diff options
Diffstat (limited to 'drivers/infiniband/hw')
48 files changed, 25342 insertions, 30 deletions
diff --git a/drivers/infiniband/hw/ipath/Kconfig b/drivers/infiniband/hw/ipath/Kconfig new file mode 100644 index 0000000..9ea67c4 --- /dev/null +++ b/drivers/infiniband/hw/ipath/Kconfig @@ -0,0 +1,16 @@ +config IPATH_CORE + tristate "PathScale InfiniPath Driver" + depends on 64BIT && PCI_MSI && NET + ---help--- + This is a low-level driver for PathScale InfiniPath host channel + adapters (HCAs) based on the HT-400 and PE-800 chips. + +config INFINIBAND_IPATH + tristate "PathScale InfiniPath Verbs Driver" + depends on IPATH_CORE && INFINIBAND + ---help--- + This is a driver that provides InfiniBand verbs support for + PathScale InfiniPath host channel adapters (HCAs). This + allows these devices to be used with both kernel upper level + protocols such as IP-over-InfiniBand as well as with userspace + applications (in conjunction with InfiniBand userspace access). diff --git a/drivers/infiniband/hw/ipath/Makefile b/drivers/infiniband/hw/ipath/Makefile new file mode 100644 index 0000000..b4d084a --- /dev/null +++ b/drivers/infiniband/hw/ipath/Makefile @@ -0,0 +1,36 @@ +EXTRA_CFLAGS += -DIPATH_IDSTR='"PathScale kernel.org driver"' \ + -DIPATH_KERN_TYPE=0 + +obj-$(CONFIG_IPATH_CORE) += ipath_core.o +obj-$(CONFIG_INFINIBAND_IPATH) += ib_ipath.o + +ipath_core-y := \ + ipath_diag.o \ + ipath_driver.o \ + ipath_eeprom.o \ + ipath_file_ops.o \ + ipath_fs.o \ + ipath_ht400.o \ + ipath_init_chip.o \ + ipath_intr.o \ + ipath_layer.o \ + ipath_pe800.o \ + ipath_stats.o \ + ipath_sysfs.o \ + ipath_user_pages.o + +ipath_core-$(CONFIG_X86_64) += ipath_wc_x86_64.o + +ib_ipath-y := \ + ipath_cq.o \ + ipath_keys.o \ + ipath_mad.o \ + ipath_mr.o \ + ipath_qp.o \ + ipath_rc.o \ + ipath_ruc.o \ + ipath_srq.o \ + ipath_uc.o \ + ipath_ud.o \ + ipath_verbs.o \ + ipath_verbs_mcast.o diff --git a/drivers/infiniband/hw/ipath/ipath_common.h b/drivers/infiniband/hw/ipath/ipath_common.h new file mode 100644 index 0000000..48a5524 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_common.h @@ -0,0 +1,616 @@ +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_COMMON_H +#define _IPATH_COMMON_H + +/* + * This file contains defines, structures, etc. that are used + * to communicate between kernel and user code. + */ + +/* This is the IEEE-assigned OUI for PathScale, Inc. */ +#define IPATH_SRC_OUI_1 0x00 +#define IPATH_SRC_OUI_2 0x11 +#define IPATH_SRC_OUI_3 0x75 + +/* version of protocol header (known to chip also). In the long run, + * we should be able to generate and accept a range of version numbers; + * for now we only accept one, and it's compiled in. + */ +#define IPS_PROTO_VERSION 2 + +/* + * These are compile time constants that you may want to enable or disable + * if you are trying to debug problems with code or performance. + * IPATH_VERBOSE_TRACING define as 1 if you want additional tracing in + * fastpath code + * IPATH_TRACE_REGWRITES define as 1 if you want register writes to be + * traced in faspath code + * _IPATH_TRACING define as 0 if you want to remove all tracing in a + * compilation unit + * _IPATH_DEBUGGING define as 0 if you want to remove debug prints + */ + +/* + * The value in the BTH QP field that InfiniPath uses to differentiate + * an infinipath protocol IB packet vs standard IB transport + */ +#define IPATH_KD_QP 0x656b79 + +/* + * valid states passed to ipath_set_linkstate() user call + */ +#define IPATH_IB_LINKDOWN 0 +#define IPATH_IB_LINKARM 1 +#define IPATH_IB_LINKACTIVE 2 +#define IPATH_IB_LINKINIT 3 +#define IPATH_IB_LINKDOWN_SLEEP 4 +#define IPATH_IB_LINKDOWN_DISABLE 5 + +/* + * stats maintained by the driver. For now, at least, this is global + * to all minor devices. + */ +struct infinipath_stats { + /* number of interrupts taken */ + __u64 sps_ints; + /* number of interrupts for errors */ + __u64 sps_errints; + /* number of errors from chip (not incl. packet errors or CRC) */ + __u64 sps_errs; + /* number of packet errors from chip other than CRC */ + __u64 sps_pkterrs; + /* number of packets with CRC errors (ICRC and VCRC) */ + __u64 sps_crcerrs; + /* number of hardware errors reported (parity, etc.) */ + __u64 sps_hwerrs; + /* number of times IB link changed state unexpectedly */ + __u64 sps_iblink; + /* no longer used; left for compatibility */ + __u64 sps_unused3; + /* number of kernel (port0) packets received */ + __u64 sps_port0pkts; + /* number of "ethernet" packets sent by driver */ + __u64 sps_ether_spkts; + /* number of "ethernet" packets received by driver */ + __u64 sps_ether_rpkts; + /* number of SMA packets sent by driver */ + __u64 sps_sma_spkts; + /* number of SMA packets received by driver */ + __u64 sps_sma_rpkts; + /* number of times all ports rcvhdrq was full and packet dropped */ + __u64 sps_hdrqfull; + /* number of times all ports egrtid was full and packet dropped */ + __u64 sps_etidfull; + /* + * number of times we tried to send from driver, but no pio buffers + * avail + */ + __u64 sps_nopiobufs; + /* number of ports currently open */ + __u64 sps_ports; + /* list of pkeys (other than default) accepted (0 means not set) */ + __u16 sps_pkeys[4]; + /* lids for up to 4 infinipaths, indexed by infinipath # */ + __u16 sps_lid[4]; + /* number of user ports per chip (not IB ports) */ + __u32 sps_nports; + /* not our interrupt, or already handled */ + __u32 sps_nullintr; + /* max number of packets handled per receive call */ + __u32 sps_maxpkts_call; + /* avg number of packets handled per receive call */ + __u32 sps_avgpkts_call; + /* total number of pages locked */ + __u64 sps_pagelocks; + /* total number of pages unlocked */ + __u64 sps_pageunlocks; + /* + * Number of packets dropped in kernel other than errors (ether + * packets if ipath not configured, sma/mad, etc.) + */ + __u64 sps_krdrops; + /* mlids for up to 4 infinipaths, indexed by infinipath # */ + __u16 sps_mlid[4]; + /* pad for future growth */ + __u64 __sps_pad[45]; +}; + +/* + * These are the status bits readable (in ascii form, 64bit value) + * from the "status" sysfs file. + */ +#define IPATH_STATUS_INITTED 0x1 /* basic initialization done */ +#define IPATH_STATUS_DISABLED 0x2 /* hardware disabled */ +/* Device has been disabled via admin request */ +#define IPATH_STATUS_ADMIN_DISABLED 0x4 +#define IPATH_STATUS_OIB_SMA 0x8 /* ipath_mad kernel SMA running */ +#define IPATH_STATUS_SMA 0x10 /* user SMA running */ +/* Chip has been found and initted */ +#define IPATH_STATUS_CHIP_PRESENT 0x20 +/* IB link is at ACTIVE, usable for data traffic */ +#define IPATH_STATUS_IB_READY 0x40 +/* link is configured, LID, MTU, etc. have been set */ +#define IPATH_STATUS_IB_CONF 0x80 +/* no link established, probably no cable */ +#define IPATH_STATUS_IB_NOCABLE 0x100 +/* A Fatal hardware error has occurred. */ +#define IPATH_STATUS_HWERROR 0x200 + +/* + * The list of usermode accessible registers. Also see Reg_* later in file. + */ +typedef enum _ipath_ureg { + /* (RO) DMA RcvHdr to be used next. */ + ur_rcvhdrtail = 0, + /* (RW) RcvHdr entry to be processed next by host. */ + ur_rcvhdrhead = 1, + /* (RO) Index of next Eager index to use. */ + ur_rcvegrindextail = 2, + /* (RW) Eager TID to be processed next */ + ur_rcvegrindexhead = 3, + /* For internal use only; max register number. */ + _IPATH_UregMax +} ipath_ureg; + +/* bit values for spi_runtime_flags */ +#define IPATH_RUNTIME_HT 0x1 +#define IPATH_RUNTIME_PCIE 0x2 +#define IPATH_RUNTIME_FORCE_WC_ORDER 0x4 +#define IPATH_RUNTIME_RCVHDR_COPY 0x8 + +/* + * This structure is returned by ipath_userinit() immediately after + * open to get implementation-specific info, and info specific to this + * instance. + * + * This struct must have explict pad fields where type sizes + * may result in different alignments between 32 and 64 bit + * programs, since the 64 bit * bit kernel requires the user code + * to have matching offsets + */ +struct ipath_base_info { + /* version of hardware, for feature checking. */ + __u32 spi_hw_version; + /* version of software, for feature checking. */ + __u32 spi_sw_version; + /* InfiniPath port assigned, goes into sent packets */ + __u32 spi_port; + /* + * IB MTU, packets IB data must be less than this. + * The MTU is in bytes, and will be a multiple of 4 bytes. + */ + __u32 spi_mtu; + /* + * Size of a PIO buffer. Any given packet's total size must be less + * than this (in words). Included is the starting control word, so + * if 513 is returned, then total pkt size is 512 words or less. + */ + __u32 spi_piosize; + /* size of the TID cache in infinipath, in entries */ + __u32 spi_tidcnt; + /* size of the TID Eager list in infinipath, in entries */ + __u32 spi_tidegrcnt; + /* size of a single receive header queue entry. */ + __u32 spi_rcvhdrent_size; + /* + * Count of receive header queue entries allocated. + * This may be less than the spu_rcvhdrcnt passed in!. + */ + __u32 spi_rcvhdr_cnt; + + /* per-chip and other runtime features bitmap (IPATH_RUNTIME_*) */ + __u32 spi_runtime_flags; + + /* address where receive buffer queue is mapped into */ + __u64 spi_rcvhdr_base; + + /* user program. */ + + /* base address of eager TID receive buffers. */ + __u64 spi_rcv_egrbufs; + + /* Allocated by initialization code, not by protocol. */ + + /* + * Size of each TID buffer in host memory, starting at + * spi_rcv_egrbufs. The buffers are virtually contiguous. + */ + __u32 spi_rcv_egrbufsize; + /* + * The special QP (queue pair) value that identifies an infinipath + * protocol packet from standard IB packets. More, probably much + * more, to be added. + */ + __u32 spi_qpair; + + /* + * User register base for init code, not to be used directly by + * protocol or applications. + */ + __u64 __spi_uregbase; + /* + * Maximum buffer size in bytes that can be used in a single TID + * entry (assuming the buffer is aligned to this boundary). This is + * the minimum of what the hardware and software support Guaranteed + * to be a power of 2. + */ + __u32 spi_tid_maxsize; + /* + * alignment of each pio send buffer (byte count + * to add to spi_piobufbase to get to second buffer) + */ + __u32 spi_pioalign; + /* + * The index of the first pio buffer available to this process; + * needed to do lookup in spi_pioavailaddr; not added to + * spi_piobufbase. + */ + __u32 spi_pioindex; + /* number of buffers mapped for this process */ + __u32 spi_piocnt; + + /* + * Base address of writeonly pio buffers for this process. + * Each buffer has spi_piosize words, and is aligned on spi_pioalign + * boundaries. spi_piocnt buffers are mapped from this address + */ + __u64 spi_piobufbase; + + /* + * Base address of readonly memory copy of the pioavail registers. + * There are 2 bits for each buffer. + */ + __u64 spi_pioavailaddr; + + /* + * Address where driver updates a copy of the interface and driver + * status (IPATH_STATUS_*) as a 64 bit value. It's followed by a + * string indicating hardware error, if there was one. + */ + __u64 spi_status; + + /* number of chip ports available to user processes */ + __u32 spi_nports; + /* unit number of chip we are using */ + __u32 spi_unit; + /* num bufs in each contiguous set */ + __u32 spi_rcv_egrperchunk; + /* size in bytes of each contiguous set */ + __u32 spi_rcv_egrchunksize; + /* total size of mmap to cover full rcvegrbuffers */ + __u32 spi_rcv_egrbuftotlen; +} __attribute__ ((aligned(8))); + + +/* + * This version number is given to the driver by the user code during + * initialization in the spu_userversion field of ipath_user_info, so + * the driver can check for compatibility with user code. + * + * The major version changes when data structures + * change in an incompatible way. The driver must be the same or higher + * for initialization to succeed. In some cases, a higher version + * driver will not interoperate with older software, and initialization + * will return an error. + */ +#define IPATH_USER_SWMAJOR 1 + +/* + * Minor version differences are always compatible + * a within a major version, however if if user software is larger + * than driver software, some new features and/or structure fields + * may not be implemented; the user code must deal with this if it + * cares, or it must abort after initialization reports the difference + */ +#define IPATH_USER_SWMINOR 2 + +#define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR) + +#define IPATH_KERN_TYPE 0 + +/* + * Similarly, this is the kernel version going back to the user. It's + * slightly different, in that we want to tell if the driver was built as + * part of a PathScale release, or from the driver from OpenIB, kernel.org, + * or a standard distribution, for support reasons. The high bit is 0 for + * non-PathScale, and 1 for PathScale-built/supplied. + * + * It's returned by the driver to the user code during initialization in the + * spi_sw_version field of ipath_base_info, so the user code can in turn + * check for compatibility with the kernel. +*/ +#define IPATH_KERN_SWVERSION ((IPATH_KERN_TYPE<<31) | IPATH_USER_SWVERSION) + +/* + * This structure is passed to ipath_userinit() to tell the driver where + * user code buffers are, sizes, etc. The offsets and sizes of the + * fields must remain unchanged, for binary compatibility. It can + * be extended, if userversion is changed so user code can tell, if needed + */ +struct ipath_user_info { + /* + * version of user software, to detect compatibility issues. + * Should be set to IPATH_USER_SWVERSION. + */ + __u32 spu_userversion; + + /* desired number of receive header queue entries */ + __u32 spu_rcvhdrcnt; + + /* size of struct base_info to write to */ + __u32 spu_base_info_size; + + /* + * number of words in KD protocol header + * This tells InfiniPath how many words to copy to rcvhdrq. If 0, + * kernel uses a default. Once set, attempts to set any other value + * are an error (EAGAIN) until driver is reloaded. + */ + __u32 spu_rcvhdrsize; + + /* + * cache line aligned (64 byte) user address to + * which the rcvhdrtail register will be written by infinipath + * whenever it changes, so that no chip registers are read in + * the performance path. + */ + __u64 spu_rcvhdraddr; + + /* + * address of struct base_info to write to + */ + __u64 spu_base_info; + +} __attribute__ ((aligned(8))); + +/* User commands. */ + +#define IPATH_CMD_MIN 16 + +#define IPATH_CMD_USER_INIT 16 /* set up userspace */ +#define IPATH_CMD_PORT_INFO 17 /* find out what resources we got */ +#define IPATH_CMD_RECV_CTRL 18 /* control receipt of packets */ +#define IPATH_CMD_TID_UPDATE 19 /* update expected TID entries */ +#define IPATH_CMD_TID_FREE 20 /* free expected TID entries */ +#define IPATH_CMD_SET_PART_KEY 21 /* add partition key */ + +#define IPATH_CMD_MAX 21 + +struct ipath_port_info { + __u32 num_active; /* number of active units */ + __u32 unit; /* unit (chip) assigned to caller */ + __u32 port; /* port on unit assigned to caller */ +}; + +struct ipath_tid_info { + __u32 tidcnt; + /* make structure same size in 32 and 64 bit */ + __u32 tid__unused; + /* virtual address of first page in transfer */ + __u64 tidvaddr; + /* pointer (same size 32/64 bit) to __u16 tid array */ + __u64 tidlist; + + /* + * pointer (same size 32/64 bit) to bitmap of TIDs used + * for this call; checked for being large enough at open + */ + __u64 tidmap; +}; + +struct ipath_cmd { + __u32 type; /* command type */ + union { + struct ipath_tid_info tid_info; + struct ipath_user_info user_info; + /* address in userspace of struct ipath_port_info to + write result to */ + __u64 port_info; + /* enable/disable receipt of packets */ + __u32 recv_ctrl; + /* partition key to set */ + __u16 part_key; + } cmd; +}; + +struct ipath_iovec { + /* Pointer to data, but same size 32 and 64 bit */ + __u64 iov_base; + + /* + * Length of data; don't need 64 bits, but want + * ipath_sendpkt to remain same size as before 32 bit changes, so... + */ + __u64 iov_len; +}; + +/* + * Describes a single packet for send. Each packet can have one or more + * buffers, but the total length (exclusive of IB headers) must be less + * than the MTU, and if using the PIO method, entire packet length, + * including IB headers, must be less than the ipath_piosize value (words). + * Use of this necessitates including sys/uio.h + */ +struct __ipath_sendpkt { + __u32 sps_flags; /* flags for packet (TBD) */ + __u32 sps_cnt; /* number of entries to use in sps_iov */ + /* array of iov's describing packet. TEMPORARY */ + struct ipath_iovec sps_iov[4]; +}; + +/* Passed into SMA special file's ->read and ->write methods. */ +struct ipath_sma_pkt +{ + __u32 unit; /* unit on which to send packet */ + __u64 data; /* address of payload in userspace */ + __u32 len; /* length of payload */ +}; + +/* + * Data layout in I2C flash (for GUID, etc.) + * All fields are little-endian binary unless otherwise stated + */ +#define IPATH_FLASH_VERSION 1 +struct ipath_flash { + /* flash layout version (IPATH_FLASH_VERSION) */ + __u8 if_fversion; + /* checksum protecting if_length bytes */ + __u8 if_csum; + /* + * valid length (in use, protected by if_csum), including + * if_fversion and if_sum themselves) + */ + __u8 if_length; + /* the GUID, in network order */ + __u8 if_guid[8]; + /* number of GUIDs to use, starting from if_guid */ + __u8 if_numguid; + /* the board serial number, in ASCII */ + char if_serial[12]; + /* board mfg date (YYYYMMDD ASCII) */ + char if_mfgdate[8]; + /* last board rework/test date (YYYYMMDD ASCII) */ + char if_testdate[8]; + /* logging of error counts, TBD */ + __u8 if_errcntp[4]; + /* powered on hours, updated at driver unload */ + __u8 if_powerhour[2]; + /* ASCII free-form comment field */ + char if_comment[32]; + /* 78 bytes used, min flash size is 128 bytes */ + __u8 if_future[50]; +}; + +/* + * These are the counters implemented in the chip, and are listed in order. + * The InterCaps naming is taken straight from the chip spec. + */ +struct infinipath_counters { + __u64 LBIntCnt; + __u64 LBFlowStallCnt; + __u64 Reserved1; + __u64 TxUnsupVLErrCnt; + __u64 TxDataPktCnt; + __u64 TxFlowPktCnt; + __u64 TxDwordCnt; + __u64 TxLenErrCnt; + __u64 TxMaxMinLenErrCnt; + __u64 TxUnderrunCnt; + __u64 TxFlowStallCnt; + __u64 TxDroppedPktCnt; + __u64 RxDroppedPktCnt; + __u64 RxDataPktCnt; + __u64 RxFlowPktCnt; + __u64 RxDwordCnt; + __u64 RxLenErrCnt; + __u64 RxMaxMinLenErrCnt; + __u64 RxICRCErrCnt; + __u64 RxVCRCErrCnt; + __u64 RxFlowCtrlErrCnt; + __u64 RxBadFormatCnt; + __u64 RxLinkProblemCnt; + __u64 RxEBPCnt; + __u64 RxLPCRCErrCnt; + __u64 RxBufOvflCnt; + __u64 RxTIDFullErrCnt; + __u64 RxTIDValidErrCnt; + __u64 RxPKeyMismatchCnt; + __u64 RxP0HdrEgrOvflCnt; + __u64 RxP1HdrEgrOvflCnt; + __u64 RxP2HdrEgrOvflCnt; + __u64 RxP3HdrEgrOvflCnt; + __u64 RxP4HdrEgrOvflCnt; + __u64 RxP5HdrEgrOvflCnt; + __u64 RxP6HdrEgrOvflCnt; + __u64 RxP7HdrEgrOvflCnt; + __u64 RxP8HdrEgrOvflCnt; + __u64 Reserved6; + __u64 Reserved7; + __u64 IBStatusChangeCnt; + __u64 IBLinkErrRecoveryCnt; + __u64 IBLinkDownedCnt; + __u64 IBSymbolErrCnt; +}; + +/* + * The next set of defines are for packet headers, and chip register + * and memory bits that are visible to and/or used by user-mode software + * The other bits that are used only by the driver or diags are in + * ipath_registers.h + */ + +/* RcvHdrFlags bits */ +#define INFINIPATH_RHF_LENGTH_MASK 0x7FF +#define INFINIPATH_RHF_LENGTH_SHIFT 0 +#define INFINIPATH_RHF_RCVTYPE_MASK 0x7 +#define INFINIPATH_RHF_RCVTYPE_SHIFT 11 +#define INFINIPATH_RHF_EGRINDEX_MASK 0x7FF +#define INFINIPATH_RHF_EGRINDEX_SHIFT 16 +#define INFINIPATH_RHF_H_ICRCERR 0x80000000 +#define INFINIPATH_RHF_H_VCRCERR 0x40000000 +#define INFINIPATH_RHF_H_PARITYERR 0x20000000 +#define INFINIPATH_RHF_H_LENERR 0x10000000 +#define INFINIPATH_RHF_H_MTUERR 0x08000000 +#define INFINIPATH_RHF_H_IHDRERR 0x04000000 +#define INFINIPATH_RHF_H_TIDERR 0x02000000 +#define INFINIPATH_RHF_H_MKERR 0x01000000 +#define INFINIPATH_RHF_H_IBERR 0x00800000 +#define INFINIPATH_RHF_L_SWA 0x00008000 +#define INFINIPATH_RHF_L_SWB 0x00004000 + +/* infinipath header fields */ +#define INFINIPATH_I_VERS_MASK 0xF +#define INFINIPATH_I_VERS_SHIFT 28 +#define INFINIPATH_I_PORT_MASK 0xF +#define INFINIPATH_I_PORT_SHIFT 24 +#define INFINIPATH_I_TID_MASK 0x7FF +#define INFINIPATH_I_TID_SHIFT 13 +#define INFINIPATH_I_OFFSET_MASK 0x1FFF +#define INFINIPATH_I_OFFSET_SHIFT 0 + +/* K_PktFlags bits */ +#define INFINIPATH_KPF_INTR 0x1 + +/* SendPIO per-buffer control */ +#define INFINIPATH_SP_LENGTHP1_MASK 0x3FF +#define INFINIPATH_SP_LENGTHP1_SHIFT 0 +#define INFINIPATH_SP_INTR 0x80000000 +#define INFINIPATH_SP_TEST 0x40000000 +#define INFINIPATH_SP_TESTEBP 0x20000000 + +/* SendPIOAvail bits */ +#define INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT 1 +#define INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT 0 + +#endif /* _IPATH_COMMON_H */ diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c new file mode 100644 index 0000000..7ece113 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_cq.c @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/err.h> +#include <linux/vmalloc.h> + +#include "ipath_verbs.h" + +/** + * ipath_cq_enter - add a new entry to the completion queue + * @cq: completion queue + * @entry: work completion entry to add + * @sig: true if @entry is a solicitated entry + * + * This may be called with one of the qp->s_lock or qp->r_rq.lock held. + */ +void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited) +{ + unsigned long flags; + u32 next; + + spin_lock_irqsave(&cq->lock, flags); + + if (cq->head == cq->ibcq.cqe) + next = 0; + else + next = cq->head + 1; + if (unlikely(next == cq->tail)) { + spin_unlock_irqrestore(&cq->lock, flags); + if (cq->ibcq.event_handler) { + struct ib_event ev; + + ev.device = cq->ibcq.device; + ev.element.cq = &cq->ibcq; + ev.event = IB_EVENT_CQ_ERR; + cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); + } + return; + } + cq->queue[cq->head] = *entry; + cq->head = next; + + if (cq->notify == IB_CQ_NEXT_COMP || + (cq->notify == IB_CQ_SOLICITED && solicited)) { + cq->notify = IB_CQ_NONE; + cq->triggered++; + /* + * This will cause send_complete() to be called in + * another thread. + */ + tasklet_hi_schedule(&cq->comptask); + } + + spin_unlock_irqrestore(&cq->lock, flags); + + if (entry->status != IB_WC_SUCCESS) + to_idev(cq->ibcq.device)->n_wqe_errs++; +} + +/** + * ipath_poll_cq - poll for work completion entries + * @ibcq: the completion queue to poll + * @num_entries: the maximum number of entries to return + * @entry: pointer to array where work completions are placed + * + * Returns the number of completion entries polled. + * + * This may be called from interrupt context. Also called by ib_poll_cq() + * in the generic verbs code. + */ +int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) +{ + struct ipath_cq *cq = to_icq(ibcq); + unsigned long flags; + int npolled; + + spin_lock_irqsave(&cq->lock, flags); + + for (npolled = 0; npolled < num_entries; ++npolled, ++entry) { + if (cq->tail == cq->head) + break; + *entry = cq->queue[cq->tail]; + if (cq->tail == cq->ibcq.cqe) + cq->tail = 0; + else + cq->tail++; + } + + spin_unlock_irqrestore(&cq->lock, flags); + + return npolled; +} + +static void send_complete(unsigned long data) +{ + struct ipath_cq *cq = (struct ipath_cq *)data; + + /* + * The completion handler will most likely rearm the notification + * and poll for all pending entries. If a new completion entry + * is added while we are in this routine, tasklet_hi_schedule() + * won't call us again until we return so we check triggered to + * see if we need to call the handler again. + */ + for (;;) { + u8 triggered = cq->triggered; + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); + + if (cq->triggered == triggered) + return; + } +} + +/** + * ipath_create_cq - create a completion queue + * @ibdev: the device this completion queue is attached to + * @entries: the minimum size of the completion queue + * @context: unused by the InfiniPath driver + * @udata: unused by the InfiniPath driver + * + * Returns a pointer to the completion queue or negative errno values + * for failure. + * + * Called by ib_create_cq() in the generic verbs code. + */ +struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct ipath_cq *cq; + struct ib_wc *wc; + struct ib_cq *ret; + + /* + * Need to use vmalloc() if we want to support large #s of + * entries. + */ + cq = kmalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + /* + * Need to use vmalloc() if we want to support large #s of entries. + */ + wc = vmalloc(sizeof(*wc) * (entries + 1)); + if (!wc) { + kfree(cq); + ret = ERR_PTR(-ENOMEM); + goto bail; + } + /* + * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe. + * The number of entries should be >= the number requested or return + * an error. + */ + cq->ibcq.cqe = entries; + cq->notify = IB_CQ_NONE; + cq->triggered = 0; + spin_lock_init(&cq->lock); + tasklet_init(&cq->comptask, send_complete, (unsigned long)cq); + cq->head = 0; + cq->tail = 0; + cq->queue = wc; + + ret = &cq->ibcq; + +bail: + return ret; +} + +/** + * ipath_destroy_cq - destroy a completion queue + * @ibcq: the completion queue to destroy. + * + * Returns 0 for success. + * + * Called by ib_destroy_cq() in the generic verbs code. + */ +int ipath_destroy_cq(struct ib_cq *ibcq) +{ + struct ipath_cq *cq = to_icq(ibcq); + + tasklet_kill(&cq->comptask); + vfree(cq->queue); + kfree(cq); + + return 0; +} + +/** + * ipath_req_notify_cq - change the notification type for a completion queue + * @ibcq: the completion queue + * @notify: the type of notification to request + * + * Returns 0 for success. + * + * This may be called from interrupt context. Also called by + * ib_req_notify_cq() in the generic verbs code. + */ +int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify notify) +{ + struct ipath_cq *cq = to_icq(ibcq); + unsigned long flags; + + spin_lock_irqsave(&cq->lock, flags); + /* + * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow + * any other transitions. + */ + if (cq->notify != IB_CQ_NEXT_COMP) + cq->notify = notify; + spin_unlock_irqrestore(&cq->lock, flags); + return 0; +} + +int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) +{ + struct ipath_cq *cq = to_icq(ibcq); + struct ib_wc *wc, *old_wc; + u32 n; + int ret; + + /* + * Need to use vmalloc() if we want to support large #s of entries. + */ + wc = vmalloc(sizeof(*wc) * (cqe + 1)); + if (!wc) { + ret = -ENOMEM; + goto bail; + } + + spin_lock_irq(&cq->lock); + if (cq->head < cq->tail) + n = cq->ibcq.cqe + 1 + cq->head - cq->tail; + else + n = cq->head - cq->tail; + if (unlikely((u32)cqe < n)) { + spin_unlock_irq(&cq->lock); + vfree(wc); + ret = -EOVERFLOW; + goto bail; + } + for (n = 0; cq->tail != cq->head; n++) { + wc[n] = cq->queue[cq->tail]; + if (cq->tail == cq->ibcq.cqe) + cq->tail = 0; + else + cq->tail++; + } + cq->ibcq.cqe = cqe; + cq->head = n; + cq->tail = 0; + old_wc = cq->queue; + cq->queue = wc; + spin_unlock_irq(&cq->lock); + + vfree(old_wc); + + ret = 0; + +bail: + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_debug.h b/drivers/infiniband/hw/ipath/ipath_debug.h new file mode 100644 index 0000000..593e289 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_debug.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_DEBUG_H +#define _IPATH_DEBUG_H + +#ifndef _IPATH_DEBUGGING /* debugging enabled or not */ +#define _IPATH_DEBUGGING 1 +#endif + +#if _IPATH_DEBUGGING + +/* + * Mask values for debugging. The scheme allows us to compile out any + * of the debug tracing stuff, and if compiled in, to enable or disable + * dynamically. This can be set at modprobe time also: + * modprobe infinipath.ko infinipath_debug=7 + */ + +#define __IPATH_INFO 0x1 /* generic low verbosity stuff */ +#define __IPATH_DBG 0x2 /* generic debug */ +#define __IPATH_TRSAMPLE 0x8 /* generate trace buffer sample entries */ +/* leave some low verbosity spots open */ +#define __IPATH_VERBDBG 0x40 /* very verbose debug */ +#define __IPATH_PKTDBG 0x80 /* print packet data */ +/* print process startup (init)/exit messages */ +#define __IPATH_PROCDBG 0x100 +/* print mmap/nopage stuff, not using VDBG any more */ +#define __IPATH_MMDBG 0x200 +#define __IPATH_USER_SEND 0x1000 /* use user mode send */ +#define __IPATH_KERNEL_SEND 0x2000 /* use kernel mode send */ +#define __IPATH_EPKTDBG 0x4000 /* print ethernet packet data */ +#define __IPATH_SMADBG 0x8000 /* sma packet debug */ +#define __IPATH_IPATHDBG 0x10000 /* Ethernet (IPATH) general debug on */ +#define __IPATH_IPATHWARN 0x20000 /* Ethernet (IPATH) warnings on */ +#define __IPATH_IPATHERR 0x40000 /* Ethernet (IPATH) errors on */ +#define __IPATH_IPATHPD 0x80000 /* Ethernet (IPATH) packet dump on */ +#define __IPATH_IPATHTABLE 0x100000 /* Ethernet (IPATH) table dump on */ + +#else /* _IPATH_DEBUGGING */ + +/* + * define all of these even with debugging off, for the few places that do + * if(infinipath_debug & _IPATH_xyzzy), but in a way that will make the + * compiler eliminate the code + */ + +#define __IPATH_INFO 0x0 /* generic low verbosity stuff */ +#define __IPATH_DBG 0x0 /* generic debug */ +#define __IPATH_TRSAMPLE 0x0 /* generate trace buffer sample entries */ +#define __IPATH_VERBDBG 0x0 /* very verbose debug */ +#define __IPATH_PKTDBG 0x0 /* print packet data */ +#define __IPATH_PROCDBG 0x0 /* print process startup (init)/exit messages */ +/* print mmap/nopage stuff, not using VDBG any more */ +#define __IPATH_MMDBG 0x0 +#define __IPATH_EPKTDBG 0x0 /* print ethernet packet data */ +#define __IPATH_SMADBG 0x0 /* print process startup (init)/exit messages */#define __IPATH_IPATHDBG 0x0 /* Ethernet (IPATH) table dump on */ +#define __IPATH_IPATHWARN 0x0 /* Ethernet (IPATH) warnings on */ +#define __IPATH_IPATHERR 0x0 /* Ethernet (IPATH) errors on */ +#define __IPATH_IPATHPD 0x0 /* Ethernet (IPATH) packet dump on */ +#define __IPATH_IPATHTABLE 0x0 /* Ethernet (IPATH) packet dump on */ + +#endif /* _IPATH_DEBUGGING */ + +#define __IPATH_VERBOSEDBG __IPATH_VERBDBG + +#endif /* _IPATH_DEBUG_H */ diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c new file mode 100644 index 0000000..7d3fb69 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_diag.c @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file contains support for diagnostic functions. It is accessed by + * opening the ipath_diag device, normally minor number 129. Diagnostic use + * of the InfiniPath chip may render the chip or board unusable until the + * driver is unloaded, or in some cases, until the system is rebooted. + * + * Accesses to the chip through this interface are not similar to going + * through the /sys/bus/pci resource mmap interface. + */ + +#include <linux/pci.h> +#include <asm/uaccess.h> + +#include "ipath_common.h" +#include "ipath_kernel.h" +#include "ips_common.h" +#include "ipath_layer.h" + +int ipath_diag_inuse; +static int diag_set_link; + +static int ipath_diag_open(struct inode *in, struct file *fp); +static int ipath_diag_release(struct inode *in, struct file *fp); +static ssize_t ipath_diag_read(struct file *fp, char __user *data, + size_t count, loff_t *off); +static ssize_t ipath_diag_write(struct file *fp, const char __user *data, + size_t count, loff_t *off); + +static struct file_operations diag_file_ops = { + .owner = THIS_MODULE, + .write = ipath_diag_write, + .read = ipath_diag_read, + .open = ipath_diag_open, + .release = ipath_diag_release +}; + +static struct cdev *diag_cdev; +static struct class_device *diag_class_dev; + +int ipath_diag_init(void) +{ + return ipath_cdev_init(IPATH_DIAG_MINOR, "ipath_diag", + &diag_file_ops, &diag_cdev, &diag_class_dev); +} + +void ipath_diag_cleanup(void) +{ + ipath_cdev_cleanup(&diag_cdev, &diag_class_dev); +} + +/** + * ipath_read_umem64 - read a 64-bit quantity from the chip into user space + * @dd: the infinipath device + * @uaddr: the location to store the data in user memory + * @caddr: the source chip address (full pointer, not offset) + * @count: number of bytes to copy (multiple of 32 bits) + * + * This function also localizes all chip memory accesses. + * The copy should be written such that we read full cacheline packets + * from the chip. This is usually used for a single qword + * + * NOTE: This assumes the chip address is 64-bit aligned. + */ +static int ipath_read_umem64(struct ipath_devdata *dd, void __user *uaddr, + const void __iomem *caddr, size_t count) +{ + const u64 __iomem *reg_addr = caddr; + const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64)); + int ret; + + /* not very efficient, but it works for now */ + if (reg_addr < dd->ipath_kregbase || + reg_end > dd->ipath_kregend) { + ret = -EINVAL; + goto bail; + } + while (reg_addr < reg_end) { + u64 data = readq(reg_addr); + if (copy_to_user(uaddr, &data, sizeof(u64))) { + ret = -EFAULT; + goto bail; + } + reg_addr++; + uaddr++; + } + ret = 0; +bail: + return ret; +} + +/** + * ipath_write_umem64 - write a 64-bit quantity to the chip from user space + * @dd: the infinipath device + * @caddr: the destination chip address (full pointer, not offset) + * @uaddr: the source of the data in user memory + * @count: the number of bytes to copy (multiple of 32 bits) + * + * This is usually used for a single qword + * NOTE: This assumes the chip address is 64-bit aligned. + */ + +static int ipath_write_umem64(struct ipath_devdata *dd, void __iomem *caddr, + const void __user *uaddr, size_t count) +{ + u64 __iomem *reg_addr = caddr; + const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64)); + int ret; + + /* not very efficient, but it works for now */ + if (reg_addr < dd->ipath_kregbase || + reg_end > dd->ipath_kregend) { + ret = -EINVAL; + goto bail; + } + while (reg_addr < reg_end) { + u64 data; + if (copy_from_user(&data, uaddr, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + writeq(data, reg_addr); + + reg_addr++; + uaddr++; + } + ret = 0; +bail: + return ret; +} + +/** + * ipath_read_umem32 - read a 32-bit quantity from the chip into user space + * @dd: the infinipath device + * @uaddr: the location to store the data in user memory + * @caddr: the source chip address (full pointer, not offset) + * @count: number of bytes to copy + * + * read 32 bit values, not 64 bit; for memories that only + * support 32 bit reads; usually a single dword. + */ +static int ipath_read_umem32(struct ipath_devdata *dd, void __user *uaddr, + const void __iomem *caddr, size_t count) +{ + const u32 __iomem *reg_addr = caddr; + const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32)); + int ret; + + if (reg_addr < (u32 __iomem *) dd->ipath_kregbase || + reg_end > (u32 __iomem *) dd->ipath_kregend) { + ret = -EINVAL; + goto bail; + } + /* not very efficient, but it works for now */ + while (reg_addr < reg_end) { + u32 data = readl(reg_addr); + if (copy_to_user(uaddr, &data, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + + reg_addr++; + uaddr++; + } + ret = 0; +bail: + return ret; +} + +/** + * ipath_write_umem32 - write a 32-bit quantity to the chip from user space + * @dd: the infinipath device + * @caddr: the destination chip address (full pointer, not offset) + * @uaddr: the source of the data in user memory + * @count: number of bytes to copy + * + * write 32 bit values, not 64 bit; for memories that only + * support 32 bit write; usually a single dword. + */ + +static int ipath_write_umem32(struct ipath_devdata *dd, void __iomem *caddr, + const void __user *uaddr, size_t count) +{ + u32 __iomem *reg_addr = caddr; + const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32)); + int ret; + + if (reg_addr < (u32 __iomem *) dd->ipath_kregbase || + reg_end > (u32 __iomem *) dd->ipath_kregend) { + ret = -EINVAL; + goto bail; + } + while (reg_addr < reg_end) { + u32 data; + if (copy_from_user(&data, uaddr, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + writel(data, reg_addr); + + reg_addr++; + uaddr++; + } + ret = 0; +bail: + return ret; +} + +static int ipath_diag_open(struct inode *in, struct file *fp) +{ + struct ipath_devdata *dd; + int unit = 0; /* XXX this is bogus */ + unsigned long flags; + int ret; + + dd = ipath_lookup(unit); + + mutex_lock(&ipath_mutex); + spin_lock_irqsave(&ipath_devs_lock, flags); + + if (ipath_diag_inuse) { + ret = -EBUSY; + goto bail; + } + + list_for_each_entry(dd, &ipath_dev_list, ipath_list) { + /* + * we need at least one infinipath device to be present + * (don't use INITTED, because we want to be able to open + * even if device is in freeze mode, which cleared INITTED). + * There is a small amount of risk to this, which is why we + * also verify kregbase is set. + */ + + if (!(dd->ipath_flags & IPATH_PRESENT) || + !dd->ipath_kregbase) + continue; + + ipath_diag_inuse = 1; + diag_set_link = 0; + ret = 0; + goto bail; + } + + ret = -ENODEV; + +bail: + spin_unlock_irqrestore(&ipath_devs_lock, flags); + mutex_unlock(&ipath_mutex); + + /* Only expose a way to reset the device if we + make it into diag mode. */ + if (ret == 0) + ipath_expose_reset(&dd->pcidev->dev); + + return ret; +} + +static int ipath_diag_release(struct inode *i, struct file *f) +{ + mutex_lock(&ipath_mutex); + ipath_diag_inuse = 0; + mutex_unlock(&ipath_mutex); + return 0; +} + +static ssize_t ipath_diag_read(struct file *fp, char __user *data, + size_t count, loff_t *off) +{ + int unit = 0; /* XXX provide for reads on other units some day */ + struct ipath_devdata *dd; + void __iomem *kreg_base; + ssize_t ret; + + dd = ipath_lookup(unit); + if (!dd) { + ret = -ENODEV; + goto bail; + } + + kreg_base = dd->ipath_kregbase; + + if (count == 0) + ret = 0; + else if ((count % 4) || (*off % 4)) + /* address or length is not 32-bit aligned, hence invalid */ + ret = -EINVAL; + else if ((count % 8) || (*off % 8)) + /* address or length not 64-bit aligned; do 32-bit reads */ + ret = ipath_read_umem32(dd, data, kreg_base + *off, count); + else + ret = ipath_read_umem64(dd, data, kreg_base + *off, count); + + if (ret >= 0) { + *off += count; + ret = count; + } + +bail: + return ret; +} + +static ssize_t ipath_diag_write(struct file *fp, const char __user *data, + size_t count, loff_t *off) +{ + int unit = 0; /* XXX this is bogus */ + struct ipath_devdata *dd; + void __iomem *kreg_base; + ssize_t ret; + + dd = ipath_lookup(unit); + if (!dd) { + ret = -ENODEV; + goto bail; + } + kreg_base = dd->ipath_kregbase; + + if (count == 0) + ret = 0; + else if ((count % 4) || (*off % 4)) + /* address or length is not 32-bit aligned, hence invalid */ + ret = -EINVAL; + else if ((count % 8) || (*off % 8)) + /* address or length not 64-bit aligned; do 32-bit writes */ + ret = ipath_write_umem32(dd, kreg_base + *off, data, count); + else + ret = ipath_write_umem64(dd, kreg_base + *off, data, count); + + if (ret >= 0) { + *off += count; + ret = count; + } + +bail: + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c new file mode 100644 index 0000000..e7617c3 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -0,0 +1,1983 @@ +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/spinlock.h> +#include <linux/idr.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/netdevice.h> +#include <linux/vmalloc.h> + +#include "ipath_kernel.h" +#include "ips_common.h" +#include "ipath_layer.h" + +static void ipath_update_pio_bufs(struct ipath_devdata *); + +const char *ipath_get_unit_name(int unit) +{ + static char iname[16]; + snprintf(iname, sizeof iname, "infinipath%u", unit); + return iname; +} + +EXPORT_SYMBOL_GPL(ipath_get_unit_name); + +#define DRIVER_LOAD_MSG "PathScale " IPATH_DRV_NAME " loaded: " +#define PFX IPATH_DRV_NAME ": " + +/* + * The size has to be longer than this string, so we can append + * board/chip information to it in the init code. + */ +const char ipath_core_version[] = IPATH_IDSTR "\n"; + +static struct idr unit_table; +DEFINE_SPINLOCK(ipath_devs_lock); +LIST_HEAD(ipath_dev_list); + +wait_queue_head_t ipath_sma_state_wait; + +unsigned ipath_debug = __IPATH_INFO; + +module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(debug, "mask for debug prints"); +EXPORT_SYMBOL_GPL(ipath_debug); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("PathScale <support@pathscale.com>"); +MODULE_DESCRIPTION("Pathscale InfiniPath driver"); + +const char *ipath_ibcstatus_str[] = { + "Disabled", + "LinkUp", + "PollActive", + "PollQuiet", + "SleepDelay", + "SleepQuiet", + "LState6", /* unused */ + "LState7", /* unused */ + "CfgDebounce", + "CfgRcvfCfg", + "CfgWaitRmt", + "CfgIdle", + "RecovRetrain", + "LState0xD", /* unused */ + "RecovWaitRmt", + "RecovIdle", +}; + +/* + * These variables are initialized in the chip-specific files + * but are defined here. + */ +u16 ipath_gpio_sda_num, ipath_gpio_scl_num; +u64 ipath_gpio_sda, ipath_gpio_scl; +u64 infinipath_i_bitsextant; +ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant; +u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask; + +static void __devexit ipath_remove_one(struct pci_dev *); +static int __devinit ipath_init_one(struct pci_dev *, + const struct pci_device_id *); + +/* Only needed for registration, nothing else needs this info */ +#define PCI_VENDOR_ID_PATHSCALE 0x1fc1 +#define PCI_DEVICE_ID_INFINIPATH_HT 0xd +#define PCI_DEVICE_ID_INFINIPATH_PE800 0x10 + +static const struct pci_device_id ipath_pci_tbl[] = { + {PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, + PCI_DEVICE_ID_INFINIPATH_HT)}, + {PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, + PCI_DEVICE_ID_INFINIPATH_PE800)}, +}; + +MODULE_DEVICE_TABLE(pci, ipath_pci_tbl); + +static struct pci_driver ipath_driver = { + .name = IPATH_DRV_NAME, + .probe = ipath_init_one, + .remove = __devexit_p(ipath_remove_one), + .id_table = ipath_pci_tbl, +}; + +/* + * This is where port 0's rcvhdrtail register is written back; we also + * want nothing else sharing the cache line, so make it a cache line + * in size. Used for all units. + */ +volatile __le64 *ipath_port0_rcvhdrtail; +dma_addr_t ipath_port0_rcvhdrtail_dma; +static int port0_rcvhdrtail_refs; + +static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev, + u32 *bar0, u32 *bar1) +{ + int ret; + + ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_0, bar0); + if (ret) + ipath_dev_err(dd, "failed to read bar0 before enable: " + "error %d\n", -ret); + + ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, bar1); + if (ret) + ipath_dev_err(dd, "failed to read bar1 before enable: " + "error %d\n", -ret); + + ipath_dbg("Read bar0 %x bar1 %x\n", *bar0, *bar1); +} + +static void ipath_free_devdata(struct pci_dev *pdev, + struct ipath_devdata *dd) +{ + unsigned long flags; + + pci_set_drvdata(pdev, NULL); + + if (dd->ipath_unit != -1) { + spin_lock_irqsave(&ipath_devs_lock, flags); + idr_remove(&unit_table, dd->ipath_unit); + list_del(&dd->ipath_list); + spin_unlock_irqrestore(&ipath_devs_lock, flags); + } + dma_free_coherent(&pdev->dev, sizeof(*dd), dd, dd->ipath_dma_addr); +} + +static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev) +{ + unsigned long flags; + struct ipath_devdata *dd; + dma_addr_t dma_addr; + int ret; + + if (!idr_pre_get(&unit_table, GFP_KERNEL)) { + dd = ERR_PTR(-ENOMEM); + goto bail; + } + + dd = dma_alloc_coherent(&pdev->dev, sizeof(*dd), &dma_addr, + GFP_KERNEL); + + if (!dd) { + dd = ERR_PTR(-ENOMEM); + goto bail; + } + + dd->ipath_dma_addr = dma_addr; + dd->ipath_unit = -1; + + spin_lock_irqsave(&ipath_devs_lock, flags); + + ret = idr_get_new(&unit_table, dd, &dd->ipath_unit); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME + ": Could not allocate unit ID: error %d\n", -ret); + ipath_free_devdata(pdev, dd); + dd = ERR_PTR(ret); + goto bail_unlock; + } + + dd->pcidev = pdev; + pci_set_drvdata(pdev, dd); + + list_add(&dd->ipath_list, &ipath_dev_list); + +bail_unlock: + spin_unlock_irqrestore(&ipath_devs_lock, flags); + +bail: + return dd; +} + +static inline struct ipath_devdata *__ipath_lookup(int unit) +{ + return idr_find(&unit_table, unit); +} + +struct ipath_devdata *ipath_lookup(int unit) +{ + struct ipath_devdata *dd; + unsigned long flags; + + spin_lock_irqsave(&ipath_devs_lock, flags); + dd = __ipath_lookup(unit); + spin_unlock_irqrestore(&ipath_devs_lock, flags); + + return dd; +} + +int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp) +{ + int nunits, npresent, nup; + struct ipath_devdata *dd; + unsigned long flags; + u32 maxports; + + nunits = npresent = nup = maxports = 0; + + spin_lock_irqsave(&ipath_devs_lock, flags); + + list_for_each_entry(dd, &ipath_dev_list, ipath_list) { + nunits++; + if ((dd->ipath_flags & IPATH_PRESENT) && dd->ipath_kregbase) + npresent++; + if (dd->ipath_lid && + !(dd->ipath_flags & (IPATH_DISABLED | IPATH_LINKDOWN + | IPATH_LINKUNK))) + nup++; + if (dd->ipath_cfgports > maxports) + maxports = dd->ipath_cfgports; + } + + spin_unlock_irqrestore(&ipath_devs_lock, flags); + + if (npresentp) + *npresentp = npresent; + if (nupp) + *nupp = nup; + if (maxportsp) + *maxportsp = maxports; + + return nunits; +} + +static int init_port0_rcvhdrtail(struct pci_dev *pdev) +{ + int ret; + + mutex_lock(&ipath_mutex); + + if (!ipath_port0_rcvhdrtail) { + ipath_port0_rcvhdrtail = + dma_alloc_coherent(&pdev->dev, + IPATH_PORT0_RCVHDRTAIL_SIZE, + &ipath_port0_rcvhdrtail_dma, + GFP_KERNEL); + + if (!ipath_port0_rcvhdrtail) { + ret = -ENOMEM; + goto bail; + } + } + port0_rcvhdrtail_refs++; + ret = 0; + +bail: + mutex_unlock(&ipath_mutex); + + return ret; +} + +static void cleanup_port0_rcvhdrtail(struct pci_dev *pdev) +{ + mutex_lock(&ipath_mutex); + + if (!--port0_rcvhdrtail_refs) { + dma_free_coherent(&pdev->dev, IPATH_PORT0_RCVHDRTAIL_SIZE, + (void *) ipath_port0_rcvhdrtail, + ipath_port0_rcvhdrtail_dma); + ipath_port0_rcvhdrtail = NULL; + } + + mutex_unlock(&ipath_mutex); +} + +/* + * These next two routines are placeholders in case we don't have per-arch + * code for controlling write combining. If explicit control of write + * combining is not available, performance will probably be awful. + */ + +int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd) +{ + return -EOPNOTSUPP; +} + +void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd) +{ +} + +static int __devinit ipath_init_one(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + int ret, len, j; + struct ipath_devdata *dd; + unsigned long long addr; + u32 bar0 = 0, bar1 = 0; + u8 rev; + + ret = init_port0_rcvhdrtail(pdev); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME + ": Could not allocate port0_rcvhdrtail: error %d\n", + -ret); + goto bail; + } + + dd = ipath_alloc_devdata(pdev); + if (IS_ERR(dd)) { + ret = PTR_ERR(dd); + printk(KERN_ERR IPATH_DRV_NAME + ": Could not allocate devdata: error %d\n", -ret); + goto bail_rcvhdrtail; + } + + ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit); + + read_bars(dd, pdev, &bar0, &bar1); + + ret = pci_enable_device(pdev); + if (ret) { + /* This can happen iff: + * + * We did a chip reset, and then failed to reprogram the + * BAR, or the chip reset due to an internal error. We then + * unloaded the driver and reloaded it. + * + * Both reset cases set the BAR back to initial state. For + * the latter case, the AER sticky error bit at offset 0x718 + * should be set, but the Linux kernel doesn't yet know + * about that, it appears. If the original BAR was retained + * in the kernel data structures, this may be OK. + */ + ipath_dev_err(dd, "enable unit %d failed: error %d\n", + dd->ipath_unit, -ret); + goto bail_devdata; + } + addr = pci_resource_start(pdev, 0); + len = pci_resource_len(pdev, 0); + ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %x, vend %x/%x " + "driver_data %lx\n", addr, len, pdev->irq, ent->vendor, + ent->device, ent->driver_data); + + read_bars(dd, pdev, &bar0, &bar1); + + if (!bar1 && !(bar0 & ~0xf)) { + if (addr) { + dev_info(&pdev->dev, "BAR is 0 (probable RESET), " + "rewriting as %llx\n", addr); + ret = pci_write_config_dword( + pdev, PCI_BASE_ADDRESS_0, addr); + if (ret) { + ipath_dev_err(dd, "rewrite of BAR0 " + "failed: err %d\n", -ret); + goto bail_disable; + } + ret = pci_write_config_dword( + pdev, PCI_BASE_ADDRESS_1, addr >> 32); + if (ret) { + ipath_dev_err(dd, "rewrite of BAR1 " + "failed: err %d\n", -ret); + goto bail_disable; + } + } else { + ipath_dev_err(dd, "BAR is 0 (probable RESET), " + "not usable until reboot\n"); + ret = -ENODEV; + goto bail_disable; + } + } + + ret = pci_request_regions(pdev, IPATH_DRV_NAME); + if (ret) { + dev_info(&pdev->dev, "pci_request_regions unit %u fails: " + "err %d\n", dd->ipath_unit, -ret); + goto bail_disable; + } + + ret = pci_set_dma_mask(pdev, DMA_64BIT_MASK); + if (ret) { + dev_info(&pdev->dev, "pci_set_dma_mask unit %u " + "fails: %d\n", dd->ipath_unit, ret); + goto bail_regions; + } + + pci_set_master(pdev); + + /* + * Save BARs to rewrite after device reset. Save all 64 bits of + * BAR, just in case. + */ + dd->ipath_pcibar0 = addr; + dd->ipath_pcibar1 = addr >> 32; + dd->ipath_deviceid = ent->device; /* save for later use */ + dd->ipath_vendorid = ent->vendor; + + /* setup the chip-specific functions, as early as possible. */ + switch (ent->device) { + case PCI_DEVICE_ID_INFINIPATH_HT: + ipath_init_ht400_funcs(dd); + break; + case PCI_DEVICE_ID_INFINIPATH_PE800: + ipath_init_pe800_funcs(dd); + break; + default: + ipath_dev_err(dd, "Found unknown PathScale deviceid 0x%x, " + "failing\n", ent->device); + return -ENODEV; + } + + for (j = 0; j < 6; j++) { + if (!pdev->resource[j].start) + continue; + ipath_cdbg(VERBOSE, "BAR %d start %lx, end %lx, len %lx\n", + j, pdev->resource[j].start, + pdev->resource[j].end, + pci_resource_len(pdev, j)); + } + + if (!addr) { + ipath_dev_err(dd, "No valid address in BAR 0!\n"); + ret = -ENODEV; + goto bail_regions; + } + + dd->ipath_deviceid = ent->device; /* save for later use */ + dd->ipath_vendorid = ent->vendor; + + ret = pci_read_config_byte(pdev, PCI_REVISION_ID, &rev); + if (ret) { + ipath_dev_err(dd, "Failed to read PCI revision ID unit " + "%u: err %d\n", dd->ipath_unit, -ret); + goto bail_regions; /* shouldn't ever happen */ + } + dd->ipath_pcirev = rev; + + dd->ipath_kregbase = ioremap_nocache(addr, len); + + if (!dd->ipath_kregbase) { + ipath_dbg("Unable to map io addr %llx to kvirt, failing\n", + addr); + ret = -ENOMEM; + goto bail_iounmap; + } + dd->ipath_kregend = (u64 __iomem *) + ((void __iomem *)dd->ipath_kregbase + len); + dd->ipath_physaddr = addr; /* used for io_remap, etc. */ + /* for user mmap */ + dd->ipath_kregvirt = (u64 __iomem *) phys_to_virt(addr); + ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p " + "kregvirt %p\n", addr, dd->ipath_kregbase, + dd->ipath_kregvirt); + + /* + * clear ipath_flags here instead of in ipath_init_chip as it is set + * by ipath_setup_htconfig. + */ + dd->ipath_flags = 0; + + if (dd->ipath_f_bus(dd, pdev)) + ipath_dev_err(dd, "Failed to setup config space; " + "continuing anyway\n"); + + /* + * set up our interrupt handler; SA_SHIRQ probably not needed, + * since MSI interrupts shouldn't be shared but won't hurt for now. + * check 0 irq after we return from chip-specific bus setup, since + * that can affect this due to setup + */ + if (!pdev->irq) + ipath_dev_err(dd, "irq is 0, BIOS error? Interrupts won't " + "work\n"); + else { + ret = request_irq(pdev->irq, ipath_intr, SA_SHIRQ, + IPATH_DRV_NAME, dd); + if (ret) { + ipath_dev_err(dd, "Couldn't setup irq handler, " + "irq=%u: %d\n", pdev->irq, ret); + goto bail_iounmap; + } + } + + ret = ipath_init_chip(dd, 0); /* do the chip-specific init */ + if (ret) + goto bail_iounmap; + + ret = ipath_enable_wc(dd); + + if (ret) { + ipath_dev_err(dd, "Write combining not enabled " + "(err %d): performance may be poor\n", + -ret); + ret = 0; + } + + ipath_device_create_group(&pdev->dev, dd); + ipathfs_add_device(dd); + ipath_user_add(dd); + ipath_layer_add(dd); + + goto bail; + +bail_iounmap: + iounmap((volatile void __iomem *) dd->ipath_kregbase); + +bail_regions: + pci_release_regions(pdev); + +bail_disable: + pci_disable_device(pdev); + +bail_devdata: + ipath_free_devdata(pdev, dd); + +bail_rcvhdrtail: + cleanup_port0_rcvhdrtail(pdev); + +bail: + return ret; +} + +static void __devexit ipath_remove_one(struct pci_dev *pdev) +{ + struct ipath_devdata *dd; + + ipath_cdbg(VERBOSE, "removing, pdev=%p\n", pdev); + if (!pdev) + return; + + dd = pci_get_drvdata(pdev); + ipath_layer_del(dd); + ipath_user_del(dd); + ipathfs_remove_device(dd); + ipath_device_remove_group(&pdev->dev, dd); + ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, " + "unit %u\n", dd, (u32) dd->ipath_unit); + if (dd->ipath_kregbase) { + ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", + dd->ipath_kregbase); + iounmap((volatile void __iomem *) dd->ipath_kregbase); + dd->ipath_kregbase = NULL; + } + pci_release_regions(pdev); + ipath_cdbg(VERBOSE, "calling pci_disable_device\n"); + pci_disable_device(pdev); + + ipath_free_devdata(pdev, dd); + cleanup_port0_rcvhdrtail(pdev); +} + +/* general driver use */ +DEFINE_MUTEX(ipath_mutex); + +static DEFINE_SPINLOCK(ipath_pioavail_lock); + +/** + * ipath_disarm_piobufs - cancel a range of PIO buffers + * @dd: the infinipath device + * @first: the first PIO buffer to cancel + * @cnt: the number of PIO buffers to cancel + * + * cancel a range of PIO buffers, used when they might be armed, but + * not triggered. Used at init to ensure buffer state, and also user + * process close, in case it died while writing to a PIO buffer + * Also after errors. + */ +void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first, + unsigned cnt) +{ + unsigned i, last = first + cnt; + u64 sendctrl, sendorig; + + ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first); + sendorig = dd->ipath_sendctrl | INFINIPATH_S_DISARM; + for (i = first; i < last; i++) { + sendctrl = sendorig | + (i << INFINIPATH_S_DISARMPIOBUF_SHIFT); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + sendctrl); + } + + /* + * Write it again with current value, in case ipath_sendctrl changed + * while we were looping; no critical bits that would require + * locking. + * + * Write a 0, and then the original value, reading scratch in + * between. This seems to avoid a chip timing race that causes + * pioavail updates to memory to stop. + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + 0); + sendorig = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); +} + +/** + * ipath_wait_linkstate - wait for an IB link state change to occur + * @dd: the infinipath device + * @state: the state to wait for + * @msecs: the number of milliseconds to wait + * + * wait up to msecs milliseconds for IB link state change to occur for + * now, take the easy polling route. Currently used only by + * ipath_layer_set_linkstate. Returns 0 if state reached, otherwise + * -ETIMEDOUT state can have multiple states set, for any of several + * transitions. + */ +int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs) +{ + dd->ipath_sma_state_wanted = state; + wait_event_interruptible_timeout(ipath_sma_state_wait, + (dd->ipath_flags & state), + msecs_to_jiffies(msecs)); + dd->ipath_sma_state_wanted = 0; + + if (!(dd->ipath_flags & state)) { + u64 val; + ipath_cdbg(SMA, "Didn't reach linkstate %s within %u ms\n", + /* test INIT ahead of DOWN, both can be set */ + (state & IPATH_LINKINIT) ? "INIT" : + ((state & IPATH_LINKDOWN) ? "DOWN" : + ((state & IPATH_LINKARMED) ? "ARM" : "ACTIVE")), + msecs); + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); + ipath_cdbg(VERBOSE, "ibcc=%llx ibcstatus=%llx (%s)\n", + (unsigned long long) ipath_read_kreg64( + dd, dd->ipath_kregs->kr_ibcctrl), + (unsigned long long) val, + ipath_ibcstatus_str[val & 0xf]); + } + return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT; +} + +void ipath_decode_err(char *buf, size_t blen, ipath_err_t err) +{ + *buf = '\0'; + if (err & INFINIPATH_E_RHDRLEN) + strlcat(buf, "rhdrlen ", blen); + if (err & INFINIPATH_E_RBADTID) + strlcat(buf, "rbadtid ", blen); + if (err & INFINIPATH_E_RBADVERSION) + strlcat(buf, "rbadversion ", blen); + if (err & INFINIPATH_E_RHDR) + strlcat(buf, "rhdr ", blen); + if (err & INFINIPATH_E_RLONGPKTLEN) + strlcat(buf, "rlongpktlen ", blen); + if (err & INFINIPATH_E_RSHORTPKTLEN) + strlcat(buf, "rshortpktlen ", blen); + if (err & INFINIPATH_E_RMAXPKTLEN) + strlcat(buf, "rmaxpktlen ", blen); + if (err & INFINIPATH_E_RMINPKTLEN) + strlcat(buf, "rminpktlen ", blen); + if (err & INFINIPATH_E_RFORMATERR) + strlcat(buf, "rformaterr ", blen); + if (err & INFINIPATH_E_RUNSUPVL) + strlcat(buf, "runsupvl ", blen); + if (err & INFINIPATH_E_RUNEXPCHAR) + strlcat(buf, "runexpchar ", blen); + if (err & INFINIPATH_E_RIBFLOW) + strlcat(buf, "ribflow ", blen); + if (err & INFINIPATH_E_REBP) + strlcat(buf, "EBP ", blen); + if (err & INFINIPATH_E_SUNDERRUN) + strlcat(buf, "sunderrun ", blen); + if (err & INFINIPATH_E_SPIOARMLAUNCH) + strlcat(buf, "spioarmlaunch ", blen); + if (err & INFINIPATH_E_SUNEXPERRPKTNUM) + strlcat(buf, "sunexperrpktnum ", blen); + if (err & INFINIPATH_E_SDROPPEDDATAPKT) + strlcat(buf, "sdroppeddatapkt ", blen); + if (err & INFINIPATH_E_SDROPPEDSMPPKT) + strlcat(buf, "sdroppedsmppkt ", blen); + if (err & INFINIPATH_E_SMAXPKTLEN) + strlcat(buf, "smaxpktlen ", blen); + if (err & INFINIPATH_E_SMINPKTLEN) + strlcat(buf, "sminpktlen ", blen); + if (err & INFINIPATH_E_SUNSUPVL) + strlcat(buf, "sunsupVL ", blen); + if (err & INFINIPATH_E_SPKTLEN) + strlcat(buf, "spktlen ", blen); + if (err & INFINIPATH_E_INVALIDADDR) + strlcat(buf, "invalidaddr ", blen); + if (err & INFINIPATH_E_RICRC) + strlcat(buf, "CRC ", blen); + if (err & INFINIPATH_E_RVCRC) + strlcat(buf, "VCRC ", blen); + if (err & INFINIPATH_E_RRCVEGRFULL) + strlcat(buf, "rcvegrfull ", blen); + if (err & INFINIPATH_E_RRCVHDRFULL) + strlcat(buf, "rcvhdrfull ", blen); + if (err & INFINIPATH_E_IBSTATUSCHANGED) + strlcat(buf, "ibcstatuschg ", blen); + if (err & INFINIPATH_E_RIBLOSTLINK) + strlcat(buf, "riblostlink ", blen); + if (err & INFINIPATH_E_HARDWARE) + strlcat(buf, "hardware ", blen); + if (err & INFINIPATH_E_RESET) + strlcat(buf, "reset ", blen); +} + +/** + * get_rhf_errstring - decode RHF errors + * @err: the err number + * @msg: the output buffer + * @len: the length of the output buffer + * + * only used one place now, may want more later + */ +static void get_rhf_errstring(u32 err, char *msg, size_t len) +{ + /* if no errors, and so don't need to check what's first */ + *msg = '\0'; + + if (err & INFINIPATH_RHF_H_ICRCERR) + strlcat(msg, "icrcerr ", len); + if (err & INFINIPATH_RHF_H_VCRCERR) + strlcat(msg, "vcrcerr ", len); + if (err & INFINIPATH_RHF_H_PARITYERR) + strlcat(msg, "parityerr ", len); + if (err & INFINIPATH_RHF_H_LENERR) + strlcat(msg, "lenerr ", len); + if (err & INFINIPATH_RHF_H_MTUERR) + strlcat(msg, "mtuerr ", len); + if (err & INFINIPATH_RHF_H_IHDRERR) + /* infinipath hdr checksum error */ + strlcat(msg, "ipathhdrerr ", len); + if (err & INFINIPATH_RHF_H_TIDERR) + strlcat(msg, "tiderr ", len); + if (err & INFINIPATH_RHF_H_MKERR) + /* bad port, offset, etc. */ + strlcat(msg, "invalid ipathhdr ", len); + if (err & INFINIPATH_RHF_H_IBERR) + strlcat(msg, "iberr ", len); + if (err & INFINIPATH_RHF_L_SWA) + strlcat(msg, "swA ", len); + if (err & INFINIPATH_RHF_L_SWB) + strlcat(msg, "swB ", len); +} + +/** + * ipath_get_egrbuf - get an eager buffer + * @dd: the infinipath device + * @bufnum: the eager buffer to get + * @err: unused + * + * must only be called if ipath_pd[port] is known to be allocated + */ +static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum, + int err) +{ + return dd->ipath_port0_skbs ? + (void *)dd->ipath_port0_skbs[bufnum]->data : NULL; +} + +/** + * ipath_alloc_skb - allocate an skb and buffer with possible constraints + * @dd: the infinipath device + * @gfp_mask: the sk_buff SFP mask + */ +struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, + gfp_t gfp_mask) +{ + struct sk_buff *skb; + u32 len; + + /* + * Only fully supported way to handle this is to allocate lots + * extra, align as needed, and then do skb_reserve(). That wastes + * a lot of memory... I'll have to hack this into infinipath_copy + * also. + */ + + /* + * We need 4 extra bytes for unaligned transfer copying + */ + if (dd->ipath_flags & IPATH_4BYTE_TID) { + /* we need a 4KB multiple alignment, and there is no way + * to do it except to allocate extra and then skb_reserve + * enough to bring it up to the right alignment. + */ + len = dd->ipath_ibmaxlen + 4 + (1 << 11) - 1; + } + else + len = dd->ipath_ibmaxlen + 4; + skb = __dev_alloc_skb(len, gfp_mask); + if (!skb) { + ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n", + len); + goto bail; + } + if (dd->ipath_flags & IPATH_4BYTE_TID) { + u32 una = ((1 << 11) - 1) & (unsigned long)(skb->data + 4); + if (una) + skb_reserve(skb, 4 + (1 << 11) - una); + else + skb_reserve(skb, 4); + } else + skb_reserve(skb, 4); + +bail: + return skb; +} + +/** + * ipath_rcv_layer - receive a packet for the layered (ethernet) driver + * @dd: the infinipath device + * @etail: the sk_buff number + * @tlen: the total packet length + * @hdr: the ethernet header + * + * Separate routine for better overall optimization + */ +static void ipath_rcv_layer(struct ipath_devdata *dd, u32 etail, + u32 tlen, struct ether_header *hdr) +{ + u32 elen; + u8 pad, *bthbytes; + struct sk_buff *skb, *nskb; + + if (dd->ipath_port0_skbs && hdr->sub_opcode == OPCODE_ENCAP) { + /* + * Allocate a new sk_buff to replace the one we give + * to the network stack. + */ + nskb = ipath_alloc_skb(dd, GFP_ATOMIC); + if (!nskb) { + /* count OK packets that we drop */ + ipath_stats.sps_krdrops++; + return; + } + + bthbytes = (u8 *) hdr->bth; + pad = (bthbytes[1] >> 4) & 3; + /* +CRC32 */ + elen = tlen - (sizeof(*hdr) + pad + sizeof(u32)); + + skb = dd->ipath_port0_skbs[etail]; + dd->ipath_port0_skbs[etail] = nskb; + skb_put(skb, elen); + + dd->ipath_f_put_tid(dd, etail + (u64 __iomem *) + ((char __iomem *) dd->ipath_kregbase + + dd->ipath_rcvegrbase), 0, + virt_to_phys(nskb->data)); + + __ipath_layer_rcv(dd, hdr, skb); + + /* another ether packet received */ + ipath_stats.sps_ether_rpkts++; + } + else if (hdr->sub_opcode == OPCODE_LID_ARP) + __ipath_layer_rcv_lid(dd, hdr); +} + +/* + * ipath_kreceive - receive a packet + * @dd: the infinipath device + * + * called from interrupt handler for errors or receive interrupt + */ +void ipath_kreceive(struct ipath_devdata *dd) +{ + u64 *rc; + void *ebuf; + const u32 rsize = dd->ipath_rcvhdrentsize; /* words */ + const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize; /* words */ + u32 etail = -1, l, hdrqtail; + struct ips_message_header *hdr; + u32 eflags, i, etype, tlen, pkttot = 0; + static u64 totcalls; /* stats, may eventually remove */ + char emsg[128]; + + if (!dd->ipath_hdrqtailptr) { + ipath_dev_err(dd, + "hdrqtailptr not set, can't do receives\n"); + goto bail; + } + + /* There is already a thread processing this queue. */ + if (test_and_set_bit(0, &dd->ipath_rcv_pending)) + goto bail; + + if (dd->ipath_port0head == + (u32)le64_to_cpu(*dd->ipath_hdrqtailptr)) + goto done; + +gotmore: + /* + * read only once at start. If in flood situation, this helps + * performance slightly. If more arrive while we are processing, + * we'll come back here and do them + */ + hdrqtail = (u32)le64_to_cpu(*dd->ipath_hdrqtailptr); + + for (i = 0, l = dd->ipath_port0head; l != hdrqtail; i++) { + u32 qp; + u8 *bthbytes; + + rc = (u64 *) (dd->ipath_pd[0]->port_rcvhdrq + (l << 2)); + hdr = (struct ips_message_header *)&rc[1]; + /* + * could make a network order version of IPATH_KD_QP, and + * do the obvious shift before masking to speed this up. + */ + qp = ntohl(hdr->bth[1]) & 0xffffff; + bthbytes = (u8 *) hdr->bth; + + eflags = ips_get_hdr_err_flags((__le32 *) rc); + etype = ips_get_rcv_type((__le32 *) rc); + /* total length */ + tlen = ips_get_length_in_bytes((__le32 *) rc); + ebuf = NULL; + if (etype != RCVHQ_RCV_TYPE_EXPECTED) { + /* + * it turns out that the chips uses an eager buffer + * for all non-expected packets, whether it "needs" + * one or not. So always get the index, but don't + * set ebuf (so we try to copy data) unless the + * length requires it. + */ + etail = ips_get_index((__le32 *) rc); + if (tlen > sizeof(*hdr) || + etype == RCVHQ_RCV_TYPE_NON_KD) + ebuf = ipath_get_egrbuf(dd, etail, 0); + } + + /* + * both tiderr and ipathhdrerr are set for all plain IB + * packets; only ipathhdrerr should be set. + */ + + if (etype != RCVHQ_RCV_TYPE_NON_KD && etype != + RCVHQ_RCV_TYPE_ERROR && ips_get_ipath_ver( + hdr->iph.ver_port_tid_offset) != + IPS_PROTO_VERSION) { + ipath_cdbg(PKT, "Bad InfiniPath protocol version " + "%x\n", etype); + } + + if (eflags & ~(INFINIPATH_RHF_H_TIDERR | + INFINIPATH_RHF_H_IHDRERR)) { + get_rhf_errstring(eflags, emsg, sizeof emsg); + ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u " + "tlen=%x opcode=%x egridx=%x: %s\n", + eflags, l, etype, tlen, bthbytes[0], + ips_get_index((__le32 *) rc), emsg); + } else if (etype == RCVHQ_RCV_TYPE_NON_KD) { + int ret = __ipath_verbs_rcv(dd, rc + 1, + ebuf, tlen); + if (ret == -ENODEV) + ipath_cdbg(VERBOSE, + "received IB packet, " + "not SMA (QP=%x)\n", qp); + } else if (etype == RCVHQ_RCV_TYPE_EAGER) { + if (qp == IPATH_KD_QP && + bthbytes[0] == ipath_layer_rcv_opcode && + ebuf) + ipath_rcv_layer(dd, etail, tlen, + (struct ether_header *)hdr); + else + ipath_cdbg(PKT, "typ %x, opcode %x (eager, " + "qp=%x), len %x; ignored\n", + etype, bthbytes[0], qp, tlen); + } + else if (etype == RCVHQ_RCV_TYPE_EXPECTED) + ipath_dbg("Bug: Expected TID, opcode %x; ignored\n", + be32_to_cpu(hdr->bth[0]) & 0xff); + else if (eflags & (INFINIPATH_RHF_H_TIDERR | + INFINIPATH_RHF_H_IHDRERR)) { + /* + * This is a type 3 packet, only the LRH is in the + * rcvhdrq, the rest of the header is in the eager + * buffer. + */ + u8 opcode; + if (ebuf) { + bthbytes = (u8 *) ebuf; + opcode = *bthbytes; + } + else + opcode = 0; + get_rhf_errstring(eflags, emsg, sizeof emsg); + ipath_dbg("Err %x (%s), opcode %x, egrbuf %x, " + "len %x\n", eflags, emsg, opcode, etail, + tlen); + } else { + /* + * error packet, type of error unknown. + * Probably type 3, but we don't know, so don't + * even try to print the opcode, etc. + */ + ipath_dbg("Error Pkt, but no eflags! egrbuf %x, " + "len %x\nhdrq@%lx;hdrq+%x rhf: %llx; " + "hdr %llx %llx %llx %llx %llx\n", + etail, tlen, (unsigned long) rc, l, + (unsigned long long) rc[0], + (unsigned long long) rc[1], + (unsigned long long) rc[2], + (unsigned long long) rc[3], + (unsigned long long) rc[4], + (unsigned long long) rc[5]); + } + l += rsize; + if (l >= maxcnt) + l = 0; + /* + * update for each packet, to help prevent overflows if we + * have lots of packets. + */ + (void)ipath_write_ureg(dd, ur_rcvhdrhead, + dd->ipath_rhdrhead_intr_off | l, 0); + if (etype != RCVHQ_RCV_TYPE_EXPECTED) + (void)ipath_write_ureg(dd, ur_rcvegrindexhead, + etail, 0); + } + + pkttot += i; + + dd->ipath_port0head = l; + + if (hdrqtail != (u32)le64_to_cpu(*dd->ipath_hdrqtailptr)) + /* more arrived while we handled first batch */ + goto gotmore; + + if (pkttot > ipath_stats.sps_maxpkts_call) + ipath_stats.sps_maxpkts_call = pkttot; + ipath_stats.sps_port0pkts += pkttot; + ipath_stats.sps_avgpkts_call = + ipath_stats.sps_port0pkts / ++totcalls; + +done: + clear_bit(0, &dd->ipath_rcv_pending); + smp_mb__after_clear_bit(); + +bail:; +} + +/** + * ipath_update_pio_bufs - update shadow copy of the PIO availability map + * @dd: the infinipath device + * + * called whenever our local copy indicates we have run out of send buffers + * NOTE: This can be called from interrupt context by some code + * and from non-interrupt context by ipath_getpiobuf(). + */ + +static void ipath_update_pio_bufs(struct ipath_devdata *dd) +{ + unsigned long flags; + int i; + const unsigned piobregs = (unsigned)dd->ipath_pioavregs; + + /* If the generation (check) bits have changed, then we update the + * busy bit for the corresponding PIO buffer. This algorithm will + * modify positions to the value they already have in some cases + * (i.e., no change), but it's faster than changing only the bits + * that have changed. + * + * We would like to do this atomicly, to avoid spinlocks in the + * critical send path, but that's not really possible, given the + * type of changes, and that this routine could be called on + * multiple cpu's simultaneously, so we lock in this routine only, + * to avoid conflicting updates; all we change is the shadow, and + * it's a single 64 bit memory location, so by definition the update + * is atomic in terms of what other cpu's can see in testing the + * bits. The spin_lock overhead isn't too bad, since it only + * happens when all buffers are in use, so only cpu overhead, not + * latency or bandwidth is affected. + */ +#define _IPATH_ALL_CHECKBITS 0x5555555555555555ULL + if (!dd->ipath_pioavailregs_dma) { + ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n"); + return; + } + if (ipath_debug & __IPATH_VERBDBG) { + /* only if packet debug and verbose */ + volatile __le64 *dma = dd->ipath_pioavailregs_dma; + unsigned long *shadow = dd->ipath_pioavailshadow; + + ipath_cdbg(PKT, "Refill avail, dma0=%llx shad0=%lx, " + "d1=%llx s1=%lx, d2=%llx s2=%lx, d3=%llx " + "s3=%lx\n", + (unsigned long long) le64_to_cpu(dma[0]), + shadow[0], + (unsigned long long) le64_to_cpu(dma[1]), + shadow[1], + (unsigned long long) le64_to_cpu(dma[2]), + shadow[2], + (unsigned long long) le64_to_cpu(dma[3]), + shadow[3]); + if (piobregs > 4) + ipath_cdbg( + PKT, "2nd group, dma4=%llx shad4=%lx, " + "d5=%llx s5=%lx, d6=%llx s6=%lx, " + "d7=%llx s7=%lx\n", + (unsigned long long) le64_to_cpu(dma[4]), + shadow[4], + (unsigned long long) le64_to_cpu(dma[5]), + shadow[5], + (unsigned long long) le64_to_cpu(dma[6]), + shadow[6], + (unsigned long long) le64_to_cpu(dma[7]), + shadow[7]); + } + spin_lock_irqsave(&ipath_pioavail_lock, flags); + for (i = 0; i < piobregs; i++) { + u64 pchbusy, pchg, piov, pnew; + /* + * Chip Errata: bug 6641; even and odd qwords>3 are swapped + */ + if (i > 3) { + if (i & 1) + piov = le64_to_cpu( + dd->ipath_pioavailregs_dma[i - 1]); + else + piov = le64_to_cpu( + dd->ipath_pioavailregs_dma[i + 1]); + } else + piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]); + pchg = _IPATH_ALL_CHECKBITS & + ~(dd->ipath_pioavailshadow[i] ^ piov); + pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT; + if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) { + pnew = dd->ipath_pioavailshadow[i] & ~pchbusy; + pnew |= piov & pchbusy; + dd->ipath_pioavailshadow[i] = pnew; + } + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); +} + +/** + * ipath_setrcvhdrsize - set the receive header size + * @dd: the infinipath device + * @rhdrsize: the receive header size + * + * called from user init code, and also layered driver init + */ +int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize) +{ + int ret = 0; + + if (dd->ipath_flags & IPATH_RCVHDRSZ_SET) { + if (dd->ipath_rcvhdrsize != rhdrsize) { + dev_info(&dd->pcidev->dev, + "Error: can't set protocol header " + "size %u, already %u\n", + rhdrsize, dd->ipath_rcvhdrsize); + ret = -EAGAIN; + } else + ipath_cdbg(VERBOSE, "Reuse same protocol header " + "size %u\n", dd->ipath_rcvhdrsize); + } else if (rhdrsize > (dd->ipath_rcvhdrentsize - + (sizeof(u64) / sizeof(u32)))) { + ipath_dbg("Error: can't set protocol header size %u " + "(> max %u)\n", rhdrsize, + dd->ipath_rcvhdrentsize - + (u32) (sizeof(u64) / sizeof(u32))); + ret = -EOVERFLOW; + } else { + dd->ipath_flags |= IPATH_RCVHDRSZ_SET; + dd->ipath_rcvhdrsize = rhdrsize; + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize, + dd->ipath_rcvhdrsize); + ipath_cdbg(VERBOSE, "Set protocol header size to %u\n", + dd->ipath_rcvhdrsize); + } + return ret; +} + +/** + * ipath_getpiobuf - find an available pio buffer + * @dd: the infinipath device + * @pbufnum: the buffer number is placed here + * + * do appropriate marking as busy, etc. + * returns buffer number if one found (>=0), negative number is error. + * Used by ipath_sma_send_pkt and ipath_layer_send + */ +u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 * pbufnum) +{ + int i, j, starti, updated = 0; + unsigned piobcnt, iter; + unsigned long flags; + unsigned long *shadow = dd->ipath_pioavailshadow; + u32 __iomem *buf; + + piobcnt = (unsigned)(dd->ipath_piobcnt2k + + dd->ipath_piobcnt4k); + starti = dd->ipath_lastport_piobuf; + iter = piobcnt - starti; + if (dd->ipath_upd_pio_shadow) { + /* + * Minor optimization. If we had no buffers on last call, + * start out by doing the update; continue and do scan even + * if no buffers were updated, to be paranoid + */ + ipath_update_pio_bufs(dd); + /* we scanned here, don't do it at end of scan */ + updated = 1; + i = starti; + } else + i = dd->ipath_lastpioindex; + +rescan: + /* + * while test_and_set_bit() is atomic, we do that and then the + * change_bit(), and the pair is not. See if this is the cause + * of the remaining armlaunch errors. + */ + spin_lock_irqsave(&ipath_pioavail_lock, flags); + for (j = 0; j < iter; j++, i++) { + if (i >= piobcnt) + i = starti; + /* + * To avoid bus lock overhead, we first find a candidate + * buffer, then do the test and set, and continue if that + * fails. + */ + if (test_bit((2 * i) + 1, shadow) || + test_and_set_bit((2 * i) + 1, shadow)) + continue; + /* flip generation bit */ + change_bit(2 * i, shadow); + break; + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); + + if (j == iter) { + volatile __le64 *dma = dd->ipath_pioavailregs_dma; + + /* + * first time through; shadow exhausted, but may be real + * buffers available, so go see; if any updated, rescan + * (once) + */ + if (!updated) { + ipath_update_pio_bufs(dd); + updated = 1; + i = starti; + goto rescan; + } + dd->ipath_upd_pio_shadow = 1; + /* + * not atomic, but if we lose one once in a while, that's OK + */ + ipath_stats.sps_nopiobufs++; + if (!(++dd->ipath_consec_nopiobuf % 100000)) { + ipath_dbg( + "%u pio sends with no bufavail; dmacopy: " + "%llx %llx %llx %llx; shadow: " + "%lx %lx %lx %lx\n", + dd->ipath_consec_nopiobuf, + (unsigned long long) le64_to_cpu(dma[0]), + (unsigned long long) le64_to_cpu(dma[1]), + (unsigned long long) le64_to_cpu(dma[2]), + (unsigned long long) le64_to_cpu(dma[3]), + shadow[0], shadow[1], shadow[2], + shadow[3]); + /* + * 4 buffers per byte, 4 registers above, cover rest + * below + */ + if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) > + (sizeof(shadow[0]) * 4 * 4)) + ipath_dbg("2nd group: dmacopy: %llx %llx " + "%llx %llx; shadow: %lx %lx " + "%lx %lx\n", + (unsigned long long) + le64_to_cpu(dma[4]), + (unsigned long long) + le64_to_cpu(dma[5]), + (unsigned long long) + le64_to_cpu(dma[6]), + (unsigned long long) + le64_to_cpu(dma[7]), + shadow[4], shadow[5], + shadow[6], shadow[7]); + } + buf = NULL; + goto bail; + } + + if (updated) + /* + * ran out of bufs, now some (at least this one we just + * got) are now available, so tell the layered driver. + */ + __ipath_layer_intr(dd, IPATH_LAYER_INT_SEND_CONTINUE); + + /* + * set next starting place. Since it's just an optimization, + * it doesn't matter who wins on this, so no locking + */ + dd->ipath_lastpioindex = i + 1; + if (dd->ipath_upd_pio_shadow) + dd->ipath_upd_pio_shadow = 0; + if (dd->ipath_consec_nopiobuf) + dd->ipath_consec_nopiobuf = 0; + if (i < dd->ipath_piobcnt2k) + buf = (u32 __iomem *) (dd->ipath_pio2kbase + + i * dd->ipath_palign); + else + buf = (u32 __iomem *) + (dd->ipath_pio4kbase + + (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign); + ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n", + i, (i < dd->ipath_piobcnt2k) ? 2 : 4, buf); + if (pbufnum) + *pbufnum = i; + +bail: + return buf; +} + +/** + * ipath_create_rcvhdrq - create a receive header queue + * @dd: the infinipath device + * @pd: the port data + * + * this *must* be physically contiguous memory, and for now, + * that limits it to what kmalloc can do. + */ +int ipath_create_rcvhdrq(struct ipath_devdata *dd, + struct ipath_portdata *pd) +{ + int ret = 0, amt; + + amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize * + sizeof(u32), PAGE_SIZE); + if (!pd->port_rcvhdrq) { + /* + * not using REPEAT isn't viable; at 128KB, we can easily + * fail this. The problem with REPEAT is we can block here + * "forever". There isn't an inbetween, unfortunately. We + * could reduce the risk by never freeing the rcvhdrq except + * at unload, but even then, the first time a port is used, + * we could delay for some time... + */ + gfp_t gfp_flags = GFP_USER | __GFP_COMP; + + pd->port_rcvhdrq = dma_alloc_coherent( + &dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys, + gfp_flags); + + if (!pd->port_rcvhdrq) { + ipath_dev_err(dd, "attempt to allocate %d bytes " + "for port %u rcvhdrq failed\n", + amt, pd->port_port); + ret = -ENOMEM; + goto bail; + } + + pd->port_rcvhdrq_size = amt; + + ipath_cdbg(VERBOSE, "%d pages at %p (phys %lx) size=%lu " + "for port %u rcvhdr Q\n", + amt >> PAGE_SHIFT, pd->port_rcvhdrq, + (unsigned long) pd->port_rcvhdrq_phys, + (unsigned long) pd->port_rcvhdrq_size, + pd->port_port); + } else { + /* + * clear for security, sanity, and/or debugging, each + * time we reuse + */ + memset(pd->port_rcvhdrq, 0, amt); + } + + /* + * tell chip each time we init it, even if we are re-using previous + * memory (we zero it at process close) + */ + ipath_cdbg(VERBOSE, "writing port %d rcvhdraddr as %lx\n", + pd->port_port, (unsigned long) pd->port_rcvhdrq_phys); + ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr, + pd->port_port, pd->port_rcvhdrq_phys); + + ret = 0; +bail: + return ret; +} + +int ipath_waitfor_complete(struct ipath_devdata *dd, ipath_kreg reg_id, + u64 bits_to_wait_for, u64 * valp) +{ + unsigned long timeout; + u64 lastval, val; + int ret; + + lastval = ipath_read_kreg64(dd, reg_id); + /* wait a ridiculously long time */ + timeout = jiffies + msecs_to_jiffies(5); + do { + val = ipath_read_kreg64(dd, reg_id); + /* set so they have something, even on failures. */ + *valp = val; + if ((val & bits_to_wait_for) == bits_to_wait_for) { + ret = 0; + break; + } + if (val != lastval) + ipath_cdbg(VERBOSE, "Changed from %llx to %llx, " + "waiting for %llx bits\n", + (unsigned long long) lastval, + (unsigned long long) val, + (unsigned long long) bits_to_wait_for); + cond_resched(); + if (time_after(jiffies, timeout)) { + ipath_dbg("Didn't get bits %llx in register 0x%x, " + "got %llx\n", + (unsigned long long) bits_to_wait_for, + reg_id, (unsigned long long) *valp); + ret = -ENODEV; + break; + } + } while (1); + + return ret; +} + +/** + * ipath_waitfor_mdio_cmdready - wait for last command to complete + * @dd: the infinipath device + * + * Like ipath_waitfor_complete(), but we wait for the CMDVALID bit to go + * away indicating the last command has completed. It doesn't return data + */ +int ipath_waitfor_mdio_cmdready(struct ipath_devdata *dd) +{ + unsigned long timeout; + u64 val; + int ret; + + /* wait a ridiculously long time */ + timeout = jiffies + msecs_to_jiffies(5); + do { + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_mdio); + if (!(val & IPATH_MDIO_CMDVALID)) { + ret = 0; + break; + } + cond_resched(); + if (time_after(jiffies, timeout)) { + ipath_dbg("CMDVALID stuck in mdio reg? (%llx)\n", + (unsigned long long) val); + ret = -ENODEV; + break; + } + } while (1); + + return ret; +} + +void ipath_set_ib_lstate(struct ipath_devdata *dd, int which) +{ + static const char *what[4] = { + [0] = "DOWN", + [INFINIPATH_IBCC_LINKCMD_INIT] = "INIT", + [INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED", + [INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE" + }; + ipath_cdbg(SMA, "Trying to move unit %u to %s, current ltstate " + "is %s\n", dd->ipath_unit, + what[(which >> INFINIPATH_IBCC_LINKCMD_SHIFT) & + INFINIPATH_IBCC_LINKCMD_MASK], + ipath_ibcstatus_str[ + (ipath_read_kreg64 + (dd, dd->ipath_kregs->kr_ibcstatus) >> + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) & + INFINIPATH_IBCS_LINKTRAININGSTATE_MASK]); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl | which); +} + +/** + * ipath_read_kreg64_port - read a device's per-port 64-bit kernel register + * @dd: the infinipath device + * @regno: the register number to read + * @port: the port containing the register + * + * Registers that vary with the chip implementation constants (port) + * use this routine. + */ +u64 ipath_read_kreg64_port(const struct ipath_devdata *dd, ipath_kreg regno, + unsigned port) +{ + u16 where; + + if (port < dd->ipath_portcnt && + (regno == dd->ipath_kregs->kr_rcvhdraddr || + regno == dd->ipath_kregs->kr_rcvhdrtailaddr)) + where = regno + port; + else + where = -1; + + return ipath_read_kreg64(dd, where); +} + +/** + * ipath_write_kreg_port - write a device's per-port 64-bit kernel register + * @dd: the infinipath device + * @regno: the register number to write + * @port: the port containing the register + * @value: the value to write + * + * Registers that vary with the chip implementation constants (port) + * use this routine. + */ +void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno, + unsigned port, u64 value) +{ + u16 where; + + if (port < dd->ipath_portcnt && + (regno == dd->ipath_kregs->kr_rcvhdraddr || + regno == dd->ipath_kregs->kr_rcvhdrtailaddr)) + where = regno + port; + else + where = -1; + + ipath_write_kreg(dd, where, value); +} + +/** + * ipath_shutdown_device - shut down a device + * @dd: the infinipath device + * + * This is called to make the device quiet when we are about to + * unload the driver, and also when the device is administratively + * disabled. It does not free any data structures. + * Everything it does has to be setup again by ipath_init_chip(dd,1) + */ +void ipath_shutdown_device(struct ipath_devdata *dd) +{ + u64 val; + + ipath_dbg("Shutting down the device\n"); + + dd->ipath_flags |= IPATH_LINKUNK; + dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN | + IPATH_LINKINIT | IPATH_LINKARMED | + IPATH_LINKACTIVE); + *dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF | + IPATH_STATUS_IB_READY); + + /* mask interrupts, but not errors */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); + + dd->ipath_rcvctrl = 0; + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + /* + * gracefully stop all sends allowing any in progress to trickle out + * first. + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, 0ULL); + /* flush it */ + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + /* + * enough for anything that's going to trickle out to have actually + * done so. + */ + udelay(5); + + /* + * abort any armed or launched PIO buffers that didn't go. (self + * clearing). Will cause any packet currently being transmitted to + * go out with an EBP, and may also cause a short packet error on + * the receiver. + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + INFINIPATH_S_ABORT); + + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKINITCMD_DISABLE << + INFINIPATH_IBCC_LINKINITCMD_SHIFT); + + /* + * we are shutting down, so tell the layered driver. We don't do + * this on just a link state change, much like ethernet, a cable + * unplug, etc. doesn't change driver state + */ + ipath_layer_intr(dd, IPATH_LAYER_INT_IF_DOWN); + + /* disable IBC */ + dd->ipath_control &= ~INFINIPATH_C_LINKENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control); + + /* + * clear SerdesEnable and turn the leds off; do this here because + * we are unloading, so don't count on interrupts to move along + * Turn the LEDs off explictly for the same reason. + */ + dd->ipath_f_quiet_serdes(dd); + dd->ipath_f_setextled(dd, 0, 0); + + if (dd->ipath_stats_timer_active) { + del_timer_sync(&dd->ipath_stats_timer); + dd->ipath_stats_timer_active = 0; + } + + /* + * clear all interrupts and errors, so that the next time the driver + * is loaded or device is enabled, we know that whatever is set + * happened while we were unloaded + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED); + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL); +} + +/** + * ipath_free_pddata - free a port's allocated data + * @dd: the infinipath device + * @port: the port + * @freehdrq: free the port data structure if true + * + * when closing, free up any allocated data for a port, if the + * reference count goes to zero + * Note: this also optionally frees the portdata itself! + * Any changes here have to be matched up with the reinit case + * of ipath_init_chip(), which calls this routine on reinit after reset. + */ +void ipath_free_pddata(struct ipath_devdata *dd, u32 port, int freehdrq) +{ + struct ipath_portdata *pd = dd->ipath_pd[port]; + + if (!pd) + return; + if (freehdrq) + /* + * only clear and free portdata if we are going to also + * release the hdrq, otherwise we leak the hdrq on each + * open/close cycle + */ + dd->ipath_pd[port] = NULL; + if (freehdrq && pd->port_rcvhdrq) { + ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p " + "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq, + (unsigned long) pd->port_rcvhdrq_size); + dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size, + pd->port_rcvhdrq, pd->port_rcvhdrq_phys); + pd->port_rcvhdrq = NULL; + } + if (port && pd->port_rcvegrbuf) { + /* always free this */ + if (pd->port_rcvegrbuf) { + unsigned e; + + for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) { + void *base = pd->port_rcvegrbuf[e]; + size_t size = pd->port_rcvegrbuf_size; + + ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), " + "chunk %u/%u\n", base, + (unsigned long) size, + e, pd->port_rcvegrbuf_chunks); + dma_free_coherent( + &dd->pcidev->dev, size, base, + pd->port_rcvegrbuf_phys[e]); + } + vfree(pd->port_rcvegrbuf); + pd->port_rcvegrbuf = NULL; + vfree(pd->port_rcvegrbuf_phys); + pd->port_rcvegrbuf_phys = NULL; + } + pd->port_rcvegrbuf_chunks = 0; + } else if (port == 0 && dd->ipath_port0_skbs) { + unsigned e; + struct sk_buff **skbs = dd->ipath_port0_skbs; + + dd->ipath_port0_skbs = NULL; + ipath_cdbg(VERBOSE, "free closed port %d ipath_port0_skbs " + "@ %p\n", pd->port_port, skbs); + for (e = 0; e < dd->ipath_rcvegrcnt; e++) + if (skbs[e]) + dev_kfree_skb(skbs[e]); + vfree(skbs); + } + if (freehdrq) { + kfree(pd->port_tid_pg_list); + kfree(pd); + } +} + +static int __init infinipath_init(void) +{ + int ret; + + ipath_dbg(KERN_INFO DRIVER_LOAD_MSG "%s", ipath_core_version); + + /* + * These must be called before the driver is registered with + * the PCI subsystem. + */ + idr_init(&unit_table); + if (!idr_pre_get(&unit_table, GFP_KERNEL)) { + ret = -ENOMEM; + goto bail; + } + + ret = pci_register_driver(&ipath_driver); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME + ": Unable to register driver: error %d\n", -ret); + goto bail_unit; + } + + ret = ipath_driver_create_group(&ipath_driver.driver); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME ": Unable to create driver " + "sysfs entries: error %d\n", -ret); + goto bail_pci; + } + + ret = ipath_init_ipathfs(); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME ": Unable to create " + "ipathfs: error %d\n", -ret); + goto bail_group; + } + + goto bail; + +bail_group: + ipath_driver_remove_group(&ipath_driver.driver); + +bail_pci: + pci_unregister_driver(&ipath_driver); + +bail_unit: + idr_destroy(&unit_table); + +bail: + return ret; +} + +static void cleanup_device(struct ipath_devdata *dd) +{ + int port; + + ipath_shutdown_device(dd); + + if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) { + /* can't do anything more with chip; needs re-init */ + *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT; + if (dd->ipath_kregbase) { + /* + * if we haven't already cleaned up before these are + * to ensure any register reads/writes "fail" until + * re-init + */ + dd->ipath_kregbase = NULL; + dd->ipath_kregvirt = NULL; + dd->ipath_uregbase = 0; + dd->ipath_sregbase = 0; + dd->ipath_cregbase = 0; + dd->ipath_kregsize = 0; + } + ipath_disable_wc(dd); + } + + if (dd->ipath_pioavailregs_dma) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + (void *) dd->ipath_pioavailregs_dma, + dd->ipath_pioavailregs_phys); + dd->ipath_pioavailregs_dma = NULL; + } + + if (dd->ipath_pageshadow) { + struct page **tmpp = dd->ipath_pageshadow; + int i, cnt = 0; + + ipath_cdbg(VERBOSE, "Unlocking any expTID pages still " + "locked\n"); + for (port = 0; port < dd->ipath_cfgports; port++) { + int port_tidbase = port * dd->ipath_rcvtidcnt; + int maxtid = port_tidbase + dd->ipath_rcvtidcnt; + for (i = port_tidbase; i < maxtid; i++) { + if (!tmpp[i]) + continue; + ipath_release_user_pages(&tmpp[i], 1); + tmpp[i] = NULL; + cnt++; + } + } + if (cnt) { + ipath_stats.sps_pageunlocks += cnt; + ipath_cdbg(VERBOSE, "There were still %u expTID " + "entries locked\n", cnt); + } + if (ipath_stats.sps_pagelocks || + ipath_stats.sps_pageunlocks) + ipath_cdbg(VERBOSE, "%llu pages locked, %llu " + "unlocked via ipath_m{un}lock\n", + (unsigned long long) + ipath_stats.sps_pagelocks, + (unsigned long long) + ipath_stats.sps_pageunlocks); + + ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n", + dd->ipath_pageshadow); + vfree(dd->ipath_pageshadow); + dd->ipath_pageshadow = NULL; + } + + /* + * free any resources still in use (usually just kernel ports) + * at unload + */ + for (port = 0; port < dd->ipath_cfgports; port++) + ipath_free_pddata(dd, port, 1); + kfree(dd->ipath_pd); + /* + * debuggability, in case some cleanup path tries to use it + * after this + */ + dd->ipath_pd = NULL; +} + +static void __exit infinipath_cleanup(void) +{ + struct ipath_devdata *dd, *tmp; + unsigned long flags; + + ipath_exit_ipathfs(); + + ipath_driver_remove_group(&ipath_driver.driver); + + spin_lock_irqsave(&ipath_devs_lock, flags); + + /* + * turn off rcv, send, and interrupts for all ports, all drivers + * should also hard reset the chip here? + * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs + * for all versions of the driver, if they were allocated + */ + list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) { + spin_unlock_irqrestore(&ipath_devs_lock, flags); + + if (dd->ipath_kregbase) + cleanup_device(dd); + + if (dd->pcidev) { + if (dd->pcidev->irq) { + ipath_cdbg(VERBOSE, + "unit %u free_irq of irq %x\n", + dd->ipath_unit, dd->pcidev->irq); + free_irq(dd->pcidev->irq, dd); + } else + ipath_dbg("irq is 0, not doing free_irq " + "for unit %u\n", dd->ipath_unit); + dd->pcidev = NULL; + } + + /* + * we check for NULL here, because it's outside the kregbase + * check, and we need to call it after the free_irq. Thus + * it's possible that the function pointers were never + * initialized. + */ + if (dd->ipath_f_cleanup) + /* clean up chip-specific stuff */ + dd->ipath_f_cleanup(dd); + + spin_lock_irqsave(&ipath_devs_lock, flags); + } + + spin_unlock_irqrestore(&ipath_devs_lock, flags); + + ipath_cdbg(VERBOSE, "Unregistering pci driver\n"); + pci_unregister_driver(&ipath_driver); + + idr_destroy(&unit_table); +} + +/** + * ipath_reset_device - reset the chip if possible + * @unit: the device to reset + * + * Whether or not reset is successful, we attempt to re-initialize the chip + * (that is, much like a driver unload/reload). We clear the INITTED flag + * so that the various entry points will fail until we reinitialize. For + * now, we only allow this if no user ports are open that use chip resources + */ +int ipath_reset_device(int unit) +{ + int ret, i; + struct ipath_devdata *dd = ipath_lookup(unit); + + if (!dd) { + ret = -ENODEV; + goto bail; + } + + dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit); + + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) { + dev_info(&dd->pcidev->dev, "Invalid unit number %u or " + "not initialized or not present\n", unit); + ret = -ENXIO; + goto bail; + } + + if (dd->ipath_pd) + for (i = 1; i < dd->ipath_portcnt; i++) { + if (dd->ipath_pd[i] && dd->ipath_pd[i]->port_cnt) { + ipath_dbg("unit %u port %d is in use " + "(PID %u cmd %s), can't reset\n", + unit, i, + dd->ipath_pd[i]->port_pid, + dd->ipath_pd[i]->port_comm); + ret = -EBUSY; + goto bail; + } + } + + dd->ipath_flags &= ~IPATH_INITTED; + ret = dd->ipath_f_reset(dd); + if (ret != 1) + ipath_dbg("reset was not successful\n"); + ipath_dbg("Trying to reinitialize unit %u after reset attempt\n", + unit); + ret = ipath_init_chip(dd, 1); + if (ret) + ipath_dev_err(dd, "Reinitialize unit %u after " + "reset failed with %d\n", unit, ret); + else + dev_info(&dd->pcidev->dev, "Reinitialized unit %u after " + "resetting\n", unit); + +bail: + return ret; +} + +module_init(infinipath_init); +module_exit(infinipath_cleanup); diff --git a/drivers/infiniband/hw/ipath/ipath_eeprom.c b/drivers/infiniband/hw/ipath/ipath_eeprom.c new file mode 100644 index 0000000..f11a900e --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_eeprom.c @@ -0,0 +1,613 @@ +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/delay.h> +#include <linux/pci.h> +#include <linux/vmalloc.h> + +#include "ipath_kernel.h" + +/* + * InfiniPath I2C driver for a serial eeprom. This is not a generic + * I2C interface. For a start, the device we're using (Atmel AT24C11) + * doesn't work like a regular I2C device. It looks like one + * electrically, but not logically. Normal I2C devices have a single + * 7-bit or 10-bit I2C address that they respond to. Valid 7-bit + * addresses range from 0x03 to 0x77. Addresses 0x00 to 0x02 and 0x78 + * to 0x7F are special reserved addresses (e.g. 0x00 is the "general + * call" address.) The Atmel device, on the other hand, responds to ALL + * 7-bit addresses. It's designed to be the only device on a given I2C + * bus. A 7-bit address corresponds to the memory address within the + * Atmel device itself. + * + * Also, the timing requirements mean more than simple software + * bitbanging, with readbacks from chip to ensure timing (simple udelay + * is not enough). + * + * This all means that accessing the device is specialized enough + * that using the standard kernel I2C bitbanging interface would be + * impossible. For example, the core I2C eeprom driver expects to find + * a device at one or more of a limited set of addresses only. It doesn't + * allow writing to an eeprom. It also doesn't provide any means of + * accessing eeprom contents from within the kernel, only via sysfs. + */ + +enum i2c_type { + i2c_line_scl = 0, + i2c_line_sda +}; + +enum i2c_state { + i2c_line_low = 0, + i2c_line_high +}; + +#define READ_CMD 1 +#define WRITE_CMD 0 + +static int eeprom_init; + +/* + * The gpioval manipulation really should be protected by spinlocks + * or be converted to use atomic operations. + */ + +/** + * i2c_gpio_set - set a GPIO line + * @dd: the infinipath device + * @line: the line to set + * @new_line_state: the state to set + * + * Returns 0 if the line was set to the new state successfully, non-zero + * on error. + */ +static int i2c_gpio_set(struct ipath_devdata *dd, + enum i2c_type line, + enum i2c_state new_line_state) +{ + u64 read_val, write_val, mask, *gpioval; + + gpioval = &dd->ipath_gpio_out; + read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl); + if (line == i2c_line_scl) + mask = ipath_gpio_scl; + else + mask = ipath_gpio_sda; + + if (new_line_state == i2c_line_high) + /* tri-state the output rather than force high */ + write_val = read_val & ~mask; + else + /* config line to be an output */ + write_val = read_val | mask; + ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, write_val); + + /* set high and verify */ + if (new_line_state == i2c_line_high) + write_val = 0x1UL; + else + write_val = 0x0UL; + + if (line == i2c_line_scl) { + write_val <<= ipath_gpio_scl_num; + *gpioval = *gpioval & ~(1UL << ipath_gpio_scl_num); + *gpioval |= write_val; + } else { + write_val <<= ipath_gpio_sda_num; + *gpioval = *gpioval & ~(1UL << ipath_gpio_sda_num); + *gpioval |= write_val; + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_out, *gpioval); + + return 0; +} + +/** + * i2c_gpio_get - get a GPIO line state + * @dd: the infinipath device + * @line: the line to get + * @curr_statep: where to put the line state + * + * Returns 0 if the line was set to the new state successfully, non-zero + * on error. curr_state is not set on error. + */ +static int i2c_gpio_get(struct ipath_devdata *dd, + enum i2c_type line, + enum i2c_state *curr_statep) +{ + u64 read_val, write_val, mask; + int ret; + + /* check args */ + if (curr_statep == NULL) { + ret = 1; + goto bail; + } + + read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl); + /* config line to be an input */ + if (line == i2c_line_scl) + mask = ipath_gpio_scl; + else + mask = ipath_gpio_sda; + write_val = read_val & ~mask; + ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, write_val); + read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus); + + if (read_val & mask) + *curr_statep = i2c_line_high; + else + *curr_statep = i2c_line_low; + + ret = 0; + +bail: + return ret; +} + +/** + * i2c_wait_for_writes - wait for a write + * @dd: the infinipath device + * + * We use this instead of udelay directly, so we can make sure + * that previous register writes have been flushed all the way + * to the chip. Since we are delaying anyway, the cost doesn't + * hurt, and makes the bit twiddling more regular + */ +static void i2c_wait_for_writes(struct ipath_devdata *dd) +{ + (void)ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch); +} + +static void scl_out(struct ipath_devdata *dd, u8 bit) +{ + i2c_gpio_set(dd, i2c_line_scl, bit ? i2c_line_high : i2c_line_low); + + i2c_wait_for_writes(dd); +} + +static void sda_out(struct ipath_devdata *dd, u8 bit) +{ + i2c_gpio_set(dd, i2c_line_sda, bit ? i2c_line_high : i2c_line_low); + + i2c_wait_for_writes(dd); +} + +static u8 sda_in(struct ipath_devdata *dd, int wait) +{ + enum i2c_state bit; + + if (i2c_gpio_get(dd, i2c_line_sda, &bit)) + ipath_dbg("get bit failed!\n"); + + if (wait) + i2c_wait_for_writes(dd); + + return bit == i2c_line_high ? 1U : 0; +} + +/** + * i2c_ackrcv - see if ack following write is true + * @dd: the infinipath device + */ +static int i2c_ackrcv(struct ipath_devdata *dd) +{ + u8 ack_received; + + /* AT ENTRY SCL = LOW */ + /* change direction, ignore data */ + ack_received = sda_in(dd, 1); + scl_out(dd, i2c_line_high); + ack_received = sda_in(dd, 1) == 0; + scl_out(dd, i2c_line_low); + return ack_received; +} + +/** + * wr_byte - write a byte, one bit at a time + * @dd: the infinipath device + * @data: the byte to write + * + * Returns 0 if we got the following ack, otherwise 1 + */ +static int wr_byte(struct ipath_devdata *dd, u8 data) +{ + int bit_cntr; + u8 bit; + + for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) { + bit = (data >> bit_cntr) & 1; + sda_out(dd, bit); + scl_out(dd, i2c_line_high); + scl_out(dd, i2c_line_low); + } + return (!i2c_ackrcv(dd)) ? 1 : 0; +} + +static void send_ack(struct ipath_devdata *dd) +{ + sda_out(dd, i2c_line_low); + scl_out(dd, i2c_line_high); + scl_out(dd, i2c_line_low); + sda_out(dd, i2c_line_high); +} + +/** + * i2c_startcmd - transmit the start condition, followed by address/cmd + * @dd: the infinipath device + * @offset_dir: direction byte + * + * (both clock/data high, clock high, data low while clock is high) + */ +static int i2c_startcmd(struct ipath_devdata *dd, u8 offset_dir) +{ + int res; + + /* issue start sequence */ + sda_out(dd, i2c_line_high); + scl_out(dd, i2c_line_high); + sda_out(dd, i2c_line_low); + scl_out(dd, i2c_line_low); + + /* issue length and direction byte */ + res = wr_byte(dd, offset_dir); + + if (res) + ipath_cdbg(VERBOSE, "No ack to complete start\n"); + + return res; +} + +/** + * stop_cmd - transmit the stop condition + * @dd: the infinipath device + * + * (both clock/data low, clock high, data high while clock is high) + */ +static void stop_cmd(struct ipath_devdata *dd) +{ + scl_out(dd, i2c_line_low); + sda_out(dd, i2c_line_low); + scl_out(dd, i2c_line_high); + sda_out(dd, i2c_line_high); + udelay(2); +} + +/** + * eeprom_reset - reset I2C communication + * @dd: the infinipath device + */ + +static int eeprom_reset(struct ipath_devdata *dd) +{ + int clock_cycles_left = 9; + u64 *gpioval = &dd->ipath_gpio_out; + int ret; + + eeprom_init = 1; + *gpioval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_out); + ipath_cdbg(VERBOSE, "Resetting i2c eeprom; initial gpioout reg " + "is %llx\n", (unsigned long long) *gpioval); + + /* + * This is to get the i2c into a known state, by first going low, + * then tristate sda (and then tristate scl as first thing + * in loop) + */ + scl_out(dd, i2c_line_low); + sda_out(dd, i2c_line_high); + + while (clock_cycles_left--) { + scl_out(dd, i2c_line_high); + + if (sda_in(dd, 0)) { + sda_out(dd, i2c_line_low); + scl_out(dd, i2c_line_low); + ret = 0; + goto bail; + } + + scl_out(dd, i2c_line_low); + } + + ret = 1; + +bail: + return ret; +} + +/** + * ipath_eeprom_read - receives bytes from the eeprom via I2C + * @dd: the infinipath device + * @eeprom_offset: address to read from + * @buffer: where to store result + * @len: number of bytes to receive + */ + +int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset, + void *buffer, int len) +{ + /* compiler complains unless initialized */ + u8 single_byte = 0; + int bit_cntr; + int ret; + + if (!eeprom_init) + eeprom_reset(dd); + + eeprom_offset = (eeprom_offset << 1) | READ_CMD; + + if (i2c_startcmd(dd, eeprom_offset)) { + ipath_dbg("Failed startcmd\n"); + stop_cmd(dd); + ret = 1; + goto bail; + } + + /* + * eeprom keeps clocking data out as long as we ack, automatically + * incrementing the address. + */ + while (len-- > 0) { + /* get data */ + single_byte = 0; + for (bit_cntr = 8; bit_cntr; bit_cntr--) { + u8 bit; + scl_out(dd, i2c_line_high); + bit = sda_in(dd, 0); + single_byte |= bit << (bit_cntr - 1); + scl_out(dd, i2c_line_low); + } + + /* send ack if not the last byte */ + if (len) + send_ack(dd); + + *((u8 *) buffer) = single_byte; + buffer++; + } + + stop_cmd(dd); + + ret = 0; + +bail: + return ret; +} + +/** + * ipath_eeprom_write - writes data to the eeprom via I2C + * @dd: the infinipath device + * @eeprom_offset: where to place data + * @buffer: data to write + * @len: number of bytes to write + */ +int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset, + const void *buffer, int len) +{ + u8 single_byte; + int sub_len; + const u8 *bp = buffer; + int max_wait_time, i; + int ret; + + if (!eeprom_init) + eeprom_reset(dd); + + while (len > 0) { + if (i2c_startcmd(dd, (eeprom_offset << 1) | WRITE_CMD)) { + ipath_dbg("Failed to start cmd offset %u\n", + eeprom_offset); + goto failed_write; + } + + sub_len = min(len, 4); + eeprom_offset += sub_len; + len -= sub_len; + + for (i = 0; i < sub_len; i++) { + if (wr_byte(dd, *bp++)) { + ipath_dbg("no ack after byte %u/%u (%u " + "total remain)\n", i, sub_len, + len + sub_len - i); + goto failed_write; + } + } + + stop_cmd(dd); + + /* + * wait for write complete by waiting for a successful + * read (the chip replies with a zero after the write + * cmd completes, and before it writes to the eeprom. + * The startcmd for the read will fail the ack until + * the writes have completed. We do this inline to avoid + * the debug prints that are in the real read routine + * if the startcmd fails. + */ + max_wait_time = 100; + while (i2c_startcmd(dd, READ_CMD)) { + stop_cmd(dd); + if (!--max_wait_time) { + ipath_dbg("Did not get successful read to " + "complete write\n"); + goto failed_write; + } + } + /* now read the zero byte */ + for (i = single_byte = 0; i < 8; i++) { + u8 bit; + scl_out(dd, i2c_line_high); + bit = sda_in(dd, 0); + scl_out(dd, i2c_line_low); + single_byte <<= 1; + single_byte |= bit; + } + stop_cmd(dd); + } + + ret = 0; + goto bail; + +failed_write: + stop_cmd(dd); + ret = 1; + +bail: + return ret; +} + +static u8 flash_csum(struct ipath_flash *ifp, int adjust) +{ + u8 *ip = (u8 *) ifp; + u8 csum = 0, len; + + for (len = 0; len < ifp->if_length; len++) + csum += *ip++; + csum -= ifp->if_csum; + csum = ~csum; + if (adjust) + ifp->if_csum = csum; + + return csum; +} + +/** + * ipath_get_guid - get the GUID from the i2c device + * @dd: the infinipath device + * + * When we add the multi-chip support, we will probably have to add + * the ability to use the number of guids field, and get the guid from + * the first chip's flash, to use for all of them. + */ +void ipath_get_guid(struct ipath_devdata *dd) +{ + void *buf; + struct ipath_flash *ifp; + __be64 guid; + int len; + u8 csum, *bguid; + int t = dd->ipath_unit; + struct ipath_devdata *dd0 = ipath_lookup(0); + + if (t && dd0->ipath_nguid > 1 && t <= dd0->ipath_nguid) { + u8 *bguid, oguid; + dd->ipath_guid = dd0->ipath_guid; + bguid = (u8 *) & dd->ipath_guid; + + oguid = bguid[7]; + bguid[7] += t; + if (oguid > bguid[7]) { + if (bguid[6] == 0xff) { + if (bguid[5] == 0xff) { + ipath_dev_err( + dd, + "Can't set %s GUID from " + "base, wraps to OUI!\n", + ipath_get_unit_name(t)); + dd->ipath_guid = 0; + goto bail; + } + bguid[5]++; + } + bguid[6]++; + } + dd->ipath_nguid = 1; + + ipath_dbg("nguid %u, so adding %u to device 0 guid, " + "for %llx\n", + dd0->ipath_nguid, t, + (unsigned long long) be64_to_cpu(dd->ipath_guid)); + goto bail; + } + + len = offsetof(struct ipath_flash, if_future); + buf = vmalloc(len); + if (!buf) { + ipath_dev_err(dd, "Couldn't allocate memory to read %u " + "bytes from eeprom for GUID\n", len); + goto bail; + } + + if (ipath_eeprom_read(dd, 0, buf, len)) { + ipath_dev_err(dd, "Failed reading GUID from eeprom\n"); + goto done; + } + ifp = (struct ipath_flash *)buf; + + csum = flash_csum(ifp, 0); + if (csum != ifp->if_csum) { + dev_info(&dd->pcidev->dev, "Bad I2C flash checksum: " + "0x%x, not 0x%x\n", csum, ifp->if_csum); + goto done; + } + if (*(__be64 *) ifp->if_guid == 0ULL || + *(__be64 *) ifp->if_guid == __constant_cpu_to_be64(-1LL)) { + ipath_dev_err(dd, "Invalid GUID %llx from flash; " + "ignoring\n", + *(unsigned long long *) ifp->if_guid); + /* don't allow GUID if all 0 or all 1's */ + goto done; + } + + /* complain, but allow it */ + if (*(u64 *) ifp->if_guid == 0x100007511000000ULL) + dev_info(&dd->pcidev->dev, "Warning, GUID %llx is " + "default, probably not correct!\n", + *(unsigned long long *) ifp->if_guid); + + bguid = ifp->if_guid; + if (!bguid[0] && !bguid[1] && !bguid[2]) { + /* original incorrect GUID format in flash; fix in + * core copy, by shifting up 2 octets; don't need to + * change top octet, since both it and shifted are + * 0.. */ + bguid[1] = bguid[3]; + bguid[2] = bguid[4]; + bguid[3] = bguid[4] = 0; + guid = *(__be64 *) ifp->if_guid; + ipath_cdbg(VERBOSE, "Old GUID format in flash, top 3 zero, " + "shifting 2 octets\n"); + } else + guid = *(__be64 *) ifp->if_guid; + dd->ipath_guid = guid; + dd->ipath_nguid = ifp->if_numguid; + memcpy(dd->ipath_serial, ifp->if_serial, + sizeof(ifp->if_serial)); + ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n", + (unsigned long long) be64_to_cpu(dd->ipath_guid)); + +done: + vfree(buf); + +bail:; +} diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c new file mode 100644 index 0000000..c347191 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -0,0 +1,1910 @@ +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/pci.h> +#include <linux/poll.h> +#include <linux/cdev.h> +#include <linux/swap.h> +#include <linux/vmalloc.h> +#include <asm/pgtable.h> + +#include "ipath_kernel.h" +#include "ips_common.h" +#include "ipath_layer.h" + +static int ipath_open(struct inode *, struct file *); +static int ipath_close(struct inode *, struct file *); +static ssize_t ipath_write(struct file *, const char __user *, size_t, + loff_t *); +static unsigned int ipath_poll(struct file *, struct poll_table_struct *); +static int ipath_mmap(struct file *, struct vm_area_struct *); + +static struct file_operations ipath_file_ops = { + .owner = THIS_MODULE, + .write = ipath_write, + .open = ipath_open, + .release = ipath_close, + .poll = ipath_poll, + .mmap = ipath_mmap +}; + +static int ipath_get_base_info(struct ipath_portdata *pd, + void __user *ubase, size_t ubase_size) +{ + int ret = 0; + struct ipath_base_info *kinfo = NULL; + struct ipath_devdata *dd = pd->port_dd; + + if (ubase_size < sizeof(*kinfo)) { + ipath_cdbg(PROC, + "Base size %lu, need %lu (version mismatch?)\n", + (unsigned long) ubase_size, + (unsigned long) sizeof(*kinfo)); + ret = -EINVAL; + goto bail; + } + + kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL); + if (kinfo == NULL) { + ret = -ENOMEM; + goto bail; + } + + ret = dd->ipath_f_get_base_info(pd, kinfo); + if (ret < 0) + goto bail; + + kinfo->spi_rcvhdr_cnt = dd->ipath_rcvhdrcnt; + kinfo->spi_rcvhdrent_size = dd->ipath_rcvhdrentsize; + kinfo->spi_tidegrcnt = dd->ipath_rcvegrcnt; + kinfo->spi_rcv_egrbufsize = dd->ipath_rcvegrbufsize; + /* + * have to mmap whole thing + */ + kinfo->spi_rcv_egrbuftotlen = + pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size; + kinfo->spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk; + kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen / + pd->port_rcvegrbuf_chunks; + kinfo->spi_tidcnt = dd->ipath_rcvtidcnt; + /* + * for this use, may be ipath_cfgports summed over all chips that + * are are configured and present + */ + kinfo->spi_nports = dd->ipath_cfgports; + /* unit (chip/board) our port is on */ + kinfo->spi_unit = dd->ipath_unit; + /* for now, only a single page */ + kinfo->spi_tid_maxsize = PAGE_SIZE; + + /* + * Doing this per port, and based on the skip value, etc. This has + * to be the actual buffer size, since the protocol code treats it + * as an array. + * + * These have to be set to user addresses in the user code via mmap. + * These values are used on return to user code for the mmap target + * addresses only. For 32 bit, same 44 bit address problem, so use + * the physical address, not virtual. Before 2.6.11, using the + * page_address() macro worked, but in 2.6.11, even that returns the + * full 64 bit address (upper bits all 1's). So far, using the + * physical addresses (or chip offsets, for chip mapping) works, but + * no doubt some future kernel release will chang that, and we'll be + * on to yet another method of dealing with this + */ + kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys; + kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys; + kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys; + kinfo->spi_status = (u64) kinfo->spi_pioavailaddr + + (void *) dd->ipath_statusp - + (void *) dd->ipath_pioavailregs_dma; + kinfo->spi_piobufbase = (u64) pd->port_piobufs; + kinfo->__spi_uregbase = + dd->ipath_uregbase + dd->ipath_palign * pd->port_port; + + kinfo->spi_pioindex = dd->ipath_pbufsport * (pd->port_port - 1); + kinfo->spi_piocnt = dd->ipath_pbufsport; + kinfo->spi_pioalign = dd->ipath_palign; + + kinfo->spi_qpair = IPATH_KD_QP; + kinfo->spi_piosize = dd->ipath_ibmaxlen; + kinfo->spi_mtu = dd->ipath_ibmaxlen; /* maxlen, not ibmtu */ + kinfo->spi_port = pd->port_port; + kinfo->spi_sw_version = IPATH_USER_SWVERSION; + kinfo->spi_hw_version = dd->ipath_revision; + + if (copy_to_user(ubase, kinfo, sizeof(*kinfo))) + ret = -EFAULT; + +bail: + kfree(kinfo); + return ret; +} + +/** + * ipath_tid_update - update a port TID + * @pd: the port + * @ti: the TID information + * + * The new implementation as of Oct 2004 is that the driver assigns + * the tid and returns it to the caller. To make it easier to + * catch bugs, and to reduce search time, we keep a cursor for + * each port, walking the shadow tid array to find one that's not + * in use. + * + * For now, if we can't allocate the full list, we fail, although + * in the long run, we'll allocate as many as we can, and the + * caller will deal with that by trying the remaining pages later. + * That means that when we fail, we have to mark the tids as not in + * use again, in our shadow copy. + * + * It's up to the caller to free the tids when they are done. + * We'll unlock the pages as they free them. + * + * Also, right now we are locking one page at a time, but since + * the intended use of this routine is for a single group of + * virtually contiguous pages, that should change to improve + * performance. + */ +static int ipath_tid_update(struct ipath_portdata *pd, + const struct ipath_tid_info *ti) +{ + int ret = 0, ntids; + u32 tid, porttid, cnt, i, tidcnt; + u16 *tidlist; + struct ipath_devdata *dd = pd->port_dd; + u64 physaddr; + unsigned long vaddr; + u64 __iomem *tidbase; + unsigned long tidmap[8]; + struct page **pagep = NULL; + + if (!dd->ipath_pageshadow) { + ret = -ENOMEM; + goto done; + } + + cnt = ti->tidcnt; + if (!cnt) { + ipath_dbg("After copyin, tidcnt 0, tidlist %llx\n", + (unsigned long long) ti->tidlist); + /* + * Should we treat as success? likely a bug + */ + ret = -EFAULT; + goto done; + } + tidcnt = dd->ipath_rcvtidcnt; + if (cnt >= tidcnt) { + /* make sure it all fits in port_tid_pg_list */ + dev_info(&dd->pcidev->dev, "Process tried to allocate %u " + "TIDs, only trying max (%u)\n", cnt, tidcnt); + cnt = tidcnt; + } + pagep = (struct page **)pd->port_tid_pg_list; + tidlist = (u16 *) (&pagep[cnt]); + + memset(tidmap, 0, sizeof(tidmap)); + tid = pd->port_tidcursor; + /* before decrement; chip actual # */ + porttid = pd->port_port * tidcnt; + ntids = tidcnt; + tidbase = (u64 __iomem *) (((char __iomem *) dd->ipath_kregbase) + + dd->ipath_rcvtidbase + + porttid * sizeof(*tidbase)); + + ipath_cdbg(VERBOSE, "Port%u %u tids, cursor %u, tidbase %p\n", + pd->port_port, cnt, tid, tidbase); + + /* virtual address of first page in transfer */ + vaddr = ti->tidvaddr; + if (!access_ok(VERIFY_WRITE, (void __user *) vaddr, + cnt * PAGE_SIZE)) { + ipath_dbg("Fail vaddr %p, %u pages, !access_ok\n", + (void *)vaddr, cnt); + ret = -EFAULT; + goto done; + } + ret = ipath_get_user_pages(vaddr, cnt, pagep); + if (ret) { + if (ret == -EBUSY) { + ipath_dbg("Failed to lock addr %p, %u pages " + "(already locked)\n", + (void *) vaddr, cnt); + /* + * for now, continue, and see what happens but with + * the new implementation, this should never happen, + * unless perhaps the user has mpin'ed the pages + * themselves (something we need to test) + */ + ret = 0; + } else { + dev_info(&dd->pcidev->dev, + "Failed to lock addr %p, %u pages: " + "errno %d\n", (void *) vaddr, cnt, -ret); + goto done; + } + } + for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) { + for (; ntids--; tid++) { + if (tid == tidcnt) + tid = 0; + if (!dd->ipath_pageshadow[porttid + tid]) + break; + } + if (ntids < 0) { + /* + * oops, wrapped all the way through their TIDs, + * and didn't have enough free; see comments at + * start of routine + */ + ipath_dbg("Not enough free TIDs for %u pages " + "(index %d), failing\n", cnt, i); + i--; /* last tidlist[i] not filled in */ + ret = -ENOMEM; + break; + } + tidlist[i] = tid; + ipath_cdbg(VERBOSE, "Updating idx %u to TID %u, " + "vaddr %lx\n", i, tid, vaddr); + /* we "know" system pages and TID pages are same size */ + dd->ipath_pageshadow[porttid + tid] = pagep[i]; + /* + * don't need atomic or it's overhead + */ + __set_bit(tid, tidmap); + physaddr = page_to_phys(pagep[i]); + ipath_stats.sps_pagelocks++; + ipath_cdbg(VERBOSE, + "TID %u, vaddr %lx, physaddr %llx pgp %p\n", + tid, vaddr, (unsigned long long) physaddr, + pagep[i]); + dd->ipath_f_put_tid(dd, &tidbase[tid], 1, physaddr); + /* + * don't check this tid in ipath_portshadow, since we + * just filled it in; start with the next one. + */ + tid++; + } + + if (ret) { + u32 limit; + cleanup: + /* jump here if copy out of updated info failed... */ + ipath_dbg("After failure (ret=%d), undo %d of %d entries\n", + -ret, i, cnt); + /* same code that's in ipath_free_tid() */ + limit = sizeof(tidmap) * BITS_PER_BYTE; + if (limit > tidcnt) + /* just in case size changes in future */ + limit = tidcnt; + tid = find_first_bit((const unsigned long *)tidmap, limit); + for (; tid < limit; tid++) { + if (!test_bit(tid, tidmap)) + continue; + if (dd->ipath_pageshadow[porttid + tid]) { + ipath_cdbg(VERBOSE, "Freeing TID %u\n", + tid); + dd->ipath_f_put_tid(dd, &tidbase[tid], 1, + dd->ipath_tidinvalid); + dd->ipath_pageshadow[porttid + tid] = NULL; + ipath_stats.sps_pageunlocks++; + } + } + ipath_release_user_pages(pagep, cnt); + } else { + /* + * Copy the updated array, with ipath_tid's filled in, back + * to user. Since we did the copy in already, this "should + * never fail" If it does, we have to clean up... + */ + if (copy_to_user((void __user *) + (unsigned long) ti->tidlist, + tidlist, cnt * sizeof(*tidlist))) { + ret = -EFAULT; + goto cleanup; + } + if (copy_to_user((void __user *) (unsigned long) ti->tidmap, + tidmap, sizeof tidmap)) { + ret = -EFAULT; + goto cleanup; + } + if (tid == tidcnt) + tid = 0; + pd->port_tidcursor = tid; + } + +done: + if (ret) + ipath_dbg("Failed to map %u TID pages, failing with %d\n", + ti->tidcnt, -ret); + return ret; +} + +/** + * ipath_tid_free - free a port TID + * @pd: the port + * @ti: the TID info + * + * right now we are unlocking one page at a time, but since + * the intended use of this routine is for a single group of + * virtually contiguous pages, that should change to improve + * performance. We check that the TID is in range for this port + * but otherwise don't check validity; if user has an error and + * frees the wrong tid, it's only their own data that can thereby + * be corrupted. We do check that the TID was in use, for sanity + * We always use our idea of the saved address, not the address that + * they pass in to us. + */ + +static int ipath_tid_free(struct ipath_portdata *pd, + const struct ipath_tid_info *ti) +{ + int ret = 0; + u32 tid, porttid, cnt, limit, tidcnt; + struct ipath_devdata *dd = pd->port_dd; + u64 __iomem *tidbase; + unsigned long tidmap[8]; + + if (!dd->ipath_pageshadow) { + ret = -ENOMEM; + goto done; + } + + if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap, + sizeof tidmap)) { + ret = -EFAULT; + goto done; + } + + porttid = pd->port_port * dd->ipath_rcvtidcnt; + tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) + + dd->ipath_rcvtidbase + + porttid * sizeof(*tidbase)); + + tidcnt = dd->ipath_rcvtidcnt; + limit = sizeof(tidmap) * BITS_PER_BYTE; + if (limit > tidcnt) + /* just in case size changes in future */ + limit = tidcnt; + tid = find_first_bit(tidmap, limit); + ipath_cdbg(VERBOSE, "Port%u free %u tids; first bit (max=%d) " + "set is %d, porttid %u\n", pd->port_port, ti->tidcnt, + limit, tid, porttid); + for (cnt = 0; tid < limit; tid++) { + /* + * small optimization; if we detect a run of 3 or so without + * any set, use find_first_bit again. That's mainly to + * accelerate the case where we wrapped, so we have some at + * the beginning, and some at the end, and a big gap + * in the middle. + */ + if (!test_bit(tid, tidmap)) + continue; + cnt++; + if (dd->ipath_pageshadow[porttid + tid]) { + ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n", + pd->port_pid, tid); + dd->ipath_f_put_tid(dd, &tidbase[tid], 1, + dd->ipath_tidinvalid); + ipath_release_user_pages( + &dd->ipath_pageshadow[porttid + tid], 1); + dd->ipath_pageshadow[porttid + tid] = NULL; + ipath_stats.sps_pageunlocks++; + } else + ipath_dbg("Unused tid %u, ignoring\n", tid); + } + if (cnt != ti->tidcnt) + ipath_dbg("passed in tidcnt %d, only %d bits set in map\n", + ti->tidcnt, cnt); +done: + if (ret) + ipath_dbg("Failed to unmap %u TID pages, failing with %d\n", + ti->tidcnt, -ret); + return ret; +} + +/** + * ipath_set_part_key - set a partition key + * @pd: the port + * @key: the key + * + * We can have up to 4 active at a time (other than the default, which is + * always allowed). This is somewhat tricky, since multiple ports may set + * the same key, so we reference count them, and clean up at exit. All 4 + * partition keys are packed into a single infinipath register. It's an + * error for a process to set the same pkey multiple times. We provide no + * mechanism to de-allocate a pkey at this time, we may eventually need to + * do that. I've used the atomic operations, and no locking, and only make + * a single pass through what's available. This should be more than + * adequate for some time. I'll think about spinlocks or the like if and as + * it's necessary. + */ +static int ipath_set_part_key(struct ipath_portdata *pd, u16 key) +{ + struct ipath_devdata *dd = pd->port_dd; + int i, any = 0, pidx = -1; + u16 lkey = key & 0x7FFF; + int ret; + + if (lkey == (IPS_DEFAULT_P_KEY & 0x7FFF)) { + /* nothing to do; this key always valid */ + ret = 0; + goto bail; + } + + ipath_cdbg(VERBOSE, "p%u try to set pkey %hx, current keys " + "%hx:%x %hx:%x %hx:%x %hx:%x\n", + pd->port_port, key, dd->ipath_pkeys[0], + atomic_read(&dd->ipath_pkeyrefs[0]), dd->ipath_pkeys[1], + atomic_read(&dd->ipath_pkeyrefs[1]), dd->ipath_pkeys[2], + atomic_read(&dd->ipath_pkeyrefs[2]), dd->ipath_pkeys[3], + atomic_read(&dd->ipath_pkeyrefs[3])); + + if (!lkey) { + ipath_cdbg(PROC, "p%u tries to set key 0, not allowed\n", + pd->port_port); + ret = -EINVAL; + goto bail; + } + + /* + * Set the full membership bit, because it has to be + * set in the register or the packet, and it seems + * cleaner to set in the register than to force all + * callers to set it. (see bug 4331) + */ + key |= 0x8000; + + for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) { + if (!pd->port_pkeys[i] && pidx == -1) + pidx = i; + if (pd->port_pkeys[i] == key) { + ipath_cdbg(VERBOSE, "p%u tries to set same pkey " + "(%x) more than once\n", + pd->port_port, key); + ret = -EEXIST; + goto bail; + } + } + if (pidx == -1) { + ipath_dbg("All pkeys for port %u already in use, " + "can't set %x\n", pd->port_port, key); + ret = -EBUSY; + goto bail; + } + for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i]) { + any++; + continue; + } + if (dd->ipath_pkeys[i] == key) { + atomic_t *pkrefs = &dd->ipath_pkeyrefs[i]; + + if (atomic_inc_return(pkrefs) > 1) { + pd->port_pkeys[pidx] = key; + ipath_cdbg(VERBOSE, "p%u set key %x " + "matches #%d, count now %d\n", + pd->port_port, key, i, + atomic_read(pkrefs)); + ret = 0; + goto bail; + } else { + /* + * lost race, decrement count, catch below + */ + atomic_dec(pkrefs); + ipath_cdbg(VERBOSE, "Lost race, count was " + "0, after dec, it's %d\n", + atomic_read(pkrefs)); + any++; + } + } + if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) { + /* + * It makes no sense to have both the limited and + * full membership PKEY set at the same time since + * the unlimited one will disable the limited one. + */ + ret = -EEXIST; + goto bail; + } + } + if (!any) { + ipath_dbg("port %u, all pkeys already in use, " + "can't set %x\n", pd->port_port, key); + ret = -EBUSY; + goto bail; + } + for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i] && + atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) { + u64 pkey; + + /* for ipathstats, etc. */ + ipath_stats.sps_pkeys[i] = lkey; + pd->port_pkeys[pidx] = dd->ipath_pkeys[i] = key; + pkey = + (u64) dd->ipath_pkeys[0] | + ((u64) dd->ipath_pkeys[1] << 16) | + ((u64) dd->ipath_pkeys[2] << 32) | + ((u64) dd->ipath_pkeys[3] << 48); + ipath_cdbg(PROC, "p%u set key %x in #%d, " + "portidx %d, new pkey reg %llx\n", + pd->port_port, key, i, pidx, + (unsigned long long) pkey); + ipath_write_kreg( + dd, dd->ipath_kregs->kr_partitionkey, pkey); + + ret = 0; + goto bail; + } + } + ipath_dbg("port %u, all pkeys already in use 2nd pass, " + "can't set %x\n", pd->port_port, key); + ret = -EBUSY; + +bail: + return ret; +} + +/** + * ipath_manage_rcvq - manage a port's receive queue + * @pd: the port + * @start_stop: action to carry out + * + * start_stop == 0 disables receive on the port, for use in queue + * overflow conditions. start_stop==1 re-enables, to be used to + * re-init the software copy of the head register + */ +static int ipath_manage_rcvq(struct ipath_portdata *pd, int start_stop) +{ + struct ipath_devdata *dd = pd->port_dd; + u64 tval; + + ipath_cdbg(PROC, "%sabling rcv for unit %u port %u\n", + start_stop ? "en" : "dis", dd->ipath_unit, + pd->port_port); + /* atomically clear receive enable port. */ + if (start_stop) { + /* + * On enable, force in-memory copy of the tail register to + * 0, so that protocol code doesn't have to worry about + * whether or not the chip has yet updated the in-memory + * copy or not on return from the system call. The chip + * always resets it's tail register back to 0 on a + * transition from disabled to enabled. This could cause a + * problem if software was broken, and did the enable w/o + * the disable, but eventually the in-memory copy will be + * updated and correct itself, even in the face of software + * bugs. + */ + *pd->port_rcvhdrtail_kvaddr = 0; + set_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port, + &dd->ipath_rcvctrl); + } else + clear_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port, + &dd->ipath_rcvctrl); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + /* now be sure chip saw it before we return */ + tval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + if (start_stop) { + /* + * And try to be sure that tail reg update has happened too. + * This should in theory interlock with the RXE changes to + * the tail register. Don't assign it to the tail register + * in memory copy, since we could overwrite an update by the + * chip if we did. + */ + tval = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port); + } + /* always; new head should be equal to new tail; see above */ + return 0; +} + +static void ipath_clean_part_key(struct ipath_portdata *pd, + struct ipath_devdata *dd) +{ + int i, j, pchanged = 0; + u64 oldpkey; + + /* for debugging only */ + oldpkey = (u64) dd->ipath_pkeys[0] | + ((u64) dd->ipath_pkeys[1] << 16) | + ((u64) dd->ipath_pkeys[2] << 32) | + ((u64) dd->ipath_pkeys[3] << 48); + + for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) { + if (!pd->port_pkeys[i]) + continue; + ipath_cdbg(VERBOSE, "look for key[%d] %hx in pkeys\n", i, + pd->port_pkeys[i]); + for (j = 0; j < ARRAY_SIZE(dd->ipath_pkeys); j++) { + /* check for match independent of the global bit */ + if ((dd->ipath_pkeys[j] & 0x7fff) != + (pd->port_pkeys[i] & 0x7fff)) + continue; + if (atomic_dec_and_test(&dd->ipath_pkeyrefs[j])) { + ipath_cdbg(VERBOSE, "p%u clear key " + "%x matches #%d\n", + pd->port_port, + pd->port_pkeys[i], j); + ipath_stats.sps_pkeys[j] = + dd->ipath_pkeys[j] = 0; + pchanged++; + } + else ipath_cdbg( + VERBOSE, "p%u key %x matches #%d, " + "but ref still %d\n", pd->port_port, + pd->port_pkeys[i], j, + atomic_read(&dd->ipath_pkeyrefs[j])); + break; + } + pd->port_pkeys[i] = 0; + } + if (pchanged) { + u64 pkey = (u64) dd->ipath_pkeys[0] | + ((u64) dd->ipath_pkeys[1] << 16) | + ((u64) dd->ipath_pkeys[2] << 32) | + ((u64) dd->ipath_pkeys[3] << 48); + ipath_cdbg(VERBOSE, "p%u old pkey reg %llx, " + "new pkey reg %llx\n", pd->port_port, + (unsigned long long) oldpkey, + (unsigned long long) pkey); + ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey, + pkey); + } +} + +/** + * ipath_create_user_egr - allocate eager TID buffers + * @pd: the port to allocate TID buffers for + * + * This routine is now quite different for user and kernel, because + * the kernel uses skb's, for the accelerated network performance + * This is the user port version + * + * Allocate the eager TID buffers and program them into infinipath + * They are no longer completely contiguous, we do multiple allocation + * calls. + */ +static int ipath_create_user_egr(struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + unsigned e, egrcnt, alloced, egrperchunk, chunk, egrsize, egroff; + size_t size; + int ret; + + egrcnt = dd->ipath_rcvegrcnt; + /* TID number offset for this port */ + egroff = pd->port_port * egrcnt; + egrsize = dd->ipath_rcvegrbufsize; + ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid " + "offset %x, egrsize %u\n", egrcnt, egroff, egrsize); + + /* + * to avoid wasting a lot of memory, we allocate 32KB chunks of + * physically contiguous memory, advance through it until used up + * and then allocate more. Of course, we need memory to store those + * extra pointers, now. Started out with 256KB, but under heavy + * memory pressure (creating large files and then copying them over + * NFS while doing lots of MPI jobs), we hit some allocation + * failures, even though we can sleep... (2.6.10) Still get + * failures at 64K. 32K is the lowest we can go without waiting + * more memory again. It seems likely that the coalescing in + * free_pages, etc. still has issues (as it has had previously + * during 2.6.x development). + */ + size = 0x8000; + alloced = ALIGN(egrsize * egrcnt, size); + egrperchunk = size / egrsize; + chunk = (egrcnt + egrperchunk - 1) / egrperchunk; + pd->port_rcvegrbuf_chunks = chunk; + pd->port_rcvegrbufs_perchunk = egrperchunk; + pd->port_rcvegrbuf_size = size; + pd->port_rcvegrbuf = vmalloc(chunk * sizeof(pd->port_rcvegrbuf[0])); + if (!pd->port_rcvegrbuf) { + ret = -ENOMEM; + goto bail; + } + pd->port_rcvegrbuf_phys = + vmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0])); + if (!pd->port_rcvegrbuf_phys) { + ret = -ENOMEM; + goto bail_rcvegrbuf; + } + for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) { + /* + * GFP_USER, but without GFP_FS, so buffer cache can be + * coalesced (we hope); otherwise, even at order 4, + * heavy filesystem activity makes these fail + */ + gfp_t gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; + + pd->port_rcvegrbuf[e] = dma_alloc_coherent( + &dd->pcidev->dev, size, &pd->port_rcvegrbuf_phys[e], + gfp_flags); + + if (!pd->port_rcvegrbuf[e]) { + ret = -ENOMEM; + goto bail_rcvegrbuf_phys; + } + } + + pd->port_rcvegr_phys = pd->port_rcvegrbuf_phys[0]; + + for (e = chunk = 0; chunk < pd->port_rcvegrbuf_chunks; chunk++) { + dma_addr_t pa = pd->port_rcvegrbuf_phys[chunk]; + unsigned i; + + for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) { + dd->ipath_f_put_tid(dd, e + egroff + + (u64 __iomem *) + ((char __iomem *) + dd->ipath_kregbase + + dd->ipath_rcvegrbase), 0, pa); + pa += egrsize; + } + cond_resched(); /* don't hog the cpu */ + } + + ret = 0; + goto bail; + +bail_rcvegrbuf_phys: + for (e = 0; e < pd->port_rcvegrbuf_chunks && + pd->port_rcvegrbuf[e]; e++) + dma_free_coherent(&dd->pcidev->dev, size, + pd->port_rcvegrbuf[e], + pd->port_rcvegrbuf_phys[e]); + + vfree(pd->port_rcvegrbuf_phys); + pd->port_rcvegrbuf_phys = NULL; +bail_rcvegrbuf: + vfree(pd->port_rcvegrbuf); + pd->port_rcvegrbuf = NULL; +bail: + return ret; +} + +static int ipath_do_user_init(struct ipath_portdata *pd, + const struct ipath_user_info *uinfo) +{ + int ret = 0; + struct ipath_devdata *dd = pd->port_dd; + u64 physaddr, uaddr, off, atmp; + struct page *pagep; + u32 head32; + u64 head; + + /* for now, if major version is different, bail */ + if ((uinfo->spu_userversion >> 16) != IPATH_USER_SWMAJOR) { + dev_info(&dd->pcidev->dev, + "User major version %d not same as driver " + "major %d\n", uinfo->spu_userversion >> 16, + IPATH_USER_SWMAJOR); + ret = -ENODEV; + goto done; + } + + if ((uinfo->spu_userversion & 0xffff) != IPATH_USER_SWMINOR) + ipath_dbg("User minor version %d not same as driver " + "minor %d\n", uinfo->spu_userversion & 0xffff, + IPATH_USER_SWMINOR); + + if (uinfo->spu_rcvhdrsize) { + ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize); + if (ret) + goto done; + } + + /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */ + + /* set up for the rcvhdr Q tail register writeback to user memory */ + if (!uinfo->spu_rcvhdraddr || + !access_ok(VERIFY_WRITE, (u64 __user *) (unsigned long) + uinfo->spu_rcvhdraddr, sizeof(u64))) { + ipath_dbg("Port %d rcvhdrtail addr %llx not valid\n", + pd->port_port, + (unsigned long long) uinfo->spu_rcvhdraddr); + ret = -EINVAL; + goto done; + } + + off = offset_in_page(uinfo->spu_rcvhdraddr); + uaddr = PAGE_MASK & (unsigned long) uinfo->spu_rcvhdraddr; + ret = ipath_get_user_pages_nocopy(uaddr, &pagep); + if (ret) { + dev_info(&dd->pcidev->dev, "Failed to lookup and lock " + "address %llx for rcvhdrtail: errno %d\n", + (unsigned long long) uinfo->spu_rcvhdraddr, -ret); + goto done; + } + ipath_stats.sps_pagelocks++; + pd->port_rcvhdrtail_uaddr = uaddr; + pd->port_rcvhdrtail_pagep = pagep; + pd->port_rcvhdrtail_kvaddr = + page_address(pagep); + pd->port_rcvhdrtail_kvaddr += off; + physaddr = page_to_phys(pagep) + off; + ipath_cdbg(VERBOSE, "port %d user addr %llx hdrtailaddr, %llx " + "physical (off=%llx)\n", + pd->port_port, + (unsigned long long) uinfo->spu_rcvhdraddr, + (unsigned long long) physaddr, (unsigned long long) off); + ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr, + pd->port_port, physaddr); + atmp = ipath_read_kreg64_port(dd, + dd->ipath_kregs->kr_rcvhdrtailaddr, + pd->port_port); + if (physaddr != atmp) { + ipath_dev_err(dd, + "Catastrophic software error, " + "RcvHdrTailAddr%u written as %llx, " + "read back as %llx\n", pd->port_port, + (unsigned long long) physaddr, + (unsigned long long) atmp); + ret = -EINVAL; + goto done; + } + + /* for right now, kernel piobufs are at end, so port 1 is at 0 */ + pd->port_piobufs = dd->ipath_piobufbase + + dd->ipath_pbufsport * (pd->port_port - + 1) * dd->ipath_palign; + ipath_cdbg(VERBOSE, "Set base of piobufs for port %u to 0x%x\n", + pd->port_port, pd->port_piobufs); + + /* + * Now allocate the rcvhdr Q and eager TIDs; skip the TID + * array for time being. If pd->port_port > chip-supported, + * we need to do extra stuff here to handle by handling overflow + * through port 0, someday + */ + ret = ipath_create_rcvhdrq(dd, pd); + if (!ret) + ret = ipath_create_user_egr(pd); + if (ret) + goto done; + /* enable receives now */ + /* atomically set enable bit for this port */ + set_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port, + &dd->ipath_rcvctrl); + + /* + * set the head registers for this port to the current values + * of the tail pointers, since we don't know if they were + * updated on last use of the port. + */ + head32 = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port); + head = (u64) head32; + ipath_write_ureg(dd, ur_rcvhdrhead, head, pd->port_port); + head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port); + ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port); + dd->ipath_lastegrheads[pd->port_port] = -1; + dd->ipath_lastrcvhdrqtails[pd->port_port] = -1; + ipath_cdbg(VERBOSE, "Wrote port%d head %llx, egrhead %x from " + "tail regs\n", pd->port_port, + (unsigned long long) head, head32); + pd->port_tidcursor = 0; /* start at beginning after open */ + /* + * now enable the port; the tail registers will be written to memory + * by the chip as soon as it sees the write to + * dd->ipath_kregs->kr_rcvctrl. The update only happens on + * transition from 0 to 1, so clear it first, then set it as part of + * enabling the port. This will (very briefly) affect any other + * open ports, but it shouldn't be long enough to be an issue. + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl & ~INFINIPATH_R_TAILUPD); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + +done: + return ret; +} + +static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd, + u64 ureg) +{ + unsigned long phys; + int ret; + + /* it's the real hardware, so io_remap works */ + + if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) { + dev_info(&dd->pcidev->dev, "FAIL mmap userreg: reqlen " + "%lx > PAGE\n", vma->vm_end - vma->vm_start); + ret = -EFAULT; + } else { + phys = dd->ipath_physaddr + ureg; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + ret = io_remap_pfn_range(vma, vma->vm_start, + phys >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); + } + return ret; +} + +static int mmap_piobufs(struct vm_area_struct *vma, + struct ipath_devdata *dd, + struct ipath_portdata *pd) +{ + unsigned long phys; + int ret; + + /* + * When we map the PIO buffers, we want to map them as writeonly, no + * read possible. + */ + + if ((vma->vm_end - vma->vm_start) > + (dd->ipath_pbufsport * dd->ipath_palign)) { + dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: " + "reqlen %lx > PAGE\n", + vma->vm_end - vma->vm_start); + ret = -EFAULT; + goto bail; + } + + phys = dd->ipath_physaddr + pd->port_piobufs; + /* + * Do *NOT* mark this as non-cached (PWT bit), or we don't get the + * write combining behavior we want on the PIO buffers! + * vma->vm_page_prot = + * pgprot_noncached(vma->vm_page_prot); + */ + + if (vma->vm_flags & VM_READ) { + dev_info(&dd->pcidev->dev, + "Can't map piobufs as readable (flags=%lx)\n", + vma->vm_flags); + ret = -EPERM; + goto bail; + } + + /* don't allow them to later change to readable with mprotect */ + + vma->vm_flags &= ~VM_MAYWRITE; + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + + ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); +bail: + return ret; +} + +static int mmap_rcvegrbufs(struct vm_area_struct *vma, + struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + unsigned long start, size; + size_t total_size, i; + dma_addr_t *phys; + int ret; + + if (!pd->port_rcvegrbuf) { + ret = -EFAULT; + goto bail; + } + + size = pd->port_rcvegrbuf_size; + total_size = pd->port_rcvegrbuf_chunks * size; + if ((vma->vm_end - vma->vm_start) > total_size) { + dev_info(&dd->pcidev->dev, "FAIL on egr bufs: " + "reqlen %lx > actual %lx\n", + vma->vm_end - vma->vm_start, + (unsigned long) total_size); + ret = -EFAULT; + goto bail; + } + + if (vma->vm_flags & VM_WRITE) { + dev_info(&dd->pcidev->dev, "Can't map eager buffers as " + "writable (flags=%lx)\n", vma->vm_flags); + ret = -EPERM; + goto bail; + } + + start = vma->vm_start; + phys = pd->port_rcvegrbuf_phys; + + /* don't allow them to later change to writeable with mprotect */ + vma->vm_flags &= ~VM_MAYWRITE; + + for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) { + ret = remap_pfn_range(vma, start, phys[i] >> PAGE_SHIFT, + size, vma->vm_page_prot); + if (ret < 0) + goto bail; + } + ret = 0; + +bail: + return ret; +} + +static int mmap_rcvhdrq(struct vm_area_struct *vma, + struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + size_t total_size; + int ret; + + /* + * kmalloc'ed memory, physically contiguous; this is from + * spi_rcvhdr_base; we allow user to map read-write so they can + * write hdrq entries to allow protocol code to directly poll + * whether a hdrq entry has been written. + */ + total_size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize * + sizeof(u32), PAGE_SIZE); + if ((vma->vm_end - vma->vm_start) > total_size) { + dev_info(&dd->pcidev->dev, + "FAIL on rcvhdrq: reqlen %lx > actual %lx\n", + vma->vm_end - vma->vm_start, + (unsigned long) total_size); + ret = -EFAULT; + goto bail; + } + + ret = remap_pfn_range(vma, vma->vm_start, + pd->port_rcvhdrq_phys >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); +bail: + return ret; +} + +static int mmap_pioavailregs(struct vm_area_struct *vma, + struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + int ret; + + /* + * when we map the PIO bufferavail registers, we want to map them as + * readonly, no write possible. + * + * kmalloc'ed memory, physically contiguous, one page only, readonly + */ + + if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) { + dev_info(&dd->pcidev->dev, "FAIL on pioavailregs_dma: " + "reqlen %lx > actual %lx\n", + vma->vm_end - vma->vm_start, + (unsigned long) PAGE_SIZE); + ret = -EFAULT; + goto bail; + } + + if (vma->vm_flags & VM_WRITE) { + dev_info(&dd->pcidev->dev, + "Can't map pioavailregs as writable (flags=%lx)\n", + vma->vm_flags); + ret = -EPERM; + goto bail; + } + + /* don't allow them to later change with mprotect */ + vma->vm_flags &= ~VM_MAYWRITE; + + ret = remap_pfn_range(vma, vma->vm_start, + dd->ipath_pioavailregs_phys >> PAGE_SHIFT, + PAGE_SIZE, vma->vm_page_prot); +bail: + return ret; +} + +/** + * ipath_mmap - mmap various structures into user space + * @fp: the file pointer + * @vma: the VM area + * + * We use this to have a shared buffer between the kernel and the user code + * for the rcvhdr queue, egr buffers, and the per-port user regs and pio + * buffers in the chip. We have the open and close entries so we can bump + * the ref count and keep the driver from being unloaded while still mapped. + */ +static int ipath_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct ipath_portdata *pd; + struct ipath_devdata *dd; + u64 pgaddr, ureg; + int ret; + + pd = port_fp(fp); + dd = pd->port_dd; + /* + * This is the ipath_do_user_init() code, mapping the shared buffers + * into the user process. The address referred to by vm_pgoff is the + * virtual, not physical, address; we only do one mmap for each + * space mapped. + */ + pgaddr = vma->vm_pgoff << PAGE_SHIFT; + + /* + * note that ureg does *NOT* have the kregvirt as part of it, to be + * sure that for 32 bit programs, we don't end up trying to map a > + * 44 address. Has to match ipath_get_base_info() code that sets + * __spi_uregbase + */ + + ureg = dd->ipath_uregbase + dd->ipath_palign * pd->port_port; + + ipath_cdbg(MM, "ushare: pgaddr %llx vm_start=%lx, vmlen %lx\n", + (unsigned long long) pgaddr, vma->vm_start, + vma->vm_end - vma->vm_start); + + if (pgaddr == ureg) + ret = mmap_ureg(vma, dd, ureg); + else if (pgaddr == pd->port_piobufs) + ret = mmap_piobufs(vma, dd, pd); + else if (pgaddr == (u64) pd->port_rcvegr_phys) + ret = mmap_rcvegrbufs(vma, pd); + else if (pgaddr == (u64) pd->port_rcvhdrq_phys) + ret = mmap_rcvhdrq(vma, pd); + else if (pgaddr == dd->ipath_pioavailregs_phys) + ret = mmap_pioavailregs(vma, pd); + else + ret = -EINVAL; + + vma->vm_private_data = NULL; + + if (ret < 0) + dev_info(&dd->pcidev->dev, + "Failure %d on addr %lx, off %lx\n", + -ret, vma->vm_start, vma->vm_pgoff); + + return ret; +} + +static unsigned int ipath_poll(struct file *fp, + struct poll_table_struct *pt) +{ + struct ipath_portdata *pd; + u32 head, tail; + int bit; + struct ipath_devdata *dd; + + pd = port_fp(fp); + dd = pd->port_dd; + + bit = pd->port_port + INFINIPATH_R_INTRAVAIL_SHIFT; + set_bit(bit, &dd->ipath_rcvctrl); + + /* + * Before blocking, make sure that head is still == tail, + * reading from the chip, so we can be sure the interrupt + * enable has made it to the chip. If not equal, disable + * interrupt again and return immediately. This avoids races, + * and the overhead of the chip read doesn't matter much at + * this point, since we are waiting for something anyway. + */ + + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port); + tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port); + + if (tail == head) { + set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag); + poll_wait(fp, &pd->port_wait, pt); + + if (test_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag)) { + /* timed out, no packets received */ + clear_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag); + pd->port_rcvwait_to++; + } + } + else { + /* it's already happened; don't do wait_event overhead */ + pd->port_rcvnowait++; + } + + clear_bit(bit, &dd->ipath_rcvctrl); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + return 0; +} + +static int try_alloc_port(struct ipath_devdata *dd, int port, + struct file *fp) +{ + int ret; + + if (!dd->ipath_pd[port]) { + void *p, *ptmp; + + p = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL); + + /* + * Allocate memory for use in ipath_tid_update() just once + * at open, not per call. Reduces cost of expected send + * setup. + */ + ptmp = kmalloc(dd->ipath_rcvtidcnt * sizeof(u16) + + dd->ipath_rcvtidcnt * sizeof(struct page **), + GFP_KERNEL); + if (!p || !ptmp) { + ipath_dev_err(dd, "Unable to allocate portdata " + "memory, failing open\n"); + ret = -ENOMEM; + kfree(p); + kfree(ptmp); + goto bail; + } + dd->ipath_pd[port] = p; + dd->ipath_pd[port]->port_port = port; + dd->ipath_pd[port]->port_dd = dd; + dd->ipath_pd[port]->port_tid_pg_list = ptmp; + init_waitqueue_head(&dd->ipath_pd[port]->port_wait); + } + if (!dd->ipath_pd[port]->port_cnt) { + dd->ipath_pd[port]->port_cnt = 1; + fp->private_data = (void *) dd->ipath_pd[port]; + ipath_cdbg(PROC, "%s[%u] opened unit:port %u:%u\n", + current->comm, current->pid, dd->ipath_unit, + port); + dd->ipath_pd[port]->port_pid = current->pid; + strncpy(dd->ipath_pd[port]->port_comm, current->comm, + sizeof(dd->ipath_pd[port]->port_comm)); + ipath_stats.sps_ports++; + ret = 0; + goto bail; + } + ret = -EBUSY; + +bail: + return ret; +} + +static inline int usable(struct ipath_devdata *dd) +{ + return dd && + (dd->ipath_flags & IPATH_PRESENT) && + dd->ipath_kregbase && + dd->ipath_lid && + !(dd->ipath_flags & (IPATH_LINKDOWN | IPATH_DISABLED + | IPATH_LINKUNK)); +} + +static int find_free_port(int unit, struct file *fp) +{ + struct ipath_devdata *dd = ipath_lookup(unit); + int ret, i; + + if (!dd) { + ret = -ENODEV; + goto bail; + } + + if (!usable(dd)) { + ret = -ENETDOWN; + goto bail; + } + + for (i = 0; i < dd->ipath_cfgports; i++) { + ret = try_alloc_port(dd, i, fp); + if (ret != -EBUSY) + goto bail; + } + ret = -EBUSY; + +bail: + return ret; +} + +static int find_best_unit(struct file *fp) +{ + int ret = 0, i, prefunit = -1, devmax; + int maxofallports, npresent, nup; + int ndev; + + (void) ipath_count_units(&npresent, &nup, &maxofallports); + + /* + * This code is present to allow a knowledgeable person to + * specify the layout of processes to processors before opening + * this driver, and then we'll assign the process to the "closest" + * HT-400 to that processor (we assume reasonable connectivity, + * for now). This code assumes that if affinity has been set + * before this point, that at most one cpu is set; for now this + * is reasonable. I check for both cpus_empty() and cpus_full(), + * in case some kernel variant sets none of the bits when no + * affinity is set. 2.6.11 and 12 kernels have all present + * cpus set. Some day we'll have to fix it up further to handle + * a cpu subset. This algorithm fails for two HT-400's connected + * in tunnel fashion. Eventually this needs real topology + * information. There may be some issues with dual core numbering + * as well. This needs more work prior to release. + */ + if (!cpus_empty(current->cpus_allowed) && + !cpus_full(current->cpus_allowed)) { + int ncpus = num_online_cpus(), curcpu = -1; + for (i = 0; i < ncpus; i++) + if (cpu_isset(i, current->cpus_allowed)) { + ipath_cdbg(PROC, "%s[%u] affinity set for " + "cpu %d\n", current->comm, + current->pid, i); + curcpu = i; + } + if (curcpu != -1) { + if (npresent) { + prefunit = curcpu / (ncpus / npresent); + ipath_dbg("%s[%u] %d chips, %d cpus, " + "%d cpus/chip, select unit %d\n", + current->comm, current->pid, + npresent, ncpus, ncpus / npresent, + prefunit); + } + } + } + + /* + * user ports start at 1, kernel port is 0 + * For now, we do round-robin access across all chips + */ + + if (prefunit != -1) + devmax = prefunit + 1; + else + devmax = ipath_count_units(NULL, NULL, NULL); +recheck: + for (i = 1; i < maxofallports; i++) { + for (ndev = prefunit != -1 ? prefunit : 0; ndev < devmax; + ndev++) { + struct ipath_devdata *dd = ipath_lookup(ndev); + + if (!usable(dd)) + continue; /* can't use this unit */ + if (i >= dd->ipath_cfgports) + /* + * Maxed out on users of this unit. Try + * next. + */ + continue; + ret = try_alloc_port(dd, i, fp); + if (!ret) + goto done; + } + } + + if (npresent) { + if (nup == 0) { + ret = -ENETDOWN; + ipath_dbg("No ports available (none initialized " + "and ready)\n"); + } else { + if (prefunit > 0) { + /* if started above 0, retry from 0 */ + ipath_cdbg(PROC, + "%s[%u] no ports on prefunit " + "%d, clear and re-check\n", + current->comm, current->pid, + prefunit); + devmax = ipath_count_units(NULL, NULL, + NULL); + prefunit = -1; + goto recheck; + } + ret = -EBUSY; + ipath_dbg("No ports available\n"); + } + } else { + ret = -ENXIO; + ipath_dbg("No boards found\n"); + } + +done: + return ret; +} + +static int ipath_open(struct inode *in, struct file *fp) +{ + int ret, minor; + + mutex_lock(&ipath_mutex); + + minor = iminor(in); + ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n", + (long)in->i_rdev, minor); + + if (minor) + ret = find_free_port(minor - 1, fp); + else + ret = find_best_unit(fp); + + mutex_unlock(&ipath_mutex); + return ret; +} + +/** + * unlock_exptid - unlock any expected TID entries port still had in use + * @pd: port + * + * We don't actually update the chip here, because we do a bulk update + * below, using ipath_f_clear_tids. + */ +static void unlock_expected_tids(struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + int port_tidbase = pd->port_port * dd->ipath_rcvtidcnt; + int i, cnt = 0, maxtid = port_tidbase + dd->ipath_rcvtidcnt; + + ipath_cdbg(VERBOSE, "Port %u unlocking any locked expTID pages\n", + pd->port_port); + for (i = port_tidbase; i < maxtid; i++) { + if (!dd->ipath_pageshadow[i]) + continue; + + ipath_release_user_pages_on_close(&dd->ipath_pageshadow[i], + 1); + dd->ipath_pageshadow[i] = NULL; + cnt++; + ipath_stats.sps_pageunlocks++; + } + if (cnt) + ipath_cdbg(VERBOSE, "Port %u locked %u expTID entries\n", + pd->port_port, cnt); + + if (ipath_stats.sps_pagelocks || ipath_stats.sps_pageunlocks) + ipath_cdbg(VERBOSE, "%llu pages locked, %llu unlocked\n", + (unsigned long long) ipath_stats.sps_pagelocks, + (unsigned long long) + ipath_stats.sps_pageunlocks); +} + +static int ipath_close(struct inode *in, struct file *fp) +{ + int ret = 0; + struct ipath_portdata *pd; + struct ipath_devdata *dd; + unsigned port; + + ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n", + (long)in->i_rdev, fp->private_data); + + mutex_lock(&ipath_mutex); + + pd = port_fp(fp); + port = pd->port_port; + fp->private_data = NULL; + dd = pd->port_dd; + + if (pd->port_hdrqfull) { + ipath_cdbg(PROC, "%s[%u] had %u rcvhdrqfull errors " + "during run\n", pd->port_comm, pd->port_pid, + pd->port_hdrqfull); + pd->port_hdrqfull = 0; + } + + if (pd->port_rcvwait_to || pd->port_piowait_to + || pd->port_rcvnowait || pd->port_pionowait) { + ipath_cdbg(VERBOSE, "port%u, %u rcv, %u pio wait timeo; " + "%u rcv %u, pio already\n", + pd->port_port, pd->port_rcvwait_to, + pd->port_piowait_to, pd->port_rcvnowait, + pd->port_pionowait); + pd->port_rcvwait_to = pd->port_piowait_to = + pd->port_rcvnowait = pd->port_pionowait = 0; + } + if (pd->port_flag) { + ipath_dbg("port %u port_flag still set to 0x%lx\n", + pd->port_port, pd->port_flag); + pd->port_flag = 0; + } + + if (dd->ipath_kregbase) { + if (pd->port_rcvhdrtail_uaddr) { + pd->port_rcvhdrtail_uaddr = 0; + pd->port_rcvhdrtail_kvaddr = NULL; + ipath_release_user_pages_on_close( + &pd->port_rcvhdrtail_pagep, 1); + pd->port_rcvhdrtail_pagep = NULL; + ipath_stats.sps_pageunlocks++; + } + ipath_write_kreg_port( + dd, dd->ipath_kregs->kr_rcvhdrtailaddr, + port, 0ULL); + ipath_write_kreg_port( + dd, dd->ipath_kregs->kr_rcvhdraddr, + pd->port_port, 0); + + /* clean up the pkeys for this port user */ + ipath_clean_part_key(pd, dd); + + if (port < dd->ipath_cfgports) { + int i = dd->ipath_pbufsport * (port - 1); + ipath_disarm_piobufs(dd, i, dd->ipath_pbufsport); + + /* atomically clear receive enable port. */ + clear_bit(INFINIPATH_R_PORTENABLE_SHIFT + port, + &dd->ipath_rcvctrl); + ipath_write_kreg( + dd, + dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + if (dd->ipath_pageshadow) + unlock_expected_tids(pd); + ipath_stats.sps_ports--; + ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n", + pd->port_comm, pd->port_pid, + dd->ipath_unit, port); + } + } + + pd->port_cnt = 0; + pd->port_pid = 0; + + dd->ipath_f_clear_tids(dd, pd->port_port); + + ipath_free_pddata(dd, pd->port_port, 0); + + mutex_unlock(&ipath_mutex); + + return ret; +} + +static int ipath_port_info(struct ipath_portdata *pd, + struct ipath_port_info __user *uinfo) +{ + struct ipath_port_info info; + int nup; + int ret; + + (void) ipath_count_units(NULL, &nup, NULL); + info.num_active = nup; + info.unit = pd->port_dd->ipath_unit; + info.port = pd->port_port; + + if (copy_to_user(uinfo, &info, sizeof(info))) { + ret = -EFAULT; + goto bail; + } + ret = 0; + +bail: + return ret; +} + +static ssize_t ipath_write(struct file *fp, const char __user *data, + size_t count, loff_t *off) +{ + const struct ipath_cmd __user *ucmd; + struct ipath_portdata *pd; + const void __user *src; + size_t consumed, copy; + struct ipath_cmd cmd; + ssize_t ret = 0; + void *dest; + + if (count < sizeof(cmd.type)) { + ret = -EINVAL; + goto bail; + } + + ucmd = (const struct ipath_cmd __user *) data; + + if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) { + ret = -EFAULT; + goto bail; + } + + consumed = sizeof(cmd.type); + + switch (cmd.type) { + case IPATH_CMD_USER_INIT: + copy = sizeof(cmd.cmd.user_info); + dest = &cmd.cmd.user_info; + src = &ucmd->cmd.user_info; + break; + case IPATH_CMD_RECV_CTRL: + copy = sizeof(cmd.cmd.recv_ctrl); + dest = &cmd.cmd.recv_ctrl; + src = &ucmd->cmd.recv_ctrl; + break; + case IPATH_CMD_PORT_INFO: + copy = sizeof(cmd.cmd.port_info); + dest = &cmd.cmd.port_info; + src = &ucmd->cmd.port_info; + break; + case IPATH_CMD_TID_UPDATE: + case IPATH_CMD_TID_FREE: + copy = sizeof(cmd.cmd.tid_info); + dest = &cmd.cmd.tid_info; + src = &ucmd->cmd.tid_info; + break; + case IPATH_CMD_SET_PART_KEY: + copy = sizeof(cmd.cmd.part_key); + dest = &cmd.cmd.part_key; + src = &ucmd->cmd.part_key; + break; + default: + ret = -EINVAL; + goto bail; + } + + if ((count - consumed) < copy) { + ret = -EINVAL; + goto bail; + } + + if (copy_from_user(dest, src, copy)) { + ret = -EFAULT; + goto bail; + } + + consumed += copy; + pd = port_fp(fp); + + switch (cmd.type) { + case IPATH_CMD_USER_INIT: + ret = ipath_do_user_init(pd, &cmd.cmd.user_info); + if (ret < 0) + goto bail; + ret = ipath_get_base_info( + pd, (void __user *) (unsigned long) + cmd.cmd.user_info.spu_base_info, + cmd.cmd.user_info.spu_base_info_size); + break; + case IPATH_CMD_RECV_CTRL: + ret = ipath_manage_rcvq(pd, cmd.cmd.recv_ctrl); + break; + case IPATH_CMD_PORT_INFO: + ret = ipath_port_info(pd, + (struct ipath_port_info __user *) + (unsigned long) cmd.cmd.port_info); + break; + case IPATH_CMD_TID_UPDATE: + ret = ipath_tid_update(pd, &cmd.cmd.tid_info); + break; + case IPATH_CMD_TID_FREE: + ret = ipath_tid_free(pd, &cmd.cmd.tid_info); + break; + case IPATH_CMD_SET_PART_KEY: + ret = ipath_set_part_key(pd, cmd.cmd.part_key); + break; + } + + if (ret >= 0) + ret = consumed; + +bail: + return ret; +} + +static struct class *ipath_class; + +static int init_cdev(int minor, char *name, struct file_operations *fops, + struct cdev **cdevp, struct class_device **class_devp) +{ + const dev_t dev = MKDEV(IPATH_MAJOR, minor); + struct cdev *cdev = NULL; + struct class_device *class_dev = NULL; + int ret; + + cdev = cdev_alloc(); + if (!cdev) { + printk(KERN_ERR IPATH_DRV_NAME + ": Could not allocate cdev for minor %d, %s\n", + minor, name); + ret = -ENOMEM; + goto done; + } + + cdev->owner = THIS_MODULE; + cdev->ops = fops; + kobject_set_name(&cdev->kobj, name); + + ret = cdev_add(cdev, dev, 1); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME + ": Could not add cdev for minor %d, %s (err %d)\n", + minor, name, -ret); + goto err_cdev; + } + + class_dev = class_device_create(ipath_class, NULL, dev, NULL, name); + + if (IS_ERR(class_dev)) { + ret = PTR_ERR(class_dev); + printk(KERN_ERR IPATH_DRV_NAME ": Could not create " + "class_dev for minor %d, %s (err %d)\n", + minor, name, -ret); + goto err_cdev; + } + + goto done; + +err_cdev: + cdev_del(cdev); + cdev = NULL; + +done: + if (ret >= 0) { + *cdevp = cdev; + *class_devp = class_dev; + } else { + *cdevp = NULL; + *class_devp = NULL; + } + + return ret; +} + +int ipath_cdev_init(int minor, char *name, struct file_operations *fops, + struct cdev **cdevp, struct class_device **class_devp) +{ + return init_cdev(minor, name, fops, cdevp, class_devp); +} + +static void cleanup_cdev(struct cdev **cdevp, + struct class_device **class_devp) +{ + struct class_device *class_dev = *class_devp; + + if (class_dev) { + class_device_unregister(class_dev); + *class_devp = NULL; + } + + if (*cdevp) { + cdev_del(*cdevp); + *cdevp = NULL; + } +} + +void ipath_cdev_cleanup(struct cdev **cdevp, + struct class_device **class_devp) +{ + cleanup_cdev(cdevp, class_devp); +} + +static struct cdev *wildcard_cdev; +static struct class_device *wildcard_class_dev; + +static const dev_t dev = MKDEV(IPATH_MAJOR, 0); + +static int user_init(void) +{ + int ret; + + ret = register_chrdev_region(dev, IPATH_NMINORS, IPATH_DRV_NAME); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME ": Could not register " + "chrdev region (err %d)\n", -ret); + goto done; + } + + ipath_class = class_create(THIS_MODULE, IPATH_DRV_NAME); + + if (IS_ERR(ipath_class)) { + ret = PTR_ERR(ipath_class); + printk(KERN_ERR IPATH_DRV_NAME ": Could not create " + "device class (err %d)\n", -ret); + goto bail; + } + + goto done; +bail: + unregister_chrdev_region(dev, IPATH_NMINORS); +done: + return ret; +} + +static void user_cleanup(void) +{ + if (ipath_class) { + class_destroy(ipath_class); + ipath_class = NULL; + } + + unregister_chrdev_region(dev, IPATH_NMINORS); +} + +static atomic_t user_count = ATOMIC_INIT(0); +static atomic_t user_setup = ATOMIC_INIT(0); + +int ipath_user_add(struct ipath_devdata *dd) +{ + char name[10]; + int ret; + + if (atomic_inc_return(&user_count) == 1) { + ret = user_init(); + if (ret < 0) { + ipath_dev_err(dd, "Unable to set up user support: " + "error %d\n", -ret); + goto bail; + } + ret = ipath_diag_init(); + if (ret < 0) { + ipath_dev_err(dd, "Unable to set up diag support: " + "error %d\n", -ret); + goto bail_sma; + } + + ret = init_cdev(0, "ipath", &ipath_file_ops, &wildcard_cdev, + &wildcard_class_dev); + if (ret < 0) { + ipath_dev_err(dd, "Could not create wildcard " + "minor: error %d\n", -ret); + goto bail_diag; + } + + atomic_set(&user_setup, 1); + } + + snprintf(name, sizeof(name), "ipath%d", dd->ipath_unit); + + ret = init_cdev(dd->ipath_unit + 1, name, &ipath_file_ops, + &dd->cdev, &dd->class_dev); + if (ret < 0) + ipath_dev_err(dd, "Could not create user minor %d, %s\n", + dd->ipath_unit + 1, name); + + goto bail; + +bail_diag: + ipath_diag_cleanup(); +bail_sma: + user_cleanup(); +bail: + return ret; +} + +void ipath_user_del(struct ipath_devdata *dd) +{ + cleanup_cdev(&dd->cdev, &dd->class_dev); + + if (atomic_dec_return(&user_count) == 0) { + if (atomic_read(&user_setup) == 0) + goto bail; + + cleanup_cdev(&wildcard_cdev, &wildcard_class_dev); + ipath_diag_cleanup(); + user_cleanup(); + + atomic_set(&user_setup, 0); + } +bail: + return; +} diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c new file mode 100644 index 0000000..e274120 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_fs.c @@ -0,0 +1,605 @@ +/* + * Copyright (c) 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/version.h> +#include <linux/config.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/pagemap.h> +#include <linux/init.h> +#include <linux/namei.h> +#include <linux/pci.h> + +#include "ipath_kernel.h" + +#define IPATHFS_MAGIC 0x726a77 + +static struct super_block *ipath_super; + +static int ipathfs_mknod(struct inode *dir, struct dentry *dentry, + int mode, struct file_operations *fops, + void *data) +{ + int error; + struct inode *inode = new_inode(dir->i_sb); + + if (!inode) { + error = -EPERM; + goto bail; + } + + inode->i_mode = mode; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_blocks = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->u.generic_ip = data; + if ((mode & S_IFMT) == S_IFDIR) { + inode->i_op = &simple_dir_inode_operations; + inode->i_nlink++; + dir->i_nlink++; + } + + inode->i_fop = fops; + + d_instantiate(dentry, inode); + error = 0; + +bail: + return error; +} + +static int create_file(const char *name, mode_t mode, + struct dentry *parent, struct dentry **dentry, + struct file_operations *fops, void *data) +{ + int error; + + *dentry = NULL; + mutex_lock(&parent->d_inode->i_mutex); + *dentry = lookup_one_len(name, parent, strlen(name)); + if (!IS_ERR(dentry)) + error = ipathfs_mknod(parent->d_inode, *dentry, + mode, fops, data); + else + error = PTR_ERR(dentry); + mutex_unlock(&parent->d_inode->i_mutex); + + return error; +} + +static ssize_t atomic_stats_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return simple_read_from_buffer(buf, count, ppos, &ipath_stats, + sizeof ipath_stats); +} + +static struct file_operations atomic_stats_ops = { + .read = atomic_stats_read, +}; + +#define NUM_COUNTERS sizeof(struct infinipath_counters) / sizeof(u64) + +static ssize_t atomic_counters_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 counters[NUM_COUNTERS]; + u16 i; + struct ipath_devdata *dd; + + dd = file->f_dentry->d_inode->u.generic_ip; + + for (i = 0; i < NUM_COUNTERS; i++) + counters[i] = ipath_snap_cntr(dd, i); + + return simple_read_from_buffer(buf, count, ppos, counters, + sizeof counters); +} + +static struct file_operations atomic_counters_ops = { + .read = atomic_counters_read, +}; + +static ssize_t atomic_node_info_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u32 nodeinfo[10]; + struct ipath_devdata *dd; + u64 guid; + + dd = file->f_dentry->d_inode->u.generic_ip; + + guid = be64_to_cpu(dd->ipath_guid); + + nodeinfo[0] = /* BaseVersion is SMA */ + /* ClassVersion is SMA */ + (1 << 8) /* NodeType */ + | (1 << 0); /* NumPorts */ + nodeinfo[1] = (u32) (guid >> 32); + nodeinfo[2] = (u32) (guid & 0xffffffff); + /* PortGUID == SystemImageGUID for us */ + nodeinfo[3] = nodeinfo[1]; + /* PortGUID == SystemImageGUID for us */ + nodeinfo[4] = nodeinfo[2]; + /* PortGUID == NodeGUID for us */ + nodeinfo[5] = nodeinfo[3]; + /* PortGUID == NodeGUID for us */ + nodeinfo[6] = nodeinfo[4]; + nodeinfo[7] = (4 << 16) /* we support 4 pkeys */ + | (dd->ipath_deviceid << 0); + /* our chip version as 16 bits major, 16 bits minor */ + nodeinfo[8] = dd->ipath_minrev | (dd->ipath_majrev << 16); + nodeinfo[9] = (dd->ipath_unit << 24) | (dd->ipath_vendorid << 0); + + return simple_read_from_buffer(buf, count, ppos, nodeinfo, + sizeof nodeinfo); +} + +static struct file_operations atomic_node_info_ops = { + .read = atomic_node_info_read, +}; + +static ssize_t atomic_port_info_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u32 portinfo[13]; + u32 tmp, tmp2; + struct ipath_devdata *dd; + + dd = file->f_dentry->d_inode->u.generic_ip; + + /* so we only initialize non-zero fields. */ + memset(portinfo, 0, sizeof portinfo); + + /* + * Notimpl yet M_Key (64) + * Notimpl yet GID (64) + */ + + portinfo[4] = (dd->ipath_lid << 16); + + /* + * Notimpl yet SMLID (should we store this in the driver, in case + * SMA dies?) CapabilityMask is 0, we don't support any of these + * DiagCode is 0; we don't store any diag info for now Notimpl yet + * M_KeyLeasePeriod (we don't support M_Key) + */ + + /* LocalPortNum is whichever port number they ask for */ + portinfo[7] = (dd->ipath_unit << 24) + /* LinkWidthEnabled */ + | (2 << 16) + /* LinkWidthSupported (really 2, but not IB valid) */ + | (3 << 8) + /* LinkWidthActive */ + | (2 << 0); + tmp = dd->ipath_lastibcstat & IPATH_IBSTATE_MASK; + tmp2 = 5; + if (tmp == IPATH_IBSTATE_INIT) + tmp = 2; + else if (tmp == IPATH_IBSTATE_ARM) + tmp = 3; + else if (tmp == IPATH_IBSTATE_ACTIVE) + tmp = 4; + else { + tmp = 0; /* down */ + tmp2 = tmp & 0xf; + } + + portinfo[8] = (1 << 28) /* LinkSpeedSupported */ + | (tmp << 24) /* PortState */ + | (tmp2 << 20) /* PortPhysicalState */ + | (2 << 16) + + /* LinkDownDefaultState */ + /* M_KeyProtectBits == 0 */ + /* NotImpl yet LMC == 0 (we can support all values) */ + | (1 << 4) /* LinkSpeedActive */ + | (1 << 0); /* LinkSpeedEnabled */ + switch (dd->ipath_ibmtu) { + case 4096: + tmp = 5; + break; + case 2048: + tmp = 4; + break; + case 1024: + tmp = 3; + break; + case 512: + tmp = 2; + break; + case 256: + tmp = 1; + break; + default: /* oops, something is wrong */ + ipath_dbg("Problem, ipath_ibmtu 0x%x not a valid IB MTU, " + "treat as 2048\n", dd->ipath_ibmtu); + tmp = 4; + break; + } + portinfo[9] = (tmp << 28) + /* NeighborMTU */ + /* Notimpl MasterSMSL */ + | (1 << 20) + + /* VLCap */ + /* Notimpl InitType (actually, an SMA decision) */ + /* VLHighLimit is 0 (only one VL) */ + ; /* VLArbitrationHighCap is 0 (only one VL) */ + portinfo[10] = /* VLArbitrationLowCap is 0 (only one VL) */ + /* InitTypeReply is SMA decision */ + (5 << 16) /* MTUCap 4096 */ + | (7 << 13) /* VLStallCount */ + | (0x1f << 8) /* HOQLife */ + | (1 << 4) + + /* OperationalVLs 0 */ + /* PartitionEnforcementInbound */ + /* PartitionEnforcementOutbound not enforced */ + /* FilterRawinbound not enforced */ + ; /* FilterRawOutbound not enforced */ + /* M_KeyViolations are not counted by hardware, SMA can count */ + tmp = ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey); + /* P_KeyViolations are counted by hardware. */ + portinfo[11] = ((tmp & 0xffff) << 0); + portinfo[12] = + /* Q_KeyViolations are not counted by hardware */ + (1 << 8) + + /* GUIDCap */ + /* SubnetTimeOut handled by SMA */ + /* RespTimeValue handled by SMA */ + ; + /* LocalPhyErrors are programmed to max */ + portinfo[12] |= (0xf << 20) + | (0xf << 16) /* OverRunErrors are programmed to max */ + ; + + return simple_read_from_buffer(buf, count, ppos, portinfo, + sizeof portinfo); +} + +static struct file_operations atomic_port_info_ops = { + .read = atomic_port_info_read, +}; + +static ssize_t flash_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct ipath_devdata *dd; + ssize_t ret; + loff_t pos; + char *tmp; + + pos = *ppos; + + if ( pos < 0) { + ret = -EINVAL; + goto bail; + } + + if (pos >= sizeof(struct ipath_flash)) { + ret = 0; + goto bail; + } + + if (count > sizeof(struct ipath_flash) - pos) + count = sizeof(struct ipath_flash) - pos; + + tmp = kmalloc(count, GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto bail; + } + + dd = file->f_dentry->d_inode->u.generic_ip; + if (ipath_eeprom_read(dd, pos, tmp, count)) { + ipath_dev_err(dd, "failed to read from flash\n"); + ret = -ENXIO; + goto bail_tmp; + } + + if (copy_to_user(buf, tmp, count)) { + ret = -EFAULT; + goto bail_tmp; + } + + *ppos = pos + count; + ret = count; + +bail_tmp: + kfree(tmp); + +bail: + return ret; +} + +static ssize_t flash_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct ipath_devdata *dd; + ssize_t ret; + loff_t pos; + char *tmp; + + pos = *ppos; + + if ( pos < 0) { + ret = -EINVAL; + goto bail; + } + + if (pos >= sizeof(struct ipath_flash)) { + ret = 0; + goto bail; + } + + if (count > sizeof(struct ipath_flash) - pos) + count = sizeof(struct ipath_flash) - pos; + + tmp = kmalloc(count, GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto bail; + } + + if (copy_from_user(tmp, buf, count)) { + ret = -EFAULT; + goto bail_tmp; + } + + dd = file->f_dentry->d_inode->u.generic_ip; + if (ipath_eeprom_write(dd, pos, tmp, count)) { + ret = -ENXIO; + ipath_dev_err(dd, "failed to write to flash\n"); + goto bail_tmp; + } + + *ppos = pos + count; + ret = count; + +bail_tmp: + kfree(tmp); + +bail: + return ret; +} + +static struct file_operations flash_ops = { + .read = flash_read, + .write = flash_write, +}; + +static int create_device_files(struct super_block *sb, + struct ipath_devdata *dd) +{ + struct dentry *dir, *tmp; + char unit[10]; + int ret; + + snprintf(unit, sizeof unit, "%02d", dd->ipath_unit); + ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir, + (struct file_operations *) &simple_dir_operations, + dd); + if (ret) { + printk(KERN_ERR "create_file(%s) failed: %d\n", unit, ret); + goto bail; + } + + ret = create_file("atomic_counters", S_IFREG|S_IRUGO, dir, &tmp, + &atomic_counters_ops, dd); + if (ret) { + printk(KERN_ERR "create_file(%s/atomic_counters) " + "failed: %d\n", unit, ret); + goto bail; + } + + ret = create_file("node_info", S_IFREG|S_IRUGO, dir, &tmp, + &atomic_node_info_ops, dd); + if (ret) { + printk(KERN_ERR "create_file(%s/node_info) " + "failed: %d\n", unit, ret); + goto bail; + } + + ret = create_file("port_info", S_IFREG|S_IRUGO, dir, &tmp, + &atomic_port_info_ops, dd); + if (ret) { + printk(KERN_ERR "create_file(%s/port_info) " + "failed: %d\n", unit, ret); + goto bail; + } + + ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp, + &flash_ops, dd); + if (ret) { + printk(KERN_ERR "create_file(%s/flash) " + "failed: %d\n", unit, ret); + goto bail; + } + +bail: + return ret; +} + +static void remove_file(struct dentry *parent, char *name) +{ + struct dentry *tmp; + + tmp = lookup_one_len(name, parent, strlen(name)); + + spin_lock(&dcache_lock); + spin_lock(&tmp->d_lock); + if (!(d_unhashed(tmp) && tmp->d_inode)) { + dget_locked(tmp); + __d_drop(tmp); + spin_unlock(&tmp->d_lock); + spin_unlock(&dcache_lock); + simple_unlink(parent->d_inode, tmp); + } else { + spin_unlock(&tmp->d_lock); + spin_unlock(&dcache_lock); + } +} + +static int remove_device_files(struct super_block *sb, + struct ipath_devdata *dd) +{ + struct dentry *dir, *root; + char unit[10]; + int ret; + + root = dget(sb->s_root); + mutex_lock(&root->d_inode->i_mutex); + snprintf(unit, sizeof unit, "%02d", dd->ipath_unit); + dir = lookup_one_len(unit, root, strlen(unit)); + + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + printk(KERN_ERR "Lookup of %s failed\n", unit); + goto bail; + } + + remove_file(dir, "flash"); + remove_file(dir, "port_info"); + remove_file(dir, "node_info"); + remove_file(dir, "atomic_counters"); + d_delete(dir); + ret = simple_rmdir(root->d_inode, dir); + +bail: + mutex_unlock(&root->d_inode->i_mutex); + dput(root); + return ret; +} + +static int ipathfs_fill_super(struct super_block *sb, void *data, + int silent) +{ + struct ipath_devdata *dd, *tmp; + unsigned long flags; + int ret; + + static struct tree_descr files[] = { + [1] = {"atomic_stats", &atomic_stats_ops, S_IRUGO}, + {""}, + }; + + ret = simple_fill_super(sb, IPATHFS_MAGIC, files); + if (ret) { + printk(KERN_ERR "simple_fill_super failed: %d\n", ret); + goto bail; + } + + spin_lock_irqsave(&ipath_devs_lock, flags); + + list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) { + spin_unlock_irqrestore(&ipath_devs_lock, flags); + ret = create_device_files(sb, dd); + if (ret) { + deactivate_super(sb); + goto bail; + } + spin_lock_irqsave(&ipath_devs_lock, flags); + } + + spin_unlock_irqrestore(&ipath_devs_lock, flags); + +bail: + return ret; +} + +static struct super_block *ipathfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + ipath_super = get_sb_single(fs_type, flags, data, + ipathfs_fill_super); + return ipath_super; +} + +static void ipathfs_kill_super(struct super_block *s) +{ + kill_litter_super(s); + ipath_super = NULL; +} + +int ipathfs_add_device(struct ipath_devdata *dd) +{ + int ret; + + if (ipath_super == NULL) { + ret = 0; + goto bail; + } + + ret = create_device_files(ipath_super, dd); + +bail: + return ret; +} + +int ipathfs_remove_device(struct ipath_devdata *dd) +{ + int ret; + + if (ipath_super == NULL) { + ret = 0; + goto bail; + } + + ret = remove_device_files(ipath_super, dd); + +bail: + return ret; +} + +static struct file_system_type ipathfs_fs_type = { + .owner = THIS_MODULE, + .name = "ipathfs", + .get_sb = ipathfs_get_sb, + .kill_sb = ipathfs_kill_super, +}; + +int __init ipath_init_ipathfs(void) +{ + return register_filesystem(&ipathfs_fs_type); +} + +void __exit ipath_exit_ipathfs(void) +{ + unregister_filesystem(&ipathfs_fs_type); +} diff --git a/drivers/infiniband/hw/ipath/ipath_ht400.c b/drivers/infiniband/hw/ipath/ipath_ht400.c new file mode 100644 index 0000000..4652435 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_ht400.c @@ -0,0 +1,1586 @@ +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file contains all of the code that is specific to the InfiniPath + * HT-400 chip. + */ + +#include <linux/pci.h> +#include <linux/delay.h> + +#include "ipath_kernel.h" +#include "ipath_registers.h" + +/* + * This lists the InfiniPath HT400 registers, in the actual chip layout. + * This structure should never be directly accessed. + * + * The names are in InterCap form because they're taken straight from + * the chip specification. Since they're only used in this file, they + * don't pollute the rest of the source. +*/ + +struct _infinipath_do_not_use_kernel_regs { + unsigned long long Revision; + unsigned long long Control; + unsigned long long PageAlign; + unsigned long long PortCnt; + unsigned long long DebugPortSelect; + unsigned long long DebugPort; + unsigned long long SendRegBase; + unsigned long long UserRegBase; + unsigned long long CounterRegBase; + unsigned long long Scratch; + unsigned long long ReservedMisc1; + unsigned long long InterruptConfig; + unsigned long long IntBlocked; + unsigned long long IntMask; + unsigned long long IntStatus; + unsigned long long IntClear; + unsigned long long ErrorMask; + unsigned long long ErrorStatus; + unsigned long long ErrorClear; + unsigned long long HwErrMask; + unsigned long long HwErrStatus; + unsigned long long HwErrClear; + unsigned long long HwDiagCtrl; + unsigned long long MDIO; + unsigned long long IBCStatus; + unsigned long long IBCCtrl; + unsigned long long ExtStatus; + unsigned long long ExtCtrl; + unsigned long long GPIOOut; + unsigned long long GPIOMask; + unsigned long long GPIOStatus; + unsigned long long GPIOClear; + unsigned long long RcvCtrl; + unsigned long long RcvBTHQP; + unsigned long long RcvHdrSize; + unsigned long long RcvHdrCnt; + unsigned long long RcvHdrEntSize; + unsigned long long RcvTIDBase; + unsigned long long RcvTIDCnt; + unsigned long long RcvEgrBase; + unsigned long long RcvEgrCnt; + unsigned long long RcvBufBase; + unsigned long long RcvBufSize; + unsigned long long RxIntMemBase; + unsigned long long RxIntMemSize; + unsigned long long RcvPartitionKey; + unsigned long long ReservedRcv[10]; + unsigned long long SendCtrl; + unsigned long long SendPIOBufBase; + unsigned long long SendPIOSize; + unsigned long long SendPIOBufCnt; + unsigned long long SendPIOAvailAddr; + unsigned long long TxIntMemBase; + unsigned long long TxIntMemSize; + unsigned long long ReservedSend[9]; + unsigned long long SendBufferError; + unsigned long long SendBufferErrorCONT1; + unsigned long long SendBufferErrorCONT2; + unsigned long long SendBufferErrorCONT3; + unsigned long long ReservedSBE[4]; + unsigned long long RcvHdrAddr0; + unsigned long long RcvHdrAddr1; + unsigned long long RcvHdrAddr2; + unsigned long long RcvHdrAddr3; + unsigned long long RcvHdrAddr4; + unsigned long long RcvHdrAddr5; + unsigned long long RcvHdrAddr6; + unsigned long long RcvHdrAddr7; + unsigned long long RcvHdrAddr8; + unsigned long long ReservedRHA[7]; + unsigned long long RcvHdrTailAddr0; + unsigned long long RcvHdrTailAddr1; + unsigned long long RcvHdrTailAddr2; + unsigned long long RcvHdrTailAddr3; + unsigned long long RcvHdrTailAddr4; + unsigned long long RcvHdrTailAddr5; + unsigned long long RcvHdrTailAddr6; + unsigned long long RcvHdrTailAddr7; + unsigned long long RcvHdrTailAddr8; + unsigned long long ReservedRHTA[7]; + unsigned long long Sync; /* Software only */ + unsigned long long Dump; /* Software only */ + unsigned long long SimVer; /* Software only */ + unsigned long long ReservedSW[5]; + unsigned long long SerdesConfig0; + unsigned long long SerdesConfig1; + unsigned long long SerdesStatus; + unsigned long long XGXSConfig; + unsigned long long ReservedSW2[4]; +}; + +#define IPATH_KREG_OFFSET(field) (offsetof(struct \ + _infinipath_do_not_use_kernel_regs, field) / sizeof(u64)) +#define IPATH_CREG_OFFSET(field) (offsetof( \ + struct infinipath_counters, field) / sizeof(u64)) + +static const struct ipath_kregs ipath_ht_kregs = { + .kr_control = IPATH_KREG_OFFSET(Control), + .kr_counterregbase = IPATH_KREG_OFFSET(CounterRegBase), + .kr_debugport = IPATH_KREG_OFFSET(DebugPort), + .kr_debugportselect = IPATH_KREG_OFFSET(DebugPortSelect), + .kr_errorclear = IPATH_KREG_OFFSET(ErrorClear), + .kr_errormask = IPATH_KREG_OFFSET(ErrorMask), + .kr_errorstatus = IPATH_KREG_OFFSET(ErrorStatus), + .kr_extctrl = IPATH_KREG_OFFSET(ExtCtrl), + .kr_extstatus = IPATH_KREG_OFFSET(ExtStatus), + .kr_gpio_clear = IPATH_KREG_OFFSET(GPIOClear), + .kr_gpio_mask = IPATH_KREG_OFFSET(GPIOMask), + .kr_gpio_out = IPATH_KREG_OFFSET(GPIOOut), + .kr_gpio_status = IPATH_KREG_OFFSET(GPIOStatus), + .kr_hwdiagctrl = IPATH_KREG_OFFSET(HwDiagCtrl), + .kr_hwerrclear = IPATH_KREG_OFFSET(HwErrClear), + .kr_hwerrmask = IPATH_KREG_OFFSET(HwErrMask), + .kr_hwerrstatus = IPATH_KREG_OFFSET(HwErrStatus), + .kr_ibcctrl = IPATH_KREG_OFFSET(IBCCtrl), + .kr_ibcstatus = IPATH_KREG_OFFSET(IBCStatus), + .kr_intblocked = IPATH_KREG_OFFSET(IntBlocked), + .kr_intclear = IPATH_KREG_OFFSET(IntClear), + .kr_interruptconfig = IPATH_KREG_OFFSET(InterruptConfig), + .kr_intmask = IPATH_KREG_OFFSET(IntMask), + .kr_intstatus = IPATH_KREG_OFFSET(IntStatus), + .kr_mdio = IPATH_KREG_OFFSET(MDIO), + .kr_pagealign = IPATH_KREG_OFFSET(PageAlign), + .kr_partitionkey = IPATH_KREG_OFFSET(RcvPartitionKey), + .kr_portcnt = IPATH_KREG_OFFSET(PortCnt), + .kr_rcvbthqp = IPATH_KREG_OFFSET(RcvBTHQP), + .kr_rcvbufbase = IPATH_KREG_OFFSET(RcvBufBase), + .kr_rcvbufsize = IPATH_KREG_OFFSET(RcvBufSize), + .kr_rcvctrl = IPATH_KREG_OFFSET(RcvCtrl), + .kr_rcvegrbase = IPATH_KREG_OFFSET(RcvEgrBase), + .kr_rcvegrcnt = IPATH_KREG_OFFSET(RcvEgrCnt), + .kr_rcvhdrcnt = IPATH_KREG_OFFSET(RcvHdrCnt), + .kr_rcvhdrentsize = IPATH_KREG_OFFSET(RcvHdrEntSize), + .kr_rcvhdrsize = IPATH_KREG_OFFSET(RcvHdrSize), + .kr_rcvintmembase = IPATH_KREG_OFFSET(RxIntMemBase), + .kr_rcvintmemsize = IPATH_KREG_OFFSET(RxIntMemSize), + .kr_rcvtidbase = IPATH_KREG_OFFSET(RcvTIDBase), + .kr_rcvtidcnt = IPATH_KREG_OFFSET(RcvTIDCnt), + .kr_revision = IPATH_KREG_OFFSET(Revision), + .kr_scratch = IPATH_KREG_OFFSET(Scratch), + .kr_sendbuffererror = IPATH_KREG_OFFSET(SendBufferError), + .kr_sendctrl = IPATH_KREG_OFFSET(SendCtrl), + .kr_sendpioavailaddr = IPATH_KREG_OFFSET(SendPIOAvailAddr), + .kr_sendpiobufbase = IPATH_KREG_OFFSET(SendPIOBufBase), + .kr_sendpiobufcnt = IPATH_KREG_OFFSET(SendPIOBufCnt), + .kr_sendpiosize = IPATH_KREG_OFFSET(SendPIOSize), + .kr_sendregbase = IPATH_KREG_OFFSET(SendRegBase), + .kr_txintmembase = IPATH_KREG_OFFSET(TxIntMemBase), + .kr_txintmemsize = IPATH_KREG_OFFSET(TxIntMemSize), + .kr_userregbase = IPATH_KREG_OFFSET(UserRegBase), + .kr_serdesconfig0 = IPATH_KREG_OFFSET(SerdesConfig0), + .kr_serdesconfig1 = IPATH_KREG_OFFSET(SerdesConfig1), + .kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus), + .kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig), + /* + * These should not be used directly via ipath_read_kreg64(), + * use them with ipath_read_kreg64_port(), + */ + .kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0), + .kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0) +}; + +static const struct ipath_cregs ipath_ht_cregs = { + .cr_badformatcnt = IPATH_CREG_OFFSET(RxBadFormatCnt), + .cr_erricrccnt = IPATH_CREG_OFFSET(RxICRCErrCnt), + .cr_errlinkcnt = IPATH_CREG_OFFSET(RxLinkProblemCnt), + .cr_errlpcrccnt = IPATH_CREG_OFFSET(RxLPCRCErrCnt), + .cr_errpkey = IPATH_CREG_OFFSET(RxPKeyMismatchCnt), + .cr_errrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowCtrlErrCnt), + .cr_err_rlencnt = IPATH_CREG_OFFSET(RxLenErrCnt), + .cr_errslencnt = IPATH_CREG_OFFSET(TxLenErrCnt), + .cr_errtidfull = IPATH_CREG_OFFSET(RxTIDFullErrCnt), + .cr_errtidvalid = IPATH_CREG_OFFSET(RxTIDValidErrCnt), + .cr_errvcrccnt = IPATH_CREG_OFFSET(RxVCRCErrCnt), + .cr_ibstatuschange = IPATH_CREG_OFFSET(IBStatusChangeCnt), + /* calc from Reg_CounterRegBase + offset */ + .cr_intcnt = IPATH_CREG_OFFSET(LBIntCnt), + .cr_invalidrlencnt = IPATH_CREG_OFFSET(RxMaxMinLenErrCnt), + .cr_invalidslencnt = IPATH_CREG_OFFSET(TxMaxMinLenErrCnt), + .cr_lbflowstallcnt = IPATH_CREG_OFFSET(LBFlowStallCnt), + .cr_pktrcvcnt = IPATH_CREG_OFFSET(RxDataPktCnt), + .cr_pktrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowPktCnt), + .cr_pktsendcnt = IPATH_CREG_OFFSET(TxDataPktCnt), + .cr_pktsendflowcnt = IPATH_CREG_OFFSET(TxFlowPktCnt), + .cr_portovflcnt = IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt), + .cr_rcvebpcnt = IPATH_CREG_OFFSET(RxEBPCnt), + .cr_rcvovflcnt = IPATH_CREG_OFFSET(RxBufOvflCnt), + .cr_senddropped = IPATH_CREG_OFFSET(TxDroppedPktCnt), + .cr_sendstallcnt = IPATH_CREG_OFFSET(TxFlowStallCnt), + .cr_sendunderruncnt = IPATH_CREG_OFFSET(TxUnderrunCnt), + .cr_wordrcvcnt = IPATH_CREG_OFFSET(RxDwordCnt), + .cr_wordsendcnt = IPATH_CREG_OFFSET(TxDwordCnt), + .cr_unsupvlcnt = IPATH_CREG_OFFSET(TxUnsupVLErrCnt), + .cr_rxdroppktcnt = IPATH_CREG_OFFSET(RxDroppedPktCnt), + .cr_iblinkerrrecovcnt = IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt), + .cr_iblinkdowncnt = IPATH_CREG_OFFSET(IBLinkDownedCnt), + .cr_ibsymbolerrcnt = IPATH_CREG_OFFSET(IBSymbolErrCnt) +}; + +/* kr_intstatus, kr_intclear, kr_intmask bits */ +#define INFINIPATH_I_RCVURG_MASK 0x1FF +#define INFINIPATH_I_RCVAVAIL_MASK 0x1FF + +/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */ +#define INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT 0 +#define INFINIPATH_HWE_HTCMEMPARITYERR_MASK 0x3FFFFFULL +#define INFINIPATH_HWE_HTCLNKABYTE0CRCERR 0x0000000000800000ULL +#define INFINIPATH_HWE_HTCLNKABYTE1CRCERR 0x0000000001000000ULL +#define INFINIPATH_HWE_HTCLNKBBYTE0CRCERR 0x0000000002000000ULL +#define INFINIPATH_HWE_HTCLNKBBYTE1CRCERR 0x0000000004000000ULL +#define INFINIPATH_HWE_HTCMISCERR4 0x0000000008000000ULL +#define INFINIPATH_HWE_HTCMISCERR5 0x0000000010000000ULL +#define INFINIPATH_HWE_HTCMISCERR6 0x0000000020000000ULL +#define INFINIPATH_HWE_HTCMISCERR7 0x0000000040000000ULL +#define INFINIPATH_HWE_HTCBUSTREQPARITYERR 0x0000000080000000ULL +#define INFINIPATH_HWE_HTCBUSTRESPPARITYERR 0x0000000100000000ULL +#define INFINIPATH_HWE_HTCBUSIREQPARITYERR 0x0000000200000000ULL +#define INFINIPATH_HWE_COREPLL_FBSLIP 0x0080000000000000ULL +#define INFINIPATH_HWE_COREPLL_RFSLIP 0x0100000000000000ULL +#define INFINIPATH_HWE_HTBPLL_FBSLIP 0x0200000000000000ULL +#define INFINIPATH_HWE_HTBPLL_RFSLIP 0x0400000000000000ULL +#define INFINIPATH_HWE_HTAPLL_FBSLIP 0x0800000000000000ULL +#define INFINIPATH_HWE_HTAPLL_RFSLIP 0x1000000000000000ULL +#define INFINIPATH_HWE_SERDESPLLFAILED 0x2000000000000000ULL + +/* kr_extstatus bits */ +#define INFINIPATH_EXTS_FREQSEL 0x2 +#define INFINIPATH_EXTS_SERDESSEL 0x4 +#define INFINIPATH_EXTS_MEMBIST_ENDTEST 0x0000000000004000 +#define INFINIPATH_EXTS_MEMBIST_CORRECT 0x0000000000008000 + +/* + * masks and bits that are different in different chips, or present only + * in one + */ +static const ipath_err_t infinipath_hwe_htcmemparityerr_mask = + INFINIPATH_HWE_HTCMEMPARITYERR_MASK; +static const ipath_err_t infinipath_hwe_htcmemparityerr_shift = + INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT; + +static const ipath_err_t infinipath_hwe_htclnkabyte0crcerr = + INFINIPATH_HWE_HTCLNKABYTE0CRCERR; +static const ipath_err_t infinipath_hwe_htclnkabyte1crcerr = + INFINIPATH_HWE_HTCLNKABYTE1CRCERR; +static const ipath_err_t infinipath_hwe_htclnkbbyte0crcerr = + INFINIPATH_HWE_HTCLNKBBYTE0CRCERR; +static const ipath_err_t infinipath_hwe_htclnkbbyte1crcerr = + INFINIPATH_HWE_HTCLNKBBYTE1CRCERR; + +#define _IPATH_GPIO_SDA_NUM 1 +#define _IPATH_GPIO_SCL_NUM 0 + +#define IPATH_GPIO_SDA \ + (1ULL << (_IPATH_GPIO_SDA_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT)) +#define IPATH_GPIO_SCL \ + (1ULL << (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT)) + +/* keep the code below somewhat more readonable; not used elsewhere */ +#define _IPATH_HTLINK0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr | \ + infinipath_hwe_htclnkabyte1crcerr) +#define _IPATH_HTLINK1_CRCBITS (infinipath_hwe_htclnkbbyte0crcerr | \ + infinipath_hwe_htclnkbbyte1crcerr) +#define _IPATH_HTLANE0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr | \ + infinipath_hwe_htclnkbbyte0crcerr) +#define _IPATH_HTLANE1_CRCBITS (infinipath_hwe_htclnkabyte1crcerr | \ + infinipath_hwe_htclnkbbyte1crcerr) + +static void hwerr_crcbits(struct ipath_devdata *dd, ipath_err_t hwerrs, + char *msg, size_t msgl) +{ + char bitsmsg[64]; + ipath_err_t crcbits = hwerrs & + (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS); + /* don't check if 8bit HT */ + if (dd->ipath_flags & IPATH_8BIT_IN_HT0) + crcbits &= ~infinipath_hwe_htclnkabyte1crcerr; + /* don't check if 8bit HT */ + if (dd->ipath_flags & IPATH_8BIT_IN_HT1) + crcbits &= ~infinipath_hwe_htclnkbbyte1crcerr; + /* + * we'll want to ignore link errors on link that is + * not in use, if any. For now, complain about both + */ + if (crcbits) { + u16 ctrl0, ctrl1; + snprintf(bitsmsg, sizeof bitsmsg, + "[HT%s lane %s CRC (%llx); ignore till reload]", + !(crcbits & _IPATH_HTLINK1_CRCBITS) ? + "0 (A)" : (!(crcbits & _IPATH_HTLINK0_CRCBITS) + ? "1 (B)" : "0+1 (A+B)"), + !(crcbits & _IPATH_HTLANE1_CRCBITS) ? "0" + : (!(crcbits & _IPATH_HTLANE0_CRCBITS) ? "1" : + "0+1"), (unsigned long long) crcbits); + strlcat(msg, bitsmsg, msgl); + + /* + * print extra info for debugging. slave/primary + * config word 4, 8 (link control 0, 1) + */ + + if (pci_read_config_word(dd->pcidev, + dd->ipath_ht_slave_off + 0x4, + &ctrl0)) + dev_info(&dd->pcidev->dev, "Couldn't read " + "linkctrl0 of slave/primary " + "config block\n"); + else if (!(ctrl0 & 1 << 6)) + /* not if EOC bit set */ + ipath_dbg("HT linkctrl0 0x%x%s%s\n", ctrl0, + ((ctrl0 >> 8) & 7) ? " CRC" : "", + ((ctrl0 >> 4) & 1) ? "linkfail" : + ""); + if (pci_read_config_word(dd->pcidev, + dd->ipath_ht_slave_off + 0x8, + &ctrl1)) + dev_info(&dd->pcidev->dev, "Couldn't read " + "linkctrl1 of slave/primary " + "config block\n"); + else if (!(ctrl1 & 1 << 6)) + /* not if EOC bit set */ + ipath_dbg("HT linkctrl1 0x%x%s%s\n", ctrl1, + ((ctrl1 >> 8) & 7) ? " CRC" : "", + ((ctrl1 >> 4) & 1) ? "linkfail" : + ""); + + /* disable until driver reloaded */ + dd->ipath_hwerrmask &= ~crcbits; + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + ipath_dbg("HT crc errs: %s\n", msg); + } else + ipath_dbg("ignoring HT crc errors 0x%llx, " + "not in use\n", (unsigned long long) + (hwerrs & (_IPATH_HTLINK0_CRCBITS | + _IPATH_HTLINK1_CRCBITS))); +} + +/** + * ipath_ht_handle_hwerrors - display hardware errors + * @dd: the infinipath device + * @msg: the output buffer + * @msgl: the size of the output buffer + * + * Use same msg buffer as regular errors to avoid + * excessive stack use. Most hardware errors are catastrophic, but for + * right now, we'll print them and continue. + * We reuse the same message buffer as ipath_handle_errors() to avoid + * excessive stack usage. + */ +static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg, + size_t msgl) +{ + ipath_err_t hwerrs; + u32 bits, ctrl; + int isfatal = 0; + char bitsmsg[64]; + + hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus); + + if (!hwerrs) { + ipath_cdbg(VERBOSE, "Called but no hardware errors set\n"); + /* + * better than printing cofusing messages + * This seems to be related to clearing the crc error, or + * the pll error during init. + */ + goto bail; + } else if (hwerrs == -1LL) { + ipath_dev_err(dd, "Read of hardware error status failed " + "(all bits set); ignoring\n"); + goto bail; + } + ipath_stats.sps_hwerrs++; + + /* Always clear the error status register, except MEMBISTFAIL, + * regardless of whether we continue or stop using the chip. + * We want that set so we know it failed, even across driver reload. + * We'll still ignore it in the hwerrmask. We do this partly for + * diagnostics, but also for support */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + hwerrs&~INFINIPATH_HWE_MEMBISTFAILED); + + hwerrs &= dd->ipath_hwerrmask; + + /* + * make sure we get this much out, unless told to be quiet, + * or it's occurred within the last 5 seconds + */ + if ((hwerrs & ~dd->ipath_lasthwerror) || + (ipath_debug & __IPATH_VERBDBG)) + dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx " + "(cleared)\n", (unsigned long long) hwerrs); + dd->ipath_lasthwerror |= hwerrs; + + if (hwerrs & ~infinipath_hwe_bitsextant) + ipath_dev_err(dd, "hwerror interrupt with unknown errors " + "%llx set\n", (unsigned long long) + (hwerrs & ~infinipath_hwe_bitsextant)); + + ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); + if (ctrl & INFINIPATH_C_FREEZEMODE) { + if (hwerrs) { + /* + * if any set that we aren't ignoring; only + * make the complaint once, in case it's stuck + * or recurring, and we get here multiple + * times. + */ + if (dd->ipath_flags & IPATH_INITTED) { + ipath_dev_err(dd, "Fatal Error (freeze " + "mode), no longer usable\n"); + isfatal = 1; + } + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + /* mark as having had error */ + *dd->ipath_statusp |= IPATH_STATUS_HWERROR; + /* + * mark as not usable, at a minimum until driver + * is reloaded, probably until reboot, since no + * other reset is possible. + */ + dd->ipath_flags &= ~IPATH_INITTED; + } else { + ipath_dbg("Clearing freezemode on ignored hardware " + "error\n"); + ctrl &= ~INFINIPATH_C_FREEZEMODE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + ctrl); + } + } + + *msg = '\0'; + + /* + * may someday want to decode into which bits are which + * functional area for parity errors, etc. + */ + if (hwerrs & (infinipath_hwe_htcmemparityerr_mask + << INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT)) { + bits = (u32) ((hwerrs >> + INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) & + INFINIPATH_HWE_HTCMEMPARITYERR_MASK); + snprintf(bitsmsg, sizeof bitsmsg, "[HTC Parity Errs %x] ", + bits); + strlcat(msg, bitsmsg, msgl); + } + if (hwerrs & (INFINIPATH_HWE_RXEMEMPARITYERR_MASK + << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)) { + bits = (u32) ((hwerrs >> + INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) & + INFINIPATH_HWE_RXEMEMPARITYERR_MASK); + snprintf(bitsmsg, sizeof bitsmsg, "[RXE Parity Errs %x] ", + bits); + strlcat(msg, bitsmsg, msgl); + } + if (hwerrs & (INFINIPATH_HWE_TXEMEMPARITYERR_MASK + << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) { + bits = (u32) ((hwerrs >> + INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) & + INFINIPATH_HWE_TXEMEMPARITYERR_MASK); + snprintf(bitsmsg, sizeof bitsmsg, "[TXE Parity Errs %x] ", + bits); + strlcat(msg, bitsmsg, msgl); + } + if (hwerrs & INFINIPATH_HWE_IBCBUSTOSPCPARITYERR) + strlcat(msg, "[IB2IPATH Parity]", msgl); + if (hwerrs & INFINIPATH_HWE_IBCBUSFRSPCPARITYERR) + strlcat(msg, "[IPATH2IB Parity]", msgl); + if (hwerrs & INFINIPATH_HWE_HTCBUSIREQPARITYERR) + strlcat(msg, "[HTC Ireq Parity]", msgl); + if (hwerrs & INFINIPATH_HWE_HTCBUSTREQPARITYERR) + strlcat(msg, "[HTC Treq Parity]", msgl); + if (hwerrs & INFINIPATH_HWE_HTCBUSTRESPPARITYERR) + strlcat(msg, "[HTC Tresp Parity]", msgl); + + if (hwerrs & (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS)) + hwerr_crcbits(dd, hwerrs, msg, msgl); + + if (hwerrs & INFINIPATH_HWE_HTCMISCERR5) + strlcat(msg, "[HT core Misc5]", msgl); + if (hwerrs & INFINIPATH_HWE_HTCMISCERR6) + strlcat(msg, "[HT core Misc6]", msgl); + if (hwerrs & INFINIPATH_HWE_HTCMISCERR7) + strlcat(msg, "[HT core Misc7]", msgl); + if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) { + strlcat(msg, "[Memory BIST test failed, HT-400 unusable]", + msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->ipath_hwerrmask &= ~INFINIPATH_HWE_MEMBISTFAILED; + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + } +#define _IPATH_PLL_FAIL (INFINIPATH_HWE_COREPLL_FBSLIP | \ + INFINIPATH_HWE_COREPLL_RFSLIP | \ + INFINIPATH_HWE_HTBPLL_FBSLIP | \ + INFINIPATH_HWE_HTBPLL_RFSLIP | \ + INFINIPATH_HWE_HTAPLL_FBSLIP | \ + INFINIPATH_HWE_HTAPLL_RFSLIP) + + if (hwerrs & _IPATH_PLL_FAIL) { + snprintf(bitsmsg, sizeof bitsmsg, + "[PLL failed (%llx), HT-400 unusable]", + (unsigned long long) (hwerrs & _IPATH_PLL_FAIL)); + strlcat(msg, bitsmsg, msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->ipath_hwerrmask &= ~(hwerrs & _IPATH_PLL_FAIL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + } + + if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) { + /* + * If it occurs, it is left masked since the eternal + * interface is unused + */ + dd->ipath_hwerrmask &= ~INFINIPATH_HWE_SERDESPLLFAILED; + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + } + + if (hwerrs & INFINIPATH_HWE_RXDSYNCMEMPARITYERR) + strlcat(msg, "[Rx Dsync]", msgl); + if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) + strlcat(msg, "[SerDes PLL]", msgl); + + ipath_dev_err(dd, "%s hardware error\n", msg); + if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg) + /* + * for status file; if no trailing brace is copied, + * we'll know it was truncated. + */ + snprintf(dd->ipath_freezemsg, + dd->ipath_freezelen, "{%s}", msg); + +bail:; +} + +/** + * ipath_ht_boardname - fill in the board name + * @dd: the infinipath device + * @name: the output buffer + * @namelen: the size of the output buffer + * + * fill in the board name, based on the board revision register + */ +static int ipath_ht_boardname(struct ipath_devdata *dd, char *name, + size_t namelen) +{ + char *n = NULL; + u8 boardrev = dd->ipath_boardrev; + int ret; + + switch (boardrev) { + case 4: /* Ponderosa is one of the bringup boards */ + n = "Ponderosa"; + break; + case 5: /* HT-460 original production board */ + n = "InfiniPath_HT-460"; + break; + case 6: + n = "OEM_Board_3"; + break; + case 7: + /* HT-460 small form factor production board */ + n = "InfiniPath_HT-465"; + break; + case 8: + n = "LS/X-1"; + break; + case 9: /* Comstock bringup test board */ + n = "Comstock"; + break; + case 10: + n = "OEM_Board_2"; + break; + case 11: + n = "InfiniPath_HT-470"; + break; + case 12: + n = "OEM_Board_4"; + break; + default: /* don't know, just print the number */ + ipath_dev_err(dd, "Don't yet know about board " + "with ID %u\n", boardrev); + snprintf(name, namelen, "Unknown_InfiniPath_HT-4xx_%u", + boardrev); + break; + } + if (n) + snprintf(name, namelen, "%s", n); + + if (dd->ipath_majrev != 3 || dd->ipath_minrev != 2) { + /* + * This version of the driver only supports the HT-400 + * Rev 3.2 + */ + ipath_dev_err(dd, + "Unsupported HT-400 revision %u.%u!\n", + dd->ipath_majrev, dd->ipath_minrev); + ret = 1; + goto bail; + } + /* + * pkt/word counters are 32 bit, and therefore wrap fast enough + * that we snapshot them from a timer, and maintain 64 bit shadow + * copies + */ + dd->ipath_flags |= IPATH_32BITCOUNTERS; + if (dd->ipath_htspeed != 800) + ipath_dev_err(dd, + "Incorrectly configured for HT @ %uMHz\n", + dd->ipath_htspeed); + if (dd->ipath_boardrev == 7 || dd->ipath_boardrev == 11 || + dd->ipath_boardrev == 6) + dd->ipath_flags |= IPATH_GPIO_INTR; + else + dd->ipath_flags |= IPATH_POLL_RX_INTR; + if (dd->ipath_boardrev == 8) { /* LS/X-1 */ + u64 val; + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus); + if (val & INFINIPATH_EXTS_SERDESSEL) { + /* + * hardware disabled + * + * This means that the chip is hardware disabled, + * and will not be able to bring up the link, + * in any case. We special case this and abort + * early, to avoid later messages. We also set + * the DISABLED status bit + */ + ipath_dbg("Unit %u is hardware-disabled\n", + dd->ipath_unit); + *dd->ipath_statusp |= IPATH_STATUS_DISABLED; + /* this value is handled differently */ + ret = 2; + goto bail; + } + } + ret = 0; + +bail: + return ret; +} + +static void ipath_check_htlink(struct ipath_devdata *dd) +{ + u8 linkerr, link_off, i; + + for (i = 0; i < 2; i++) { + link_off = dd->ipath_ht_slave_off + i * 4 + 0xd; + if (pci_read_config_byte(dd->pcidev, link_off, &linkerr)) + dev_info(&dd->pcidev->dev, "Couldn't read " + "linkerror%d of HT slave/primary block\n", + i); + else if (linkerr & 0xf0) { + ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, " + "clearing\n", linkerr >> 4, i); + /* + * writing the linkerr bits that are set should + * clear them + */ + if (pci_write_config_byte(dd->pcidev, link_off, + linkerr)) + ipath_dbg("Failed write to clear HT " + "linkerror%d\n", i); + if (pci_read_config_byte(dd->pcidev, link_off, + &linkerr)) + dev_info(&dd->pcidev->dev, + "Couldn't reread linkerror%d of " + "HT slave/primary block\n", i); + else if (linkerr & 0xf0) + dev_info(&dd->pcidev->dev, + "HT linkerror%d bits 0x%x " + "couldn't be cleared\n", + i, linkerr >> 4); + } + } +} + +static int ipath_setup_ht_reset(struct ipath_devdata *dd) +{ + ipath_dbg("No reset possible for HT-400\n"); + return 0; +} + +#define HT_CAPABILITY_ID 0x08 /* HT capabilities not defined in kernel */ +#define HT_INTR_DISC_CONFIG 0x80 /* HT interrupt and discovery cap */ +#define HT_INTR_REG_INDEX 2 /* intconfig requires indirect accesses */ + +/* + * Bits 13-15 of command==0 is slave/primary block. Clear any HT CRC + * errors. We only bother to do this at load time, because it's OK if + * it happened before we were loaded (first time after boot/reset), + * but any time after that, it's fatal anyway. Also need to not check + * for for upper byte errors if we are in 8 bit mode, so figure out + * our width. For now, at least, also complain if it's 8 bit. + */ +static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev, + int pos, u8 cap_type) +{ + u8 linkwidth = 0, linkerr, link_a_b_off, link_off; + u16 linkctrl = 0; + int i; + + dd->ipath_ht_slave_off = pos; + /* command word, master_host bit */ + /* master host || slave */ + if ((cap_type >> 2) & 1) + link_a_b_off = 4; + else + link_a_b_off = 0; + ipath_cdbg(VERBOSE, "HT%u (Link %c) connected to processor\n", + link_a_b_off ? 1 : 0, + link_a_b_off ? 'B' : 'A'); + + link_a_b_off += pos; + + /* + * check both link control registers; clear both HT CRC sets if + * necessary. + */ + for (i = 0; i < 2; i++) { + link_off = pos + i * 4 + 0x4; + if (pci_read_config_word(pdev, link_off, &linkctrl)) + ipath_dev_err(dd, "Couldn't read HT link control%d " + "register\n", i); + else if (linkctrl & (0xf << 8)) { + ipath_cdbg(VERBOSE, "Clear linkctrl%d CRC Error " + "bits %x\n", i, linkctrl & (0xf << 8)); + /* + * now write them back to clear the error. + */ + pci_write_config_byte(pdev, link_off, + linkctrl & (0xf << 8)); + } + } + + /* + * As with HT CRC bits, same for protocol errors that might occur + * during boot. + */ + for (i = 0; i < 2; i++) { + link_off = pos + i * 4 + 0xd; + if (pci_read_config_byte(pdev, link_off, &linkerr)) + dev_info(&pdev->dev, "Couldn't read linkerror%d " + "of HT slave/primary block\n", i); + else if (linkerr & 0xf0) { + ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, " + "clearing\n", linkerr >> 4, i); + /* + * writing the linkerr bits that are set will clear + * them + */ + if (pci_write_config_byte + (pdev, link_off, linkerr)) + ipath_dbg("Failed write to clear HT " + "linkerror%d\n", i); + if (pci_read_config_byte(pdev, link_off, &linkerr)) + dev_info(&pdev->dev, "Couldn't reread " + "linkerror%d of HT slave/primary " + "block\n", i); + else if (linkerr & 0xf0) + dev_info(&pdev->dev, "HT linkerror%d bits " + "0x%x couldn't be cleared\n", + i, linkerr >> 4); + } + } + + /* + * this is just for our link to the host, not devices connected + * through tunnel. + */ + + if (pci_read_config_byte(pdev, link_a_b_off + 7, &linkwidth)) + ipath_dev_err(dd, "Couldn't read HT link width " + "config register\n"); + else { + u32 width; + switch (linkwidth & 7) { + case 5: + width = 4; + break; + case 4: + width = 2; + break; + case 3: + width = 32; + break; + case 1: + width = 16; + break; + case 0: + default: /* if wrong, assume 8 bit */ + width = 8; + break; + } + + dd->ipath_htwidth = width; + + if (linkwidth != 0x11) { + ipath_dev_err(dd, "Not configured for 16 bit HT " + "(%x)\n", linkwidth); + if (!(linkwidth & 0xf)) { + ipath_dbg("Will ignore HT lane1 errors\n"); + dd->ipath_flags |= IPATH_8BIT_IN_HT0; + } + } + } + + /* + * this is just for our link to the host, not devices connected + * through tunnel. + */ + if (pci_read_config_byte(pdev, link_a_b_off + 0xd, &linkwidth)) + ipath_dev_err(dd, "Couldn't read HT link frequency " + "config register\n"); + else { + u32 speed; + switch (linkwidth & 0xf) { + case 6: + speed = 1000; + break; + case 5: + speed = 800; + break; + case 4: + speed = 600; + break; + case 3: + speed = 500; + break; + case 2: + speed = 400; + break; + case 1: + speed = 300; + break; + default: + /* + * assume reserved and vendor-specific are 200... + */ + case 0: + speed = 200; + break; + } + dd->ipath_htspeed = speed; + } +} + +static int set_int_handler(struct ipath_devdata *dd, struct pci_dev *pdev, + int pos) +{ + u32 int_handler_addr_lower; + u32 int_handler_addr_upper; + u64 ihandler; + u32 intvec; + + /* use indirection register to get the intr handler */ + pci_write_config_byte(pdev, pos + HT_INTR_REG_INDEX, 0x10); + pci_read_config_dword(pdev, pos + 4, &int_handler_addr_lower); + pci_write_config_byte(pdev, pos + HT_INTR_REG_INDEX, 0x11); + pci_read_config_dword(pdev, pos + 4, &int_handler_addr_upper); + + ihandler = (u64) int_handler_addr_lower | + ((u64) int_handler_addr_upper << 32); + + /* + * kernels with CONFIG_PCI_MSI set the vector in the irq field of + * struct pci_device, so we use that to program the HT-400 internal + * interrupt register (not config space) with that value. The BIOS + * must still have done the basic MSI setup. + */ + intvec = pdev->irq; + /* + * clear any vector bits there; normally not set but we'll overload + * this for some debug purposes (setting the HTC debug register + * value from software, rather than GPIOs), so it might be set on a + * driver reload. + */ + ihandler &= ~0xff0000; + /* x86 vector goes in intrinfo[23:16] */ + ihandler |= intvec << 16; + ipath_cdbg(VERBOSE, "ihandler lower %x, upper %x, intvec %x, " + "interruptconfig %llx\n", int_handler_addr_lower, + int_handler_addr_upper, intvec, + (unsigned long long) ihandler); + + /* can't program yet, so save for interrupt setup */ + dd->ipath_intconfig = ihandler; + /* keep going, so we find link control stuff also */ + + return ihandler != 0; +} + +/** + * ipath_setup_ht_config - setup the interruptconfig register + * @dd: the infinipath device + * @pdev: the PCI device + * + * setup the interruptconfig register from the HT config info. + * Also clear CRC errors in HT linkcontrol, if necessary. + * This is done only for the real hardware. It is done before + * chip address space is initted, so can't touch infinipath registers + */ +static int ipath_setup_ht_config(struct ipath_devdata *dd, + struct pci_dev *pdev) +{ + int pos, ret = 0; + int ihandler = 0; + + /* + * Read the capability info to find the interrupt info, and also + * handle clearing CRC errors in linkctrl register if necessary. We + * do this early, before we ever enable errors or hardware errors, + * mostly to avoid causing the chip to enter freeze mode. + */ + pos = pci_find_capability(pdev, HT_CAPABILITY_ID); + if (!pos) { + ipath_dev_err(dd, "Couldn't find HyperTransport " + "capability; no interrupts\n"); + ret = -ENODEV; + goto bail; + } + do { + u8 cap_type; + + /* the HT capability type byte is 3 bytes after the + * capability byte. + */ + if (pci_read_config_byte(pdev, pos + 3, &cap_type)) { + dev_info(&pdev->dev, "Couldn't read config " + "command @ %d\n", pos); + continue; + } + if (!(cap_type & 0xE0)) + slave_or_pri_blk(dd, pdev, pos, cap_type); + else if (cap_type == HT_INTR_DISC_CONFIG) + ihandler = set_int_handler(dd, pdev, pos); + } while ((pos = pci_find_next_capability(pdev, pos, + HT_CAPABILITY_ID))); + + if (!ihandler) { + ipath_dev_err(dd, "Couldn't find interrupt handler in " + "config space\n"); + ret = -ENODEV; + } + +bail: + return ret; +} + +/** + * ipath_setup_ht_cleanup - clean up any per-chip chip-specific stuff + * @dd: the infinipath device + * + * Called during driver unload. + * This is currently a nop for the HT-400, not for all chips + */ +static void ipath_setup_ht_cleanup(struct ipath_devdata *dd) +{ +} + +/** + * ipath_setup_ht_setextled - set the state of the two external LEDs + * @dd: the infinipath device + * @lst: the L state + * @ltst: the LT state + * + * Set the state of the two external LEDs, to indicate physical and + * logical state of IB link. For this chip (at least with recommended + * board pinouts), LED1 is Green (physical state), and LED2 is Yellow + * (logical state) + * + * Note: We try to match the Mellanox HCA LED behavior as best + * we can. Green indicates physical link state is OK (something is + * plugged in, and we can train). + * Amber indicates the link is logically up (ACTIVE). + * Mellanox further blinks the amber LED to indicate data packet + * activity, but we have no hardware support for that, so it would + * require waking up every 10-20 msecs and checking the counters + * on the chip, and then turning the LED off if appropriate. That's + * visible overhead, so not something we will do. + * + */ +static void ipath_setup_ht_setextled(struct ipath_devdata *dd, + u64 lst, u64 ltst) +{ + u64 extctl; + + /* the diags use the LED to indicate diag info, so we leave + * the external LED alone when the diags are running */ + if (ipath_diag_inuse) + return; + + /* + * start by setting both LED control bits to off, then turn + * on the appropriate bit(s). + */ + if (dd->ipath_boardrev == 8) { /* LS/X-1 uses different pins */ + /* + * major difference is that INFINIPATH_EXTC_LEDGBLERR_OFF + * is inverted, because it is normally used to indicate + * a hardware fault at reset, if there were errors + */ + extctl = (dd->ipath_extctrl & ~INFINIPATH_EXTC_LEDGBLOK_ON) + | INFINIPATH_EXTC_LEDGBLERR_OFF; + if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP) + extctl &= ~INFINIPATH_EXTC_LEDGBLERR_OFF; + if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE) + extctl |= INFINIPATH_EXTC_LEDGBLOK_ON; + } + else { + extctl = dd->ipath_extctrl & + ~(INFINIPATH_EXTC_LED1PRIPORT_ON | + INFINIPATH_EXTC_LED2PRIPORT_ON); + if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP) + extctl |= INFINIPATH_EXTC_LED1PRIPORT_ON; + if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE) + extctl |= INFINIPATH_EXTC_LED2PRIPORT_ON; + } + dd->ipath_extctrl = extctl; + ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl); +} + +static void ipath_init_ht_variables(void) +{ + ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM; + ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM; + ipath_gpio_sda = IPATH_GPIO_SDA; + ipath_gpio_scl = IPATH_GPIO_SCL; + + infinipath_i_bitsextant = + (INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) | + (INFINIPATH_I_RCVAVAIL_MASK << + INFINIPATH_I_RCVAVAIL_SHIFT) | + INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT | + INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO; + + infinipath_e_bitsextant = + INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC | + INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN | + INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN | + INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RUNEXPCHAR | + INFINIPATH_E_RUNSUPVL | INFINIPATH_E_REBP | + INFINIPATH_E_RIBFLOW | INFINIPATH_E_RBADVERSION | + INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_RBADTID | INFINIPATH_E_RHDRLEN | + INFINIPATH_E_RHDR | INFINIPATH_E_RIBLOSTLINK | + INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SMAXPKTLEN | + INFINIPATH_E_SUNDERRUN | INFINIPATH_E_SPKTLEN | + INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SDROPPEDDATAPKT | + INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | + INFINIPATH_E_SUNSUPVL | INFINIPATH_E_IBSTATUSCHANGED | + INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET | + INFINIPATH_E_HARDWARE; + + infinipath_hwe_bitsextant = + (INFINIPATH_HWE_HTCMEMPARITYERR_MASK << + INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) | + (INFINIPATH_HWE_TXEMEMPARITYERR_MASK << + INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) | + (INFINIPATH_HWE_RXEMEMPARITYERR_MASK << + INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) | + INFINIPATH_HWE_HTCLNKABYTE0CRCERR | + INFINIPATH_HWE_HTCLNKABYTE1CRCERR | + INFINIPATH_HWE_HTCLNKBBYTE0CRCERR | + INFINIPATH_HWE_HTCLNKBBYTE1CRCERR | + INFINIPATH_HWE_HTCMISCERR4 | + INFINIPATH_HWE_HTCMISCERR5 | INFINIPATH_HWE_HTCMISCERR6 | + INFINIPATH_HWE_HTCMISCERR7 | + INFINIPATH_HWE_HTCBUSTREQPARITYERR | + INFINIPATH_HWE_HTCBUSTRESPPARITYERR | + INFINIPATH_HWE_HTCBUSIREQPARITYERR | + INFINIPATH_HWE_RXDSYNCMEMPARITYERR | + INFINIPATH_HWE_MEMBISTFAILED | + INFINIPATH_HWE_COREPLL_FBSLIP | + INFINIPATH_HWE_COREPLL_RFSLIP | + INFINIPATH_HWE_HTBPLL_FBSLIP | + INFINIPATH_HWE_HTBPLL_RFSLIP | + INFINIPATH_HWE_HTAPLL_FBSLIP | + INFINIPATH_HWE_HTAPLL_RFSLIP | + INFINIPATH_HWE_SERDESPLLFAILED | + INFINIPATH_HWE_IBCBUSTOSPCPARITYERR | + INFINIPATH_HWE_IBCBUSFRSPCPARITYERR; + + infinipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK; + infinipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK; +} + +/** + * ipath_ht_init_hwerrors - enable hardware errors + * @dd: the infinipath device + * + * now that we have finished initializing everything that might reasonably + * cause a hardware error, and cleared those errors bits as they occur, + * we can enable hardware errors in the mask (potentially enabling + * freeze mode), and enable hardware errors as errors (along with + * everything else) in errormask + */ +static void ipath_ht_init_hwerrors(struct ipath_devdata *dd) +{ + ipath_err_t val; + u64 extsval; + + extsval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus); + + if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST)) + ipath_dev_err(dd, "MemBIST did not complete!\n"); + + ipath_check_htlink(dd); + + /* barring bugs, all hwerrors become interrupts, which can */ + val = -1LL; + /* don't look at crc lane1 if 8 bit */ + if (dd->ipath_flags & IPATH_8BIT_IN_HT0) + val &= ~infinipath_hwe_htclnkabyte1crcerr; + /* don't look at crc lane1 if 8 bit */ + if (dd->ipath_flags & IPATH_8BIT_IN_HT1) + val &= ~infinipath_hwe_htclnkbbyte1crcerr; + + /* + * disable RXDSYNCMEMPARITY because external serdes is unused, + * and therefore the logic will never be used or initialized, + * and uninitialized state will normally result in this error + * being asserted. Similarly for the external serdess pll + * lock signal. + */ + val &= ~(INFINIPATH_HWE_SERDESPLLFAILED | + INFINIPATH_HWE_RXDSYNCMEMPARITYERR); + + /* + * Disable MISCERR4 because of an inversion in the HT core + * logic checking for errors that cause this bit to be set. + * The errata can also cause the protocol error bit to be set + * in the HT config space linkerror register(s). + */ + val &= ~INFINIPATH_HWE_HTCMISCERR4; + + /* + * PLL ignored because MDIO interface has a logic problem + * for reads, on Comstock and Ponderosa. BRINGUP + */ + if (dd->ipath_boardrev == 4 || dd->ipath_boardrev == 9) + val &= ~INFINIPATH_HWE_SERDESPLLFAILED; + dd->ipath_hwerrmask = val; +} + +/** + * ipath_ht_bringup_serdes - bring up the serdes + * @dd: the infinipath device + */ +static int ipath_ht_bringup_serdes(struct ipath_devdata *dd) +{ + u64 val, config1; + int ret = 0, change = 0; + + ipath_dbg("Trying to bringup serdes\n"); + + if (ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus) & + INFINIPATH_HWE_SERDESPLLFAILED) + { + ipath_dbg("At start, serdes PLL failed bit set in " + "hwerrstatus, clearing and continuing\n"); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + INFINIPATH_HWE_SERDESPLLFAILED); + } + + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); + config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1); + + ipath_cdbg(VERBOSE, "Initial serdes status is config0=%llx " + "config1=%llx, sstatus=%llx xgxs %llx\n", + (unsigned long long) val, (unsigned long long) config1, + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus), + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig)); + + /* force reset on */ + val |= INFINIPATH_SERDC0_RESET_PLL + /* | INFINIPATH_SERDC0_RESET_MASK */ + ; + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val); + udelay(15); /* need pll reset set at least for a bit */ + + if (val & INFINIPATH_SERDC0_RESET_PLL) { + u64 val2 = val &= ~INFINIPATH_SERDC0_RESET_PLL; + /* set lane resets, and tx idle, during pll reset */ + val2 |= INFINIPATH_SERDC0_RESET_MASK | + INFINIPATH_SERDC0_TXIDLE; + ipath_cdbg(VERBOSE, "Clearing serdes PLL reset (writing " + "%llx)\n", (unsigned long long) val2); + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, + val2); + /* + * be sure chip saw it + */ + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + /* + * need pll reset clear at least 11 usec before lane + * resets cleared; give it a few more + */ + udelay(15); + val = val2; /* for check below */ + } + + if (val & (INFINIPATH_SERDC0_RESET_PLL | + INFINIPATH_SERDC0_RESET_MASK | + INFINIPATH_SERDC0_TXIDLE)) { + val &= ~(INFINIPATH_SERDC0_RESET_PLL | + INFINIPATH_SERDC0_RESET_MASK | + INFINIPATH_SERDC0_TXIDLE); + /* clear them */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, + val); + } + + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig); + if (((val >> INFINIPATH_XGXS_MDIOADDR_SHIFT) & + INFINIPATH_XGXS_MDIOADDR_MASK) != 3) { + val &= ~(INFINIPATH_XGXS_MDIOADDR_MASK << + INFINIPATH_XGXS_MDIOADDR_SHIFT); + /* + * we use address 3 + */ + val |= 3ULL << INFINIPATH_XGXS_MDIOADDR_SHIFT; + change = 1; + } + if (val & INFINIPATH_XGXS_RESET) { + /* normally true after boot */ + val &= ~INFINIPATH_XGXS_RESET; + change = 1; + } + if (change) + ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val); + + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); + + /* clear current and de-emphasis bits */ + config1 &= ~0x0ffffffff00ULL; + /* set current to 20ma */ + config1 |= 0x00000000000ULL; + /* set de-emphasis to -5.68dB */ + config1 |= 0x0cccc000000ULL; + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig1, config1); + + ipath_cdbg(VERBOSE, "After setup: serdes status is config0=%llx " + "config1=%llx, sstatus=%llx xgxs %llx\n", + (unsigned long long) val, (unsigned long long) config1, + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus), + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig)); + + if (!ipath_waitfor_mdio_cmdready(dd)) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_mdio, + ipath_mdio_req(IPATH_MDIO_CMD_READ, 31, + IPATH_MDIO_CTRL_XGXS_REG_8, + 0)); + if (ipath_waitfor_complete(dd, dd->ipath_kregs->kr_mdio, + IPATH_MDIO_DATAVALID, &val)) + ipath_dbg("Never got MDIO data for XGXS status " + "read\n"); + else + ipath_cdbg(VERBOSE, "MDIO Read reg8, " + "'bank' 31 %x\n", (u32) val); + } else + ipath_dbg("Never got MDIO cmdready for XGXS status read\n"); + + return ret; /* for now, say we always succeeded */ +} + +/** + * ipath_ht_quiet_serdes - set serdes to txidle + * @dd: the infinipath device + * driver is being unloaded + */ +static void ipath_ht_quiet_serdes(struct ipath_devdata *dd) +{ + u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); + + val |= INFINIPATH_SERDC0_TXIDLE; + ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n", + (unsigned long long) val); + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val); +} + +static int ipath_ht_intconfig(struct ipath_devdata *dd) +{ + int ret; + + if (!dd->ipath_intconfig) { + ipath_dev_err(dd, "No interrupts enabled, couldn't setup " + "interrupt address\n"); + ret = 1; + goto bail; + } + + ipath_write_kreg(dd, dd->ipath_kregs->kr_interruptconfig, + dd->ipath_intconfig); /* interrupt address */ + ret = 0; + +bail: + return ret; +} + +/** + * ipath_pe_put_tid - write a TID in chip + * @dd: the infinipath device + * @tidptr: pointer to the expected TID (in chip) to udpate + * @tidtype: 0 for eager, 1 for expected + * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing + * + * This exists as a separate routine to allow for special locking etc. + * It's used for both the full cleanup on exit, as well as the normal + * setup and teardown. + */ +static void ipath_ht_put_tid(struct ipath_devdata *dd, + u64 __iomem *tidptr, u32 type, + unsigned long pa) +{ + if (pa != dd->ipath_tidinvalid) { + if (unlikely((pa & ~INFINIPATH_RT_ADDR_MASK))) { + dev_info(&dd->pcidev->dev, + "physaddr %lx has more than " + "40 bits, using only 40!!!\n", pa); + pa &= INFINIPATH_RT_ADDR_MASK; + } + if (type == 0) + pa |= dd->ipath_tidtemplate; + else { + /* in words (fixed, full page). */ + u64 lenvalid = PAGE_SIZE >> 2; + lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT; + pa |= lenvalid | INFINIPATH_RT_VALID; + } + } + if (dd->ipath_kregbase) + writeq(pa, tidptr); +} + +/** + * ipath_ht_clear_tid - clear all TID entries for a port, expected and eager + * @dd: the infinipath device + * @port: the port + * + * Used from ipath_close(), and at chip initialization. + */ +static void ipath_ht_clear_tids(struct ipath_devdata *dd, unsigned port) +{ + u64 __iomem *tidbase; + int i; + + if (!dd->ipath_kregbase) + return; + + ipath_cdbg(VERBOSE, "Invalidate TIDs for port %u\n", port); + + /* + * need to invalidate all of the expected TID entries for this + * port, so we don't have valid entries that might somehow get + * used (early in next use of this port, or through some bug) + */ + tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) + + dd->ipath_rcvtidbase + + port * dd->ipath_rcvtidcnt * + sizeof(*tidbase)); + for (i = 0; i < dd->ipath_rcvtidcnt; i++) + ipath_ht_put_tid(dd, &tidbase[i], 1, dd->ipath_tidinvalid); + + tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) + + dd->ipath_rcvegrbase + + port * dd->ipath_rcvegrcnt * + sizeof(*tidbase)); + + for (i = 0; i < dd->ipath_rcvegrcnt; i++) + ipath_ht_put_tid(dd, &tidbase[i], 0, dd->ipath_tidinvalid); +} + +/** + * ipath_ht_tidtemplate - setup constants for TID updates + * @dd: the infinipath device + * + * We setup stuff that we use a lot, to avoid calculating each time + */ +static void ipath_ht_tidtemplate(struct ipath_devdata *dd) +{ + dd->ipath_tidtemplate = dd->ipath_ibmaxlen >> 2; + dd->ipath_tidtemplate <<= INFINIPATH_RT_BUFSIZE_SHIFT; + dd->ipath_tidtemplate |= INFINIPATH_RT_VALID; + + /* + * work around chip errata bug 7358, by marking invalid tids + * as having max length + */ + dd->ipath_tidinvalid = (-1LL & INFINIPATH_RT_BUFSIZE_MASK) << + INFINIPATH_RT_BUFSIZE_SHIFT; +} + +static int ipath_ht_early_init(struct ipath_devdata *dd) +{ + u32 __iomem *piobuf; + u32 pioincr, val32, egrsize; + int i; + + /* + * one cache line; long IB headers will spill over into received + * buffer + */ + dd->ipath_rcvhdrentsize = 16; + dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE; + + /* + * For HT-400, we allocate a somewhat overly large eager buffer, + * such that we can guarantee that we can receive the largest + * packet that we can send out. To truly support a 4KB MTU, + * we need to bump this to a large value. To date, other than + * testing, we have never encountered an HCA that can really + * send 4KB MTU packets, so we do not handle that (we'll get + * errors interrupts if we ever see one). + */ + dd->ipath_rcvegrbufsize = dd->ipath_piosize2k; + egrsize = dd->ipath_rcvegrbufsize; + + /* + * the min() check here is currently a nop, but it may not + * always be, depending on just how we do ipath_rcvegrbufsize + */ + dd->ipath_ibmaxlen = min(dd->ipath_piosize2k, + dd->ipath_rcvegrbufsize); + dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen; + ipath_ht_tidtemplate(dd); + + /* + * zero all the TID entries at startup. We do this for sanity, + * in case of a previous driver crash of some kind, and also + * because the chip powers up with these memories in an unknown + * state. Use portcnt, not cfgports, since this is for the + * full chip, not for current (possibly different) configuration + * value. + * Chip Errata bug 6447 + */ + for (val32 = 0; val32 < dd->ipath_portcnt; val32++) + ipath_ht_clear_tids(dd, val32); + + /* + * write the pbc of each buffer, to be sure it's initialized, then + * cancel all the buffers, and also abort any packets that might + * have been in flight for some reason (the latter is for driver + * unload/reload, but isn't a bad idea at first init). PIO send + * isn't enabled at this point, so there is no danger of sending + * these out on the wire. + * Chip Errata bug 6610 + */ + piobuf = (u32 __iomem *) (((char __iomem *)(dd->ipath_kregbase)) + + dd->ipath_piobufbase); + pioincr = dd->ipath_palign / sizeof(*piobuf); + for (i = 0; i < dd->ipath_piobcnt2k; i++) { + /* + * reasonable word count, just to init pbc + */ + writel(16, piobuf); + piobuf += pioincr; + } + /* + * self-clearing + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + INFINIPATH_S_ABORT); + return 0; +} + +/** + * ipath_init_ht_get_base_info - set chip-specific flags for user code + * @dd: the infinipath device + * @kbase: ipath_base_info pointer + * + * We set the PCIE flag because the lower bandwidth on PCIe vs + * HyperTransport can affect some user packet algorithims. + */ +static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase) +{ + struct ipath_base_info *kinfo = kbase; + + kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT | + IPATH_RUNTIME_RCVHDR_COPY; + + return 0; +} + +/** + * ipath_init_ht400_funcs - set up the chip-specific function pointers + * @dd: the infinipath device + * + * This is global, and is called directly at init to set up the + * chip-specific function pointers for later use. + */ +void ipath_init_ht400_funcs(struct ipath_devdata *dd) +{ + dd->ipath_f_intrsetup = ipath_ht_intconfig; + dd->ipath_f_bus = ipath_setup_ht_config; + dd->ipath_f_reset = ipath_setup_ht_reset; + dd->ipath_f_get_boardname = ipath_ht_boardname; + dd->ipath_f_init_hwerrors = ipath_ht_init_hwerrors; + dd->ipath_f_init_hwerrors = ipath_ht_init_hwerrors; + dd->ipath_f_early_init = ipath_ht_early_init; + dd->ipath_f_handle_hwerrors = ipath_ht_handle_hwerrors; + dd->ipath_f_quiet_serdes = ipath_ht_quiet_serdes; + dd->ipath_f_bringup_serdes = ipath_ht_bringup_serdes; + dd->ipath_f_clear_tids = ipath_ht_clear_tids; + dd->ipath_f_put_tid = ipath_ht_put_tid; + dd->ipath_f_cleanup = ipath_setup_ht_cleanup; + dd->ipath_f_setextled = ipath_setup_ht_setextled; + dd->ipath_f_get_base_info = ipath_ht_get_base_info; + + /* + * initialize chip-specific variables + */ + dd->ipath_f_tidtemplate = ipath_ht_tidtemplate; + + /* + * setup the register offsets, since they are different for each + * chip + */ + dd->ipath_kregs = &ipath_ht_kregs; + dd->ipath_cregs = &ipath_ht_cregs; + + /* + * do very early init that is needed before ipath_f_bus is + * called + */ + ipath_init_ht_variables(); +} diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c new file mode 100644 index 0000000..2823ff9 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c @@ -0,0 +1,951 @@ +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/pci.h> +#include <linux/netdevice.h> +#include <linux/vmalloc.h> + +#include "ipath_kernel.h" +#include "ips_common.h" + +/* + * min buffers we want to have per port, after driver + */ +#define IPATH_MIN_USER_PORT_BUFCNT 8 + +/* + * Number of ports we are configured to use (to allow for more pio + * buffers per port, etc.) Zero means use chip value. + */ +static ushort ipath_cfgports; + +module_param_named(cfgports, ipath_cfgports, ushort, S_IRUGO); +MODULE_PARM_DESC(cfgports, "Set max number of ports to use"); + +/* + * Number of buffers reserved for driver (layered drivers and SMA + * send). Reserved at end of buffer list. + */ +static ushort ipath_kpiobufs = 32; + +static int ipath_set_kpiobufs(const char *val, struct kernel_param *kp); + +module_param_call(kpiobufs, ipath_set_kpiobufs, param_get_uint, + &ipath_kpiobufs, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(kpiobufs, "Set number of PIO buffers for driver"); + +/** + * create_port0_egr - allocate the eager TID buffers + * @dd: the infinipath device + * + * This code is now quite different for user and kernel, because + * the kernel uses skb's, for the accelerated network performance. + * This is the kernel (port0) version. + * + * Allocate the eager TID buffers and program them into infinipath. + * We use the network layer alloc_skb() allocator to allocate the + * memory, and either use the buffers as is for things like SMA + * packets, or pass the buffers up to the ipath layered driver and + * thence the network layer, replacing them as we do so (see + * ipath_rcv_layer()). + */ +static int create_port0_egr(struct ipath_devdata *dd) +{ + unsigned e, egrcnt; + struct sk_buff **skbs; + int ret; + + egrcnt = dd->ipath_rcvegrcnt; + + skbs = vmalloc(sizeof(*dd->ipath_port0_skbs) * egrcnt); + if (skbs == NULL) { + ipath_dev_err(dd, "allocation error for eager TID " + "skb array\n"); + ret = -ENOMEM; + goto bail; + } + for (e = 0; e < egrcnt; e++) { + /* + * This is a bit tricky in that we allocate extra + * space for 2 bytes of the 14 byte ethernet header. + * These two bytes are passed in the ipath header so + * the rest of the data is word aligned. We allocate + * 4 bytes so that the data buffer stays word aligned. + * See ipath_kreceive() for more details. + */ + skbs[e] = ipath_alloc_skb(dd, GFP_KERNEL); + if (!skbs[e]) { + ipath_dev_err(dd, "SKB allocation error for " + "eager TID %u\n", e); + while (e != 0) + dev_kfree_skb(skbs[--e]); + ret = -ENOMEM; + goto bail; + } + } + /* + * After loop above, so we can test non-NULL to see if ready + * to use at receive, etc. + */ + dd->ipath_port0_skbs = skbs; + + for (e = 0; e < egrcnt; e++) { + unsigned long phys = + virt_to_phys(dd->ipath_port0_skbs[e]->data); + dd->ipath_f_put_tid(dd, e + (u64 __iomem *) + ((char __iomem *) dd->ipath_kregbase + + dd->ipath_rcvegrbase), 0, phys); + } + + ret = 0; + +bail: + return ret; +} + +static int bringup_link(struct ipath_devdata *dd) +{ + u64 val, ibc; + int ret = 0; + + /* hold IBC in reset */ + dd->ipath_control &= ~INFINIPATH_C_LINKENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control); + + /* + * Note that prior to try 14 or 15 of IB, the credit scaling + * wasn't working, because it was swapped for writes with the + * 1 bit default linkstate field + */ + + /* ignore pbc and align word */ + val = dd->ipath_piosize2k - 2 * sizeof(u32); + /* + * for ICRC, which we only send in diag test pkt mode, and we + * don't need to worry about that for mtu + */ + val += 1; + /* + * Set the IBC maxpktlength to the size of our pio buffers the + * maxpktlength is in words. This is *not* the IB data MTU. + */ + ibc = (val / sizeof(u32)) << INFINIPATH_IBCC_MAXPKTLEN_SHIFT; + /* in KB */ + ibc |= 0x5ULL << INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT; + /* + * How often flowctrl sent. More or less in usecs; balance against + * watermark value, so that in theory senders always get a flow + * control update in time to not let the IB link go idle. + */ + ibc |= 0x3ULL << INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT; + /* max error tolerance */ + ibc |= 0xfULL << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT; + /* use "real" buffer space for */ + ibc |= 4ULL << INFINIPATH_IBCC_CREDITSCALE_SHIFT; + /* IB credit flow control. */ + ibc |= 0xfULL << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT; + /* initially come up waiting for TS1, without sending anything. */ + dd->ipath_ibcctrl = ibc; + /* + * Want to start out with both LINKCMD and LINKINITCMD in NOP + * (0 and 0). Don't put linkinitcmd in ipath_ibcctrl, want that + * to stay a NOP + */ + ibc |= INFINIPATH_IBCC_LINKINITCMD_DISABLE << + INFINIPATH_IBCC_LINKINITCMD_SHIFT; + ipath_cdbg(VERBOSE, "Writing 0x%llx to ibcctrl\n", + (unsigned long long) ibc); + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, ibc); + + // be sure chip saw it + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + + ret = dd->ipath_f_bringup_serdes(dd); + + if (ret) + dev_info(&dd->pcidev->dev, "Could not initialize SerDes, " + "not usable\n"); + else { + /* enable IBC */ + dd->ipath_control |= INFINIPATH_C_LINKENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control); + } + + return ret; +} + +static int init_chip_first(struct ipath_devdata *dd, + struct ipath_portdata **pdp) +{ + struct ipath_portdata *pd = NULL; + int ret = 0; + u64 val; + + /* + * skip cfgports stuff because we are not allocating memory, + * and we don't want problems if the portcnt changed due to + * cfgports. We do still check and report a difference, if + * not same (should be impossible). + */ + dd->ipath_portcnt = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt); + if (!ipath_cfgports) + dd->ipath_cfgports = dd->ipath_portcnt; + else if (ipath_cfgports <= dd->ipath_portcnt) { + dd->ipath_cfgports = ipath_cfgports; + ipath_dbg("Configured to use %u ports out of %u in chip\n", + dd->ipath_cfgports, dd->ipath_portcnt); + } else { + dd->ipath_cfgports = dd->ipath_portcnt; + ipath_dbg("Tried to configured to use %u ports; chip " + "only supports %u\n", ipath_cfgports, + dd->ipath_portcnt); + } + dd->ipath_pd = kzalloc(sizeof(*dd->ipath_pd) * dd->ipath_cfgports, + GFP_KERNEL); + + if (!dd->ipath_pd) { + ipath_dev_err(dd, "Unable to allocate portdata array, " + "failing\n"); + ret = -ENOMEM; + goto done; + } + + dd->ipath_lastegrheads = kzalloc(sizeof(*dd->ipath_lastegrheads) + * dd->ipath_cfgports, + GFP_KERNEL); + dd->ipath_lastrcvhdrqtails = + kzalloc(sizeof(*dd->ipath_lastrcvhdrqtails) + * dd->ipath_cfgports, GFP_KERNEL); + + if (!dd->ipath_lastegrheads || !dd->ipath_lastrcvhdrqtails) { + ipath_dev_err(dd, "Unable to allocate head arrays, " + "failing\n"); + ret = -ENOMEM; + goto done; + } + + dd->ipath_pd[0] = kzalloc(sizeof(*pd), GFP_KERNEL); + + if (!dd->ipath_pd[0]) { + ipath_dev_err(dd, "Unable to allocate portdata for port " + "0, failing\n"); + ret = -ENOMEM; + goto done; + } + pd = dd->ipath_pd[0]; + pd->port_dd = dd; + pd->port_port = 0; + pd->port_cnt = 1; + /* The port 0 pkey table is used by the layer interface. */ + pd->port_pkeys[0] = IPS_DEFAULT_P_KEY; + dd->ipath_rcvtidcnt = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt); + dd->ipath_rcvtidbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase); + dd->ipath_rcvegrcnt = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt); + dd->ipath_rcvegrbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase); + dd->ipath_palign = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign); + dd->ipath_piobufbase = + ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufbase); + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiosize); + dd->ipath_piosize2k = val & ~0U; + dd->ipath_piosize4k = val >> 32; + dd->ipath_ibmtu = 4096; /* default to largest legal MTU */ + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufcnt); + dd->ipath_piobcnt2k = val & ~0U; + dd->ipath_piobcnt4k = val >> 32; + dd->ipath_pio2kbase = + (u32 __iomem *) (((char __iomem *) dd->ipath_kregbase) + + (dd->ipath_piobufbase & 0xffffffff)); + if (dd->ipath_piobcnt4k) { + dd->ipath_pio4kbase = (u32 __iomem *) + (((char __iomem *) dd->ipath_kregbase) + + (dd->ipath_piobufbase >> 32)); + /* + * 4K buffers take 2 pages; we use roundup just to be + * paranoid; we calculate it once here, rather than on + * ever buf allocate + */ + dd->ipath_4kalign = ALIGN(dd->ipath_piosize4k, + dd->ipath_palign); + ipath_dbg("%u 2k(%x) piobufs @ %p, %u 4k(%x) @ %p " + "(%x aligned)\n", + dd->ipath_piobcnt2k, dd->ipath_piosize2k, + dd->ipath_pio2kbase, dd->ipath_piobcnt4k, + dd->ipath_piosize4k, dd->ipath_pio4kbase, + dd->ipath_4kalign); + } + else ipath_dbg("%u 2k piobufs @ %p\n", + dd->ipath_piobcnt2k, dd->ipath_pio2kbase); + + spin_lock_init(&dd->ipath_tid_lock); + +done: + *pdp = pd; + return ret; +} + +/** + * init_chip_reset - re-initialize after a reset, or enable + * @dd: the infinipath device + * @pdp: output for port data + * + * sanity check at least some of the values after reset, and + * ensure no receive or transmit (explictly, in case reset + * failed + */ +static int init_chip_reset(struct ipath_devdata *dd, + struct ipath_portdata **pdp) +{ + struct ipath_portdata *pd; + u32 rtmp; + + *pdp = pd = dd->ipath_pd[0]; + /* ensure chip does no sends or receives while we re-initialize */ + dd->ipath_control = dd->ipath_sendctrl = dd->ipath_rcvctrl = 0U; + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0); + + rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt); + if (dd->ipath_portcnt != rtmp) + dev_info(&dd->pcidev->dev, "portcnt was %u before " + "reset, now %u, using original\n", + dd->ipath_portcnt, rtmp); + rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt); + if (rtmp != dd->ipath_rcvtidcnt) + dev_info(&dd->pcidev->dev, "tidcnt was %u before " + "reset, now %u, using original\n", + dd->ipath_rcvtidcnt, rtmp); + rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase); + if (rtmp != dd->ipath_rcvtidbase) + dev_info(&dd->pcidev->dev, "tidbase was %u before " + "reset, now %u, using original\n", + dd->ipath_rcvtidbase, rtmp); + rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt); + if (rtmp != dd->ipath_rcvegrcnt) + dev_info(&dd->pcidev->dev, "egrcnt was %u before " + "reset, now %u, using original\n", + dd->ipath_rcvegrcnt, rtmp); + rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase); + if (rtmp != dd->ipath_rcvegrbase) + dev_info(&dd->pcidev->dev, "egrbase was %u before " + "reset, now %u, using original\n", + dd->ipath_rcvegrbase, rtmp); + + return 0; +} + +static int init_pioavailregs(struct ipath_devdata *dd) +{ + int ret; + + dd->ipath_pioavailregs_dma = dma_alloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &dd->ipath_pioavailregs_phys, + GFP_KERNEL); + if (!dd->ipath_pioavailregs_dma) { + ipath_dev_err(dd, "failed to allocate PIOavail reg area " + "in memory\n"); + ret = -ENOMEM; + goto done; + } + + /* + * we really want L2 cache aligned, but for current CPUs of + * interest, they are the same. + */ + dd->ipath_statusp = (u64 *) + ((char *)dd->ipath_pioavailregs_dma + + ((2 * L1_CACHE_BYTES + + dd->ipath_pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES)); + /* copy the current value now that it's really allocated */ + *dd->ipath_statusp = dd->_ipath_status; + /* + * setup buffer to hold freeze msg, accessible to apps, + * following statusp + */ + dd->ipath_freezemsg = (char *)&dd->ipath_statusp[1]; + /* and its length */ + dd->ipath_freezelen = L1_CACHE_BYTES - sizeof(dd->ipath_statusp[0]); + + if (dd->ipath_unit * 64 > (IPATH_PORT0_RCVHDRTAIL_SIZE - 64)) { + ipath_dev_err(dd, "unit %u too large for port 0 " + "rcvhdrtail buffer size\n", dd->ipath_unit); + ret = -ENODEV; + } + else + ret = 0; + + /* so we can get current tail in ipath_kreceive(), per chip */ + dd->ipath_hdrqtailptr = &ipath_port0_rcvhdrtail[ + dd->ipath_unit * (64 / sizeof(*ipath_port0_rcvhdrtail))]; +done: + return ret; +} + +/** + * init_shadow_tids - allocate the shadow TID array + * @dd: the infinipath device + * + * allocate the shadow TID array, so we can ipath_munlock previous + * entries. It may make more sense to move the pageshadow to the + * port data structure, so we only allocate memory for ports actually + * in use, since we at 8k per port, now. + */ +static void init_shadow_tids(struct ipath_devdata *dd) +{ + dd->ipath_pageshadow = (struct page **) + vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt * + sizeof(struct page *)); + if (!dd->ipath_pageshadow) + ipath_dev_err(dd, "failed to allocate shadow page * " + "array, no expected sends!\n"); + else + memset(dd->ipath_pageshadow, 0, + dd->ipath_cfgports * dd->ipath_rcvtidcnt * + sizeof(struct page *)); +} + +static void enable_chip(struct ipath_devdata *dd, + struct ipath_portdata *pd, int reinit) +{ + u32 val; + int i; + + if (!reinit) { + init_waitqueue_head(&ipath_sma_state_wait); + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + /* Enable PIO send, and update of PIOavail regs to memory. */ + dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE | + INFINIPATH_S_PIOBUFAVAILUPD; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + + /* + * enable port 0 receive, and receive interrupt. other ports + * done as user opens and inits them. + */ + dd->ipath_rcvctrl = INFINIPATH_R_TAILUPD | + (1ULL << INFINIPATH_R_PORTENABLE_SHIFT) | + (1ULL << INFINIPATH_R_INTRAVAIL_SHIFT); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + /* + * now ready for use. this should be cleared whenever we + * detect a reset, or initiate one. + */ + dd->ipath_flags |= IPATH_INITTED; + + /* + * init our shadow copies of head from tail values, and write + * head values to match. + */ + val = ipath_read_ureg32(dd, ur_rcvegrindextail, 0); + (void)ipath_write_ureg(dd, ur_rcvegrindexhead, val, 0); + dd->ipath_port0head = ipath_read_ureg32(dd, ur_rcvhdrtail, 0); + + /* Initialize so we interrupt on next packet received */ + (void)ipath_write_ureg(dd, ur_rcvhdrhead, + dd->ipath_rhdrhead_intr_off | + dd->ipath_port0head, 0); + + /* + * by now pioavail updates to memory should have occurred, so + * copy them into our working/shadow registers; this is in + * case something went wrong with abort, but mostly to get the + * initial values of the generation bit correct. + */ + for (i = 0; i < dd->ipath_pioavregs; i++) { + __le64 val; + + /* + * Chip Errata bug 6641; even and odd qwords>3 are swapped. + */ + if (i > 3) { + if (i & 1) + val = dd->ipath_pioavailregs_dma[i - 1]; + else + val = dd->ipath_pioavailregs_dma[i + 1]; + } + else + val = dd->ipath_pioavailregs_dma[i]; + dd->ipath_pioavailshadow[i] = le64_to_cpu(val); + } + /* can get counters, stats, etc. */ + dd->ipath_flags |= IPATH_PRESENT; +} + +static int init_housekeeping(struct ipath_devdata *dd, + struct ipath_portdata **pdp, int reinit) +{ + char boardn[32]; + int ret = 0; + + /* + * have to clear shadow copies of registers at init that are + * not otherwise set here, or all kinds of bizarre things + * happen with driver on chip reset + */ + dd->ipath_rcvhdrsize = 0; + + /* + * Don't clear ipath_flags as 8bit mode was set before + * entering this func. However, we do set the linkstate to + * unknown, so we can watch for a transition. + */ + dd->ipath_flags |= IPATH_LINKUNK; + dd->ipath_flags &= ~(IPATH_LINKACTIVE | IPATH_LINKARMED | + IPATH_LINKDOWN | IPATH_LINKINIT); + + ipath_cdbg(VERBOSE, "Try to read spc chip revision\n"); + dd->ipath_revision = + ipath_read_kreg64(dd, dd->ipath_kregs->kr_revision); + + /* + * set up fundamental info we need to use the chip; we assume + * if the revision reg and these regs are OK, we don't need to + * special case the rest + */ + dd->ipath_sregbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_sendregbase); + dd->ipath_cregbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_counterregbase); + dd->ipath_uregbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_userregbase); + ipath_cdbg(VERBOSE, "ipath_kregbase %p, sendbase %x usrbase %x, " + "cntrbase %x\n", dd->ipath_kregbase, dd->ipath_sregbase, + dd->ipath_uregbase, dd->ipath_cregbase); + if ((dd->ipath_revision & 0xffffffff) == 0xffffffff + || (dd->ipath_sregbase & 0xffffffff) == 0xffffffff + || (dd->ipath_cregbase & 0xffffffff) == 0xffffffff + || (dd->ipath_uregbase & 0xffffffff) == 0xffffffff) { + ipath_dev_err(dd, "Register read failures from chip, " + "giving up initialization\n"); + ret = -ENODEV; + goto done; + } + + /* clear the initial reset flag, in case first driver load */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, + INFINIPATH_E_RESET); + + if (reinit) + ret = init_chip_reset(dd, pdp); + else + ret = init_chip_first(dd, pdp); + + if (ret) + goto done; + + ipath_cdbg(VERBOSE, "Revision %llx (PCI %x), %u ports, %u tids, " + "%u egrtids\n", (unsigned long long) dd->ipath_revision, + dd->ipath_pcirev, dd->ipath_portcnt, dd->ipath_rcvtidcnt, + dd->ipath_rcvegrcnt); + + if (((dd->ipath_revision >> INFINIPATH_R_SOFTWARE_SHIFT) & + INFINIPATH_R_SOFTWARE_MASK) != IPATH_CHIP_SWVERSION) { + ipath_dev_err(dd, "Driver only handles version %d, " + "chip swversion is %d (%llx), failng\n", + IPATH_CHIP_SWVERSION, + (int)(dd->ipath_revision >> + INFINIPATH_R_SOFTWARE_SHIFT) & + INFINIPATH_R_SOFTWARE_MASK, + (unsigned long long) dd->ipath_revision); + ret = -ENOSYS; + goto done; + } + dd->ipath_majrev = (u8) ((dd->ipath_revision >> + INFINIPATH_R_CHIPREVMAJOR_SHIFT) & + INFINIPATH_R_CHIPREVMAJOR_MASK); + dd->ipath_minrev = (u8) ((dd->ipath_revision >> + INFINIPATH_R_CHIPREVMINOR_SHIFT) & + INFINIPATH_R_CHIPREVMINOR_MASK); + dd->ipath_boardrev = (u8) ((dd->ipath_revision >> + INFINIPATH_R_BOARDID_SHIFT) & + INFINIPATH_R_BOARDID_MASK); + + ret = dd->ipath_f_get_boardname(dd, boardn, sizeof boardn); + + snprintf(dd->ipath_boardversion, sizeof(dd->ipath_boardversion), + "Driver %u.%u, %s, InfiniPath%u %u.%u, PCI %u, " + "SW Compat %u\n", + IPATH_CHIP_VERS_MAJ, IPATH_CHIP_VERS_MIN, boardn, + (unsigned)(dd->ipath_revision >> INFINIPATH_R_ARCH_SHIFT) & + INFINIPATH_R_ARCH_MASK, + dd->ipath_majrev, dd->ipath_minrev, dd->ipath_pcirev, + (unsigned)(dd->ipath_revision >> + INFINIPATH_R_SOFTWARE_SHIFT) & + INFINIPATH_R_SOFTWARE_MASK); + + ipath_dbg("%s", dd->ipath_boardversion); + +done: + return ret; +} + + +/** + * ipath_init_chip - do the actual initialization sequence on the chip + * @dd: the infinipath device + * @reinit: reinitializing, so don't allocate new memory + * + * Do the actual initialization sequence on the chip. This is done + * both from the init routine called from the PCI infrastructure, and + * when we reset the chip, or detect that it was reset internally, + * or it's administratively re-enabled. + * + * Memory allocation here and in called routines is only done in + * the first case (reinit == 0). We have to be careful, because even + * without memory allocation, we need to re-write all the chip registers + * TIDs, etc. after the reset or enable has completed. + */ +int ipath_init_chip(struct ipath_devdata *dd, int reinit) +{ + int ret = 0, i; + u32 val32, kpiobufs; + u64 val, atmp; + struct ipath_portdata *pd = NULL; /* keep gcc4 happy */ + + ret = init_housekeeping(dd, &pd, reinit); + if (ret) + goto done; + + /* + * we ignore most issues after reporting them, but have to specially + * handle hardware-disabled chips. + */ + if (ret == 2) { + /* unique error, known to ipath_init_one */ + ret = -EPERM; + goto done; + } + + /* + * We could bump this to allow for full rcvegrcnt + rcvtidcnt, + * but then it no longer nicely fits power of two, and since + * we now use routines that backend onto __get_free_pages, the + * rest would be wasted. + */ + dd->ipath_rcvhdrcnt = dd->ipath_rcvegrcnt; + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrcnt, + dd->ipath_rcvhdrcnt); + + /* + * Set up the shadow copies of the piobufavail registers, + * which we compare against the chip registers for now, and + * the in memory DMA'ed copies of the registers. This has to + * be done early, before we calculate lastport, etc. + */ + val = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + /* + * calc number of pioavail registers, and save it; we have 2 + * bits per buffer. + */ + dd->ipath_pioavregs = ALIGN(val, sizeof(u64) * BITS_PER_BYTE / 2) + / (sizeof(u64) * BITS_PER_BYTE / 2); + if (!ipath_kpiobufs) /* have to have at least 1, for SMA */ + kpiobufs = ipath_kpiobufs = 1; + else if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) < + (dd->ipath_cfgports * IPATH_MIN_USER_PORT_BUFCNT)) { + dev_info(&dd->pcidev->dev, "Too few PIO buffers (%u) " + "for %u ports to have %u each!\n", + dd->ipath_piobcnt2k + dd->ipath_piobcnt4k, + dd->ipath_cfgports, IPATH_MIN_USER_PORT_BUFCNT); + kpiobufs = 1; /* reserve just the minimum for SMA/ether */ + } else + kpiobufs = ipath_kpiobufs; + + if (kpiobufs > + (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - + (dd->ipath_cfgports * IPATH_MIN_USER_PORT_BUFCNT))) { + i = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - + (dd->ipath_cfgports * IPATH_MIN_USER_PORT_BUFCNT); + if (i < 0) + i = 0; + dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs for " + "kernel leaves too few for %d user ports " + "(%d each); using %u\n", kpiobufs, + dd->ipath_cfgports - 1, + IPATH_MIN_USER_PORT_BUFCNT, i); + /* + * shouldn't change ipath_kpiobufs, because could be + * different for different devices... + */ + kpiobufs = i; + } + dd->ipath_lastport_piobuf = + dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - kpiobufs; + dd->ipath_pbufsport = dd->ipath_cfgports > 1 + ? dd->ipath_lastport_piobuf / (dd->ipath_cfgports - 1) + : 0; + val32 = dd->ipath_lastport_piobuf - + (dd->ipath_pbufsport * (dd->ipath_cfgports - 1)); + if (val32 > 0) { + ipath_dbg("allocating %u pbufs/port leaves %u unused, " + "add to kernel\n", dd->ipath_pbufsport, val32); + dd->ipath_lastport_piobuf -= val32; + ipath_dbg("%u pbufs/port leaves %u unused, add to kernel\n", + dd->ipath_pbufsport, val32); + } + dd->ipath_lastpioindex = dd->ipath_lastport_piobuf; + ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u " + "each for %u user ports\n", kpiobufs, + dd->ipath_piobcnt2k + dd->ipath_piobcnt4k, + dd->ipath_pbufsport, dd->ipath_cfgports - 1); + + dd->ipath_f_early_init(dd); + + /* early_init sets rcvhdrentsize and rcvhdrsize, so this must be + * done after early_init */ + dd->ipath_hdrqlast = + dd->ipath_rcvhdrentsize * (dd->ipath_rcvhdrcnt - 1); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrentsize, + dd->ipath_rcvhdrentsize); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize, + dd->ipath_rcvhdrsize); + + if (!reinit) { + ret = init_pioavailregs(dd); + init_shadow_tids(dd); + if (ret) + goto done; + } + + (void)ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr, + dd->ipath_pioavailregs_phys); + /* + * this is to detect s/w errors, which the h/w works around by + * ignoring the low 6 bits of address, if it wasn't aligned. + */ + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpioavailaddr); + if (val != dd->ipath_pioavailregs_phys) { + ipath_dev_err(dd, "Catastrophic software error, " + "SendPIOAvailAddr written as %lx, " + "read back as %llx\n", + (unsigned long) dd->ipath_pioavailregs_phys, + (unsigned long long) val); + ret = -EINVAL; + goto done; + } + + val = ipath_port0_rcvhdrtail_dma + dd->ipath_unit * 64; + + /* verify that the alignment requirement was met */ + ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr, + 0, val); + atmp = ipath_read_kreg64_port( + dd, dd->ipath_kregs->kr_rcvhdrtailaddr, 0); + if (val != atmp) { + ipath_dev_err(dd, "Catastrophic software error, " + "RcvHdrTailAddr0 written as %llx, " + "read back as %llx from %x\n", + (unsigned long long) val, + (unsigned long long) atmp, + dd->ipath_kregs->kr_rcvhdrtailaddr); + ret = -EINVAL; + goto done; + } + + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvbthqp, IPATH_KD_QP); + + /* + * make sure we are not in freeze, and PIO send enabled, so + * writes to pbc happen + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, 0ULL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED); + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + INFINIPATH_S_PIOENABLE); + + /* + * before error clears, since we expect serdes pll errors during + * this, the first time after reset + */ + if (bringup_link(dd)) { + dev_info(&dd->pcidev->dev, "Failed to bringup IB link\n"); + ret = -ENETDOWN; + goto done; + } + + /* + * clear any "expected" hwerrs from reset and/or initialization + * clear any that aren't enabled (at least this once), and then + * set the enable mask + */ + dd->ipath_f_init_hwerrors(dd); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + + dd->ipath_maskederrs = dd->ipath_ignorederrs; + /* clear all */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL); + /* enable errors that are masked, at least this first time. */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + ~dd->ipath_maskederrs); + /* clear any interrups up to this point (ints still not enabled) */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL); + + ipath_stats.sps_lid[dd->ipath_unit] = dd->ipath_lid; + + /* + * Set up the port 0 (kernel) rcvhdr q and egr TIDs. If doing + * re-init, the simplest way to handle this is to free + * existing, and re-allocate. + */ + if (reinit) + ipath_free_pddata(dd, 0, 0); + dd->ipath_f_tidtemplate(dd); + ret = ipath_create_rcvhdrq(dd, pd); + if (!ret) + ret = create_port0_egr(dd); + if (ret) + ipath_dev_err(dd, "failed to allocate port 0 (kernel) " + "rcvhdrq and/or egr bufs\n"); + else + enable_chip(dd, pd, reinit); + + /* + * cause retrigger of pending interrupts ignored during init, + * even if we had errors + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL); + + if(!dd->ipath_stats_timer_active) { + /* + * first init, or after an admin disable/enable + * set up stats retrieval timer, even if we had errors + * in last portion of setup + */ + init_timer(&dd->ipath_stats_timer); + dd->ipath_stats_timer.function = ipath_get_faststats; + dd->ipath_stats_timer.data = (unsigned long) dd; + /* every 5 seconds; */ + dd->ipath_stats_timer.expires = jiffies + 5 * HZ; + /* takes ~16 seconds to overflow at full IB 4x bandwdith */ + add_timer(&dd->ipath_stats_timer); + dd->ipath_stats_timer_active = 1; + } + +done: + if (!ret) { + ipath_get_guid(dd); + *dd->ipath_statusp |= IPATH_STATUS_CHIP_PRESENT; + if (!dd->ipath_f_intrsetup(dd)) { + /* now we can enable all interrupts from the chip */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, + -1LL); + /* force re-interrupt of any pending interrupts. */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, + 0ULL); + /* chip is usable; mark it as initialized */ + *dd->ipath_statusp |= IPATH_STATUS_INITTED; + } else + ipath_dev_err(dd, "No interrupts enabled, couldn't " + "setup interrupt address\n"); + + if (dd->ipath_cfgports > ipath_stats.sps_nports) + /* + * sps_nports is a global, so, we set it to + * the highest number of ports of any of the + * chips we find; we never decrement it, at + * least for now. Since this might have changed + * over disable/enable or prior to reset, always + * do the check and potentially adjust. + */ + ipath_stats.sps_nports = dd->ipath_cfgports; + } else + ipath_dbg("Failed (%d) to initialize chip\n", ret); + + /* if ret is non-zero, we probably should do some cleanup + here... */ + return ret; +} + +static int ipath_set_kpiobufs(const char *str, struct kernel_param *kp) +{ + struct ipath_devdata *dd; + unsigned long flags; + unsigned short val; + int ret; + + ret = ipath_parse_ushort(str, &val); + + spin_lock_irqsave(&ipath_devs_lock, flags); + + if (ret < 0) + goto bail; + + if (val == 0) { + ret = -EINVAL; + goto bail; + } + + list_for_each_entry(dd, &ipath_dev_list, ipath_list) { + if (dd->ipath_kregbase) + continue; + if (val > (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - + (dd->ipath_cfgports * + IPATH_MIN_USER_PORT_BUFCNT))) + { + ipath_dev_err( + dd, + "Allocating %d PIO bufs for kernel leaves " + "too few for %d user ports (%d each)\n", + val, dd->ipath_cfgports - 1, + IPATH_MIN_USER_PORT_BUFCNT); + ret = -EINVAL; + goto bail; + } + dd->ipath_lastport_piobuf = + dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - val; + } + + ret = 0; +bail: + spin_unlock_irqrestore(&ipath_devs_lock, flags); + + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c new file mode 100644 index 0000000..0bcb428 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -0,0 +1,841 @@ +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/pci.h> + +#include "ipath_kernel.h" +#include "ips_common.h" +#include "ipath_layer.h" + +#define E_SUM_PKTERRS \ + (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \ + INFINIPATH_E_RBADVERSION | INFINIPATH_E_RHDR | \ + INFINIPATH_E_RLONGPKTLEN | INFINIPATH_E_RSHORTPKTLEN | \ + INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RMINPKTLEN | \ + INFINIPATH_E_RFORMATERR | INFINIPATH_E_RUNSUPVL | \ + INFINIPATH_E_RUNEXPCHAR | INFINIPATH_E_REBP) + +#define E_SUM_ERRS \ + (INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | \ + INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \ + INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SUNSUPVL | \ + INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \ + INFINIPATH_E_INVALIDADDR) + +static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs) +{ + unsigned long sbuf[4]; + u64 ignore_this_time = 0; + u32 piobcnt; + + /* if possible that sendbuffererror could be valid */ + piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + /* read these before writing errorclear */ + sbuf[0] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror); + sbuf[1] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror + 1); + if (piobcnt > 128) { + sbuf[2] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror + 2); + sbuf[3] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror + 3); + } + + if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) { + int i; + + ipath_cdbg(PKT, "SendbufErrs %lx %lx ", sbuf[0], sbuf[1]); + if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128) + printk("%lx %lx ", sbuf[2], sbuf[3]); + for (i = 0; i < piobcnt; i++) { + if (test_bit(i, sbuf)) { + u32 __iomem *piobuf; + if (i < dd->ipath_piobcnt2k) + piobuf = (u32 __iomem *) + (dd->ipath_pio2kbase + + i * dd->ipath_palign); + else + piobuf = (u32 __iomem *) + (dd->ipath_pio4kbase + + (i - dd->ipath_piobcnt2k) * + dd->ipath_4kalign); + + ipath_cdbg(PKT, + "PIObuf[%u] @%p pbc is %x; ", + i, piobuf, readl(piobuf)); + + ipath_disarm_piobufs(dd, i, 1); + } + } + if (ipath_debug & __IPATH_PKTDBG) + printk("\n"); + } + if ((errs & (INFINIPATH_E_SDROPPEDDATAPKT | + INFINIPATH_E_SDROPPEDSMPPKT | + INFINIPATH_E_SMINPKTLEN)) && + !(dd->ipath_flags & IPATH_LINKACTIVE)) { + /* + * This can happen when SMA is trying to bring the link + * up, but the IB link changes state at the "wrong" time. + * The IB logic then complains that the packet isn't + * valid. We don't want to confuse people, so we just + * don't print them, except at debug + */ + ipath_dbg("Ignoring pktsend errors %llx, because not " + "yet active\n", (unsigned long long) errs); + ignore_this_time = INFINIPATH_E_SDROPPEDDATAPKT | + INFINIPATH_E_SDROPPEDSMPPKT | + INFINIPATH_E_SMINPKTLEN; + } + + return ignore_this_time; +} + +/* return the strings for the most common link states */ +static char *ib_linkstate(u32 linkstate) +{ + char *ret; + + switch (linkstate) { + case IPATH_IBSTATE_INIT: + ret = "Init"; + break; + case IPATH_IBSTATE_ARM: + ret = "Arm"; + break; + case IPATH_IBSTATE_ACTIVE: + ret = "Active"; + break; + default: + ret = "Down"; + } + + return ret; +} + +static void handle_e_ibstatuschanged(struct ipath_devdata *dd, + ipath_err_t errs, int noprint) +{ + u64 val; + u32 ltstate, lstate; + + /* + * even if diags are enabled, we want to notice LINKINIT, etc. + * We just don't want to change the LED state, or + * dd->ipath_kregs->kr_ibcctrl + */ + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); + lstate = val & IPATH_IBSTATE_MASK; + if (lstate == IPATH_IBSTATE_INIT || lstate == IPATH_IBSTATE_ARM || + lstate == IPATH_IBSTATE_ACTIVE) { + /* + * only print at SMA if there is a change, debug if not + * (sometimes we want to know that, usually not). + */ + if (lstate == ((unsigned) dd->ipath_lastibcstat + & IPATH_IBSTATE_MASK)) { + ipath_dbg("Status change intr but no change (%s)\n", + ib_linkstate(lstate)); + } + else + ipath_cdbg(SMA, "Unit %u link state %s, last " + "was %s\n", dd->ipath_unit, + ib_linkstate(lstate), + ib_linkstate((unsigned) + dd->ipath_lastibcstat + & IPATH_IBSTATE_MASK)); + } + else { + lstate = dd->ipath_lastibcstat & IPATH_IBSTATE_MASK; + if (lstate == IPATH_IBSTATE_INIT || + lstate == IPATH_IBSTATE_ARM || + lstate == IPATH_IBSTATE_ACTIVE) + ipath_cdbg(SMA, "Unit %u link state down" + " (state 0x%x), from %s\n", + dd->ipath_unit, + (u32)val & IPATH_IBSTATE_MASK, + ib_linkstate(lstate)); + else + ipath_cdbg(VERBOSE, "Unit %u link state changed " + "to 0x%x from down (%x)\n", + dd->ipath_unit, (u32) val, lstate); + } + ltstate = (val >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) & + INFINIPATH_IBCS_LINKTRAININGSTATE_MASK; + lstate = (val >> INFINIPATH_IBCS_LINKSTATE_SHIFT) & + INFINIPATH_IBCS_LINKSTATE_MASK; + + if (ltstate == INFINIPATH_IBCS_LT_STATE_POLLACTIVE || + ltstate == INFINIPATH_IBCS_LT_STATE_POLLQUIET) { + u32 last_ltstate; + + /* + * Ignore cycling back and forth from Polling.Active + * to Polling.Quiet while waiting for the other end of + * the link to come up. We will cycle back and forth + * between them if no cable is plugged in, + * the other device is powered off or disabled, etc. + */ + last_ltstate = (dd->ipath_lastibcstat >> + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) + & INFINIPATH_IBCS_LINKTRAININGSTATE_MASK; + if (last_ltstate == INFINIPATH_IBCS_LT_STATE_POLLACTIVE + || last_ltstate == + INFINIPATH_IBCS_LT_STATE_POLLQUIET) { + if (dd->ipath_ibpollcnt > 40) { + dd->ipath_flags |= IPATH_NOCABLE; + *dd->ipath_statusp |= + IPATH_STATUS_IB_NOCABLE; + } else + dd->ipath_ibpollcnt++; + goto skip_ibchange; + } + } + dd->ipath_ibpollcnt = 0; /* some state other than 2 or 3 */ + ipath_stats.sps_iblink++; + if (ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) { + dd->ipath_flags |= IPATH_LINKDOWN; + dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT + | IPATH_LINKACTIVE | + IPATH_LINKARMED); + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + if (!noprint) { + if (((dd->ipath_lastibcstat >> + INFINIPATH_IBCS_LINKSTATE_SHIFT) & + INFINIPATH_IBCS_LINKSTATE_MASK) + == INFINIPATH_IBCS_L_STATE_ACTIVE) + /* if from up to down be more vocal */ + ipath_cdbg(SMA, + "Unit %u link now down (%s)\n", + dd->ipath_unit, + ipath_ibcstatus_str[ltstate]); + else + ipath_cdbg(VERBOSE, "Unit %u link is " + "down (%s)\n", dd->ipath_unit, + ipath_ibcstatus_str[ltstate]); + } + + dd->ipath_f_setextled(dd, lstate, ltstate); + } else if ((val & IPATH_IBSTATE_MASK) == IPATH_IBSTATE_ACTIVE) { + dd->ipath_flags |= IPATH_LINKACTIVE; + dd->ipath_flags &= + ~(IPATH_LINKUNK | IPATH_LINKINIT | IPATH_LINKDOWN | + IPATH_LINKARMED | IPATH_NOCABLE); + *dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE; + *dd->ipath_statusp |= + IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF; + dd->ipath_f_setextled(dd, lstate, ltstate); + + __ipath_layer_intr(dd, IPATH_LAYER_INT_IF_UP); + } else if ((val & IPATH_IBSTATE_MASK) == IPATH_IBSTATE_INIT) { + /* + * set INIT and DOWN. Down is checked by most of the other + * code, but INIT is useful to know in a few places. + */ + dd->ipath_flags |= IPATH_LINKINIT | IPATH_LINKDOWN; + dd->ipath_flags &= + ~(IPATH_LINKUNK | IPATH_LINKACTIVE | IPATH_LINKARMED + | IPATH_NOCABLE); + *dd->ipath_statusp &= ~(IPATH_STATUS_IB_NOCABLE + | IPATH_STATUS_IB_READY); + dd->ipath_f_setextled(dd, lstate, ltstate); + } else if ((val & IPATH_IBSTATE_MASK) == IPATH_IBSTATE_ARM) { + dd->ipath_flags |= IPATH_LINKARMED; + dd->ipath_flags &= + ~(IPATH_LINKUNK | IPATH_LINKDOWN | IPATH_LINKINIT | + IPATH_LINKACTIVE | IPATH_NOCABLE); + *dd->ipath_statusp &= ~(IPATH_STATUS_IB_NOCABLE + | IPATH_STATUS_IB_READY); + dd->ipath_f_setextled(dd, lstate, ltstate); + } else { + if (!noprint) + ipath_dbg("IBstatuschange unit %u: %s (%x)\n", + dd->ipath_unit, + ipath_ibcstatus_str[ltstate], ltstate); + } +skip_ibchange: + dd->ipath_lastibcstat = val; +} + +static void handle_supp_msgs(struct ipath_devdata *dd, + unsigned supp_msgs, char msg[512]) +{ + /* + * Print the message unless it's ibc status change only, which + * happens so often we never want to count it. + */ + if (dd->ipath_lasterror & ~INFINIPATH_E_IBSTATUSCHANGED) { + ipath_decode_err(msg, sizeof msg, dd->ipath_lasterror & + ~INFINIPATH_E_IBSTATUSCHANGED); + if (dd->ipath_lasterror & + ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL)) + ipath_dev_err(dd, "Suppressed %u messages for " + "fast-repeating errors (%s) (%llx)\n", + supp_msgs, msg, + (unsigned long long) + dd->ipath_lasterror); + else { + /* + * rcvegrfull and rcvhdrqfull are "normal", for some + * types of processes (mostly benchmarks) that send + * huge numbers of messages, while not processing + * them. So only complain about these at debug + * level. + */ + ipath_dbg("Suppressed %u messages for %s\n", + supp_msgs, msg); + } + } +} + +static unsigned handle_frequent_errors(struct ipath_devdata *dd, + ipath_err_t errs, char msg[512], + int *noprint) +{ + unsigned long nc; + static unsigned long nextmsg_time; + static unsigned nmsgs, supp_msgs; + + /* + * Throttle back "fast" messages to no more than 10 per 5 seconds. + * This isn't perfect, but it's a reasonable heuristic. If we get + * more than 10, give a 6x longer delay. + */ + nc = jiffies; + if (nmsgs > 10) { + if (time_before(nc, nextmsg_time)) { + *noprint = 1; + if (!supp_msgs++) + nextmsg_time = nc + HZ * 3; + } + else if (supp_msgs) { + handle_supp_msgs(dd, supp_msgs, msg); + supp_msgs = 0; + nmsgs = 0; + } + } + else if (!nmsgs++ || time_after(nc, nextmsg_time)) + nextmsg_time = nc + HZ / 2; + + return supp_msgs; +} + +static void handle_errors(struct ipath_devdata *dd, ipath_err_t errs) +{ + char msg[512]; + u64 ignore_this_time = 0; + int i; + int chkerrpkts = 0, noprint = 0; + unsigned supp_msgs; + + supp_msgs = handle_frequent_errors(dd, errs, msg, &noprint); + + /* + * don't report errors that are masked (includes those always + * ignored) + */ + errs &= ~dd->ipath_maskederrs; + + /* do these first, they are most important */ + if (errs & INFINIPATH_E_HARDWARE) { + /* reuse same msg buf */ + dd->ipath_f_handle_hwerrors(dd, msg, sizeof msg); + } + + if (!noprint && (errs & ~infinipath_e_bitsextant)) + ipath_dev_err(dd, "error interrupt with unknown errors " + "%llx set\n", (unsigned long long) + (errs & ~infinipath_e_bitsextant)); + + if (errs & E_SUM_ERRS) + ignore_this_time = handle_e_sum_errs(dd, errs); + + if (supp_msgs == 250000) { + /* + * It's not entirely reasonable assuming that the errors set + * in the last clear period are all responsible for the + * problem, but the alternative is to assume it's the only + * ones on this particular interrupt, which also isn't great + */ + dd->ipath_maskederrs |= dd->ipath_lasterror | errs; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + ~dd->ipath_maskederrs); + ipath_decode_err(msg, sizeof msg, + (dd->ipath_maskederrs & ~dd-> + ipath_ignorederrs)); + + if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs) & + ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL)) + ipath_dev_err(dd, "Disabling error(s) %llx because " + "occuring too frequently (%s)\n", + (unsigned long long) + (dd->ipath_maskederrs & + ~dd->ipath_ignorederrs), msg); + else { + /* + * rcvegrfull and rcvhdrqfull are "normal", + * for some types of processes (mostly benchmarks) + * that send huge numbers of messages, while not + * processing them. So only complain about + * these at debug level. + */ + ipath_dbg("Disabling frequent queue full errors " + "(%s)\n", msg); + } + + /* + * Re-enable the masked errors after around 3 minutes. in + * ipath_get_faststats(). If we have a series of fast + * repeating but different errors, the interval will keep + * stretching out, but that's OK, as that's pretty + * catastrophic. + */ + dd->ipath_unmasktime = jiffies + HZ * 180; + } + + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, errs); + if (ignore_this_time) + errs &= ~ignore_this_time; + if (errs & ~dd->ipath_lasterror) { + errs &= ~dd->ipath_lasterror; + /* never suppress duplicate hwerrors or ibstatuschange */ + dd->ipath_lasterror |= errs & + ~(INFINIPATH_E_HARDWARE | + INFINIPATH_E_IBSTATUSCHANGED); + } + if (!errs) + return; + + if (!noprint) + /* + * the ones we mask off are handled specially below or above + */ + ipath_decode_err(msg, sizeof msg, + errs & ~(INFINIPATH_E_IBSTATUSCHANGED | + INFINIPATH_E_RRCVEGRFULL | + INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_HARDWARE)); + else + /* so we don't need if (!noprint) at strlcat's below */ + *msg = 0; + + if (errs & E_SUM_PKTERRS) { + ipath_stats.sps_pkterrs++; + chkerrpkts = 1; + } + if (errs & E_SUM_ERRS) + ipath_stats.sps_errs++; + + if (errs & (INFINIPATH_E_RICRC | INFINIPATH_E_RVCRC)) { + ipath_stats.sps_crcerrs++; + chkerrpkts = 1; + } + + /* + * We don't want to print these two as they happen, or we can make + * the situation even worse, because it takes so long to print + * messages to serial consoles. Kernel ports get printed from + * fast_stats, no more than every 5 seconds, user ports get printed + * on close + */ + if (errs & INFINIPATH_E_RRCVHDRFULL) { + int any; + u32 hd, tl; + ipath_stats.sps_hdrqfull++; + for (any = i = 0; i < dd->ipath_cfgports; i++) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + if (i == 0) { + hd = dd->ipath_port0head; + tl = (u32) le64_to_cpu( + *dd->ipath_hdrqtailptr); + } else if (pd && pd->port_cnt && + pd->port_rcvhdrtail_kvaddr) { + /* + * don't report same point multiple times, + * except kernel + */ + tl = (u32) * pd->port_rcvhdrtail_kvaddr; + if (tl == dd->ipath_lastrcvhdrqtails[i]) + continue; + hd = ipath_read_ureg32(dd, ur_rcvhdrhead, + i); + } else + continue; + if (hd == (tl + 1) || + (!hd && tl == dd->ipath_hdrqlast)) { + dd->ipath_lastrcvhdrqtails[i] = tl; + pd->port_hdrqfull++; + if (i == 0) + chkerrpkts = 1; + } + } + } + if (errs & INFINIPATH_E_RRCVEGRFULL) { + /* + * since this is of less importance and not likely to + * happen without also getting hdrfull, only count + * occurrences; don't check each port (or even the kernel + * vs user) + */ + ipath_stats.sps_etidfull++; + if (dd->ipath_port0head != + (u32) le64_to_cpu(*dd->ipath_hdrqtailptr)) + chkerrpkts = 1; + } + + /* + * do this before IBSTATUSCHANGED, in case both bits set in a single + * interrupt; we want the STATUSCHANGE to "win", so we do our + * internal copy of state machine correctly + */ + if (errs & INFINIPATH_E_RIBLOSTLINK) { + /* + * force through block below + */ + errs |= INFINIPATH_E_IBSTATUSCHANGED; + ipath_stats.sps_iblink++; + dd->ipath_flags |= IPATH_LINKDOWN; + dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT + | IPATH_LINKARMED | IPATH_LINKACTIVE); + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + if (!noprint) { + u64 st = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_ibcstatus); + + ipath_dbg("Lost link, link now down (%s)\n", + ipath_ibcstatus_str[st & 0xf]); + } + } + if (errs & INFINIPATH_E_IBSTATUSCHANGED) + handle_e_ibstatuschanged(dd, errs, noprint); + + if (errs & INFINIPATH_E_RESET) { + if (!noprint) + ipath_dev_err(dd, "Got reset, requires re-init " + "(unload and reload driver)\n"); + dd->ipath_flags &= ~IPATH_INITTED; /* needs re-init */ + /* mark as having had error */ + *dd->ipath_statusp |= IPATH_STATUS_HWERROR; + *dd->ipath_statusp &= ~IPATH_STATUS_IB_CONF; + } + + if (!noprint && *msg) + ipath_dev_err(dd, "%s error\n", msg); + if (dd->ipath_sma_state_wanted & dd->ipath_flags) { + ipath_cdbg(VERBOSE, "sma wanted state %x, iflags now %x, " + "waking\n", dd->ipath_sma_state_wanted, + dd->ipath_flags); + wake_up_interruptible(&ipath_sma_state_wait); + } + + if (chkerrpkts) + /* process possible error packets in hdrq */ + ipath_kreceive(dd); +} + +/* this is separate to allow for better optimization of ipath_intr() */ + +static void ipath_bad_intr(struct ipath_devdata *dd, u32 * unexpectp) +{ + /* + * sometimes happen during driver init and unload, don't want + * to process any interrupts at that point + */ + + /* this is just a bandaid, not a fix, if something goes badly + * wrong */ + if (++*unexpectp > 100) { + if (++*unexpectp > 105) { + /* + * ok, we must be taking somebody else's interrupts, + * due to a messed up mptable and/or PIRQ table, so + * unregister the interrupt. We've seen this during + * linuxbios development work, and it may happen in + * the future again. + */ + if (dd->pcidev && dd->pcidev->irq) { + ipath_dev_err(dd, "Now %u unexpected " + "interrupts, unregistering " + "interrupt handler\n", + *unexpectp); + ipath_dbg("free_irq of irq %x\n", + dd->pcidev->irq); + free_irq(dd->pcidev->irq, dd); + } + } + if (ipath_read_kreg32(dd, dd->ipath_kregs->kr_intmask)) { + ipath_dev_err(dd, "%u unexpected interrupts, " + "disabling interrupts completely\n", + *unexpectp); + /* + * disable all interrupts, something is very wrong + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, + 0ULL); + } + } else if (*unexpectp > 1) + ipath_dbg("Interrupt when not ready, should not happen, " + "ignoring\n"); +} + +static void ipath_bad_regread(struct ipath_devdata *dd) +{ + static int allbits; + + /* separate routine, for better optimization of ipath_intr() */ + + /* + * We print the message and disable interrupts, in hope of + * having a better chance of debugging the problem. + */ + ipath_dev_err(dd, + "Read of interrupt status failed (all bits set)\n"); + if (allbits++) { + /* disable all interrupts, something is very wrong */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); + if (allbits == 2) { + ipath_dev_err(dd, "Still bad interrupt status, " + "unregistering interrupt\n"); + free_irq(dd->pcidev->irq, dd); + } else if (allbits > 2) { + if ((allbits % 10000) == 0) + printk("."); + } else + ipath_dev_err(dd, "Disabling interrupts, " + "multiple errors\n"); + } +} + +static void handle_port_pioavail(struct ipath_devdata *dd) +{ + u32 i; + /* + * start from port 1, since for now port 0 is never using + * wait_event for PIO + */ + for (i = 1; dd->ipath_portpiowait && i < dd->ipath_cfgports; i++) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + + if (pd && pd->port_cnt && + dd->ipath_portpiowait & (1U << i)) { + clear_bit(i, &dd->ipath_portpiowait); + if (test_bit(IPATH_PORT_WAITING_PIO, + &pd->port_flag)) { + clear_bit(IPATH_PORT_WAITING_PIO, + &pd->port_flag); + wake_up_interruptible(&pd->port_wait); + } + } + } +} + +static void handle_layer_pioavail(struct ipath_devdata *dd) +{ + int ret; + + ret = __ipath_layer_intr(dd, IPATH_LAYER_INT_SEND_CONTINUE); + if (ret > 0) + goto clear; + + ret = __ipath_verbs_piobufavail(dd); + if (ret > 0) + goto clear; + + return; +clear: + set_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); +} + +static void handle_rcv(struct ipath_devdata *dd, u32 istat) +{ + u64 portr; + int i; + int rcvdint = 0; + + portr = ((istat >> INFINIPATH_I_RCVAVAIL_SHIFT) & + infinipath_i_rcvavail_mask) + | ((istat >> INFINIPATH_I_RCVURG_SHIFT) & + infinipath_i_rcvurg_mask); + for (i = 0; i < dd->ipath_cfgports; i++) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + if (portr & (1 << i) && pd && + pd->port_cnt) { + if (i == 0) + ipath_kreceive(dd); + else if (test_bit(IPATH_PORT_WAITING_RCV, + &pd->port_flag)) { + int rcbit; + clear_bit(IPATH_PORT_WAITING_RCV, + &pd->port_flag); + rcbit = i + INFINIPATH_R_INTRAVAIL_SHIFT; + clear_bit(1UL << rcbit, &dd->ipath_rcvctrl); + wake_up_interruptible(&pd->port_wait); + rcvdint = 1; + } + } + } + if (rcvdint) { + /* only want to take one interrupt, so turn off the rcv + * interrupt for all the ports that we did the wakeup on + * (but never for kernel port) + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + } +} + +irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs) +{ + struct ipath_devdata *dd = data; + u32 istat = ipath_read_kreg32(dd, dd->ipath_kregs->kr_intstatus); + ipath_err_t estat = 0; + static unsigned unexpected = 0; + irqreturn_t ret; + + if (unlikely(!istat)) { + ipath_stats.sps_nullintr++; + ret = IRQ_NONE; /* not our interrupt, or already handled */ + goto bail; + } + if (unlikely(istat == -1)) { + ipath_bad_regread(dd); + /* don't know if it was our interrupt or not */ + ret = IRQ_NONE; + goto bail; + } + + ipath_stats.sps_ints++; + + /* + * this needs to be flags&initted, not statusp, so we keep + * taking interrupts even after link goes down, etc. + * Also, we *must* clear the interrupt at some point, or we won't + * take it again, which can be real bad for errors, etc... + */ + + if (!(dd->ipath_flags & IPATH_INITTED)) { + ipath_bad_intr(dd, &unexpected); + ret = IRQ_NONE; + goto bail; + } + if (unexpected) + unexpected = 0; + + ipath_cdbg(VERBOSE, "intr stat=0x%x\n", istat); + + if (istat & ~infinipath_i_bitsextant) + ipath_dev_err(dd, + "interrupt with unknown interrupts %x set\n", + istat & (u32) ~ infinipath_i_bitsextant); + + if (istat & INFINIPATH_I_ERROR) { + ipath_stats.sps_errints++; + estat = ipath_read_kreg64(dd, + dd->ipath_kregs->kr_errorstatus); + if (!estat) + dev_info(&dd->pcidev->dev, "error interrupt (%x), " + "but no error bits set!\n", istat); + else if (estat == -1LL) + /* + * should we try clearing all, or hope next read + * works? + */ + ipath_dev_err(dd, "Read of error status failed " + "(all bits set); ignoring\n"); + else + handle_errors(dd, estat); + } + + if (istat & INFINIPATH_I_GPIO) { + if (unlikely(!(dd->ipath_flags & IPATH_GPIO_INTR))) { + u32 gpiostatus; + gpiostatus = ipath_read_kreg32( + dd, dd->ipath_kregs->kr_gpio_status); + ipath_dbg("Unexpected GPIO interrupt bits %x\n", + gpiostatus); + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear, + gpiostatus); + } + else { + /* Clear GPIO status bit 2 */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear, + (u64) (1 << 2)); + + /* + * Packets are available in the port 0 rcv queue. + * Eventually this needs to be generalized to check + * IPATH_GPIO_INTR, and the specific GPIO bit, if + * GPIO interrupts are used for anything else. + */ + ipath_kreceive(dd); + } + } + + /* + * clear the ones we will deal with on this round + * We clear it early, mostly for receive interrupts, so we + * know the chip will have seen this by the time we process + * the queue, and will re-interrupt if necessary. The processor + * itself won't take the interrupt again until we return. + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, istat); + + if (istat & INFINIPATH_I_SPIOBUFAVAIL) { + clear_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + + if (dd->ipath_portpiowait) + handle_port_pioavail(dd); + + handle_layer_pioavail(dd); + } + + /* + * we check for both transition from empty to non-empty, and urgent + * packets (those with the interrupt bit set in the header) + */ + + if (istat & ((infinipath_i_rcvavail_mask << + INFINIPATH_I_RCVAVAIL_SHIFT) + | (infinipath_i_rcvurg_mask << + INFINIPATH_I_RCVURG_SHIFT))) + handle_rcv(dd, istat); + + ret = IRQ_HANDLED; + +bail: + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h new file mode 100644 index 0000000..0ce5f19 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_kernel.h @@ -0,0 +1,883 @@ +#ifndef _IPATH_KERNEL_H +#define _IPATH_KERNEL_H +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This header file is the base header file for infinipath kernel code + * ipath_user.h serves a similar purpose for user code. + */ + +#include <linux/interrupt.h> +#include <asm/io.h> + +#include "ipath_common.h" +#include "ipath_debug.h" +#include "ipath_registers.h" + +/* only s/w major version of InfiniPath we can handle */ +#define IPATH_CHIP_VERS_MAJ 2U + +/* don't care about this except printing */ +#define IPATH_CHIP_VERS_MIN 0U + +/* temporary, maybe always */ +extern struct infinipath_stats ipath_stats; + +#define IPATH_CHIP_SWVERSION IPATH_CHIP_VERS_MAJ + +struct ipath_portdata { + void **port_rcvegrbuf; + dma_addr_t *port_rcvegrbuf_phys; + /* rcvhdrq base, needs mmap before useful */ + void *port_rcvhdrq; + /* kernel virtual address where hdrqtail is updated */ + u64 *port_rcvhdrtail_kvaddr; + /* page * used for uaddr */ + struct page *port_rcvhdrtail_pagep; + /* + * temp buffer for expected send setup, allocated at open, instead + * of each setup call + */ + void *port_tid_pg_list; + /* when waiting for rcv or pioavail */ + wait_queue_head_t port_wait; + /* + * rcvegr bufs base, physical, must fit + * in 44 bits so 32 bit programs mmap64 44 bit works) + */ + dma_addr_t port_rcvegr_phys; + /* mmap of hdrq, must fit in 44 bits */ + dma_addr_t port_rcvhdrq_phys; + /* + * the actual user address that we ipath_mlock'ed, so we can + * ipath_munlock it at close + */ + unsigned long port_rcvhdrtail_uaddr; + /* + * number of opens on this instance (0 or 1; ignoring forks, dup, + * etc. for now) + */ + int port_cnt; + /* + * how much space to leave at start of eager TID entries for + * protocol use, on each TID + */ + /* instead of calculating it */ + unsigned port_port; + /* chip offset of PIO buffers for this port */ + u32 port_piobufs; + /* how many alloc_pages() chunks in port_rcvegrbuf_pages */ + u32 port_rcvegrbuf_chunks; + /* how many egrbufs per chunk */ + u32 port_rcvegrbufs_perchunk; + /* order for port_rcvegrbuf_pages */ + size_t port_rcvegrbuf_size; + /* rcvhdrq size (for freeing) */ + size_t port_rcvhdrq_size; + /* next expected TID to check when looking for free */ + u32 port_tidcursor; + /* next expected TID to check */ + unsigned long port_flag; + /* WAIT_RCV that timed out, no interrupt */ + u32 port_rcvwait_to; + /* WAIT_PIO that timed out, no interrupt */ + u32 port_piowait_to; + /* WAIT_RCV already happened, no wait */ + u32 port_rcvnowait; + /* WAIT_PIO already happened, no wait */ + u32 port_pionowait; + /* total number of rcvhdrqfull errors */ + u32 port_hdrqfull; + /* pid of process using this port */ + pid_t port_pid; + /* same size as task_struct .comm[] */ + char port_comm[16]; + /* pkeys set by this use of this port */ + u16 port_pkeys[4]; + /* so file ops can get at unit */ + struct ipath_devdata *port_dd; +}; + +struct sk_buff; + +/* + * control information for layered drivers + */ +struct _ipath_layer { + void *l_arg; +}; + +/* Verbs layer interface */ +struct _verbs_layer { + void *l_arg; + struct timer_list l_timer; +}; + +struct ipath_devdata { + struct list_head ipath_list; + + struct ipath_kregs const *ipath_kregs; + struct ipath_cregs const *ipath_cregs; + + /* mem-mapped pointer to base of chip regs */ + u64 __iomem *ipath_kregbase; + /* end of mem-mapped chip space; range checking */ + u64 __iomem *ipath_kregend; + /* physical address of chip for io_remap, etc. */ + unsigned long ipath_physaddr; + /* base of memory alloced for ipath_kregbase, for free */ + u64 *ipath_kregalloc; + /* + * version of kregbase that doesn't have high bits set (for 32 bit + * programs, so mmap64 44 bit works) + */ + u64 __iomem *ipath_kregvirt; + /* + * virtual address where port0 rcvhdrqtail updated for this unit. + * only written to by the chip, not the driver. + */ + volatile __le64 *ipath_hdrqtailptr; + dma_addr_t ipath_dma_addr; + /* ipath_cfgports pointers */ + struct ipath_portdata **ipath_pd; + /* sk_buffs used by port 0 eager receive queue */ + struct sk_buff **ipath_port0_skbs; + /* kvirt address of 1st 2k pio buffer */ + void __iomem *ipath_pio2kbase; + /* kvirt address of 1st 4k pio buffer */ + void __iomem *ipath_pio4kbase; + /* + * points to area where PIOavail registers will be DMA'ed. + * Has to be on a page of it's own, because the page will be + * mapped into user program space. This copy is *ONLY* ever + * written by DMA, not by the driver! Need a copy per device + * when we get to multiple devices + */ + volatile __le64 *ipath_pioavailregs_dma; + /* physical address where updates occur */ + dma_addr_t ipath_pioavailregs_phys; + struct _ipath_layer ipath_layer; + /* setup intr */ + int (*ipath_f_intrsetup)(struct ipath_devdata *); + /* setup on-chip bus config */ + int (*ipath_f_bus)(struct ipath_devdata *, struct pci_dev *); + /* hard reset chip */ + int (*ipath_f_reset)(struct ipath_devdata *); + int (*ipath_f_get_boardname)(struct ipath_devdata *, char *, + size_t); + void (*ipath_f_init_hwerrors)(struct ipath_devdata *); + void (*ipath_f_handle_hwerrors)(struct ipath_devdata *, char *, + size_t); + void (*ipath_f_quiet_serdes)(struct ipath_devdata *); + int (*ipath_f_bringup_serdes)(struct ipath_devdata *); + int (*ipath_f_early_init)(struct ipath_devdata *); + void (*ipath_f_clear_tids)(struct ipath_devdata *, unsigned); + void (*ipath_f_put_tid)(struct ipath_devdata *, u64 __iomem*, + u32, unsigned long); + void (*ipath_f_tidtemplate)(struct ipath_devdata *); + void (*ipath_f_cleanup)(struct ipath_devdata *); + void (*ipath_f_setextled)(struct ipath_devdata *, u64, u64); + /* fill out chip-specific fields */ + int (*ipath_f_get_base_info)(struct ipath_portdata *, void *); + struct _verbs_layer verbs_layer; + /* total dwords sent (summed from counter) */ + u64 ipath_sword; + /* total dwords rcvd (summed from counter) */ + u64 ipath_rword; + /* total packets sent (summed from counter) */ + u64 ipath_spkts; + /* total packets rcvd (summed from counter) */ + u64 ipath_rpkts; + /* ipath_statusp initially points to this. */ + u64 _ipath_status; + /* GUID for this interface, in network order */ + __be64 ipath_guid; + /* + * aggregrate of error bits reported since last cleared, for + * limiting of error reporting + */ + ipath_err_t ipath_lasterror; + /* + * aggregrate of error bits reported since last cleared, for + * limiting of hwerror reporting + */ + ipath_err_t ipath_lasthwerror; + /* + * errors masked because they occur too fast, also includes errors + * that are always ignored (ipath_ignorederrs) + */ + ipath_err_t ipath_maskederrs; + /* time in jiffies at which to re-enable maskederrs */ + unsigned long ipath_unmasktime; + /* + * errors always ignored (masked), at least for a given + * chip/device, because they are wrong or not useful + */ + ipath_err_t ipath_ignorederrs; + /* count of egrfull errors, combined for all ports */ + u64 ipath_last_tidfull; + /* for ipath_qcheck() */ + u64 ipath_lastport0rcv_cnt; + /* template for writing TIDs */ + u64 ipath_tidtemplate; + /* value to write to free TIDs */ + u64 ipath_tidinvalid; + /* PE-800 rcv interrupt setup */ + u64 ipath_rhdrhead_intr_off; + + /* size of memory at ipath_kregbase */ + u32 ipath_kregsize; + /* number of registers used for pioavail */ + u32 ipath_pioavregs; + /* IPATH_POLL, etc. */ + u32 ipath_flags; + /* ipath_flags sma is waiting for */ + u32 ipath_sma_state_wanted; + /* last buffer for user use, first buf for kernel use is this + * index. */ + u32 ipath_lastport_piobuf; + /* is a stats timer active */ + u32 ipath_stats_timer_active; + /* dwords sent read from counter */ + u32 ipath_lastsword; + /* dwords received read from counter */ + u32 ipath_lastrword; + /* sent packets read from counter */ + u32 ipath_lastspkts; + /* received packets read from counter */ + u32 ipath_lastrpkts; + /* pio bufs allocated per port */ + u32 ipath_pbufsport; + /* + * number of ports configured as max; zero is set to number chip + * supports, less gives more pio bufs/port, etc. + */ + u32 ipath_cfgports; + /* port0 rcvhdrq head offset */ + u32 ipath_port0head; + /* count of port 0 hdrqfull errors */ + u32 ipath_p0_hdrqfull; + + /* + * (*cfgports) used to suppress multiple instances of same + * port staying stuck at same point + */ + u32 *ipath_lastrcvhdrqtails; + /* + * (*cfgports) used to suppress multiple instances of same + * port staying stuck at same point + */ + u32 *ipath_lastegrheads; + /* + * index of last piobuffer we used. Speeds up searching, by + * starting at this point. Doesn't matter if multiple cpu's use and + * update, last updater is only write that matters. Whenever it + * wraps, we update shadow copies. Need a copy per device when we + * get to multiple devices + */ + u32 ipath_lastpioindex; + /* max length of freezemsg */ + u32 ipath_freezelen; + /* + * consecutive times we wanted a PIO buffer but were unable to + * get one + */ + u32 ipath_consec_nopiobuf; + /* + * hint that we should update ipath_pioavailshadow before + * looking for a PIO buffer + */ + u32 ipath_upd_pio_shadow; + /* so we can rewrite it after a chip reset */ + u32 ipath_pcibar0; + /* so we can rewrite it after a chip reset */ + u32 ipath_pcibar1; + /* sequential tries for SMA send and no bufs */ + u32 ipath_nosma_bufs; + /* duration (seconds) ipath_nosma_bufs set */ + u32 ipath_nosma_secs; + + /* HT/PCI Vendor ID (here for NodeInfo) */ + u16 ipath_vendorid; + /* HT/PCI Device ID (here for NodeInfo) */ + u16 ipath_deviceid; + /* offset in HT config space of slave/primary interface block */ + u8 ipath_ht_slave_off; + /* for write combining settings */ + unsigned long ipath_wc_cookie; + /* ref count for each pkey */ + atomic_t ipath_pkeyrefs[4]; + /* shadow copy of all exptids physaddr; used only by funcsim */ + u64 *ipath_tidsimshadow; + /* shadow copy of struct page *'s for exp tid pages */ + struct page **ipath_pageshadow; + /* lock to workaround chip bug 9437 */ + spinlock_t ipath_tid_lock; + + /* + * IPATH_STATUS_*, + * this address is mapped readonly into user processes so they can + * get status cheaply, whenever they want. + */ + u64 *ipath_statusp; + /* freeze msg if hw error put chip in freeze */ + char *ipath_freezemsg; + /* pci access data structure */ + struct pci_dev *pcidev; + struct cdev *cdev; + struct class_device *class_dev; + /* timer used to prevent stats overflow, error throttling, etc. */ + struct timer_list ipath_stats_timer; + /* check for stale messages in rcv queue */ + /* only allow one intr at a time. */ + unsigned long ipath_rcv_pending; + + /* + * Shadow copies of registers; size indicates read access size. + * Most of them are readonly, but some are write-only register, + * where we manipulate the bits in the shadow copy, and then write + * the shadow copy to infinipath. + * + * We deliberately make most of these 32 bits, since they have + * restricted range. For any that we read, we won't to generate 32 + * bit accesses, since Opteron will generate 2 separate 32 bit HT + * transactions for a 64 bit read, and we want to avoid unnecessary + * HT transactions. + */ + + /* This is the 64 bit group */ + + /* + * shadow of pioavail, check to be sure it's large enough at + * init time. + */ + unsigned long ipath_pioavailshadow[8]; + /* shadow of kr_gpio_out, for rmw ops */ + u64 ipath_gpio_out; + /* kr_revision shadow */ + u64 ipath_revision; + /* + * shadow of ibcctrl, for interrupt handling of link changes, + * etc. + */ + u64 ipath_ibcctrl; + /* + * last ibcstatus, to suppress "duplicate" status change messages, + * mostly from 2 to 3 + */ + u64 ipath_lastibcstat; + /* hwerrmask shadow */ + ipath_err_t ipath_hwerrmask; + /* interrupt config reg shadow */ + u64 ipath_intconfig; + /* kr_sendpiobufbase value */ + u64 ipath_piobufbase; + + /* these are the "32 bit" regs */ + + /* + * number of GUIDs in the flash for this interface; may need some + * rethinking for setting on other ifaces + */ + u32 ipath_nguid; + /* + * the following two are 32-bit bitmasks, but {test,clear,set}_bit + * all expect bit fields to be "unsigned long" + */ + /* shadow kr_rcvctrl */ + unsigned long ipath_rcvctrl; + /* shadow kr_sendctrl */ + unsigned long ipath_sendctrl; + + /* value we put in kr_rcvhdrcnt */ + u32 ipath_rcvhdrcnt; + /* value we put in kr_rcvhdrsize */ + u32 ipath_rcvhdrsize; + /* value we put in kr_rcvhdrentsize */ + u32 ipath_rcvhdrentsize; + /* offset of last entry in rcvhdrq */ + u32 ipath_hdrqlast; + /* kr_portcnt value */ + u32 ipath_portcnt; + /* kr_pagealign value */ + u32 ipath_palign; + /* number of "2KB" PIO buffers */ + u32 ipath_piobcnt2k; + /* size in bytes of "2KB" PIO buffers */ + u32 ipath_piosize2k; + /* number of "4KB" PIO buffers */ + u32 ipath_piobcnt4k; + /* size in bytes of "4KB" PIO buffers */ + u32 ipath_piosize4k; + /* kr_rcvegrbase value */ + u32 ipath_rcvegrbase; + /* kr_rcvegrcnt value */ + u32 ipath_rcvegrcnt; + /* kr_rcvtidbase value */ + u32 ipath_rcvtidbase; + /* kr_rcvtidcnt value */ + u32 ipath_rcvtidcnt; + /* kr_sendregbase */ + u32 ipath_sregbase; + /* kr_userregbase */ + u32 ipath_uregbase; + /* kr_counterregbase */ + u32 ipath_cregbase; + /* shadow the control register contents */ + u32 ipath_control; + /* shadow the gpio output contents */ + u32 ipath_extctrl; + /* PCI revision register (HTC rev on FPGA) */ + u32 ipath_pcirev; + + /* chip address space used by 4k pio buffers */ + u32 ipath_4kalign; + /* The MTU programmed for this unit */ + u32 ipath_ibmtu; + /* + * The max size IB packet, included IB headers that we can send. + * Starts same as ipath_piosize, but is affected when ibmtu is + * changed, or by size of eager buffers + */ + u32 ipath_ibmaxlen; + /* + * ibmaxlen at init time, limited by chip and by receive buffer + * size. Not changed after init. + */ + u32 ipath_init_ibmaxlen; + /* size of each rcvegrbuffer */ + u32 ipath_rcvegrbufsize; + /* width (2,4,8,16,32) from HT config reg */ + u32 ipath_htwidth; + /* HT speed (200,400,800,1000) from HT config */ + u32 ipath_htspeed; + /* ports waiting for PIOavail intr */ + unsigned long ipath_portpiowait; + /* + * number of sequential ibcstatus change for polling active/quiet + * (i.e., link not coming up). + */ + u32 ipath_ibpollcnt; + /* low and high portions of MSI capability/vector */ + u32 ipath_msi_lo; + /* saved after PCIe init for restore after reset */ + u32 ipath_msi_hi; + /* MSI data (vector) saved for restore */ + u16 ipath_msi_data; + /* MLID programmed for this instance */ + u16 ipath_mlid; + /* LID programmed for this instance */ + u16 ipath_lid; + /* list of pkeys programmed; 0 if not set */ + u16 ipath_pkeys[4]; + /* ASCII serial number, from flash */ + u8 ipath_serial[12]; + /* human readable board version */ + u8 ipath_boardversion[80]; + /* chip major rev, from ipath_revision */ + u8 ipath_majrev; + /* chip minor rev, from ipath_revision */ + u8 ipath_minrev; + /* board rev, from ipath_revision */ + u8 ipath_boardrev; + /* unit # of this chip, if present */ + int ipath_unit; + /* saved for restore after reset */ + u8 ipath_pci_cacheline; + /* LID mask control */ + u8 ipath_lmc; +}; + +extern volatile __le64 *ipath_port0_rcvhdrtail; +extern dma_addr_t ipath_port0_rcvhdrtail_dma; + +#define IPATH_PORT0_RCVHDRTAIL_SIZE PAGE_SIZE + +extern struct list_head ipath_dev_list; +extern spinlock_t ipath_devs_lock; +extern struct ipath_devdata *ipath_lookup(int unit); + +extern u16 ipath_layer_rcv_opcode; +extern int __ipath_layer_intr(struct ipath_devdata *, u32); +extern int ipath_layer_intr(struct ipath_devdata *, u32); +extern int __ipath_layer_rcv(struct ipath_devdata *, void *, + struct sk_buff *); +extern int __ipath_layer_rcv_lid(struct ipath_devdata *, void *); +extern int __ipath_verbs_piobufavail(struct ipath_devdata *); +extern int __ipath_verbs_rcv(struct ipath_devdata *, void *, void *, u32); + +void ipath_layer_add(struct ipath_devdata *); +void ipath_layer_del(struct ipath_devdata *); + +int ipath_init_chip(struct ipath_devdata *, int); +int ipath_enable_wc(struct ipath_devdata *dd); +void ipath_disable_wc(struct ipath_devdata *dd); +int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp); +void ipath_shutdown_device(struct ipath_devdata *); + +struct file_operations; +int ipath_cdev_init(int minor, char *name, struct file_operations *fops, + struct cdev **cdevp, struct class_device **class_devp); +void ipath_cdev_cleanup(struct cdev **cdevp, + struct class_device **class_devp); + +int ipath_diag_init(void); +void ipath_diag_cleanup(void); +void ipath_diag_bringup_link(struct ipath_devdata *); + +extern wait_queue_head_t ipath_sma_state_wait; + +int ipath_user_add(struct ipath_devdata *dd); +void ipath_user_del(struct ipath_devdata *dd); + +struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, gfp_t); + +extern int ipath_diag_inuse; + +irqreturn_t ipath_intr(int irq, void *devid, struct pt_regs *regs); +void ipath_decode_err(char *buf, size_t blen, ipath_err_t err); +#if __IPATH_INFO || __IPATH_DBG +extern const char *ipath_ibcstatus_str[]; +#endif + +/* clean up any per-chip chip-specific stuff */ +void ipath_chip_cleanup(struct ipath_devdata *); +/* clean up any chip type-specific stuff */ +void ipath_chip_done(void); + +/* check to see if we have to force ordering for write combining */ +int ipath_unordered_wc(void); + +void ipath_disarm_piobufs(struct ipath_devdata *, unsigned first, + unsigned cnt); + +int ipath_create_rcvhdrq(struct ipath_devdata *, struct ipath_portdata *); +void ipath_free_pddata(struct ipath_devdata *, u32, int); + +int ipath_parse_ushort(const char *str, unsigned short *valp); + +int ipath_wait_linkstate(struct ipath_devdata *, u32, int); +void ipath_set_ib_lstate(struct ipath_devdata *, int); +void ipath_kreceive(struct ipath_devdata *); +int ipath_setrcvhdrsize(struct ipath_devdata *, unsigned); +int ipath_reset_device(int); +void ipath_get_faststats(unsigned long); + +/* for use in system calls, where we want to know device type, etc. */ +#define port_fp(fp) ((struct ipath_portdata *) (fp)->private_data) + +/* + * values for ipath_flags + */ +/* The chip is up and initted */ +#define IPATH_INITTED 0x2 + /* set if any user code has set kr_rcvhdrsize */ +#define IPATH_RCVHDRSZ_SET 0x4 + /* The chip is present and valid for accesses */ +#define IPATH_PRESENT 0x8 + /* HT link0 is only 8 bits wide, ignore upper byte crc + * errors, etc. */ +#define IPATH_8BIT_IN_HT0 0x10 + /* HT link1 is only 8 bits wide, ignore upper byte crc + * errors, etc. */ +#define IPATH_8BIT_IN_HT1 0x20 + /* The link is down */ +#define IPATH_LINKDOWN 0x40 + /* The link level is up (0x11) */ +#define IPATH_LINKINIT 0x80 + /* The link is in the armed (0x21) state */ +#define IPATH_LINKARMED 0x100 + /* The link is in the active (0x31) state */ +#define IPATH_LINKACTIVE 0x200 + /* link current state is unknown */ +#define IPATH_LINKUNK 0x400 + /* no IB cable, or no device on IB cable */ +#define IPATH_NOCABLE 0x4000 + /* Supports port zero per packet receive interrupts via + * GPIO */ +#define IPATH_GPIO_INTR 0x8000 + /* uses the coded 4byte TID, not 8 byte */ +#define IPATH_4BYTE_TID 0x10000 + /* packet/word counters are 32 bit, else those 4 counters + * are 64bit */ +#define IPATH_32BITCOUNTERS 0x20000 + /* can miss port0 rx interrupts */ +#define IPATH_POLL_RX_INTR 0x40000 +#define IPATH_DISABLED 0x80000 /* administratively disabled */ + +/* portdata flag bit offsets */ + /* waiting for a packet to arrive */ +#define IPATH_PORT_WAITING_RCV 2 + /* waiting for a PIO buffer to be available */ +#define IPATH_PORT_WAITING_PIO 3 + +/* free up any allocated data at closes */ +void ipath_free_data(struct ipath_portdata *dd); +int ipath_waitfor_mdio_cmdready(struct ipath_devdata *); +int ipath_waitfor_complete(struct ipath_devdata *, ipath_kreg, u64, u64 *); +u32 __iomem *ipath_getpiobuf(struct ipath_devdata *, u32 *); +/* init PE-800-specific func */ +void ipath_init_pe800_funcs(struct ipath_devdata *); +/* init HT-400-specific func */ +void ipath_init_ht400_funcs(struct ipath_devdata *); +void ipath_get_guid(struct ipath_devdata *); +u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg); + +/* + * number of words used for protocol header if not set by ipath_userinit(); + */ +#define IPATH_DFLT_RCVHDRSIZE 9 + +#define IPATH_MDIO_CMD_WRITE 1 +#define IPATH_MDIO_CMD_READ 2 +#define IPATH_MDIO_CLD_DIV 25 /* to get 2.5 Mhz mdio clock */ +#define IPATH_MDIO_CMDVALID 0x40000000 /* bit 30 */ +#define IPATH_MDIO_DATAVALID 0x80000000 /* bit 31 */ +#define IPATH_MDIO_CTRL_STD 0x0 + +static inline u64 ipath_mdio_req(int cmd, int dev, int reg, int data) +{ + return (((u64) IPATH_MDIO_CLD_DIV) << 32) | + (cmd << 26) | + (dev << 21) | + (reg << 16) | + (data & 0xFFFF); +} + + /* signal and fifo status, in bank 31 */ +#define IPATH_MDIO_CTRL_XGXS_REG_8 0x8 + /* controls loopback, redundancy */ +#define IPATH_MDIO_CTRL_8355_REG_1 0x10 + /* premph, encdec, etc. */ +#define IPATH_MDIO_CTRL_8355_REG_2 0x11 + /* Kchars, etc. */ +#define IPATH_MDIO_CTRL_8355_REG_6 0x15 +#define IPATH_MDIO_CTRL_8355_REG_9 0x18 +#define IPATH_MDIO_CTRL_8355_REG_10 0x1D + +int ipath_get_user_pages(unsigned long, size_t, struct page **); +int ipath_get_user_pages_nocopy(unsigned long, struct page **); +void ipath_release_user_pages(struct page **, size_t); +void ipath_release_user_pages_on_close(struct page **, size_t); +int ipath_eeprom_read(struct ipath_devdata *, u8, void *, int); +int ipath_eeprom_write(struct ipath_devdata *, u8, const void *, int); + +/* these are used for the registers that vary with port */ +void ipath_write_kreg_port(const struct ipath_devdata *, ipath_kreg, + unsigned, u64); +u64 ipath_read_kreg64_port(const struct ipath_devdata *, ipath_kreg, + unsigned); + +/* + * We could have a single register get/put routine, that takes a group type, + * but this is somewhat clearer and cleaner. It also gives us some error + * checking. 64 bit register reads should always work, but are inefficient + * on opteron (the northbridge always generates 2 separate HT 32 bit reads), + * so we use kreg32 wherever possible. User register and counter register + * reads are always 32 bit reads, so only one form of those routines. + */ + +/* + * At the moment, none of the s-registers are writable, so no + * ipath_write_sreg(), and none of the c-registers are writable, so no + * ipath_write_creg(). + */ + +/** + * ipath_read_ureg32 - read 32-bit virtualized per-port register + * @dd: device + * @regno: register number + * @port: port number + * + * Return the contents of a register that is virtualized to be per port. + * Prints a debug message and returns -1 on errors (not distinguishable from + * valid contents at runtime; we may add a separate error variable at some + * point). + * + * This is normally not used by the kernel, but may be for debugging, and + * has a different implementation than user mode, which is why it's not in + * _common.h. + */ +static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd, + ipath_ureg regno, int port) +{ + if (!dd->ipath_kregbase) + return 0; + + return readl(regno + (u64 __iomem *) + (dd->ipath_uregbase + + (char __iomem *)dd->ipath_kregbase + + dd->ipath_palign * port)); +} + +/** + * ipath_write_ureg - write 32-bit virtualized per-port register + * @dd: device + * @regno: register number + * @value: value + * @port: port + * + * Write the contents of a register that is virtualized to be per port. + */ +static inline void ipath_write_ureg(const struct ipath_devdata *dd, + ipath_ureg regno, u64 value, int port) +{ + u64 __iomem *ubase = (u64 __iomem *) + (dd->ipath_uregbase + (char __iomem *) dd->ipath_kregbase + + dd->ipath_palign * port); + if (dd->ipath_kregbase) + writeq(value, &ubase[regno]); +} + +static inline u32 ipath_read_kreg32(const struct ipath_devdata *dd, + ipath_kreg regno) +{ + if (!dd->ipath_kregbase) + return -1; + return readl((u32 __iomem *) & dd->ipath_kregbase[regno]); +} + +static inline u64 ipath_read_kreg64(const struct ipath_devdata *dd, + ipath_kreg regno) +{ + if (!dd->ipath_kregbase) + return -1; + + return readq(&dd->ipath_kregbase[regno]); +} + +static inline void ipath_write_kreg(const struct ipath_devdata *dd, + ipath_kreg regno, u64 value) +{ + if (dd->ipath_kregbase) + writeq(value, &dd->ipath_kregbase[regno]); +} + +static inline u64 ipath_read_creg(const struct ipath_devdata *dd, + ipath_sreg regno) +{ + if (!dd->ipath_kregbase) + return 0; + + return readq(regno + (u64 __iomem *) + (dd->ipath_cregbase + + (char __iomem *)dd->ipath_kregbase)); +} + +static inline u32 ipath_read_creg32(const struct ipath_devdata *dd, + ipath_sreg regno) +{ + if (!dd->ipath_kregbase) + return 0; + return readl(regno + (u64 __iomem *) + (dd->ipath_cregbase + + (char __iomem *)dd->ipath_kregbase)); +} + +/* + * sysfs interface. + */ + +struct device_driver; + +extern const char ipath_core_version[]; + +int ipath_driver_create_group(struct device_driver *); +void ipath_driver_remove_group(struct device_driver *); + +int ipath_device_create_group(struct device *, struct ipath_devdata *); +void ipath_device_remove_group(struct device *, struct ipath_devdata *); +int ipath_expose_reset(struct device *); + +int ipath_init_ipathfs(void); +void ipath_exit_ipathfs(void); +int ipathfs_add_device(struct ipath_devdata *); +int ipathfs_remove_device(struct ipath_devdata *); + +/* + * Flush write combining store buffers (if present) and perform a write + * barrier. + */ +#if defined(CONFIG_X86_64) +#define ipath_flush_wc() asm volatile("sfence" ::: "memory") +#else +#define ipath_flush_wc() wmb() +#endif + +extern unsigned ipath_debug; /* debugging bit mask */ + +const char *ipath_get_unit_name(int unit); + +extern struct mutex ipath_mutex; + +#define IPATH_DRV_NAME "ipath_core" +#define IPATH_MAJOR 233 +#define IPATH_SMA_MINOR 128 +#define IPATH_DIAG_MINOR 129 +#define IPATH_NMINORS 130 + +#define ipath_dev_err(dd,fmt,...) \ + do { \ + const struct ipath_devdata *__dd = (dd); \ + if (__dd->pcidev) \ + dev_err(&__dd->pcidev->dev, "%s: " fmt, \ + ipath_get_unit_name(__dd->ipath_unit), \ + ##__VA_ARGS__); \ + else \ + printk(KERN_ERR IPATH_DRV_NAME ": %s: " fmt, \ + ipath_get_unit_name(__dd->ipath_unit), \ + ##__VA_ARGS__); \ + } while (0) + +#if _IPATH_DEBUGGING + +# define __IPATH_DBG_WHICH(which,fmt,...) \ + do { \ + if(unlikely(ipath_debug&(which))) \ + printk(KERN_DEBUG IPATH_DRV_NAME ": %s: " fmt, \ + __func__,##__VA_ARGS__); \ + } while(0) + +# define ipath_dbg(fmt,...) \ + __IPATH_DBG_WHICH(__IPATH_DBG,fmt,##__VA_ARGS__) +# define ipath_cdbg(which,fmt,...) \ + __IPATH_DBG_WHICH(__IPATH_##which##DBG,fmt,##__VA_ARGS__) + +#else /* ! _IPATH_DEBUGGING */ + +# define ipath_dbg(fmt,...) +# define ipath_cdbg(which,fmt,...) + +#endif /* _IPATH_DEBUGGING */ + +#endif /* _IPATH_KERNEL_H */ diff --git a/drivers/infiniband/hw/ipath/ipath_keys.c b/drivers/infiniband/hw/ipath/ipath_keys.c new file mode 100644 index 0000000..aa33b0e --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_keys.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <asm/io.h> + +#include "ipath_verbs.h" + +/** + * ipath_alloc_lkey - allocate an lkey + * @rkt: lkey table in which to allocate the lkey + * @mr: memory region that this lkey protects + * + * Returns 1 if successful, otherwise returns 0. + */ + +int ipath_alloc_lkey(struct ipath_lkey_table *rkt, struct ipath_mregion *mr) +{ + unsigned long flags; + u32 r; + u32 n; + int ret; + + spin_lock_irqsave(&rkt->lock, flags); + + /* Find the next available LKEY */ + r = n = rkt->next; + for (;;) { + if (rkt->table[r] == NULL) + break; + r = (r + 1) & (rkt->max - 1); + if (r == n) { + spin_unlock_irqrestore(&rkt->lock, flags); + _VERBS_INFO("LKEY table full\n"); + ret = 0; + goto bail; + } + } + rkt->next = (r + 1) & (rkt->max - 1); + /* + * Make sure lkey is never zero which is reserved to indicate an + * unrestricted LKEY. + */ + rkt->gen++; + mr->lkey = (r << (32 - ib_ipath_lkey_table_size)) | + ((((1 << (24 - ib_ipath_lkey_table_size)) - 1) & rkt->gen) + << 8); + if (mr->lkey == 0) { + mr->lkey |= 1 << 8; + rkt->gen++; + } + rkt->table[r] = mr; + spin_unlock_irqrestore(&rkt->lock, flags); + + ret = 1; + +bail: + return ret; +} + +/** + * ipath_free_lkey - free an lkey + * @rkt: table from which to free the lkey + * @lkey: lkey id to free + */ +void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey) +{ + unsigned long flags; + u32 r; + + if (lkey == 0) + return; + r = lkey >> (32 - ib_ipath_lkey_table_size); + spin_lock_irqsave(&rkt->lock, flags); + rkt->table[r] = NULL; + spin_unlock_irqrestore(&rkt->lock, flags); +} + +/** + * ipath_lkey_ok - check IB SGE for validity and initialize + * @rkt: table containing lkey to check SGE against + * @isge: outgoing internal SGE + * @sge: SGE to check + * @acc: access flags + * + * Return 1 if valid and successful, otherwise returns 0. + * + * Check the IB SGE for validity and initialize our internal version + * of it. + */ +int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge, + struct ib_sge *sge, int acc) +{ + struct ipath_mregion *mr; + size_t off; + int ret; + + /* + * We use LKEY == zero to mean a physical kmalloc() address. + * This is a bit of a hack since we rely on dma_map_single() + * being reversible by calling bus_to_virt(). + */ + if (sge->lkey == 0) { + isge->mr = NULL; + isge->vaddr = bus_to_virt(sge->addr); + isge->length = sge->length; + isge->sge_length = sge->length; + ret = 1; + goto bail; + } + spin_lock(&rkt->lock); + mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))]; + spin_unlock(&rkt->lock); + if (unlikely(mr == NULL || mr->lkey != sge->lkey)) { + ret = 0; + goto bail; + } + + off = sge->addr - mr->user_base; + if (unlikely(sge->addr < mr->user_base || + off + sge->length > mr->length || + (mr->access_flags & acc) != acc)) { + ret = 0; + goto bail; + } + + off += mr->offset; + isge->mr = mr; + isge->m = 0; + isge->n = 0; + while (off >= mr->map[isge->m]->segs[isge->n].length) { + off -= mr->map[isge->m]->segs[isge->n].length; + isge->n++; + if (isge->n >= IPATH_SEGSZ) { + isge->m++; + isge->n = 0; + } + } + isge->vaddr = mr->map[isge->m]->segs[isge->n].vaddr + off; + isge->length = mr->map[isge->m]->segs[isge->n].length - off; + isge->sge_length = sge->length; + + ret = 1; + +bail: + return ret; +} + +/** + * ipath_rkey_ok - check the IB virtual address, length, and RKEY + * @dev: infiniband device + * @ss: SGE state + * @len: length of data + * @vaddr: virtual address to place data + * @rkey: rkey to check + * @acc: access flags + * + * Return 1 if successful, otherwise 0. + * + * The QP r_rq.lock should be held. + */ +int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss, + u32 len, u64 vaddr, u32 rkey, int acc) +{ + struct ipath_lkey_table *rkt = &dev->lk_table; + struct ipath_sge *sge = &ss->sge; + struct ipath_mregion *mr; + size_t off; + int ret; + + spin_lock(&rkt->lock); + mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))]; + spin_unlock(&rkt->lock); + if (unlikely(mr == NULL || mr->lkey != rkey)) { + ret = 0; + goto bail; + } + + off = vaddr - mr->iova; + if (unlikely(vaddr < mr->iova || off + len > mr->length || + (mr->access_flags & acc) == 0)) { + ret = 0; + goto bail; + } + + off += mr->offset; + sge->mr = mr; + sge->m = 0; + sge->n = 0; + while (off >= mr->map[sge->m]->segs[sge->n].length) { + off -= mr->map[sge->m]->segs[sge->n].length; + sge->n++; + if (sge->n >= IPATH_SEGSZ) { + sge->m++; + sge->n = 0; + } + } + sge->vaddr = mr->map[sge->m]->segs[sge->n].vaddr + off; + sge->length = mr->map[sge->m]->segs[sge->n].length - off; + sge->sge_length = len; + ss->sg_list = NULL; + ss->num_sge = 1; + + ret = 1; + +bail: + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_layer.c b/drivers/infiniband/hw/ipath/ipath_layer.c new file mode 100644 index 0000000..69ed110 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_layer.c @@ -0,0 +1,1515 @@ +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * These are the routines used by layered drivers, currently just the + * layered ethernet driver and verbs layer. + */ + +#include <linux/io.h> +#include <linux/pci.h> +#include <asm/byteorder.h> + +#include "ipath_kernel.h" +#include "ips_common.h" +#include "ipath_layer.h" + +/* Acquire before ipath_devs_lock. */ +static DEFINE_MUTEX(ipath_layer_mutex); + +u16 ipath_layer_rcv_opcode; +static int (*layer_intr)(void *, u32); +static int (*layer_rcv)(void *, void *, struct sk_buff *); +static int (*layer_rcv_lid)(void *, void *); +static int (*verbs_piobufavail)(void *); +static void (*verbs_rcv)(void *, void *, void *, u32); +static int ipath_verbs_registered; + +static void *(*layer_add_one)(int, struct ipath_devdata *); +static void (*layer_remove_one)(void *); +static void *(*verbs_add_one)(int, struct ipath_devdata *); +static void (*verbs_remove_one)(void *); +static void (*verbs_timer_cb)(void *); + +int __ipath_layer_intr(struct ipath_devdata *dd, u32 arg) +{ + int ret = -ENODEV; + + if (dd->ipath_layer.l_arg && layer_intr) + ret = layer_intr(dd->ipath_layer.l_arg, arg); + + return ret; +} + +int ipath_layer_intr(struct ipath_devdata *dd, u32 arg) +{ + int ret; + + mutex_lock(&ipath_layer_mutex); + + ret = __ipath_layer_intr(dd, arg); + + mutex_unlock(&ipath_layer_mutex); + + return ret; +} + +int __ipath_layer_rcv(struct ipath_devdata *dd, void *hdr, + struct sk_buff *skb) +{ + int ret = -ENODEV; + + if (dd->ipath_layer.l_arg && layer_rcv) + ret = layer_rcv(dd->ipath_layer.l_arg, hdr, skb); + + return ret; +} + +int __ipath_layer_rcv_lid(struct ipath_devdata *dd, void *hdr) +{ + int ret = -ENODEV; + + if (dd->ipath_layer.l_arg && layer_rcv_lid) + ret = layer_rcv_lid(dd->ipath_layer.l_arg, hdr); + + return ret; +} + +int __ipath_verbs_piobufavail(struct ipath_devdata *dd) +{ + int ret = -ENODEV; + + if (dd->verbs_layer.l_arg && verbs_piobufavail) + ret = verbs_piobufavail(dd->verbs_layer.l_arg); + + return ret; +} + +int __ipath_verbs_rcv(struct ipath_devdata *dd, void *rc, void *ebuf, + u32 tlen) +{ + int ret = -ENODEV; + + if (dd->verbs_layer.l_arg && verbs_rcv) { + verbs_rcv(dd->verbs_layer.l_arg, rc, ebuf, tlen); + ret = 0; + } + + return ret; +} + +int ipath_layer_set_linkstate(struct ipath_devdata *dd, u8 newstate) +{ + u32 lstate; + int ret; + + switch (newstate) { + case IPATH_IB_LINKDOWN: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKINITCMD_POLL << + INFINIPATH_IBCC_LINKINITCMD_SHIFT); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKDOWN_SLEEP: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKINITCMD_SLEEP << + INFINIPATH_IBCC_LINKINITCMD_SHIFT); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKDOWN_DISABLE: + ipath_set_ib_lstate(dd, + INFINIPATH_IBCC_LINKINITCMD_DISABLE << + INFINIPATH_IBCC_LINKINITCMD_SHIFT); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKINIT: + if (dd->ipath_flags & IPATH_LINKINIT) { + ret = 0; + goto bail; + } + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_INIT << + INFINIPATH_IBCC_LINKCMD_SHIFT); + lstate = IPATH_LINKINIT; + break; + + case IPATH_IB_LINKARM: + if (dd->ipath_flags & IPATH_LINKARMED) { + ret = 0; + goto bail; + } + if (!(dd->ipath_flags & + (IPATH_LINKINIT | IPATH_LINKACTIVE))) { + ret = -EINVAL; + goto bail; + } + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED << + INFINIPATH_IBCC_LINKCMD_SHIFT); + /* + * Since the port can transition to ACTIVE by receiving + * a non VL 15 packet, wait for either state. + */ + lstate = IPATH_LINKARMED | IPATH_LINKACTIVE; + break; + + case IPATH_IB_LINKACTIVE: + if (dd->ipath_flags & IPATH_LINKACTIVE) { + ret = 0; + goto bail; + } + if (!(dd->ipath_flags & IPATH_LINKARMED)) { + ret = -EINVAL; + goto bail; + } + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE << + INFINIPATH_IBCC_LINKCMD_SHIFT); + lstate = IPATH_LINKACTIVE; + break; + + default: + ipath_dbg("Invalid linkstate 0x%x requested\n", newstate); + ret = -EINVAL; + goto bail; + } + ret = ipath_wait_linkstate(dd, lstate, 2000); + +bail: + return ret; +} + +EXPORT_SYMBOL_GPL(ipath_layer_set_linkstate); + +/** + * ipath_layer_set_mtu - set the MTU + * @dd: the infinipath device + * @arg: the new MTU + * + * we can handle "any" incoming size, the issue here is whether we + * need to restrict our outgoing size. For now, we don't do any + * sanity checking on this, and we don't deal with what happens to + * programs that are already running when the size changes. + * NOTE: changing the MTU will usually cause the IBC to go back to + * link initialize (IPATH_IBSTATE_INIT) state... + */ +int ipath_layer_set_mtu(struct ipath_devdata *dd, u16 arg) +{ + u32 piosize; + int changed = 0; + int ret; + + /* + * mtu is IB data payload max. It's the largest power of 2 less + * than piosize (or even larger, since it only really controls the + * largest we can receive; we can send the max of the mtu and + * piosize). We check that it's one of the valid IB sizes. + */ + if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 && + arg != 4096) { + ipath_dbg("Trying to set invalid mtu %u, failing\n", arg); + ret = -EINVAL; + goto bail; + } + if (dd->ipath_ibmtu == arg) { + ret = 0; /* same as current */ + goto bail; + } + + piosize = dd->ipath_ibmaxlen; + dd->ipath_ibmtu = arg; + + if (arg >= (piosize - IPATH_PIO_MAXIBHDR)) { + /* Only if it's not the initial value (or reset to it) */ + if (piosize != dd->ipath_init_ibmaxlen) { + dd->ipath_ibmaxlen = piosize; + changed = 1; + } + } else if ((arg + IPATH_PIO_MAXIBHDR) != dd->ipath_ibmaxlen) { + piosize = arg + IPATH_PIO_MAXIBHDR; + ipath_cdbg(VERBOSE, "ibmaxlen was 0x%x, setting to 0x%x " + "(mtu 0x%x)\n", dd->ipath_ibmaxlen, piosize, + arg); + dd->ipath_ibmaxlen = piosize; + changed = 1; + } + + if (changed) { + /* + * set the IBC maxpktlength to the size of our pio + * buffers in words + */ + u64 ibc = dd->ipath_ibcctrl; + ibc &= ~(INFINIPATH_IBCC_MAXPKTLEN_MASK << + INFINIPATH_IBCC_MAXPKTLEN_SHIFT); + + piosize = piosize - 2 * sizeof(u32); /* ignore pbc */ + dd->ipath_ibmaxlen = piosize; + piosize /= sizeof(u32); /* in words */ + /* + * for ICRC, which we only send in diag test pkt mode, and + * we don't need to worry about that for mtu + */ + piosize += 1; + + ibc |= piosize << INFINIPATH_IBCC_MAXPKTLEN_SHIFT; + dd->ipath_ibcctrl = ibc; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + dd->ipath_f_tidtemplate(dd); + } + + ret = 0; + +bail: + return ret; +} + +EXPORT_SYMBOL_GPL(ipath_layer_set_mtu); + +int ipath_set_sps_lid(struct ipath_devdata *dd, u32 arg, u8 lmc) +{ + ipath_stats.sps_lid[dd->ipath_unit] = arg; + dd->ipath_lid = arg; + dd->ipath_lmc = lmc; + + mutex_lock(&ipath_layer_mutex); + + if (dd->ipath_layer.l_arg && layer_intr) + layer_intr(dd->ipath_layer.l_arg, IPATH_LAYER_INT_LID); + + mutex_unlock(&ipath_layer_mutex); + + return 0; +} + +EXPORT_SYMBOL_GPL(ipath_set_sps_lid); + +int ipath_layer_set_guid(struct ipath_devdata *dd, __be64 guid) +{ + /* XXX - need to inform anyone who cares this just happened. */ + dd->ipath_guid = guid; + return 0; +} + +EXPORT_SYMBOL_GPL(ipath_layer_set_guid); + +__be64 ipath_layer_get_guid(struct ipath_devdata *dd) +{ + return dd->ipath_guid; +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_guid); + +u32 ipath_layer_get_nguid(struct ipath_devdata *dd) +{ + return dd->ipath_nguid; +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_nguid); + +int ipath_layer_query_device(struct ipath_devdata *dd, u32 * vendor, + u32 * boardrev, u32 * majrev, u32 * minrev) +{ + *vendor = dd->ipath_vendorid; + *boardrev = dd->ipath_boardrev; + *majrev = dd->ipath_majrev; + *minrev = dd->ipath_minrev; + + return 0; +} + +EXPORT_SYMBOL_GPL(ipath_layer_query_device); + +u32 ipath_layer_get_flags(struct ipath_devdata *dd) +{ + return dd->ipath_flags; +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_flags); + +struct device *ipath_layer_get_device(struct ipath_devdata *dd) +{ + return &dd->pcidev->dev; +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_device); + +u16 ipath_layer_get_deviceid(struct ipath_devdata *dd) +{ + return dd->ipath_deviceid; +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_deviceid); + +u64 ipath_layer_get_lastibcstat(struct ipath_devdata *dd) +{ + return dd->ipath_lastibcstat; +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_lastibcstat); + +u32 ipath_layer_get_ibmtu(struct ipath_devdata *dd) +{ + return dd->ipath_ibmtu; +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_ibmtu); + +void ipath_layer_add(struct ipath_devdata *dd) +{ + mutex_lock(&ipath_layer_mutex); + + if (layer_add_one) + dd->ipath_layer.l_arg = + layer_add_one(dd->ipath_unit, dd); + + if (verbs_add_one) + dd->verbs_layer.l_arg = + verbs_add_one(dd->ipath_unit, dd); + + mutex_unlock(&ipath_layer_mutex); +} + +void ipath_layer_del(struct ipath_devdata *dd) +{ + mutex_lock(&ipath_layer_mutex); + + if (dd->ipath_layer.l_arg && layer_remove_one) { + layer_remove_one(dd->ipath_layer.l_arg); + dd->ipath_layer.l_arg = NULL; + } + + if (dd->verbs_layer.l_arg && verbs_remove_one) { + verbs_remove_one(dd->verbs_layer.l_arg); + dd->verbs_layer.l_arg = NULL; + } + + mutex_unlock(&ipath_layer_mutex); +} + +int ipath_layer_register(void *(*l_add)(int, struct ipath_devdata *), + void (*l_remove)(void *), + int (*l_intr)(void *, u32), + int (*l_rcv)(void *, void *, struct sk_buff *), + u16 l_rcv_opcode, + int (*l_rcv_lid)(void *, void *)) +{ + struct ipath_devdata *dd, *tmp; + unsigned long flags; + + mutex_lock(&ipath_layer_mutex); + + layer_add_one = l_add; + layer_remove_one = l_remove; + layer_intr = l_intr; + layer_rcv = l_rcv; + layer_rcv_lid = l_rcv_lid; + ipath_layer_rcv_opcode = l_rcv_opcode; + + spin_lock_irqsave(&ipath_devs_lock, flags); + + list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) { + if (!(dd->ipath_flags & IPATH_INITTED)) + continue; + + if (dd->ipath_layer.l_arg) + continue; + + if (!(*dd->ipath_statusp & IPATH_STATUS_SMA)) + *dd->ipath_statusp |= IPATH_STATUS_OIB_SMA; + + spin_unlock_irqrestore(&ipath_devs_lock, flags); + dd->ipath_layer.l_arg = l_add(dd->ipath_unit, dd); + spin_lock_irqsave(&ipath_devs_lock, flags); + } + + spin_unlock_irqrestore(&ipath_devs_lock, flags); + mutex_unlock(&ipath_layer_mutex); + + return 0; +} + +EXPORT_SYMBOL_GPL(ipath_layer_register); + +void ipath_layer_unregister(void) +{ + struct ipath_devdata *dd, *tmp; + unsigned long flags; + + mutex_lock(&ipath_layer_mutex); + spin_lock_irqsave(&ipath_devs_lock, flags); + + list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) { + if (dd->ipath_layer.l_arg && layer_remove_one) { + spin_unlock_irqrestore(&ipath_devs_lock, flags); + layer_remove_one(dd->ipath_layer.l_arg); + spin_lock_irqsave(&ipath_devs_lock, flags); + dd->ipath_layer.l_arg = NULL; + } + } + + spin_unlock_irqrestore(&ipath_devs_lock, flags); + + layer_add_one = NULL; + layer_remove_one = NULL; + layer_intr = NULL; + layer_rcv = NULL; + layer_rcv_lid = NULL; + + mutex_unlock(&ipath_layer_mutex); +} + +EXPORT_SYMBOL_GPL(ipath_layer_unregister); + +static void __ipath_verbs_timer(unsigned long arg) +{ + struct ipath_devdata *dd = (struct ipath_devdata *) arg; + + /* + * If port 0 receive packet interrupts are not available, or + * can be missed, poll the receive queue + */ + if (dd->ipath_flags & IPATH_POLL_RX_INTR) + ipath_kreceive(dd); + + /* Handle verbs layer timeouts. */ + if (dd->verbs_layer.l_arg && verbs_timer_cb) + verbs_timer_cb(dd->verbs_layer.l_arg); + + mod_timer(&dd->verbs_layer.l_timer, jiffies + 1); +} + +/** + * ipath_verbs_register - verbs layer registration + * @l_piobufavail: callback for when PIO buffers become available + * @l_rcv: callback for receiving a packet + * @l_timer_cb: timer callback + * @ipath_devdata: device data structure is put here + */ +int ipath_verbs_register(void *(*l_add)(int, struct ipath_devdata *), + void (*l_remove)(void *arg), + int (*l_piobufavail) (void *arg), + void (*l_rcv) (void *arg, void *rhdr, + void *data, u32 tlen), + void (*l_timer_cb) (void *arg)) +{ + struct ipath_devdata *dd, *tmp; + unsigned long flags; + + mutex_lock(&ipath_layer_mutex); + + verbs_add_one = l_add; + verbs_remove_one = l_remove; + verbs_piobufavail = l_piobufavail; + verbs_rcv = l_rcv; + verbs_timer_cb = l_timer_cb; + + spin_lock_irqsave(&ipath_devs_lock, flags); + + list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) { + if (!(dd->ipath_flags & IPATH_INITTED)) + continue; + + if (dd->verbs_layer.l_arg) + continue; + + spin_unlock_irqrestore(&ipath_devs_lock, flags); + dd->verbs_layer.l_arg = l_add(dd->ipath_unit, dd); + spin_lock_irqsave(&ipath_devs_lock, flags); + } + + spin_unlock_irqrestore(&ipath_devs_lock, flags); + mutex_unlock(&ipath_layer_mutex); + + ipath_verbs_registered = 1; + + return 0; +} + +EXPORT_SYMBOL_GPL(ipath_verbs_register); + +void ipath_verbs_unregister(void) +{ + struct ipath_devdata *dd, *tmp; + unsigned long flags; + + mutex_lock(&ipath_layer_mutex); + spin_lock_irqsave(&ipath_devs_lock, flags); + + list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) { + *dd->ipath_statusp &= ~IPATH_STATUS_OIB_SMA; + + if (dd->verbs_layer.l_arg && verbs_remove_one) { + spin_unlock_irqrestore(&ipath_devs_lock, flags); + verbs_remove_one(dd->verbs_layer.l_arg); + spin_lock_irqsave(&ipath_devs_lock, flags); + dd->verbs_layer.l_arg = NULL; + } + } + + spin_unlock_irqrestore(&ipath_devs_lock, flags); + + verbs_add_one = NULL; + verbs_remove_one = NULL; + verbs_piobufavail = NULL; + verbs_rcv = NULL; + verbs_timer_cb = NULL; + + mutex_unlock(&ipath_layer_mutex); +} + +EXPORT_SYMBOL_GPL(ipath_verbs_unregister); + +int ipath_layer_open(struct ipath_devdata *dd, u32 * pktmax) +{ + int ret; + u32 intval = 0; + + mutex_lock(&ipath_layer_mutex); + + if (!dd->ipath_layer.l_arg) { + ret = -EINVAL; + goto bail; + } + + ret = ipath_setrcvhdrsize(dd, NUM_OF_EXTRA_WORDS_IN_HEADER_QUEUE); + + if (ret < 0) + goto bail; + + *pktmax = dd->ipath_ibmaxlen; + + if (*dd->ipath_statusp & IPATH_STATUS_IB_READY) + intval |= IPATH_LAYER_INT_IF_UP; + if (ipath_stats.sps_lid[dd->ipath_unit]) + intval |= IPATH_LAYER_INT_LID; + if (ipath_stats.sps_mlid[dd->ipath_unit]) + intval |= IPATH_LAYER_INT_BCAST; + /* + * do this on open, in case low level is already up and + * just layered driver was reloaded, etc. + */ + if (intval) + layer_intr(dd->ipath_layer.l_arg, intval); + + ret = 0; +bail: + mutex_unlock(&ipath_layer_mutex); + + return ret; +} + +EXPORT_SYMBOL_GPL(ipath_layer_open); + +u16 ipath_layer_get_lid(struct ipath_devdata *dd) +{ + return dd->ipath_lid; +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_lid); + +/** + * ipath_layer_get_mac - get the MAC address + * @dd: the infinipath device + * @mac: the MAC is put here + * + * This is the EUID-64 OUI octets (top 3), then + * skip the next 2 (which should both be zero or 0xff). + * The returned MAC is in network order + * mac points to at least 6 bytes of buffer + * We assume that by the time the LID is set, that the GUID is as valid + * as it's ever going to be, rather than adding yet another status bit. + */ + +int ipath_layer_get_mac(struct ipath_devdata *dd, u8 * mac) +{ + u8 *guid; + + guid = (u8 *) &dd->ipath_guid; + + mac[0] = guid[0]; + mac[1] = guid[1]; + mac[2] = guid[2]; + mac[3] = guid[5]; + mac[4] = guid[6]; + mac[5] = guid[7]; + if ((guid[3] || guid[4]) && !(guid[3] == 0xff && guid[4] == 0xff)) + ipath_dbg("Warning, guid bytes 3 and 4 not 0 or 0xffff: " + "%x %x\n", guid[3], guid[4]); + return 0; +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_mac); + +u16 ipath_layer_get_bcast(struct ipath_devdata *dd) +{ + return dd->ipath_mlid; +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_bcast); + +u32 ipath_layer_get_cr_errpkey(struct ipath_devdata *dd) +{ + return ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey); +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_cr_errpkey); + +static void update_sge(struct ipath_sge_state *ss, u32 length) +{ + struct ipath_sge *sge = &ss->sge; + + sge->vaddr += length; + sge->length -= length; + sge->sge_length -= length; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + return; + sge->n = 0; + } + sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = sge->mr->map[sge->m]->segs[sge->n].length; + } +} + +#ifdef __LITTLE_ENDIAN +static inline u32 get_upper_bits(u32 data, u32 shift) +{ + return data >> shift; +} + +static inline u32 set_upper_bits(u32 data, u32 shift) +{ + return data << shift; +} + +static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) +{ + data <<= ((sizeof(u32) - n) * BITS_PER_BYTE); + data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE); + return data; +} +#else +static inline u32 get_upper_bits(u32 data, u32 shift) +{ + return data << shift; +} + +static inline u32 set_upper_bits(u32 data, u32 shift) +{ + return data >> shift; +} + +static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) +{ + data >>= ((sizeof(u32) - n) * BITS_PER_BYTE); + data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE); + return data; +} +#endif + +static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss, + u32 length) +{ + u32 extra = 0; + u32 data = 0; + u32 last; + + while (1) { + u32 len = ss->sge.length; + u32 off; + + BUG_ON(len == 0); + if (len > length) + len = length; + if (len > ss->sge.sge_length) + len = ss->sge.sge_length; + /* If the source address is not aligned, try to align it. */ + off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1); + if (off) { + u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr & + ~(sizeof(u32) - 1)); + u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE); + u32 y; + + y = sizeof(u32) - off; + if (len > y) + len = y; + if (len + extra >= sizeof(u32)) { + data |= set_upper_bits(v, extra * + BITS_PER_BYTE); + len = sizeof(u32) - extra; + if (len == length) { + last = data; + break; + } + __raw_writel(data, piobuf); + piobuf++; + extra = 0; + data = 0; + } else { + /* Clear unused upper bytes */ + data |= clear_upper_bytes(v, len, extra); + if (len == length) { + last = data; + break; + } + extra += len; + } + } else if (extra) { + /* Source address is aligned. */ + u32 *addr = (u32 *) ss->sge.vaddr; + int shift = extra * BITS_PER_BYTE; + int ushift = 32 - shift; + u32 l = len; + + while (l >= sizeof(u32)) { + u32 v = *addr; + + data |= set_upper_bits(v, shift); + __raw_writel(data, piobuf); + data = get_upper_bits(v, ushift); + piobuf++; + addr++; + l -= sizeof(u32); + } + /* + * We still have 'extra' number of bytes leftover. + */ + if (l) { + u32 v = *addr; + + if (l + extra >= sizeof(u32)) { + data |= set_upper_bits(v, shift); + len -= l + extra - sizeof(u32); + if (len == length) { + last = data; + break; + } + __raw_writel(data, piobuf); + piobuf++; + extra = 0; + data = 0; + } else { + /* Clear unused upper bytes */ + data |= clear_upper_bytes(v, l, + extra); + if (len == length) { + last = data; + break; + } + extra += l; + } + } else if (len == length) { + last = data; + break; + } + } else if (len == length) { + u32 w; + + /* + * Need to round up for the last dword in the + * packet. + */ + w = (len + 3) >> 2; + __iowrite32_copy(piobuf, ss->sge.vaddr, w - 1); + piobuf += w - 1; + last = ((u32 *) ss->sge.vaddr)[w - 1]; + break; + } else { + u32 w = len >> 2; + + __iowrite32_copy(piobuf, ss->sge.vaddr, w); + piobuf += w; + + extra = len & (sizeof(u32) - 1); + if (extra) { + u32 v = ((u32 *) ss->sge.vaddr)[w]; + + /* Clear unused upper bytes */ + data = clear_upper_bytes(v, extra, 0); + } + } + update_sge(ss, len); + length -= len; + } + /* must flush early everything before trigger word */ + ipath_flush_wc(); + __raw_writel(last, piobuf); + /* be sure trigger word is written */ + ipath_flush_wc(); + update_sge(ss, length); +} + +/** + * ipath_verbs_send - send a packet from the verbs layer + * @dd: the infinipath device + * @hdrwords: the number of works in the header + * @hdr: the packet header + * @len: the length of the packet in bytes + * @ss: the SGE to send + * + * This is like ipath_sma_send_pkt() in that we need to be able to send + * packets after the chip is initialized (MADs) but also like + * ipath_layer_send_hdr() since its used by the verbs layer. + */ +int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords, + u32 *hdr, u32 len, struct ipath_sge_state *ss) +{ + u32 __iomem *piobuf; + u32 plen; + int ret; + + /* +1 is for the qword padding of pbc */ + plen = hdrwords + ((len + 3) >> 2) + 1; + if (unlikely((plen << 2) > dd->ipath_ibmaxlen)) { + ipath_dbg("packet len 0x%x too long, failing\n", plen); + ret = -EINVAL; + goto bail; + } + + /* Get a PIO buffer to use. */ + piobuf = ipath_getpiobuf(dd, NULL); + if (unlikely(piobuf == NULL)) { + ret = -EBUSY; + goto bail; + } + + /* + * Write len to control qword, no flags. + * We have to flush after the PBC for correctness on some cpus + * or WC buffer can be written out of order. + */ + writeq(plen, piobuf); + ipath_flush_wc(); + piobuf += 2; + if (len == 0) { + /* + * If there is just the header portion, must flush before + * writing last word of header for correctness, and after + * the last header word (trigger word). + */ + __iowrite32_copy(piobuf, hdr, hdrwords - 1); + ipath_flush_wc(); + __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1); + ipath_flush_wc(); + ret = 0; + goto bail; + } + + __iowrite32_copy(piobuf, hdr, hdrwords); + piobuf += hdrwords; + + /* The common case is aligned and contained in one segment. */ + if (likely(ss->num_sge == 1 && len <= ss->sge.length && + !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) { + u32 w; + + /* Need to round up for the last dword in the packet. */ + w = (len + 3) >> 2; + __iowrite32_copy(piobuf, ss->sge.vaddr, w - 1); + /* must flush early everything before trigger word */ + ipath_flush_wc(); + __raw_writel(((u32 *) ss->sge.vaddr)[w - 1], + piobuf + w - 1); + /* be sure trigger word is written */ + ipath_flush_wc(); + update_sge(ss, len); + ret = 0; + goto bail; + } + copy_io(piobuf, ss, len); + ret = 0; + +bail: + return ret; +} + +EXPORT_SYMBOL_GPL(ipath_verbs_send); + +int ipath_layer_snapshot_counters(struct ipath_devdata *dd, u64 *swords, + u64 *rwords, u64 *spkts, u64 *rpkts, + u64 *xmit_wait) +{ + int ret; + + if (!(dd->ipath_flags & IPATH_INITTED)) { + /* no hardware, freeze, etc. */ + ipath_dbg("unit %u not usable\n", dd->ipath_unit); + ret = -EINVAL; + goto bail; + } + *swords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt); + *rwords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt); + *spkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt); + *rpkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt); + *xmit_wait = ipath_snap_cntr(dd, dd->ipath_cregs->cr_sendstallcnt); + + ret = 0; + +bail: + return ret; +} + +EXPORT_SYMBOL_GPL(ipath_layer_snapshot_counters); + +/** + * ipath_layer_get_counters - get various chip counters + * @dd: the infinipath device + * @cntrs: counters are placed here + * + * Return the counters needed by recv_pma_get_portcounters(). + */ +int ipath_layer_get_counters(struct ipath_devdata *dd, + struct ipath_layer_counters *cntrs) +{ + int ret; + + if (!(dd->ipath_flags & IPATH_INITTED)) { + /* no hardware, freeze, etc. */ + ipath_dbg("unit %u not usable\n", dd->ipath_unit); + ret = -EINVAL; + goto bail; + } + cntrs->symbol_error_counter = + ipath_snap_cntr(dd, dd->ipath_cregs->cr_ibsymbolerrcnt); + cntrs->link_error_recovery_counter = + ipath_snap_cntr(dd, dd->ipath_cregs->cr_iblinkerrrecovcnt); + cntrs->link_downed_counter = + ipath_snap_cntr(dd, dd->ipath_cregs->cr_iblinkdowncnt); + cntrs->port_rcv_errors = + ipath_snap_cntr(dd, dd->ipath_cregs->cr_rxdroppktcnt) + + ipath_snap_cntr(dd, dd->ipath_cregs->cr_rcvovflcnt) + + ipath_snap_cntr(dd, dd->ipath_cregs->cr_portovflcnt) + + ipath_snap_cntr(dd, dd->ipath_cregs->cr_errrcvflowctrlcnt) + + ipath_snap_cntr(dd, dd->ipath_cregs->cr_err_rlencnt) + + ipath_snap_cntr(dd, dd->ipath_cregs->cr_invalidrlencnt) + + ipath_snap_cntr(dd, dd->ipath_cregs->cr_erricrccnt) + + ipath_snap_cntr(dd, dd->ipath_cregs->cr_errvcrccnt) + + ipath_snap_cntr(dd, dd->ipath_cregs->cr_errlpcrccnt) + + ipath_snap_cntr(dd, dd->ipath_cregs->cr_errlinkcnt) + + ipath_snap_cntr(dd, dd->ipath_cregs->cr_badformatcnt); + cntrs->port_rcv_remphys_errors = + ipath_snap_cntr(dd, dd->ipath_cregs->cr_rcvebpcnt); + cntrs->port_xmit_discards = + ipath_snap_cntr(dd, dd->ipath_cregs->cr_unsupvlcnt); + cntrs->port_xmit_data = + ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt); + cntrs->port_rcv_data = + ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt); + cntrs->port_xmit_packets = + ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt); + cntrs->port_rcv_packets = + ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt); + + ret = 0; + +bail: + return ret; +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_counters); + +int ipath_layer_want_buffer(struct ipath_devdata *dd) +{ + set_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + + return 0; +} + +EXPORT_SYMBOL_GPL(ipath_layer_want_buffer); + +int ipath_layer_send_hdr(struct ipath_devdata *dd, struct ether_header *hdr) +{ + int ret = 0; + u32 __iomem *piobuf; + u32 plen, *uhdr; + size_t count; + __be16 vlsllnh; + + if (!(dd->ipath_flags & IPATH_RCVHDRSZ_SET)) { + ipath_dbg("send while not open\n"); + ret = -EINVAL; + } else + if ((dd->ipath_flags & (IPATH_LINKUNK | IPATH_LINKDOWN)) || + dd->ipath_lid == 0) { + /* + * lid check is for when sma hasn't yet configured + */ + ret = -ENETDOWN; + ipath_cdbg(VERBOSE, "send while not ready, " + "mylid=%u, flags=0x%x\n", + dd->ipath_lid, dd->ipath_flags); + } + + vlsllnh = *((__be16 *) hdr); + if (vlsllnh != htons(IPS_LRH_BTH)) { + ipath_dbg("Warning: lrh[0] wrong (%x, not %x); " + "not sending\n", be16_to_cpu(vlsllnh), + IPS_LRH_BTH); + ret = -EINVAL; + } + if (ret) + goto done; + + /* Get a PIO buffer to use. */ + piobuf = ipath_getpiobuf(dd, NULL); + if (piobuf == NULL) { + ret = -EBUSY; + goto done; + } + + plen = (sizeof(*hdr) >> 2); /* actual length */ + ipath_cdbg(EPKT, "0x%x+1w pio %p\n", plen, piobuf); + + writeq(plen+1, piobuf); /* len (+1 for pad) to pbc, no flags */ + ipath_flush_wc(); + piobuf += 2; + uhdr = (u32 *)hdr; + count = plen-1; /* amount we can copy before trigger word */ + __iowrite32_copy(piobuf, uhdr, count); + ipath_flush_wc(); + __raw_writel(uhdr[count], piobuf + count); + ipath_flush_wc(); /* ensure it's sent, now */ + + ipath_stats.sps_ether_spkts++; /* ether packet sent */ + +done: + return ret; +} + +EXPORT_SYMBOL_GPL(ipath_layer_send_hdr); + +int ipath_layer_set_piointbufavail_int(struct ipath_devdata *dd) +{ + set_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + return 0; +} + +EXPORT_SYMBOL_GPL(ipath_layer_set_piointbufavail_int); + +int ipath_layer_enable_timer(struct ipath_devdata *dd) +{ + /* + * HT-400 has a design flaw where the chip and kernel idea + * of the tail register don't always agree, and therefore we won't + * get an interrupt on the next packet received. + * If the board supports per packet receive interrupts, use it. + * Otherwise, the timer function periodically checks for packets + * to cover this case. + * Either way, the timer is needed for verbs layer related + * processing. + */ + if (dd->ipath_flags & IPATH_GPIO_INTR) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect, + 0x2074076542310ULL); + /* Enable GPIO bit 2 interrupt */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + (u64) (1 << 2)); + } + + init_timer(&dd->verbs_layer.l_timer); + dd->verbs_layer.l_timer.function = __ipath_verbs_timer; + dd->verbs_layer.l_timer.data = (unsigned long)dd; + dd->verbs_layer.l_timer.expires = jiffies + 1; + add_timer(&dd->verbs_layer.l_timer); + + return 0; +} + +EXPORT_SYMBOL_GPL(ipath_layer_enable_timer); + +int ipath_layer_disable_timer(struct ipath_devdata *dd) +{ + /* Disable GPIO bit 2 interrupt */ + if (dd->ipath_flags & IPATH_GPIO_INTR) + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, 0); + + del_timer_sync(&dd->verbs_layer.l_timer); + + return 0; +} + +EXPORT_SYMBOL_GPL(ipath_layer_disable_timer); + +/** + * ipath_layer_set_verbs_flags - set the verbs layer flags + * @dd: the infinipath device + * @flags: the flags to set + */ +int ipath_layer_set_verbs_flags(struct ipath_devdata *dd, unsigned flags) +{ + struct ipath_devdata *ss; + unsigned long lflags; + + spin_lock_irqsave(&ipath_devs_lock, lflags); + + list_for_each_entry(ss, &ipath_dev_list, ipath_list) { + if (!(ss->ipath_flags & IPATH_INITTED)) + continue; + if ((flags & IPATH_VERBS_KERNEL_SMA) && + !(*ss->ipath_statusp & IPATH_STATUS_SMA)) + *ss->ipath_statusp |= IPATH_STATUS_OIB_SMA; + else + *ss->ipath_statusp &= ~IPATH_STATUS_OIB_SMA; + } + + spin_unlock_irqrestore(&ipath_devs_lock, lflags); + + return 0; +} + +EXPORT_SYMBOL_GPL(ipath_layer_set_verbs_flags); + +/** + * ipath_layer_get_npkeys - return the size of the PKEY table for port 0 + * @dd: the infinipath device + */ +unsigned ipath_layer_get_npkeys(struct ipath_devdata *dd) +{ + return ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys); +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_npkeys); + +/** + * ipath_layer_get_pkey - return the indexed PKEY from the port 0 PKEY table + * @dd: the infinipath device + * @index: the PKEY index + */ +unsigned ipath_layer_get_pkey(struct ipath_devdata *dd, unsigned index) +{ + unsigned ret; + + if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys)) + ret = 0; + else + ret = dd->ipath_pd[0]->port_pkeys[index]; + + return ret; +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_pkey); + +/** + * ipath_layer_get_pkeys - return the PKEY table for port 0 + * @dd: the infinipath device + * @pkeys: the pkey table is placed here + */ +int ipath_layer_get_pkeys(struct ipath_devdata *dd, u16 * pkeys) +{ + struct ipath_portdata *pd = dd->ipath_pd[0]; + + memcpy(pkeys, pd->port_pkeys, sizeof(pd->port_pkeys)); + + return 0; +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_pkeys); + +/** + * rm_pkey - decrecment the reference count for the given PKEY + * @dd: the infinipath device + * @key: the PKEY index + * + * Return true if this was the last reference and the hardware table entry + * needs to be changed. + */ +static int rm_pkey(struct ipath_devdata *dd, u16 key) +{ + int i; + int ret; + + for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (dd->ipath_pkeys[i] != key) + continue; + if (atomic_dec_and_test(&dd->ipath_pkeyrefs[i])) { + dd->ipath_pkeys[i] = 0; + ret = 1; + goto bail; + } + break; + } + + ret = 0; + +bail: + return ret; +} + +/** + * add_pkey - add the given PKEY to the hardware table + * @dd: the infinipath device + * @key: the PKEY + * + * Return an error code if unable to add the entry, zero if no change, + * or 1 if the hardware PKEY register needs to be updated. + */ +static int add_pkey(struct ipath_devdata *dd, u16 key) +{ + int i; + u16 lkey = key & 0x7FFF; + int any = 0; + int ret; + + if (lkey == 0x7FFF) { + ret = 0; + goto bail; + } + + /* Look for an empty slot or a matching PKEY. */ + for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i]) { + any++; + continue; + } + /* If it matches exactly, try to increment the ref count */ + if (dd->ipath_pkeys[i] == key) { + if (atomic_inc_return(&dd->ipath_pkeyrefs[i]) > 1) { + ret = 0; + goto bail; + } + /* Lost the race. Look for an empty slot below. */ + atomic_dec(&dd->ipath_pkeyrefs[i]); + any++; + } + /* + * It makes no sense to have both the limited and unlimited + * PKEY set at the same time since the unlimited one will + * disable the limited one. + */ + if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) { + ret = -EEXIST; + goto bail; + } + } + if (!any) { + ret = -EBUSY; + goto bail; + } + for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i] && + atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) { + /* for ipathstats, etc. */ + ipath_stats.sps_pkeys[i] = lkey; + dd->ipath_pkeys[i] = key; + ret = 1; + goto bail; + } + } + ret = -EBUSY; + +bail: + return ret; +} + +/** + * ipath_layer_set_pkeys - set the PKEY table for port 0 + * @dd: the infinipath device + * @pkeys: the PKEY table + */ +int ipath_layer_set_pkeys(struct ipath_devdata *dd, u16 * pkeys) +{ + struct ipath_portdata *pd; + int i; + int changed = 0; + + pd = dd->ipath_pd[0]; + + for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) { + u16 key = pkeys[i]; + u16 okey = pd->port_pkeys[i]; + + if (key == okey) + continue; + /* + * The value of this PKEY table entry is changing. + * Remove the old entry in the hardware's array of PKEYs. + */ + if (okey & 0x7FFF) + changed |= rm_pkey(dd, okey); + if (key & 0x7FFF) { + int ret = add_pkey(dd, key); + + if (ret < 0) + key = 0; + else + changed |= ret; + } + pd->port_pkeys[i] = key; + } + if (changed) { + u64 pkey; + + pkey = (u64) dd->ipath_pkeys[0] | + ((u64) dd->ipath_pkeys[1] << 16) | + ((u64) dd->ipath_pkeys[2] << 32) | + ((u64) dd->ipath_pkeys[3] << 48); + ipath_cdbg(VERBOSE, "p0 new pkey reg %llx\n", + (unsigned long long) pkey); + ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey, + pkey); + } + return 0; +} + +EXPORT_SYMBOL_GPL(ipath_layer_set_pkeys); + +/** + * ipath_layer_get_linkdowndefaultstate - get the default linkdown state + * @dd: the infinipath device + * + * Returns zero if the default is POLL, 1 if the default is SLEEP. + */ +int ipath_layer_get_linkdowndefaultstate(struct ipath_devdata *dd) +{ + return !!(dd->ipath_ibcctrl & INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE); +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_linkdowndefaultstate); + +/** + * ipath_layer_set_linkdowndefaultstate - set the default linkdown state + * @dd: the infinipath device + * @sleep: the new state + * + * Note that this will only take effect when the link state changes. + */ +int ipath_layer_set_linkdowndefaultstate(struct ipath_devdata *dd, + int sleep) +{ + if (sleep) + dd->ipath_ibcctrl |= INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE; + else + dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + return 0; +} + +EXPORT_SYMBOL_GPL(ipath_layer_set_linkdowndefaultstate); + +int ipath_layer_get_phyerrthreshold(struct ipath_devdata *dd) +{ + return (dd->ipath_ibcctrl >> + INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK; +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_phyerrthreshold); + +/** + * ipath_layer_set_phyerrthreshold - set the physical error threshold + * @dd: the infinipath device + * @n: the new threshold + * + * Note that this will only take effect when the link state changes. + */ +int ipath_layer_set_phyerrthreshold(struct ipath_devdata *dd, unsigned n) +{ + unsigned v; + + v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK; + if (v != n) { + dd->ipath_ibcctrl &= + ~(INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK << + INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT); + dd->ipath_ibcctrl |= + (u64) n << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + } + return 0; +} + +EXPORT_SYMBOL_GPL(ipath_layer_set_phyerrthreshold); + +int ipath_layer_get_overrunthreshold(struct ipath_devdata *dd) +{ + return (dd->ipath_ibcctrl >> + INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK; +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_overrunthreshold); + +/** + * ipath_layer_set_overrunthreshold - set the overrun threshold + * @dd: the infinipath device + * @n: the new threshold + * + * Note that this will only take effect when the link state changes. + */ +int ipath_layer_set_overrunthreshold(struct ipath_devdata *dd, unsigned n) +{ + unsigned v; + + v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK; + if (v != n) { + dd->ipath_ibcctrl &= + ~(INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK << + INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT); + dd->ipath_ibcctrl |= + (u64) n << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + } + return 0; +} + +EXPORT_SYMBOL_GPL(ipath_layer_set_overrunthreshold); + +int ipath_layer_get_boardname(struct ipath_devdata *dd, char *name, + size_t namelen) +{ + return dd->ipath_f_get_boardname(dd, name, namelen); +} +EXPORT_SYMBOL_GPL(ipath_layer_get_boardname); + +u32 ipath_layer_get_rcvhdrentsize(struct ipath_devdata *dd) +{ + return dd->ipath_rcvhdrentsize; +} +EXPORT_SYMBOL_GPL(ipath_layer_get_rcvhdrentsize); diff --git a/drivers/infiniband/hw/ipath/ipath_layer.h b/drivers/infiniband/hw/ipath/ipath_layer.h new file mode 100644 index 0000000..6fefd15 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_layer.h @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_LAYER_H +#define _IPATH_LAYER_H + +/* + * This header file is for symbols shared between the infinipath driver + * and drivers layered upon it (such as ipath). + */ + +struct sk_buff; +struct ipath_sge_state; +struct ipath_devdata; +struct ether_header; + +struct ipath_layer_counters { + u64 symbol_error_counter; + u64 link_error_recovery_counter; + u64 link_downed_counter; + u64 port_rcv_errors; + u64 port_rcv_remphys_errors; + u64 port_xmit_discards; + u64 port_xmit_data; + u64 port_rcv_data; + u64 port_xmit_packets; + u64 port_rcv_packets; +}; + +/* + * A segment is a linear region of low physical memory. + * XXX Maybe we should use phys addr here and kmap()/kunmap(). + * Used by the verbs layer. + */ +struct ipath_seg { + void *vaddr; + size_t length; +}; + +/* The number of ipath_segs that fit in a page. */ +#define IPATH_SEGSZ (PAGE_SIZE / sizeof (struct ipath_seg)) + +struct ipath_segarray { + struct ipath_seg segs[IPATH_SEGSZ]; +}; + +struct ipath_mregion { + u64 user_base; /* User's address for this region */ + u64 iova; /* IB start address of this region */ + size_t length; + u32 lkey; + u32 offset; /* offset (bytes) to start of region */ + int access_flags; + u32 max_segs; /* number of ipath_segs in all the arrays */ + u32 mapsz; /* size of the map array */ + struct ipath_segarray *map[0]; /* the segments */ +}; + +/* + * These keep track of the copy progress within a memory region. + * Used by the verbs layer. + */ +struct ipath_sge { + struct ipath_mregion *mr; + void *vaddr; /* current pointer into the segment */ + u32 sge_length; /* length of the SGE */ + u32 length; /* remaining length of the segment */ + u16 m; /* current index: mr->map[m] */ + u16 n; /* current index: mr->map[m]->segs[n] */ +}; + +struct ipath_sge_state { + struct ipath_sge *sg_list; /* next SGE to be used if any */ + struct ipath_sge sge; /* progress state for the current SGE */ + u8 num_sge; +}; + +int ipath_layer_register(void *(*l_add)(int, struct ipath_devdata *), + void (*l_remove)(void *), + int (*l_intr)(void *, u32), + int (*l_rcv)(void *, void *, + struct sk_buff *), + u16 rcv_opcode, + int (*l_rcv_lid)(void *, void *)); +int ipath_verbs_register(void *(*l_add)(int, struct ipath_devdata *), + void (*l_remove)(void *arg), + int (*l_piobufavail)(void *arg), + void (*l_rcv)(void *arg, void *rhdr, + void *data, u32 tlen), + void (*l_timer_cb)(void *arg)); +void ipath_layer_unregister(void); +void ipath_verbs_unregister(void); +int ipath_layer_open(struct ipath_devdata *, u32 * pktmax); +u16 ipath_layer_get_lid(struct ipath_devdata *dd); +int ipath_layer_get_mac(struct ipath_devdata *dd, u8 *); +u16 ipath_layer_get_bcast(struct ipath_devdata *dd); +u32 ipath_layer_get_cr_errpkey(struct ipath_devdata *dd); +int ipath_layer_set_linkstate(struct ipath_devdata *dd, u8 state); +int ipath_layer_set_mtu(struct ipath_devdata *, u16); +int ipath_set_sps_lid(struct ipath_devdata *, u32, u8); +int ipath_layer_send_hdr(struct ipath_devdata *dd, + struct ether_header *hdr); +int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords, + u32 * hdr, u32 len, struct ipath_sge_state *ss); +int ipath_layer_set_piointbufavail_int(struct ipath_devdata *dd); +int ipath_layer_get_boardname(struct ipath_devdata *dd, char *name, + size_t namelen); +int ipath_layer_snapshot_counters(struct ipath_devdata *dd, u64 *swords, + u64 *rwords, u64 *spkts, u64 *rpkts, + u64 *xmit_wait); +int ipath_layer_get_counters(struct ipath_devdata *dd, + struct ipath_layer_counters *cntrs); +int ipath_layer_want_buffer(struct ipath_devdata *dd); +int ipath_layer_set_guid(struct ipath_devdata *, __be64 guid); +__be64 ipath_layer_get_guid(struct ipath_devdata *); +u32 ipath_layer_get_nguid(struct ipath_devdata *); +int ipath_layer_query_device(struct ipath_devdata *, u32 * vendor, + u32 * boardrev, u32 * majrev, u32 * minrev); +u32 ipath_layer_get_flags(struct ipath_devdata *dd); +struct device *ipath_layer_get_device(struct ipath_devdata *dd); +u16 ipath_layer_get_deviceid(struct ipath_devdata *dd); +u64 ipath_layer_get_lastibcstat(struct ipath_devdata *dd); +u32 ipath_layer_get_ibmtu(struct ipath_devdata *dd); +int ipath_layer_enable_timer(struct ipath_devdata *dd); +int ipath_layer_disable_timer(struct ipath_devdata *dd); +int ipath_layer_set_verbs_flags(struct ipath_devdata *dd, unsigned flags); +unsigned ipath_layer_get_npkeys(struct ipath_devdata *dd); +unsigned ipath_layer_get_pkey(struct ipath_devdata *dd, unsigned index); +int ipath_layer_get_pkeys(struct ipath_devdata *dd, u16 *pkeys); +int ipath_layer_set_pkeys(struct ipath_devdata *dd, u16 *pkeys); +int ipath_layer_get_linkdowndefaultstate(struct ipath_devdata *dd); +int ipath_layer_set_linkdowndefaultstate(struct ipath_devdata *dd, + int sleep); +int ipath_layer_get_phyerrthreshold(struct ipath_devdata *dd); +int ipath_layer_set_phyerrthreshold(struct ipath_devdata *dd, unsigned n); +int ipath_layer_get_overrunthreshold(struct ipath_devdata *dd); +int ipath_layer_set_overrunthreshold(struct ipath_devdata *dd, unsigned n); +u32 ipath_layer_get_rcvhdrentsize(struct ipath_devdata *dd); + +/* ipath_ether interrupt values */ +#define IPATH_LAYER_INT_IF_UP 0x2 +#define IPATH_LAYER_INT_IF_DOWN 0x4 +#define IPATH_LAYER_INT_LID 0x8 +#define IPATH_LAYER_INT_SEND_CONTINUE 0x10 +#define IPATH_LAYER_INT_BCAST 0x40 + +/* _verbs_layer.l_flags */ +#define IPATH_VERBS_KERNEL_SMA 0x1 + +extern unsigned ipath_debug; /* debugging bit mask */ + +#endif /* _IPATH_LAYER_H */ diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c new file mode 100644 index 0000000..f7f8391 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_mad.c @@ -0,0 +1,1352 @@ +/* + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_smi.h> + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ips_common.h" + +#define IB_SMP_UNSUP_VERSION __constant_htons(0x0004) +#define IB_SMP_UNSUP_METHOD __constant_htons(0x0008) +#define IB_SMP_UNSUP_METH_ATTR __constant_htons(0x000C) +#define IB_SMP_INVALID_FIELD __constant_htons(0x001C) + +static int reply(struct ib_smp *smp) +{ + /* + * The verbs framework will handle the directed/LID route + * packet changes. + */ + smp->method = IB_MGMT_METHOD_GET_RESP; + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + smp->status |= IB_SMP_DIRECTION; + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static int recv_subn_get_nodedescription(struct ib_smp *smp, + struct ib_device *ibdev) +{ + if (smp->attr_mod) + smp->status |= IB_SMP_INVALID_FIELD; + + strncpy(smp->data, ibdev->node_desc, sizeof(smp->data)); + + return reply(smp); +} + +struct nodeinfo { + u8 base_version; + u8 class_version; + u8 node_type; + u8 num_ports; + __be64 sys_guid; + __be64 node_guid; + __be64 port_guid; + __be16 partition_cap; + __be16 device_id; + __be32 revision; + u8 local_port_num; + u8 vendor_id[3]; +} __attribute__ ((packed)); + +static int recv_subn_get_nodeinfo(struct ib_smp *smp, + struct ib_device *ibdev, u8 port) +{ + struct nodeinfo *nip = (struct nodeinfo *)&smp->data; + struct ipath_devdata *dd = to_idev(ibdev)->dd; + u32 vendor, boardid, majrev, minrev; + + if (smp->attr_mod) + smp->status |= IB_SMP_INVALID_FIELD; + + nip->base_version = 1; + nip->class_version = 1; + nip->node_type = 1; /* channel adapter */ + /* + * XXX The num_ports value will need a layer function to get + * the value if we ever have more than one IB port on a chip. + * We will also need to get the GUID for the port. + */ + nip->num_ports = ibdev->phys_port_cnt; + /* This is already in network order */ + nip->sys_guid = to_idev(ibdev)->sys_image_guid; + nip->node_guid = ipath_layer_get_guid(dd); + nip->port_guid = nip->sys_guid; + nip->partition_cap = cpu_to_be16(ipath_layer_get_npkeys(dd)); + nip->device_id = cpu_to_be16(ipath_layer_get_deviceid(dd)); + ipath_layer_query_device(dd, &vendor, &boardid, &majrev, &minrev); + nip->revision = cpu_to_be32((majrev << 16) | minrev); + nip->local_port_num = port; + nip->vendor_id[0] = 0; + nip->vendor_id[1] = vendor >> 8; + nip->vendor_id[2] = vendor; + + return reply(smp); +} + +static int recv_subn_get_guidinfo(struct ib_smp *smp, + struct ib_device *ibdev) +{ + u32 startgx = 8 * be32_to_cpu(smp->attr_mod); + __be64 *p = (__be64 *) smp->data; + + /* 32 blocks of 8 64-bit GUIDs per block */ + + memset(smp->data, 0, sizeof(smp->data)); + + /* + * We only support one GUID for now. If this changes, the + * portinfo.guid_cap field needs to be updated too. + */ + if (startgx == 0) + /* The first is a copy of the read-only HW GUID. */ + *p = ipath_layer_get_guid(to_idev(ibdev)->dd); + else + smp->status |= IB_SMP_INVALID_FIELD; + + return reply(smp); +} + +struct port_info { + __be64 mkey; + __be64 gid_prefix; + __be16 lid; + __be16 sm_lid; + __be32 cap_mask; + __be16 diag_code; + __be16 mkey_lease_period; + u8 local_port_num; + u8 link_width_enabled; + u8 link_width_supported; + u8 link_width_active; + u8 linkspeed_portstate; /* 4 bits, 4 bits */ + u8 portphysstate_linkdown; /* 4 bits, 4 bits */ + u8 mkeyprot_resv_lmc; /* 2 bits, 3, 3 */ + u8 linkspeedactive_enabled; /* 4 bits, 4 bits */ + u8 neighbormtu_mastersmsl; /* 4 bits, 4 bits */ + u8 vlcap_inittype; /* 4 bits, 4 bits */ + u8 vl_high_limit; + u8 vl_arb_high_cap; + u8 vl_arb_low_cap; + u8 inittypereply_mtucap; /* 4 bits, 4 bits */ + u8 vlstallcnt_hoqlife; /* 3 bits, 5 bits */ + u8 operationalvl_pei_peo_fpi_fpo; /* 4 bits, 1, 1, 1, 1 */ + __be16 mkey_violations; + __be16 pkey_violations; + __be16 qkey_violations; + u8 guid_cap; + u8 clientrereg_resv_subnetto; /* 1 bit, 2 bits, 5 */ + u8 resv_resptimevalue; /* 3 bits, 5 bits */ + u8 localphyerrors_overrunerrors; /* 4 bits, 4 bits */ + __be16 max_credit_hint; + u8 resv; + u8 link_roundtrip_latency[3]; +} __attribute__ ((packed)); + +static int recv_subn_get_portinfo(struct ib_smp *smp, + struct ib_device *ibdev, u8 port) +{ + struct ipath_ibdev *dev; + struct port_info *pip = (struct port_info *)smp->data; + u16 lid; + u8 ibcstat; + u8 mtu; + int ret; + + if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt) { + smp->status |= IB_SMP_INVALID_FIELD; + ret = reply(smp); + goto bail; + } + + dev = to_idev(ibdev); + + /* Clear all fields. Only set the non-zero fields. */ + memset(smp->data, 0, sizeof(smp->data)); + + /* Only return the mkey if the protection field allows it. */ + if (smp->method == IB_MGMT_METHOD_SET || dev->mkey == smp->mkey || + (dev->mkeyprot_resv_lmc >> 6) == 0) + pip->mkey = dev->mkey; + pip->gid_prefix = dev->gid_prefix; + lid = ipath_layer_get_lid(dev->dd); + pip->lid = lid ? cpu_to_be16(lid) : IB_LID_PERMISSIVE; + pip->sm_lid = cpu_to_be16(dev->sm_lid); + pip->cap_mask = cpu_to_be32(dev->port_cap_flags); + /* pip->diag_code; */ + pip->mkey_lease_period = cpu_to_be16(dev->mkey_lease_period); + pip->local_port_num = port; + pip->link_width_enabled = dev->link_width_enabled; + pip->link_width_supported = 3; /* 1x or 4x */ + pip->link_width_active = 2; /* 4x */ + pip->linkspeed_portstate = 0x10; /* 2.5Gbps */ + ibcstat = ipath_layer_get_lastibcstat(dev->dd); + pip->linkspeed_portstate |= ((ibcstat >> 4) & 0x3) + 1; + pip->portphysstate_linkdown = + (ipath_cvt_physportstate[ibcstat & 0xf] << 4) | + (ipath_layer_get_linkdowndefaultstate(dev->dd) ? 1 : 2); + pip->mkeyprot_resv_lmc = dev->mkeyprot_resv_lmc; + pip->linkspeedactive_enabled = 0x11; /* 2.5Gbps, 2.5Gbps */ + switch (ipath_layer_get_ibmtu(dev->dd)) { + case 4096: + mtu = IB_MTU_4096; + break; + case 2048: + mtu = IB_MTU_2048; + break; + case 1024: + mtu = IB_MTU_1024; + break; + case 512: + mtu = IB_MTU_512; + break; + case 256: + mtu = IB_MTU_256; + break; + default: /* oops, something is wrong */ + mtu = IB_MTU_2048; + break; + } + pip->neighbormtu_mastersmsl = (mtu << 4) | dev->sm_sl; + pip->vlcap_inittype = 0x10; /* VLCap = VL0, InitType = 0 */ + pip->vl_high_limit = dev->vl_high_limit; + /* pip->vl_arb_high_cap; // only one VL */ + /* pip->vl_arb_low_cap; // only one VL */ + /* InitTypeReply = 0 */ + pip->inittypereply_mtucap = IB_MTU_4096; + // HCAs ignore VLStallCount and HOQLife + /* pip->vlstallcnt_hoqlife; */ + pip->operationalvl_pei_peo_fpi_fpo = 0x10; /* OVLs = 1 */ + pip->mkey_violations = cpu_to_be16(dev->mkey_violations); + /* P_KeyViolations are counted by hardware. */ + pip->pkey_violations = + cpu_to_be16((ipath_layer_get_cr_errpkey(dev->dd) - + dev->n_pkey_violations) & 0xFFFF); + pip->qkey_violations = cpu_to_be16(dev->qkey_violations); + /* Only the hardware GUID is supported for now */ + pip->guid_cap = 1; + pip->clientrereg_resv_subnetto = dev->subnet_timeout; + /* 32.768 usec. response time (guessing) */ + pip->resv_resptimevalue = 3; + pip->localphyerrors_overrunerrors = + (ipath_layer_get_phyerrthreshold(dev->dd) << 4) | + ipath_layer_get_overrunthreshold(dev->dd); + /* pip->max_credit_hint; */ + /* pip->link_roundtrip_latency[3]; */ + + ret = reply(smp); + +bail: + return ret; +} + +static int recv_subn_get_pkeytable(struct ib_smp *smp, + struct ib_device *ibdev) +{ + u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff); + u16 *p = (u16 *) smp->data; + __be16 *q = (__be16 *) smp->data; + + /* 64 blocks of 32 16-bit P_Key entries */ + + memset(smp->data, 0, sizeof(smp->data)); + if (startpx == 0) { + struct ipath_ibdev *dev = to_idev(ibdev); + unsigned i, n = ipath_layer_get_npkeys(dev->dd); + + ipath_layer_get_pkeys(dev->dd, p); + + for (i = 0; i < n; i++) + q[i] = cpu_to_be16(p[i]); + } else + smp->status |= IB_SMP_INVALID_FIELD; + + return reply(smp); +} + +static int recv_subn_set_guidinfo(struct ib_smp *smp, + struct ib_device *ibdev) +{ + /* The only GUID we support is the first read-only entry. */ + return recv_subn_get_guidinfo(smp, ibdev); +} + +/** + * recv_subn_set_portinfo - set port information + * @smp: the incoming SM packet + * @ibdev: the infiniband device + * @port: the port on the device + * + * Set Portinfo (see ch. 14.2.5.6). + */ +static int recv_subn_set_portinfo(struct ib_smp *smp, + struct ib_device *ibdev, u8 port) +{ + struct port_info *pip = (struct port_info *)smp->data; + struct ib_event event; + struct ipath_ibdev *dev; + u32 flags; + char clientrereg = 0; + u16 lid, smlid; + u8 lwe; + u8 lse; + u8 state; + u16 lstate; + u32 mtu; + int ret; + + if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt) + goto err; + + dev = to_idev(ibdev); + event.device = ibdev; + event.element.port_num = port; + + dev->mkey = pip->mkey; + dev->gid_prefix = pip->gid_prefix; + dev->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period); + + lid = be16_to_cpu(pip->lid); + if (lid != ipath_layer_get_lid(dev->dd)) { + /* Must be a valid unicast LID address. */ + if (lid == 0 || lid >= IPS_MULTICAST_LID_BASE) + goto err; + ipath_set_sps_lid(dev->dd, lid, pip->mkeyprot_resv_lmc & 7); + event.event = IB_EVENT_LID_CHANGE; + ib_dispatch_event(&event); + } + + smlid = be16_to_cpu(pip->sm_lid); + if (smlid != dev->sm_lid) { + /* Must be a valid unicast LID address. */ + if (smlid == 0 || smlid >= IPS_MULTICAST_LID_BASE) + goto err; + dev->sm_lid = smlid; + event.event = IB_EVENT_SM_CHANGE; + ib_dispatch_event(&event); + } + + /* Only 4x supported but allow 1x or 4x to be set (see 14.2.6.6). */ + lwe = pip->link_width_enabled; + if ((lwe >= 4 && lwe <= 8) || (lwe >= 0xC && lwe <= 0xFE)) + goto err; + if (lwe == 0xFF) + dev->link_width_enabled = 3; /* 1x or 4x */ + else if (lwe) + dev->link_width_enabled = lwe; + + /* Only 2.5 Gbs supported. */ + lse = pip->linkspeedactive_enabled & 0xF; + if (lse >= 2 && lse <= 0xE) + goto err; + + /* Set link down default state. */ + switch (pip->portphysstate_linkdown & 0xF) { + case 0: /* NOP */ + break; + case 1: /* SLEEP */ + if (ipath_layer_set_linkdowndefaultstate(dev->dd, 1)) + goto err; + break; + case 2: /* POLL */ + if (ipath_layer_set_linkdowndefaultstate(dev->dd, 0)) + goto err; + break; + default: + goto err; + } + + dev->mkeyprot_resv_lmc = pip->mkeyprot_resv_lmc; + dev->vl_high_limit = pip->vl_high_limit; + + switch ((pip->neighbormtu_mastersmsl >> 4) & 0xF) { + case IB_MTU_256: + mtu = 256; + break; + case IB_MTU_512: + mtu = 512; + break; + case IB_MTU_1024: + mtu = 1024; + break; + case IB_MTU_2048: + mtu = 2048; + break; + case IB_MTU_4096: + mtu = 4096; + break; + default: + /* XXX We have already partially updated our state! */ + goto err; + } + ipath_layer_set_mtu(dev->dd, mtu); + + dev->sm_sl = pip->neighbormtu_mastersmsl & 0xF; + + /* We only support VL0 */ + if (((pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF) > 1) + goto err; + + if (pip->mkey_violations == 0) + dev->mkey_violations = 0; + + /* + * Hardware counter can't be reset so snapshot and subtract + * later. + */ + if (pip->pkey_violations == 0) + dev->n_pkey_violations = + ipath_layer_get_cr_errpkey(dev->dd); + + if (pip->qkey_violations == 0) + dev->qkey_violations = 0; + + if (ipath_layer_set_phyerrthreshold( + dev->dd, + (pip->localphyerrors_overrunerrors >> 4) & 0xF)) + goto err; + + if (ipath_layer_set_overrunthreshold( + dev->dd, + (pip->localphyerrors_overrunerrors & 0xF))) + goto err; + + dev->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F; + + if (pip->clientrereg_resv_subnetto & 0x80) { + clientrereg = 1; + event.event = IB_EVENT_LID_CHANGE; + ib_dispatch_event(&event); + } + + /* + * Do the port state change now that the other link parameters + * have been set. + * Changing the port physical state only makes sense if the link + * is down or is being set to down. + */ + state = pip->linkspeed_portstate & 0xF; + flags = ipath_layer_get_flags(dev->dd); + lstate = (pip->portphysstate_linkdown >> 4) & 0xF; + if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP)) + goto err; + + /* + * Only state changes of DOWN, ARM, and ACTIVE are valid + * and must be in the correct state to take effect (see 7.2.6). + */ + switch (state) { + case IB_PORT_NOP: + if (lstate == 0) + break; + /* FALLTHROUGH */ + case IB_PORT_DOWN: + if (lstate == 0) + if (ipath_layer_get_linkdowndefaultstate(dev->dd)) + lstate = IPATH_IB_LINKDOWN_SLEEP; + else + lstate = IPATH_IB_LINKDOWN; + else if (lstate == 1) + lstate = IPATH_IB_LINKDOWN_SLEEP; + else if (lstate == 2) + lstate = IPATH_IB_LINKDOWN; + else if (lstate == 3) + lstate = IPATH_IB_LINKDOWN_DISABLE; + else + goto err; + ipath_layer_set_linkstate(dev->dd, lstate); + if (flags & IPATH_LINKACTIVE) { + event.event = IB_EVENT_PORT_ERR; + ib_dispatch_event(&event); + } + break; + case IB_PORT_ARMED: + if (!(flags & (IPATH_LINKINIT | IPATH_LINKACTIVE))) + break; + ipath_layer_set_linkstate(dev->dd, IPATH_IB_LINKARM); + if (flags & IPATH_LINKACTIVE) { + event.event = IB_EVENT_PORT_ERR; + ib_dispatch_event(&event); + } + break; + case IB_PORT_ACTIVE: + if (!(flags & IPATH_LINKARMED)) + break; + ipath_layer_set_linkstate(dev->dd, IPATH_IB_LINKACTIVE); + event.event = IB_EVENT_PORT_ACTIVE; + ib_dispatch_event(&event); + break; + default: + /* XXX We have already partially updated our state! */ + goto err; + } + + ret = recv_subn_get_portinfo(smp, ibdev, port); + + if (clientrereg) + pip->clientrereg_resv_subnetto |= 0x80; + + goto done; + +err: + smp->status |= IB_SMP_INVALID_FIELD; + ret = recv_subn_get_portinfo(smp, ibdev, port); + +done: + return ret; +} + +static int recv_subn_set_pkeytable(struct ib_smp *smp, + struct ib_device *ibdev) +{ + u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff); + __be16 *p = (__be16 *) smp->data; + u16 *q = (u16 *) smp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + unsigned i, n = ipath_layer_get_npkeys(dev->dd); + + for (i = 0; i < n; i++) + q[i] = be16_to_cpu(p[i]); + + if (startpx != 0 || + ipath_layer_set_pkeys(dev->dd, q) != 0) + smp->status |= IB_SMP_INVALID_FIELD; + + return recv_subn_get_pkeytable(smp, ibdev); +} + +#define IB_PMA_CLASS_PORT_INFO __constant_htons(0x0001) +#define IB_PMA_PORT_SAMPLES_CONTROL __constant_htons(0x0010) +#define IB_PMA_PORT_SAMPLES_RESULT __constant_htons(0x0011) +#define IB_PMA_PORT_COUNTERS __constant_htons(0x0012) +#define IB_PMA_PORT_COUNTERS_EXT __constant_htons(0x001D) +#define IB_PMA_PORT_SAMPLES_RESULT_EXT __constant_htons(0x001E) + +struct ib_perf { + u8 base_version; + u8 mgmt_class; + u8 class_version; + u8 method; + __be16 status; + __be16 unused; + __be64 tid; + __be16 attr_id; + __be16 resv; + __be32 attr_mod; + u8 reserved[40]; + u8 data[192]; +} __attribute__ ((packed)); + +struct ib_pma_classportinfo { + u8 base_version; + u8 class_version; + __be16 cap_mask; + u8 reserved[3]; + u8 resp_time_value; /* only lower 5 bits */ + union ib_gid redirect_gid; + __be32 redirect_tc_sl_fl; /* 8, 4, 20 bits respectively */ + __be16 redirect_lid; + __be16 redirect_pkey; + __be32 redirect_qp; /* only lower 24 bits */ + __be32 redirect_qkey; + union ib_gid trap_gid; + __be32 trap_tc_sl_fl; /* 8, 4, 20 bits respectively */ + __be16 trap_lid; + __be16 trap_pkey; + __be32 trap_hl_qp; /* 8, 24 bits respectively */ + __be32 trap_qkey; +} __attribute__ ((packed)); + +struct ib_pma_portsamplescontrol { + u8 opcode; + u8 port_select; + u8 tick; + u8 counter_width; /* only lower 3 bits */ + __be32 counter_mask0_9; /* 2, 10 * 3, bits */ + __be16 counter_mask10_14; /* 1, 5 * 3, bits */ + u8 sample_mechanisms; + u8 sample_status; /* only lower 2 bits */ + __be64 option_mask; + __be64 vendor_mask; + __be32 sample_start; + __be32 sample_interval; + __be16 tag; + __be16 counter_select[15]; +} __attribute__ ((packed)); + +struct ib_pma_portsamplesresult { + __be16 tag; + __be16 sample_status; /* only lower 2 bits */ + __be32 counter[15]; +} __attribute__ ((packed)); + +struct ib_pma_portsamplesresult_ext { + __be16 tag; + __be16 sample_status; /* only lower 2 bits */ + __be32 extended_width; /* only upper 2 bits */ + __be64 counter[15]; +} __attribute__ ((packed)); + +struct ib_pma_portcounters { + u8 reserved; + u8 port_select; + __be16 counter_select; + __be16 symbol_error_counter; + u8 link_error_recovery_counter; + u8 link_downed_counter; + __be16 port_rcv_errors; + __be16 port_rcv_remphys_errors; + __be16 port_rcv_switch_relay_errors; + __be16 port_xmit_discards; + u8 port_xmit_constraint_errors; + u8 port_rcv_constraint_errors; + u8 reserved1; + u8 lli_ebor_errors; /* 4, 4, bits */ + __be16 reserved2; + __be16 vl15_dropped; + __be32 port_xmit_data; + __be32 port_rcv_data; + __be32 port_xmit_packets; + __be32 port_rcv_packets; +} __attribute__ ((packed)); + +#define IB_PMA_SEL_SYMBOL_ERROR __constant_htons(0x0001) +#define IB_PMA_SEL_LINK_ERROR_RECOVERY __constant_htons(0x0002) +#define IB_PMA_SEL_LINK_DOWNED __constant_htons(0x0004) +#define IB_PMA_SEL_PORT_RCV_ERRORS __constant_htons(0x0008) +#define IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS __constant_htons(0x0010) +#define IB_PMA_SEL_PORT_XMIT_DISCARDS __constant_htons(0x0040) +#define IB_PMA_SEL_PORT_XMIT_DATA __constant_htons(0x1000) +#define IB_PMA_SEL_PORT_RCV_DATA __constant_htons(0x2000) +#define IB_PMA_SEL_PORT_XMIT_PACKETS __constant_htons(0x4000) +#define IB_PMA_SEL_PORT_RCV_PACKETS __constant_htons(0x8000) + +struct ib_pma_portcounters_ext { + u8 reserved; + u8 port_select; + __be16 counter_select; + __be32 reserved1; + __be64 port_xmit_data; + __be64 port_rcv_data; + __be64 port_xmit_packets; + __be64 port_rcv_packets; + __be64 port_unicast_xmit_packets; + __be64 port_unicast_rcv_packets; + __be64 port_multicast_xmit_packets; + __be64 port_multicast_rcv_packets; +} __attribute__ ((packed)); + +#define IB_PMA_SELX_PORT_XMIT_DATA __constant_htons(0x0001) +#define IB_PMA_SELX_PORT_RCV_DATA __constant_htons(0x0002) +#define IB_PMA_SELX_PORT_XMIT_PACKETS __constant_htons(0x0004) +#define IB_PMA_SELX_PORT_RCV_PACKETS __constant_htons(0x0008) +#define IB_PMA_SELX_PORT_UNI_XMIT_PACKETS __constant_htons(0x0010) +#define IB_PMA_SELX_PORT_UNI_RCV_PACKETS __constant_htons(0x0020) +#define IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS __constant_htons(0x0040) +#define IB_PMA_SELX_PORT_MULTI_RCV_PACKETS __constant_htons(0x0080) + +static int recv_pma_get_classportinfo(struct ib_perf *pmp) +{ + struct ib_pma_classportinfo *p = + (struct ib_pma_classportinfo *)pmp->data; + + memset(pmp->data, 0, sizeof(pmp->data)); + + if (pmp->attr_mod != 0) + pmp->status |= IB_SMP_INVALID_FIELD; + + /* Indicate AllPortSelect is valid (only one port anyway) */ + p->cap_mask = __constant_cpu_to_be16(1 << 8); + p->base_version = 1; + p->class_version = 1; + /* + * Expected response time is 4.096 usec. * 2^18 == 1.073741824 + * sec. + */ + p->resp_time_value = 18; + + return reply((struct ib_smp *) pmp); +} + +/* + * The PortSamplesControl.CounterMasks field is an array of 3 bit fields + * which specify the N'th counter's capabilities. See ch. 16.1.3.2. + * We support 5 counters which only count the mandatory quantities. + */ +#define COUNTER_MASK(q, n) (q << ((9 - n) * 3)) +#define COUNTER_MASK0_9 \ + __constant_cpu_to_be32(COUNTER_MASK(1, 0) | \ + COUNTER_MASK(1, 1) | \ + COUNTER_MASK(1, 2) | \ + COUNTER_MASK(1, 3) | \ + COUNTER_MASK(1, 4)) + +static int recv_pma_get_portsamplescontrol(struct ib_perf *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portsamplescontrol *p = + (struct ib_pma_portsamplescontrol *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + unsigned long flags; + u8 port_select = p->port_select; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->attr_mod != 0 || + (port_select != port && port_select != 0xFF)) + pmp->status |= IB_SMP_INVALID_FIELD; + /* + * Ticks are 10x the link transfer period which for 2.5Gbs is 4 + * nsec. 0 == 4 nsec., 1 == 8 nsec., ..., 255 == 1020 nsec. Sample + * intervals are counted in ticks. Since we use Linux timers, that + * count in jiffies, we can't sample for less than 1000 ticks if HZ + * == 1000 (4000 ticks if HZ is 250). + */ + /* XXX This is WRONG. */ + p->tick = 250; /* 1 usec. */ + p->counter_width = 4; /* 32 bit counters */ + p->counter_mask0_9 = COUNTER_MASK0_9; + spin_lock_irqsave(&dev->pending_lock, flags); + p->sample_status = dev->pma_sample_status; + p->sample_start = cpu_to_be32(dev->pma_sample_start); + p->sample_interval = cpu_to_be32(dev->pma_sample_interval); + p->tag = cpu_to_be16(dev->pma_tag); + p->counter_select[0] = dev->pma_counter_select[0]; + p->counter_select[1] = dev->pma_counter_select[1]; + p->counter_select[2] = dev->pma_counter_select[2]; + p->counter_select[3] = dev->pma_counter_select[3]; + p->counter_select[4] = dev->pma_counter_select[4]; + spin_unlock_irqrestore(&dev->pending_lock, flags); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_set_portsamplescontrol(struct ib_perf *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portsamplescontrol *p = + (struct ib_pma_portsamplescontrol *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + unsigned long flags; + u32 start; + int ret; + + if (pmp->attr_mod != 0 || + (p->port_select != port && p->port_select != 0xFF)) { + pmp->status |= IB_SMP_INVALID_FIELD; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + start = be32_to_cpu(p->sample_start); + if (start != 0) { + spin_lock_irqsave(&dev->pending_lock, flags); + if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_DONE) { + dev->pma_sample_status = + IB_PMA_SAMPLE_STATUS_STARTED; + dev->pma_sample_start = start; + dev->pma_sample_interval = + be32_to_cpu(p->sample_interval); + dev->pma_tag = be16_to_cpu(p->tag); + if (p->counter_select[0]) + dev->pma_counter_select[0] = + p->counter_select[0]; + if (p->counter_select[1]) + dev->pma_counter_select[1] = + p->counter_select[1]; + if (p->counter_select[2]) + dev->pma_counter_select[2] = + p->counter_select[2]; + if (p->counter_select[3]) + dev->pma_counter_select[3] = + p->counter_select[3]; + if (p->counter_select[4]) + dev->pma_counter_select[4] = + p->counter_select[4]; + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + } + ret = recv_pma_get_portsamplescontrol(pmp, ibdev, port); + +bail: + return ret; +} + +static u64 get_counter(struct ipath_ibdev *dev, __be16 sel) +{ + u64 ret; + + switch (sel) { + case IB_PMA_PORT_XMIT_DATA: + ret = dev->ipath_sword; + break; + case IB_PMA_PORT_RCV_DATA: + ret = dev->ipath_rword; + break; + case IB_PMA_PORT_XMIT_PKTS: + ret = dev->ipath_spkts; + break; + case IB_PMA_PORT_RCV_PKTS: + ret = dev->ipath_rpkts; + break; + case IB_PMA_PORT_XMIT_WAIT: + ret = dev->ipath_xmit_wait; + break; + default: + ret = 0; + } + + return ret; +} + +static int recv_pma_get_portsamplesresult(struct ib_perf *pmp, + struct ib_device *ibdev) +{ + struct ib_pma_portsamplesresult *p = + (struct ib_pma_portsamplesresult *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + int i; + + memset(pmp->data, 0, sizeof(pmp->data)); + p->tag = cpu_to_be16(dev->pma_tag); + p->sample_status = cpu_to_be16(dev->pma_sample_status); + for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++) + p->counter[i] = cpu_to_be32( + get_counter(dev, dev->pma_counter_select[i])); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_get_portsamplesresult_ext(struct ib_perf *pmp, + struct ib_device *ibdev) +{ + struct ib_pma_portsamplesresult_ext *p = + (struct ib_pma_portsamplesresult_ext *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + int i; + + memset(pmp->data, 0, sizeof(pmp->data)); + p->tag = cpu_to_be16(dev->pma_tag); + p->sample_status = cpu_to_be16(dev->pma_sample_status); + /* 64 bits */ + p->extended_width = __constant_cpu_to_be32(0x80000000); + for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++) + p->counter[i] = cpu_to_be64( + get_counter(dev, dev->pma_counter_select[i])); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_get_portcounters(struct ib_perf *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_layer_counters cntrs; + u8 port_select = p->port_select; + + ipath_layer_get_counters(dev->dd, &cntrs); + + /* Adjust counters for any resets done. */ + cntrs.symbol_error_counter -= dev->n_symbol_error_counter; + cntrs.link_error_recovery_counter -= + dev->n_link_error_recovery_counter; + cntrs.link_downed_counter -= dev->n_link_downed_counter; + cntrs.port_rcv_errors += dev->rcv_errors; + cntrs.port_rcv_errors -= dev->n_port_rcv_errors; + cntrs.port_rcv_remphys_errors -= dev->n_port_rcv_remphys_errors; + cntrs.port_xmit_discards -= dev->n_port_xmit_discards; + cntrs.port_xmit_data -= dev->n_port_xmit_data; + cntrs.port_rcv_data -= dev->n_port_rcv_data; + cntrs.port_xmit_packets -= dev->n_port_xmit_packets; + cntrs.port_rcv_packets -= dev->n_port_rcv_packets; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->attr_mod != 0 || + (port_select != port && port_select != 0xFF)) + pmp->status |= IB_SMP_INVALID_FIELD; + + if (cntrs.symbol_error_counter > 0xFFFFUL) + p->symbol_error_counter = __constant_cpu_to_be16(0xFFFF); + else + p->symbol_error_counter = + cpu_to_be16((u16)cntrs.symbol_error_counter); + if (cntrs.link_error_recovery_counter > 0xFFUL) + p->link_error_recovery_counter = 0xFF; + else + p->link_error_recovery_counter = + (u8)cntrs.link_error_recovery_counter; + if (cntrs.link_downed_counter > 0xFFUL) + p->link_downed_counter = 0xFF; + else + p->link_downed_counter = (u8)cntrs.link_downed_counter; + if (cntrs.port_rcv_errors > 0xFFFFUL) + p->port_rcv_errors = __constant_cpu_to_be16(0xFFFF); + else + p->port_rcv_errors = + cpu_to_be16((u16) cntrs.port_rcv_errors); + if (cntrs.port_rcv_remphys_errors > 0xFFFFUL) + p->port_rcv_remphys_errors = __constant_cpu_to_be16(0xFFFF); + else + p->port_rcv_remphys_errors = + cpu_to_be16((u16)cntrs.port_rcv_remphys_errors); + if (cntrs.port_xmit_discards > 0xFFFFUL) + p->port_xmit_discards = __constant_cpu_to_be16(0xFFFF); + else + p->port_xmit_discards = + cpu_to_be16((u16)cntrs.port_xmit_discards); + if (cntrs.port_xmit_data > 0xFFFFFFFFUL) + p->port_xmit_data = __constant_cpu_to_be32(0xFFFFFFFF); + else + p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data); + if (cntrs.port_rcv_data > 0xFFFFFFFFUL) + p->port_rcv_data = __constant_cpu_to_be32(0xFFFFFFFF); + else + p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data); + if (cntrs.port_xmit_packets > 0xFFFFFFFFUL) + p->port_xmit_packets = __constant_cpu_to_be32(0xFFFFFFFF); + else + p->port_xmit_packets = + cpu_to_be32((u32)cntrs.port_xmit_packets); + if (cntrs.port_rcv_packets > 0xFFFFFFFFUL) + p->port_rcv_packets = __constant_cpu_to_be32(0xFFFFFFFF); + else + p->port_rcv_packets = + cpu_to_be32((u32) cntrs.port_rcv_packets); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_get_portcounters_ext(struct ib_perf *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters_ext *p = + (struct ib_pma_portcounters_ext *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + u64 swords, rwords, spkts, rpkts, xwait; + u8 port_select = p->port_select; + + ipath_layer_snapshot_counters(dev->dd, &swords, &rwords, &spkts, + &rpkts, &xwait); + + /* Adjust counters for any resets done. */ + swords -= dev->n_port_xmit_data; + rwords -= dev->n_port_rcv_data; + spkts -= dev->n_port_xmit_packets; + rpkts -= dev->n_port_rcv_packets; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->attr_mod != 0 || + (port_select != port && port_select != 0xFF)) + pmp->status |= IB_SMP_INVALID_FIELD; + + p->port_xmit_data = cpu_to_be64(swords); + p->port_rcv_data = cpu_to_be64(rwords); + p->port_xmit_packets = cpu_to_be64(spkts); + p->port_rcv_packets = cpu_to_be64(rpkts); + p->port_unicast_xmit_packets = cpu_to_be64(dev->n_unicast_xmit); + p->port_unicast_rcv_packets = cpu_to_be64(dev->n_unicast_rcv); + p->port_multicast_xmit_packets = cpu_to_be64(dev->n_multicast_xmit); + p->port_multicast_rcv_packets = cpu_to_be64(dev->n_multicast_rcv); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_set_portcounters(struct ib_perf *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_layer_counters cntrs; + + /* + * Since the HW doesn't support clearing counters, we save the + * current count and subtract it from future responses. + */ + ipath_layer_get_counters(dev->dd, &cntrs); + + if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR) + dev->n_symbol_error_counter = cntrs.symbol_error_counter; + + if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY) + dev->n_link_error_recovery_counter = + cntrs.link_error_recovery_counter; + + if (p->counter_select & IB_PMA_SEL_LINK_DOWNED) + dev->n_link_downed_counter = cntrs.link_downed_counter; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS) + dev->n_port_rcv_errors = + cntrs.port_rcv_errors + dev->rcv_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS) + dev->n_port_rcv_remphys_errors = + cntrs.port_rcv_remphys_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS) + dev->n_port_xmit_discards = cntrs.port_xmit_discards; + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA) + dev->n_port_xmit_data = cntrs.port_xmit_data; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA) + dev->n_port_rcv_data = cntrs.port_rcv_data; + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS) + dev->n_port_xmit_packets = cntrs.port_xmit_packets; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS) + dev->n_port_rcv_packets = cntrs.port_rcv_packets; + + return recv_pma_get_portcounters(pmp, ibdev, port); +} + +static int recv_pma_set_portcounters_ext(struct ib_perf *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + u64 swords, rwords, spkts, rpkts, xwait; + + ipath_layer_snapshot_counters(dev->dd, &swords, &rwords, &spkts, + &rpkts, &xwait); + + if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA) + dev->n_port_xmit_data = swords; + + if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA) + dev->n_port_rcv_data = rwords; + + if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS) + dev->n_port_xmit_packets = spkts; + + if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS) + dev->n_port_rcv_packets = rpkts; + + if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS) + dev->n_unicast_xmit = 0; + + if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS) + dev->n_unicast_rcv = 0; + + if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS) + dev->n_multicast_xmit = 0; + + if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS) + dev->n_multicast_rcv = 0; + + return recv_pma_get_portcounters_ext(pmp, ibdev, port); +} + +static int process_subn(struct ib_device *ibdev, int mad_flags, + u8 port_num, struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_smp *smp = (struct ib_smp *)out_mad; + struct ipath_ibdev *dev = to_idev(ibdev); + int ret; + + *out_mad = *in_mad; + if (smp->class_version != 1) { + smp->status |= IB_SMP_UNSUP_VERSION; + ret = reply(smp); + goto bail; + } + + /* Is the mkey in the process of expiring? */ + if (dev->mkey_lease_timeout && jiffies >= dev->mkey_lease_timeout) { + /* Clear timeout and mkey protection field. */ + dev->mkey_lease_timeout = 0; + dev->mkeyprot_resv_lmc &= 0x3F; + } + + /* + * M_Key checking depends on + * Portinfo:M_Key_protect_bits + */ + if ((mad_flags & IB_MAD_IGNORE_MKEY) == 0 && dev->mkey != 0 && + dev->mkey != smp->mkey && + (smp->method == IB_MGMT_METHOD_SET || + (smp->method == IB_MGMT_METHOD_GET && + (dev->mkeyprot_resv_lmc >> 7) != 0))) { + if (dev->mkey_violations != 0xFFFF) + ++dev->mkey_violations; + if (dev->mkey_lease_timeout || + dev->mkey_lease_period == 0) { + ret = IB_MAD_RESULT_SUCCESS | + IB_MAD_RESULT_CONSUMED; + goto bail; + } + dev->mkey_lease_timeout = jiffies + + dev->mkey_lease_period * HZ; + /* Future: Generate a trap notice. */ + ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + goto bail; + } else if (dev->mkey_lease_timeout) + dev->mkey_lease_timeout = 0; + + switch (smp->method) { + case IB_MGMT_METHOD_GET: + switch (smp->attr_id) { + case IB_SMP_ATTR_NODE_DESC: + ret = recv_subn_get_nodedescription(smp, ibdev); + goto bail; + case IB_SMP_ATTR_NODE_INFO: + ret = recv_subn_get_nodeinfo(smp, ibdev, port_num); + goto bail; + case IB_SMP_ATTR_GUID_INFO: + ret = recv_subn_get_guidinfo(smp, ibdev); + goto bail; + case IB_SMP_ATTR_PORT_INFO: + ret = recv_subn_get_portinfo(smp, ibdev, port_num); + goto bail; + case IB_SMP_ATTR_PKEY_TABLE: + ret = recv_subn_get_pkeytable(smp, ibdev); + goto bail; + case IB_SMP_ATTR_SM_INFO: + if (dev->port_cap_flags & IB_PORT_SM_DISABLED) { + ret = IB_MAD_RESULT_SUCCESS | + IB_MAD_RESULT_CONSUMED; + goto bail; + } + if (dev->port_cap_flags & IB_PORT_SM) { + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + } + /* FALLTHROUGH */ + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply(smp); + goto bail; + } + + case IB_MGMT_METHOD_SET: + switch (smp->attr_id) { + case IB_SMP_ATTR_GUID_INFO: + ret = recv_subn_set_guidinfo(smp, ibdev); + goto bail; + case IB_SMP_ATTR_PORT_INFO: + ret = recv_subn_set_portinfo(smp, ibdev, port_num); + goto bail; + case IB_SMP_ATTR_PKEY_TABLE: + ret = recv_subn_set_pkeytable(smp, ibdev); + goto bail; + case IB_SMP_ATTR_SM_INFO: + if (dev->port_cap_flags & IB_PORT_SM_DISABLED) { + ret = IB_MAD_RESULT_SUCCESS | + IB_MAD_RESULT_CONSUMED; + goto bail; + } + if (dev->port_cap_flags & IB_PORT_SM) { + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + } + /* FALLTHROUGH */ + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply(smp); + goto bail; + } + + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_FAILURE; + goto bail; + default: + smp->status |= IB_SMP_UNSUP_METHOD; + ret = reply(smp); + } + +bail: + return ret; +} + +static int process_perf(struct ib_device *ibdev, u8 port_num, + struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_perf *pmp = (struct ib_perf *)out_mad; + int ret; + + *out_mad = *in_mad; + if (pmp->class_version != 1) { + pmp->status |= IB_SMP_UNSUP_VERSION; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + switch (pmp->method) { + case IB_MGMT_METHOD_GET: + switch (pmp->attr_id) { + case IB_PMA_CLASS_PORT_INFO: + ret = recv_pma_get_classportinfo(pmp); + goto bail; + case IB_PMA_PORT_SAMPLES_CONTROL: + ret = recv_pma_get_portsamplescontrol(pmp, ibdev, + port_num); + goto bail; + case IB_PMA_PORT_SAMPLES_RESULT: + ret = recv_pma_get_portsamplesresult(pmp, ibdev); + goto bail; + case IB_PMA_PORT_SAMPLES_RESULT_EXT: + ret = recv_pma_get_portsamplesresult_ext(pmp, + ibdev); + goto bail; + case IB_PMA_PORT_COUNTERS: + ret = recv_pma_get_portcounters(pmp, ibdev, + port_num); + goto bail; + case IB_PMA_PORT_COUNTERS_EXT: + ret = recv_pma_get_portcounters_ext(pmp, ibdev, + port_num); + goto bail; + default: + pmp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + case IB_MGMT_METHOD_SET: + switch (pmp->attr_id) { + case IB_PMA_PORT_SAMPLES_CONTROL: + ret = recv_pma_set_portsamplescontrol(pmp, ibdev, + port_num); + goto bail; + case IB_PMA_PORT_COUNTERS: + ret = recv_pma_set_portcounters(pmp, ibdev, + port_num); + goto bail; + case IB_PMA_PORT_COUNTERS_EXT: + ret = recv_pma_set_portcounters_ext(pmp, ibdev, + port_num); + goto bail; + default: + pmp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_FAILURE; + goto bail; + default: + pmp->status |= IB_SMP_UNSUP_METHOD; + ret = reply((struct ib_smp *) pmp); + } + +bail: + return ret; +} + +/** + * ipath_process_mad - process an incoming MAD packet + * @ibdev: the infiniband device this packet came in on + * @mad_flags: MAD flags + * @port_num: the port number this packet came in on + * @in_wc: the work completion entry for this packet + * @in_grh: the global route header for this packet + * @in_mad: the incoming MAD + * @out_mad: any outgoing MAD reply + * + * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not + * interested in processing. + * + * Note that the verbs framework has already done the MAD sanity checks, + * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE + * MADs. + * + * This is called by the ib_mad module. + */ +int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + int ret; + + /* + * Snapshot current HW counters to "clear" them. + * This should be done when the driver is loaded except that for + * some reason we get a zillion errors when brining up the link. + */ + if (dev->rcv_errors == 0) { + struct ipath_layer_counters cntrs; + + ipath_layer_get_counters(to_idev(ibdev)->dd, &cntrs); + dev->rcv_errors++; + dev->n_symbol_error_counter = cntrs.symbol_error_counter; + dev->n_link_error_recovery_counter = + cntrs.link_error_recovery_counter; + dev->n_link_downed_counter = cntrs.link_downed_counter; + dev->n_port_rcv_errors = cntrs.port_rcv_errors + 1; + dev->n_port_rcv_remphys_errors = + cntrs.port_rcv_remphys_errors; + dev->n_port_xmit_discards = cntrs.port_xmit_discards; + dev->n_port_xmit_data = cntrs.port_xmit_data; + dev->n_port_rcv_data = cntrs.port_rcv_data; + dev->n_port_xmit_packets = cntrs.port_xmit_packets; + dev->n_port_rcv_packets = cntrs.port_rcv_packets; + } + switch (in_mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + ret = process_subn(ibdev, mad_flags, port_num, + in_mad, out_mad); + goto bail; + case IB_MGMT_CLASS_PERF_MGMT: + ret = process_perf(ibdev, port_num, in_mad, out_mad); + goto bail; + default: + ret = IB_MAD_RESULT_SUCCESS; + } + +bail: + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c new file mode 100644 index 0000000..69ffec6 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_mr.c @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_pack.h> +#include <rdma/ib_smi.h> + +#include "ipath_verbs.h" + +/** + * ipath_get_dma_mr - get a DMA memory region + * @pd: protection domain for this memory region + * @acc: access flags + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct ipath_mr *mr; + struct ib_mr *ret; + + mr = kzalloc(sizeof *mr, GFP_KERNEL); + if (!mr) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + mr->mr.access_flags = acc; + ret = &mr->ibmr; + +bail: + return ret; +} + +static struct ipath_mr *alloc_mr(int count, + struct ipath_lkey_table *lk_table) +{ + struct ipath_mr *mr; + int m, i = 0; + + /* Allocate struct plus pointers to first level page tables. */ + m = (count + IPATH_SEGSZ - 1) / IPATH_SEGSZ; + mr = kmalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL); + if (!mr) + goto done; + + /* Allocate first level page tables. */ + for (; i < m; i++) { + mr->mr.map[i] = kmalloc(sizeof *mr->mr.map[0], GFP_KERNEL); + if (!mr->mr.map[i]) + goto bail; + } + mr->mr.mapsz = m; + + /* + * ib_reg_phys_mr() will initialize mr->ibmr except for + * lkey and rkey. + */ + if (!ipath_alloc_lkey(lk_table, &mr->mr)) + goto bail; + mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey; + + goto done; + +bail: + while (i) { + i--; + kfree(mr->mr.map[i]); + } + kfree(mr); + mr = NULL; + +done: + return mr; +} + +/** + * ipath_reg_phys_mr - register a physical memory region + * @pd: protection domain for this memory region + * @buffer_list: pointer to the list of physical buffers to register + * @num_phys_buf: the number of physical buffers to register + * @iova_start: the starting address passed over IB which maps to this MR + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start) +{ + struct ipath_mr *mr; + int n, m, i; + struct ib_mr *ret; + + mr = alloc_mr(num_phys_buf, &to_idev(pd->device)->lk_table); + if (mr == NULL) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + mr->mr.user_base = *iova_start; + mr->mr.iova = *iova_start; + mr->mr.length = 0; + mr->mr.offset = 0; + mr->mr.access_flags = acc; + mr->mr.max_segs = num_phys_buf; + + m = 0; + n = 0; + for (i = 0; i < num_phys_buf; i++) { + mr->mr.map[m]->segs[n].vaddr = + phys_to_virt(buffer_list[i].addr); + mr->mr.map[m]->segs[n].length = buffer_list[i].size; + mr->mr.length += buffer_list[i].size; + n++; + if (n == IPATH_SEGSZ) { + m++; + n = 0; + } + } + + ret = &mr->ibmr; + +bail: + return ret; +} + +/** + * ipath_reg_user_mr - register a userspace memory region + * @pd: protection domain for this memory region + * @region: the user memory region + * @mr_access_flags: access flags for this memory region + * @udata: unused by the InfiniPath driver + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, + int mr_access_flags, struct ib_udata *udata) +{ + struct ipath_mr *mr; + struct ib_umem_chunk *chunk; + int n, m, i; + struct ib_mr *ret; + + n = 0; + list_for_each_entry(chunk, ®ion->chunk_list, list) + n += chunk->nents; + + mr = alloc_mr(n, &to_idev(pd->device)->lk_table); + if (!mr) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + mr->mr.user_base = region->user_base; + mr->mr.iova = region->virt_base; + mr->mr.length = region->length; + mr->mr.offset = region->offset; + mr->mr.access_flags = mr_access_flags; + mr->mr.max_segs = n; + + m = 0; + n = 0; + list_for_each_entry(chunk, ®ion->chunk_list, list) { + for (i = 0; i < chunk->nmap; i++) { + mr->mr.map[m]->segs[n].vaddr = + page_address(chunk->page_list[i].page); + mr->mr.map[m]->segs[n].length = region->page_size; + n++; + if (n == IPATH_SEGSZ) { + m++; + n = 0; + } + } + } + ret = &mr->ibmr; + +bail: + return ret; +} + +/** + * ipath_dereg_mr - unregister and free a memory region + * @ibmr: the memory region to free + * + * Returns 0 on success. + * + * Note that this is called to free MRs created by ipath_get_dma_mr() + * or ipath_reg_user_mr(). + */ +int ipath_dereg_mr(struct ib_mr *ibmr) +{ + struct ipath_mr *mr = to_imr(ibmr); + int i; + + ipath_free_lkey(&to_idev(ibmr->device)->lk_table, ibmr->lkey); + i = mr->mr.mapsz; + while (i) { + i--; + kfree(mr->mr.map[i]); + } + kfree(mr); + return 0; +} + +/** + * ipath_alloc_fmr - allocate a fast memory region + * @pd: the protection domain for this memory region + * @mr_access_flags: access flags for this memory region + * @fmr_attr: fast memory region attributes + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr) +{ + struct ipath_fmr *fmr; + int m, i = 0; + struct ib_fmr *ret; + + /* Allocate struct plus pointers to first level page tables. */ + m = (fmr_attr->max_pages + IPATH_SEGSZ - 1) / IPATH_SEGSZ; + fmr = kmalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL); + if (!fmr) + goto bail; + + /* Allocate first level page tables. */ + for (; i < m; i++) { + fmr->mr.map[i] = kmalloc(sizeof *fmr->mr.map[0], + GFP_KERNEL); + if (!fmr->mr.map[i]) + goto bail; + } + fmr->mr.mapsz = m; + + /* + * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey & + * rkey. + */ + if (!ipath_alloc_lkey(&to_idev(pd->device)->lk_table, &fmr->mr)) + goto bail; + fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mr.lkey; + /* + * Resources are allocated but no valid mapping (RKEY can't be + * used). + */ + fmr->mr.user_base = 0; + fmr->mr.iova = 0; + fmr->mr.length = 0; + fmr->mr.offset = 0; + fmr->mr.access_flags = mr_access_flags; + fmr->mr.max_segs = fmr_attr->max_pages; + fmr->page_shift = fmr_attr->page_shift; + + ret = &fmr->ibfmr; + goto done; + +bail: + while (i) + kfree(fmr->mr.map[--i]); + kfree(fmr); + ret = ERR_PTR(-ENOMEM); + +done: + return ret; +} + +/** + * ipath_map_phys_fmr - set up a fast memory region + * @ibmfr: the fast memory region to set up + * @page_list: the list of pages to associate with the fast memory region + * @list_len: the number of pages to associate with the fast memory region + * @iova: the virtual address of the start of the fast memory region + * + * This may be called from interrupt context. + */ + +int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list, + int list_len, u64 iova) +{ + struct ipath_fmr *fmr = to_ifmr(ibfmr); + struct ipath_lkey_table *rkt; + unsigned long flags; + int m, n, i; + u32 ps; + int ret; + + if (list_len > fmr->mr.max_segs) { + ret = -EINVAL; + goto bail; + } + rkt = &to_idev(ibfmr->device)->lk_table; + spin_lock_irqsave(&rkt->lock, flags); + fmr->mr.user_base = iova; + fmr->mr.iova = iova; + ps = 1 << fmr->page_shift; + fmr->mr.length = list_len * ps; + m = 0; + n = 0; + ps = 1 << fmr->page_shift; + for (i = 0; i < list_len; i++) { + fmr->mr.map[m]->segs[n].vaddr = phys_to_virt(page_list[i]); + fmr->mr.map[m]->segs[n].length = ps; + if (++n == IPATH_SEGSZ) { + m++; + n = 0; + } + } + spin_unlock_irqrestore(&rkt->lock, flags); + ret = 0; + +bail: + return ret; +} + +/** + * ipath_unmap_fmr - unmap fast memory regions + * @fmr_list: the list of fast memory regions to unmap + * + * Returns 0 on success. + */ +int ipath_unmap_fmr(struct list_head *fmr_list) +{ + struct ipath_fmr *fmr; + struct ipath_lkey_table *rkt; + unsigned long flags; + + list_for_each_entry(fmr, fmr_list, ibfmr.list) { + rkt = &to_idev(fmr->ibfmr.device)->lk_table; + spin_lock_irqsave(&rkt->lock, flags); + fmr->mr.user_base = 0; + fmr->mr.iova = 0; + fmr->mr.length = 0; + spin_unlock_irqrestore(&rkt->lock, flags); + } + return 0; +} + +/** + * ipath_dealloc_fmr - deallocate a fast memory region + * @ibfmr: the fast memory region to deallocate + * + * Returns 0 on success. + */ +int ipath_dealloc_fmr(struct ib_fmr *ibfmr) +{ + struct ipath_fmr *fmr = to_ifmr(ibfmr); + int i; + + ipath_free_lkey(&to_idev(ibfmr->device)->lk_table, ibfmr->lkey); + i = fmr->mr.mapsz; + while (i) + kfree(fmr->mr.map[--i]); + kfree(fmr); + return 0; +} diff --git a/drivers/infiniband/hw/ipath/ipath_pe800.c b/drivers/infiniband/hw/ipath/ipath_pe800.c new file mode 100644 index 0000000..e1dc4f7 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_pe800.c @@ -0,0 +1,1247 @@ +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +/* + * This file contains all of the code that is specific to the + * InfiniPath PE-800 chip. + */ + +#include <linux/interrupt.h> +#include <linux/pci.h> +#include <linux/delay.h> + + +#include "ipath_kernel.h" +#include "ipath_registers.h" + +/* + * This file contains all the chip-specific register information and + * access functions for the PathScale PE800, the PCI-Express chip. + * + * This lists the InfiniPath PE800 registers, in the actual chip layout. + * This structure should never be directly accessed. + */ +struct _infinipath_do_not_use_kernel_regs { + unsigned long long Revision; + unsigned long long Control; + unsigned long long PageAlign; + unsigned long long PortCnt; + unsigned long long DebugPortSelect; + unsigned long long Reserved0; + unsigned long long SendRegBase; + unsigned long long UserRegBase; + unsigned long long CounterRegBase; + unsigned long long Scratch; + unsigned long long Reserved1; + unsigned long long Reserved2; + unsigned long long IntBlocked; + unsigned long long IntMask; + unsigned long long IntStatus; + unsigned long long IntClear; + unsigned long long ErrorMask; + unsigned long long ErrorStatus; + unsigned long long ErrorClear; + unsigned long long HwErrMask; + unsigned long long HwErrStatus; + unsigned long long HwErrClear; + unsigned long long HwDiagCtrl; + unsigned long long MDIO; + unsigned long long IBCStatus; + unsigned long long IBCCtrl; + unsigned long long ExtStatus; + unsigned long long ExtCtrl; + unsigned long long GPIOOut; + unsigned long long GPIOMask; + unsigned long long GPIOStatus; + unsigned long long GPIOClear; + unsigned long long RcvCtrl; + unsigned long long RcvBTHQP; + unsigned long long RcvHdrSize; + unsigned long long RcvHdrCnt; + unsigned long long RcvHdrEntSize; + unsigned long long RcvTIDBase; + unsigned long long RcvTIDCnt; + unsigned long long RcvEgrBase; + unsigned long long RcvEgrCnt; + unsigned long long RcvBufBase; + unsigned long long RcvBufSize; + unsigned long long RxIntMemBase; + unsigned long long RxIntMemSize; + unsigned long long RcvPartitionKey; + unsigned long long Reserved3; + unsigned long long RcvPktLEDCnt; + unsigned long long Reserved4[8]; + unsigned long long SendCtrl; + unsigned long long SendPIOBufBase; + unsigned long long SendPIOSize; + unsigned long long SendPIOBufCnt; + unsigned long long SendPIOAvailAddr; + unsigned long long TxIntMemBase; + unsigned long long TxIntMemSize; + unsigned long long Reserved5; + unsigned long long PCIeRBufTestReg0; + unsigned long long PCIeRBufTestReg1; + unsigned long long Reserved51[6]; + unsigned long long SendBufferError; + unsigned long long SendBufferErrorCONT1; + unsigned long long Reserved6SBE[6]; + unsigned long long RcvHdrAddr0; + unsigned long long RcvHdrAddr1; + unsigned long long RcvHdrAddr2; + unsigned long long RcvHdrAddr3; + unsigned long long RcvHdrAddr4; + unsigned long long Reserved7RHA[11]; + unsigned long long RcvHdrTailAddr0; + unsigned long long RcvHdrTailAddr1; + unsigned long long RcvHdrTailAddr2; + unsigned long long RcvHdrTailAddr3; + unsigned long long RcvHdrTailAddr4; + unsigned long long Reserved8RHTA[11]; + unsigned long long Reserved9SW[8]; + unsigned long long SerdesConfig0; + unsigned long long SerdesConfig1; + unsigned long long SerdesStatus; + unsigned long long XGXSConfig; + unsigned long long IBPLLCfg; + unsigned long long Reserved10SW2[3]; + unsigned long long PCIEQ0SerdesConfig0; + unsigned long long PCIEQ0SerdesConfig1; + unsigned long long PCIEQ0SerdesStatus; + unsigned long long Reserved11; + unsigned long long PCIEQ1SerdesConfig0; + unsigned long long PCIEQ1SerdesConfig1; + unsigned long long PCIEQ1SerdesStatus; + unsigned long long Reserved12; +}; + +#define IPATH_KREG_OFFSET(field) (offsetof(struct \ + _infinipath_do_not_use_kernel_regs, field) / sizeof(u64)) +#define IPATH_CREG_OFFSET(field) (offsetof( \ + struct infinipath_counters, field) / sizeof(u64)) + +static const struct ipath_kregs ipath_pe_kregs = { + .kr_control = IPATH_KREG_OFFSET(Control), + .kr_counterregbase = IPATH_KREG_OFFSET(CounterRegBase), + .kr_debugportselect = IPATH_KREG_OFFSET(DebugPortSelect), + .kr_errorclear = IPATH_KREG_OFFSET(ErrorClear), + .kr_errormask = IPATH_KREG_OFFSET(ErrorMask), + .kr_errorstatus = IPATH_KREG_OFFSET(ErrorStatus), + .kr_extctrl = IPATH_KREG_OFFSET(ExtCtrl), + .kr_extstatus = IPATH_KREG_OFFSET(ExtStatus), + .kr_gpio_clear = IPATH_KREG_OFFSET(GPIOClear), + .kr_gpio_mask = IPATH_KREG_OFFSET(GPIOMask), + .kr_gpio_out = IPATH_KREG_OFFSET(GPIOOut), + .kr_gpio_status = IPATH_KREG_OFFSET(GPIOStatus), + .kr_hwdiagctrl = IPATH_KREG_OFFSET(HwDiagCtrl), + .kr_hwerrclear = IPATH_KREG_OFFSET(HwErrClear), + .kr_hwerrmask = IPATH_KREG_OFFSET(HwErrMask), + .kr_hwerrstatus = IPATH_KREG_OFFSET(HwErrStatus), + .kr_ibcctrl = IPATH_KREG_OFFSET(IBCCtrl), + .kr_ibcstatus = IPATH_KREG_OFFSET(IBCStatus), + .kr_intblocked = IPATH_KREG_OFFSET(IntBlocked), + .kr_intclear = IPATH_KREG_OFFSET(IntClear), + .kr_intmask = IPATH_KREG_OFFSET(IntMask), + .kr_intstatus = IPATH_KREG_OFFSET(IntStatus), + .kr_mdio = IPATH_KREG_OFFSET(MDIO), + .kr_pagealign = IPATH_KREG_OFFSET(PageAlign), + .kr_partitionkey = IPATH_KREG_OFFSET(RcvPartitionKey), + .kr_portcnt = IPATH_KREG_OFFSET(PortCnt), + .kr_rcvbthqp = IPATH_KREG_OFFSET(RcvBTHQP), + .kr_rcvbufbase = IPATH_KREG_OFFSET(RcvBufBase), + .kr_rcvbufsize = IPATH_KREG_OFFSET(RcvBufSize), + .kr_rcvctrl = IPATH_KREG_OFFSET(RcvCtrl), + .kr_rcvegrbase = IPATH_KREG_OFFSET(RcvEgrBase), + .kr_rcvegrcnt = IPATH_KREG_OFFSET(RcvEgrCnt), + .kr_rcvhdrcnt = IPATH_KREG_OFFSET(RcvHdrCnt), + .kr_rcvhdrentsize = IPATH_KREG_OFFSET(RcvHdrEntSize), + .kr_rcvhdrsize = IPATH_KREG_OFFSET(RcvHdrSize), + .kr_rcvintmembase = IPATH_KREG_OFFSET(RxIntMemBase), + .kr_rcvintmemsize = IPATH_KREG_OFFSET(RxIntMemSize), + .kr_rcvtidbase = IPATH_KREG_OFFSET(RcvTIDBase), + .kr_rcvtidcnt = IPATH_KREG_OFFSET(RcvTIDCnt), + .kr_revision = IPATH_KREG_OFFSET(Revision), + .kr_scratch = IPATH_KREG_OFFSET(Scratch), + .kr_sendbuffererror = IPATH_KREG_OFFSET(SendBufferError), + .kr_sendctrl = IPATH_KREG_OFFSET(SendCtrl), + .kr_sendpioavailaddr = IPATH_KREG_OFFSET(SendPIOAvailAddr), + .kr_sendpiobufbase = IPATH_KREG_OFFSET(SendPIOBufBase), + .kr_sendpiobufcnt = IPATH_KREG_OFFSET(SendPIOBufCnt), + .kr_sendpiosize = IPATH_KREG_OFFSET(SendPIOSize), + .kr_sendregbase = IPATH_KREG_OFFSET(SendRegBase), + .kr_txintmembase = IPATH_KREG_OFFSET(TxIntMemBase), + .kr_txintmemsize = IPATH_KREG_OFFSET(TxIntMemSize), + .kr_userregbase = IPATH_KREG_OFFSET(UserRegBase), + .kr_serdesconfig0 = IPATH_KREG_OFFSET(SerdesConfig0), + .kr_serdesconfig1 = IPATH_KREG_OFFSET(SerdesConfig1), + .kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus), + .kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig), + .kr_ibpllcfg = IPATH_KREG_OFFSET(IBPLLCfg), + + /* + * These should not be used directly via ipath_read_kreg64(), + * use them with ipath_read_kreg64_port() + */ + .kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0), + .kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0), + + /* This group is pe-800-specific; and used only in this file */ + /* The rcvpktled register controls one of the debug port signals, so + * a packet activity LED can be connected to it. */ + .kr_rcvpktledcnt = IPATH_KREG_OFFSET(RcvPktLEDCnt), + .kr_pcierbuftestreg0 = IPATH_KREG_OFFSET(PCIeRBufTestReg0), + .kr_pcierbuftestreg1 = IPATH_KREG_OFFSET(PCIeRBufTestReg1), + .kr_pcieq0serdesconfig0 = IPATH_KREG_OFFSET(PCIEQ0SerdesConfig0), + .kr_pcieq0serdesconfig1 = IPATH_KREG_OFFSET(PCIEQ0SerdesConfig1), + .kr_pcieq0serdesstatus = IPATH_KREG_OFFSET(PCIEQ0SerdesStatus), + .kr_pcieq1serdesconfig0 = IPATH_KREG_OFFSET(PCIEQ1SerdesConfig0), + .kr_pcieq1serdesconfig1 = IPATH_KREG_OFFSET(PCIEQ1SerdesConfig1), + .kr_pcieq1serdesstatus = IPATH_KREG_OFFSET(PCIEQ1SerdesStatus) +}; + +static const struct ipath_cregs ipath_pe_cregs = { + .cr_badformatcnt = IPATH_CREG_OFFSET(RxBadFormatCnt), + .cr_erricrccnt = IPATH_CREG_OFFSET(RxICRCErrCnt), + .cr_errlinkcnt = IPATH_CREG_OFFSET(RxLinkProblemCnt), + .cr_errlpcrccnt = IPATH_CREG_OFFSET(RxLPCRCErrCnt), + .cr_errpkey = IPATH_CREG_OFFSET(RxPKeyMismatchCnt), + .cr_errrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowCtrlErrCnt), + .cr_err_rlencnt = IPATH_CREG_OFFSET(RxLenErrCnt), + .cr_errslencnt = IPATH_CREG_OFFSET(TxLenErrCnt), + .cr_errtidfull = IPATH_CREG_OFFSET(RxTIDFullErrCnt), + .cr_errtidvalid = IPATH_CREG_OFFSET(RxTIDValidErrCnt), + .cr_errvcrccnt = IPATH_CREG_OFFSET(RxVCRCErrCnt), + .cr_ibstatuschange = IPATH_CREG_OFFSET(IBStatusChangeCnt), + .cr_intcnt = IPATH_CREG_OFFSET(LBIntCnt), + .cr_invalidrlencnt = IPATH_CREG_OFFSET(RxMaxMinLenErrCnt), + .cr_invalidslencnt = IPATH_CREG_OFFSET(TxMaxMinLenErrCnt), + .cr_lbflowstallcnt = IPATH_CREG_OFFSET(LBFlowStallCnt), + .cr_pktrcvcnt = IPATH_CREG_OFFSET(RxDataPktCnt), + .cr_pktrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowPktCnt), + .cr_pktsendcnt = IPATH_CREG_OFFSET(TxDataPktCnt), + .cr_pktsendflowcnt = IPATH_CREG_OFFSET(TxFlowPktCnt), + .cr_portovflcnt = IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt), + .cr_rcvebpcnt = IPATH_CREG_OFFSET(RxEBPCnt), + .cr_rcvovflcnt = IPATH_CREG_OFFSET(RxBufOvflCnt), + .cr_senddropped = IPATH_CREG_OFFSET(TxDroppedPktCnt), + .cr_sendstallcnt = IPATH_CREG_OFFSET(TxFlowStallCnt), + .cr_sendunderruncnt = IPATH_CREG_OFFSET(TxUnderrunCnt), + .cr_wordrcvcnt = IPATH_CREG_OFFSET(RxDwordCnt), + .cr_wordsendcnt = IPATH_CREG_OFFSET(TxDwordCnt), + .cr_unsupvlcnt = IPATH_CREG_OFFSET(TxUnsupVLErrCnt), + .cr_rxdroppktcnt = IPATH_CREG_OFFSET(RxDroppedPktCnt), + .cr_iblinkerrrecovcnt = IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt), + .cr_iblinkdowncnt = IPATH_CREG_OFFSET(IBLinkDownedCnt), + .cr_ibsymbolerrcnt = IPATH_CREG_OFFSET(IBSymbolErrCnt) +}; + +/* kr_intstatus, kr_intclear, kr_intmask bits */ +#define INFINIPATH_I_RCVURG_MASK 0x1F +#define INFINIPATH_I_RCVAVAIL_MASK 0x1F + +/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */ +#define INFINIPATH_HWE_PCIEMEMPARITYERR_MASK 0x000000000000003fULL +#define INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT 0 +#define INFINIPATH_HWE_PCIEPOISONEDTLP 0x0000000010000000ULL +#define INFINIPATH_HWE_PCIECPLTIMEOUT 0x0000000020000000ULL +#define INFINIPATH_HWE_PCIEBUSPARITYXTLH 0x0000000040000000ULL +#define INFINIPATH_HWE_PCIEBUSPARITYXADM 0x0000000080000000ULL +#define INFINIPATH_HWE_PCIEBUSPARITYRADM 0x0000000100000000ULL +#define INFINIPATH_HWE_COREPLL_FBSLIP 0x0080000000000000ULL +#define INFINIPATH_HWE_COREPLL_RFSLIP 0x0100000000000000ULL +#define INFINIPATH_HWE_PCIE1PLLFAILED 0x0400000000000000ULL +#define INFINIPATH_HWE_PCIE0PLLFAILED 0x0800000000000000ULL +#define INFINIPATH_HWE_SERDESPLLFAILED 0x1000000000000000ULL + +/* kr_extstatus bits */ +#define INFINIPATH_EXTS_FREQSEL 0x2 +#define INFINIPATH_EXTS_SERDESSEL 0x4 +#define INFINIPATH_EXTS_MEMBIST_ENDTEST 0x0000000000004000 +#define INFINIPATH_EXTS_MEMBIST_FOUND 0x0000000000008000 + +#define _IPATH_GPIO_SDA_NUM 1 +#define _IPATH_GPIO_SCL_NUM 0 + +#define IPATH_GPIO_SDA (1ULL << \ + (_IPATH_GPIO_SDA_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT)) +#define IPATH_GPIO_SCL (1ULL << \ + (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT)) + +/** + * ipath_pe_handle_hwerrors - display hardware errors. + * @dd: the infinipath device + * @msg: the output buffer + * @msgl: the size of the output buffer + * + * Use same msg buffer as regular errors to avoid excessive stack + * use. Most hardware errors are catastrophic, but for right now, + * we'll print them and continue. We reuse the same message buffer as + * ipath_handle_errors() to avoid excessive stack usage. + */ +static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg, + size_t msgl) +{ + ipath_err_t hwerrs; + u32 bits, ctrl; + int isfatal = 0; + char bitsmsg[64]; + + hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus); + if (!hwerrs) { + /* + * better than printing cofusing messages + * This seems to be related to clearing the crc error, or + * the pll error during init. + */ + ipath_cdbg(VERBOSE, "Called but no hardware errors set\n"); + return; + } else if (hwerrs == ~0ULL) { + ipath_dev_err(dd, "Read of hardware error status failed " + "(all bits set); ignoring\n"); + return; + } + ipath_stats.sps_hwerrs++; + + /* Always clear the error status register, except MEMBISTFAIL, + * regardless of whether we continue or stop using the chip. + * We want that set so we know it failed, even across driver reload. + * We'll still ignore it in the hwerrmask. We do this partly for + * diagnostics, but also for support */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + hwerrs&~INFINIPATH_HWE_MEMBISTFAILED); + + hwerrs &= dd->ipath_hwerrmask; + + /* + * make sure we get this much out, unless told to be quiet, + * or it's occurred within the last 5 seconds + */ + if ((hwerrs & ~dd->ipath_lasthwerror) || + (ipath_debug & __IPATH_VERBDBG)) + dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx " + "(cleared)\n", (unsigned long long) hwerrs); + dd->ipath_lasthwerror |= hwerrs; + + if (hwerrs & ~infinipath_hwe_bitsextant) + ipath_dev_err(dd, "hwerror interrupt with unknown errors " + "%llx set\n", (unsigned long long) + (hwerrs & ~infinipath_hwe_bitsextant)); + + ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); + if (ctrl & INFINIPATH_C_FREEZEMODE) { + if (hwerrs) { + /* + * if any set that we aren't ignoring only make the + * complaint once, in case it's stuck or recurring, + * and we get here multiple times + */ + if (dd->ipath_flags & IPATH_INITTED) { + ipath_dev_err(dd, "Fatal Error (freeze " + "mode), no longer usable\n"); + isfatal = 1; + } + /* + * Mark as having had an error for driver, and also + * for /sys and status word mapped to user programs. + * This marks unit as not usable, until reset + */ + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + *dd->ipath_statusp |= IPATH_STATUS_HWERROR; + dd->ipath_flags &= ~IPATH_INITTED; + } else { + ipath_dbg("Clearing freezemode on ignored hardware " + "error\n"); + ctrl &= ~INFINIPATH_C_FREEZEMODE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + ctrl); + } + } + + *msg = '\0'; + + if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) { + strlcat(msg, "[Memory BIST test failed, PE-800 unusable]", + msgl); + /* ignore from now on, so disable until driver reloaded */ + *dd->ipath_statusp |= IPATH_STATUS_HWERROR; + dd->ipath_hwerrmask &= ~INFINIPATH_HWE_MEMBISTFAILED; + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + } + if (hwerrs & (INFINIPATH_HWE_RXEMEMPARITYERR_MASK + << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)) { + bits = (u32) ((hwerrs >> + INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) & + INFINIPATH_HWE_RXEMEMPARITYERR_MASK); + snprintf(bitsmsg, sizeof bitsmsg, "[RXE Parity Errs %x] ", + bits); + strlcat(msg, bitsmsg, msgl); + } + if (hwerrs & (INFINIPATH_HWE_TXEMEMPARITYERR_MASK + << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) { + bits = (u32) ((hwerrs >> + INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) & + INFINIPATH_HWE_TXEMEMPARITYERR_MASK); + snprintf(bitsmsg, sizeof bitsmsg, "[TXE Parity Errs %x] ", + bits); + strlcat(msg, bitsmsg, msgl); + } + if (hwerrs & (INFINIPATH_HWE_PCIEMEMPARITYERR_MASK + << INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT)) { + bits = (u32) ((hwerrs >> + INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT) & + INFINIPATH_HWE_PCIEMEMPARITYERR_MASK); + snprintf(bitsmsg, sizeof bitsmsg, + "[PCIe Mem Parity Errs %x] ", bits); + strlcat(msg, bitsmsg, msgl); + } + if (hwerrs & INFINIPATH_HWE_IBCBUSTOSPCPARITYERR) + strlcat(msg, "[IB2IPATH Parity]", msgl); + if (hwerrs & INFINIPATH_HWE_IBCBUSFRSPCPARITYERR) + strlcat(msg, "[IPATH2IB Parity]", msgl); + +#define _IPATH_PLL_FAIL (INFINIPATH_HWE_COREPLL_FBSLIP | \ + INFINIPATH_HWE_COREPLL_RFSLIP ) + + if (hwerrs & _IPATH_PLL_FAIL) { + snprintf(bitsmsg, sizeof bitsmsg, + "[PLL failed (%llx), PE-800 unusable]", + (unsigned long long) hwerrs & _IPATH_PLL_FAIL); + strlcat(msg, bitsmsg, msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->ipath_hwerrmask &= ~(hwerrs & _IPATH_PLL_FAIL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + } + + if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) { + /* + * If it occurs, it is left masked since the eternal + * interface is unused + */ + dd->ipath_hwerrmask &= ~INFINIPATH_HWE_SERDESPLLFAILED; + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + } + + if (hwerrs & INFINIPATH_HWE_PCIEPOISONEDTLP) + strlcat(msg, "[PCIe Poisoned TLP]", msgl); + if (hwerrs & INFINIPATH_HWE_PCIECPLTIMEOUT) + strlcat(msg, "[PCIe completion timeout]", msgl); + + /* + * In practice, it's unlikely wthat we'll see PCIe PLL, or bus + * parity or memory parity error failures, because most likely we + * won't be able to talk to the core of the chip. Nonetheless, we + * might see them, if they are in parts of the PCIe core that aren't + * essential. + */ + if (hwerrs & INFINIPATH_HWE_PCIE1PLLFAILED) + strlcat(msg, "[PCIePLL1]", msgl); + if (hwerrs & INFINIPATH_HWE_PCIE0PLLFAILED) + strlcat(msg, "[PCIePLL0]", msgl); + if (hwerrs & INFINIPATH_HWE_PCIEBUSPARITYXTLH) + strlcat(msg, "[PCIe XTLH core parity]", msgl); + if (hwerrs & INFINIPATH_HWE_PCIEBUSPARITYXADM) + strlcat(msg, "[PCIe ADM TX core parity]", msgl); + if (hwerrs & INFINIPATH_HWE_PCIEBUSPARITYRADM) + strlcat(msg, "[PCIe ADM RX core parity]", msgl); + + if (hwerrs & INFINIPATH_HWE_RXDSYNCMEMPARITYERR) + strlcat(msg, "[Rx Dsync]", msgl); + if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) + strlcat(msg, "[SerDes PLL]", msgl); + + ipath_dev_err(dd, "%s hardware error\n", msg); + if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg) { + /* + * for /sys status file ; if no trailing } is copied, we'll + * know it was truncated. + */ + snprintf(dd->ipath_freezemsg, dd->ipath_freezelen, + "{%s}", msg); + } +} + +/** + * ipath_pe_boardname - fill in the board name + * @dd: the infinipath device + * @name: the output buffer + * @namelen: the size of the output buffer + * + * info is based on the board revision register + */ +static int ipath_pe_boardname(struct ipath_devdata *dd, char *name, + size_t namelen) +{ + char *n = NULL; + u8 boardrev = dd->ipath_boardrev; + int ret; + + switch (boardrev) { + case 0: + n = "InfiniPath_Emulation"; + break; + case 1: + n = "InfiniPath_PE-800-Bringup"; + break; + case 2: + n = "InfiniPath_PE-880"; + break; + case 3: + n = "InfiniPath_PE-850"; + break; + case 4: + n = "InfiniPath_PE-860"; + break; + default: + ipath_dev_err(dd, + "Don't yet know about board with ID %u\n", + boardrev); + snprintf(name, namelen, "Unknown_InfiniPath_PE-8xx_%u", + boardrev); + break; + } + if (n) + snprintf(name, namelen, "%s", n); + + if (dd->ipath_majrev != 4 || dd->ipath_minrev != 1) { + ipath_dev_err(dd, "Unsupported PE-800 revision %u.%u!\n", + dd->ipath_majrev, dd->ipath_minrev); + ret = 1; + } else + ret = 0; + + return ret; +} + +/** + * ipath_pe_init_hwerrors - enable hardware errors + * @dd: the infinipath device + * + * now that we have finished initializing everything that might reasonably + * cause a hardware error, and cleared those errors bits as they occur, + * we can enable hardware errors in the mask (potentially enabling + * freeze mode), and enable hardware errors as errors (along with + * everything else) in errormask + */ +static void ipath_pe_init_hwerrors(struct ipath_devdata *dd) +{ + ipath_err_t val; + u64 extsval; + + extsval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus); + + if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST)) + ipath_dev_err(dd, "MemBIST did not complete!\n"); + + val = ~0ULL; /* barring bugs, all hwerrors become interrupts, */ + + if (!dd->ipath_boardrev) // no PLL for Emulator + val &= ~INFINIPATH_HWE_SERDESPLLFAILED; + + /* workaround bug 9460 in internal interface bus parity checking */ + val &= ~INFINIPATH_HWE_PCIEBUSPARITYRADM; + + dd->ipath_hwerrmask = val; +} + +/** + * ipath_pe_bringup_serdes - bring up the serdes + * @dd: the infinipath device + */ +static int ipath_pe_bringup_serdes(struct ipath_devdata *dd) +{ + u64 val, tmp, config1; + int ret = 0, change = 0; + + ipath_dbg("Trying to bringup serdes\n"); + + if (ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus) & + INFINIPATH_HWE_SERDESPLLFAILED) { + ipath_dbg("At start, serdes PLL failed bit set " + "in hwerrstatus, clearing and continuing\n"); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + INFINIPATH_HWE_SERDESPLLFAILED); + } + + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); + config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1); + + ipath_cdbg(VERBOSE, "SerDes status config0=%llx config1=%llx, " + "xgxsconfig %llx\n", (unsigned long long) val, + (unsigned long long) config1, (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig)); + + /* + * Force reset on, also set rxdetect enable. Must do before reading + * serdesstatus at least for simulation, or some of the bits in + * serdes status will come back as undefined and cause simulation + * failures + */ + val |= INFINIPATH_SERDC0_RESET_PLL | INFINIPATH_SERDC0_RXDETECT_EN + | INFINIPATH_SERDC0_L1PWR_DN; + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val); + /* be sure chip saw it */ + tmp = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + udelay(5); /* need pll reset set at least for a bit */ + /* + * after PLL is reset, set the per-lane Resets and TxIdle and + * clear the PLL reset and rxdetect (to get falling edge). + * Leave L1PWR bits set (permanently) + */ + val &= ~(INFINIPATH_SERDC0_RXDETECT_EN | INFINIPATH_SERDC0_RESET_PLL + | INFINIPATH_SERDC0_L1PWR_DN); + val |= INFINIPATH_SERDC0_RESET_MASK | INFINIPATH_SERDC0_TXIDLE; + ipath_cdbg(VERBOSE, "Clearing pll reset and setting lane resets " + "and txidle (%llx)\n", (unsigned long long) val); + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val); + /* be sure chip saw it */ + tmp = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + /* need PLL reset clear for at least 11 usec before lane + * resets cleared; give it a few more to be sure */ + udelay(15); + val &= ~(INFINIPATH_SERDC0_RESET_MASK | INFINIPATH_SERDC0_TXIDLE); + + ipath_cdbg(VERBOSE, "Clearing lane resets and txidle " + "(writing %llx)\n", (unsigned long long) val); + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val); + /* be sure chip saw it */ + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig); + if (((val >> INFINIPATH_XGXS_MDIOADDR_SHIFT) & + INFINIPATH_XGXS_MDIOADDR_MASK) != 3) { + val &= + ~(INFINIPATH_XGXS_MDIOADDR_MASK << + INFINIPATH_XGXS_MDIOADDR_SHIFT); + /* MDIO address 3 */ + val |= 3ULL << INFINIPATH_XGXS_MDIOADDR_SHIFT; + change = 1; + } + if (val & INFINIPATH_XGXS_RESET) { + val &= ~INFINIPATH_XGXS_RESET; + change = 1; + } + if (change) + ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val); + + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); + + /* clear current and de-emphasis bits */ + config1 &= ~0x0ffffffff00ULL; + /* set current to 20ma */ + config1 |= 0x00000000000ULL; + /* set de-emphasis to -5.68dB */ + config1 |= 0x0cccc000000ULL; + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig1, config1); + + ipath_cdbg(VERBOSE, "done: SerDes status config0=%llx " + "config1=%llx, sstatus=%llx xgxs=%llx\n", + (unsigned long long) val, (unsigned long long) config1, + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus), + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig)); + + if (!ipath_waitfor_mdio_cmdready(dd)) { + ipath_write_kreg( + dd, dd->ipath_kregs->kr_mdio, + ipath_mdio_req(IPATH_MDIO_CMD_READ, 31, + IPATH_MDIO_CTRL_XGXS_REG_8, 0)); + if (ipath_waitfor_complete(dd, dd->ipath_kregs->kr_mdio, + IPATH_MDIO_DATAVALID, &val)) + ipath_dbg("Never got MDIO data for XGXS " + "status read\n"); + else + ipath_cdbg(VERBOSE, "MDIO Read reg8, " + "'bank' 31 %x\n", (u32) val); + } else + ipath_dbg("Never got MDIO cmdready for XGXS status read\n"); + + return ret; +} + +/** + * ipath_pe_quiet_serdes - set serdes to txidle + * @dd: the infinipath device + * Called when driver is being unloaded + */ +static void ipath_pe_quiet_serdes(struct ipath_devdata *dd) +{ + u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); + + val |= INFINIPATH_SERDC0_TXIDLE; + ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n", + (unsigned long long) val); + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val); +} + +/* this is not yet needed on the PE800, so just return 0. */ +static int ipath_pe_intconfig(struct ipath_devdata *dd) +{ + return 0; +} + +/** + * ipath_setup_pe_setextled - set the state of the two external LEDs + * @dd: the infinipath device + * @lst: the L state + * @ltst: the LT state + + * These LEDs indicate the physical and logical state of IB link. + * For this chip (at least with recommended board pinouts), LED1 + * is Yellow (logical state) and LED2 is Green (physical state), + * + * Note: We try to match the Mellanox HCA LED behavior as best + * we can. Green indicates physical link state is OK (something is + * plugged in, and we can train). + * Amber indicates the link is logically up (ACTIVE). + * Mellanox further blinks the amber LED to indicate data packet + * activity, but we have no hardware support for that, so it would + * require waking up every 10-20 msecs and checking the counters + * on the chip, and then turning the LED off if appropriate. That's + * visible overhead, so not something we will do. + * + */ +static void ipath_setup_pe_setextled(struct ipath_devdata *dd, u64 lst, + u64 ltst) +{ + u64 extctl; + + /* the diags use the LED to indicate diag info, so we leave + * the external LED alone when the diags are running */ + if (ipath_diag_inuse) + return; + + extctl = dd->ipath_extctrl & ~(INFINIPATH_EXTC_LED1PRIPORT_ON | + INFINIPATH_EXTC_LED2PRIPORT_ON); + + if (ltst & INFINIPATH_IBCS_LT_STATE_LINKUP) + extctl |= INFINIPATH_EXTC_LED2PRIPORT_ON; + if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE) + extctl |= INFINIPATH_EXTC_LED1PRIPORT_ON; + dd->ipath_extctrl = extctl; + ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl); +} + +/** + * ipath_setup_pe_cleanup - clean up any per-chip chip-specific stuff + * @dd: the infinipath device + * + * This is called during driver unload. + * We do the pci_disable_msi here, not in generic code, because it + * isn't used for the HT-400. If we do end up needing pci_enable_msi + * at some point in the future for HT-400, we'll move the call back + * into the main init_one code. + */ +static void ipath_setup_pe_cleanup(struct ipath_devdata *dd) +{ + dd->ipath_msi_lo = 0; /* just in case unload fails */ + pci_disable_msi(dd->pcidev); +} + +/** + * ipath_setup_pe_config - setup PCIe config related stuff + * @dd: the infinipath device + * @pdev: the PCI device + * + * The pci_enable_msi() call will fail on systems with MSI quirks + * such as those with AMD8131, even if the device of interest is not + * attached to that device, (in the 2.6.13 - 2.6.15 kernels, at least, fixed + * late in 2.6.16). + * All that can be done is to edit the kernel source to remove the quirk + * check until that is fixed. + * We do not need to call enable_msi() for our HyperTransport chip (HT-400), + * even those it uses MSI, and we want to avoid the quirk warning, so + * So we call enable_msi only for the PE-800. If we do end up needing + * pci_enable_msi at some point in the future for HT-400, we'll move the + * call back into the main init_one code. + * We save the msi lo and hi values, so we can restore them after + * chip reset (the kernel PCI infrastructure doesn't yet handle that + * correctly). + */ +static int ipath_setup_pe_config(struct ipath_devdata *dd, + struct pci_dev *pdev) +{ + int pos, ret; + + dd->ipath_msi_lo = 0; /* used as a flag during reset processing */ + ret = pci_enable_msi(dd->pcidev); + if (ret) + ipath_dev_err(dd, "pci_enable_msi failed: %d, " + "interrupts may not work\n", ret); + /* continue even if it fails, we may still be OK... */ + + if ((pos = pci_find_capability(dd->pcidev, PCI_CAP_ID_MSI))) { + u16 control; + pci_read_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_LO, + &dd->ipath_msi_lo); + pci_read_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_HI, + &dd->ipath_msi_hi); + pci_read_config_word(dd->pcidev, pos + PCI_MSI_FLAGS, + &control); + /* now save the data (vector) info */ + pci_read_config_word(dd->pcidev, + pos + ((control & PCI_MSI_FLAGS_64BIT) + ? 12 : 8), + &dd->ipath_msi_data); + ipath_cdbg(VERBOSE, "Read msi data 0x%x from config offset " + "0x%x, control=0x%x\n", dd->ipath_msi_data, + pos + ((control & PCI_MSI_FLAGS_64BIT) ? 12 : 8), + control); + /* we save the cachelinesize also, although it doesn't + * really matter */ + pci_read_config_byte(dd->pcidev, PCI_CACHE_LINE_SIZE, + &dd->ipath_pci_cacheline); + } else + ipath_dev_err(dd, "Can't find MSI capability, " + "can't save MSI settings for reset\n"); + if ((pos = pci_find_capability(dd->pcidev, PCI_CAP_ID_EXP))) { + u16 linkstat; + pci_read_config_word(dd->pcidev, pos + PCI_EXP_LNKSTA, + &linkstat); + linkstat >>= 4; + linkstat &= 0x1f; + if (linkstat != 8) + ipath_dev_err(dd, "PCIe width %u, " + "performance reduced\n", linkstat); + } + else + ipath_dev_err(dd, "Can't find PCI Express " + "capability!\n"); + return 0; +} + +static void ipath_init_pe_variables(void) +{ + /* + * bits for selecting i2c direction and values, + * used for I2C serial flash + */ + ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM; + ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM; + ipath_gpio_sda = IPATH_GPIO_SDA; + ipath_gpio_scl = IPATH_GPIO_SCL; + + /* variables for sanity checking interrupt and errors */ + infinipath_hwe_bitsextant = + (INFINIPATH_HWE_RXEMEMPARITYERR_MASK << + INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) | + (INFINIPATH_HWE_PCIEMEMPARITYERR_MASK << + INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT) | + INFINIPATH_HWE_PCIE1PLLFAILED | + INFINIPATH_HWE_PCIE0PLLFAILED | + INFINIPATH_HWE_PCIEPOISONEDTLP | + INFINIPATH_HWE_PCIECPLTIMEOUT | + INFINIPATH_HWE_PCIEBUSPARITYXTLH | + INFINIPATH_HWE_PCIEBUSPARITYXADM | + INFINIPATH_HWE_PCIEBUSPARITYRADM | + INFINIPATH_HWE_MEMBISTFAILED | + INFINIPATH_HWE_COREPLL_FBSLIP | + INFINIPATH_HWE_COREPLL_RFSLIP | + INFINIPATH_HWE_SERDESPLLFAILED | + INFINIPATH_HWE_IBCBUSTOSPCPARITYERR | + INFINIPATH_HWE_IBCBUSFRSPCPARITYERR; + infinipath_i_bitsextant = + (INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) | + (INFINIPATH_I_RCVAVAIL_MASK << + INFINIPATH_I_RCVAVAIL_SHIFT) | + INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT | + INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO; + infinipath_e_bitsextant = + INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC | + INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN | + INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN | + INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RUNEXPCHAR | + INFINIPATH_E_RUNSUPVL | INFINIPATH_E_REBP | + INFINIPATH_E_RIBFLOW | INFINIPATH_E_RBADVERSION | + INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_RBADTID | INFINIPATH_E_RHDRLEN | + INFINIPATH_E_RHDR | INFINIPATH_E_RIBLOSTLINK | + INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SMAXPKTLEN | + INFINIPATH_E_SUNDERRUN | INFINIPATH_E_SPKTLEN | + INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SDROPPEDDATAPKT | + INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | + INFINIPATH_E_SUNSUPVL | INFINIPATH_E_IBSTATUSCHANGED | + INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET | + INFINIPATH_E_HARDWARE; + + infinipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK; + infinipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK; +} + +/* setup the MSI stuff again after a reset. I'd like to just call + * pci_enable_msi() and request_irq() again, but when I do that, + * the MSI enable bit doesn't get set in the command word, and + * we switch to to a different interrupt vector, which is confusing, + * so I instead just do it all inline. Perhaps somehow can tie this + * into the PCIe hotplug support at some point + * Note, because I'm doing it all here, I don't call pci_disable_msi() + * or free_irq() at the start of ipath_setup_pe_reset(). + */ +static int ipath_reinit_msi(struct ipath_devdata *dd) +{ + int pos; + u16 control; + int ret; + + if (!dd->ipath_msi_lo) { + dev_info(&dd->pcidev->dev, "Can't restore MSI config, " + "initial setup failed?\n"); + ret = 0; + goto bail; + } + + if (!(pos = pci_find_capability(dd->pcidev, PCI_CAP_ID_MSI))) { + ipath_dev_err(dd, "Can't find MSI capability, " + "can't restore MSI settings\n"); + ret = 0; + goto bail; + } + ipath_cdbg(VERBOSE, "Writing msi_lo 0x%x to config offset 0x%x\n", + dd->ipath_msi_lo, pos + PCI_MSI_ADDRESS_LO); + pci_write_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_LO, + dd->ipath_msi_lo); + ipath_cdbg(VERBOSE, "Writing msi_lo 0x%x to config offset 0x%x\n", + dd->ipath_msi_hi, pos + PCI_MSI_ADDRESS_HI); + pci_write_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_HI, + dd->ipath_msi_hi); + pci_read_config_word(dd->pcidev, pos + PCI_MSI_FLAGS, &control); + if (!(control & PCI_MSI_FLAGS_ENABLE)) { + ipath_cdbg(VERBOSE, "MSI control at off %x was %x, " + "setting MSI enable (%x)\n", pos + PCI_MSI_FLAGS, + control, control | PCI_MSI_FLAGS_ENABLE); + control |= PCI_MSI_FLAGS_ENABLE; + pci_write_config_word(dd->pcidev, pos + PCI_MSI_FLAGS, + control); + } + /* now rewrite the data (vector) info */ + pci_write_config_word(dd->pcidev, pos + + ((control & PCI_MSI_FLAGS_64BIT) ? 12 : 8), + dd->ipath_msi_data); + /* we restore the cachelinesize also, although it doesn't really + * matter */ + pci_write_config_byte(dd->pcidev, PCI_CACHE_LINE_SIZE, + dd->ipath_pci_cacheline); + /* and now set the pci master bit again */ + pci_set_master(dd->pcidev); + ret = 1; + +bail: + return ret; +} + +/* This routine sleeps, so it can only be called from user context, not + * from interrupt context. If we need interrupt context, we can split + * it into two routines. +*/ +static int ipath_setup_pe_reset(struct ipath_devdata *dd) +{ + u64 val; + int i; + int ret; + + /* Use ERROR so it shows up in logs, etc. */ + ipath_dev_err(dd, "Resetting PE-800 unit %u\n", + dd->ipath_unit); + val = dd->ipath_control | INFINIPATH_C_RESET; + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, val); + mb(); + + for (i = 1; i <= 5; i++) { + int r; + /* allow MBIST, etc. to complete; longer on each retry. + * We sometimes get machine checks from bus timeout if no + * response, so for now, make it *really* long. + */ + msleep(1000 + (1 + i) * 2000); + if ((r = + pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, + dd->ipath_pcibar0))) + ipath_dev_err(dd, "rewrite of BAR0 failed: %d\n", + r); + if ((r = + pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, + dd->ipath_pcibar1))) + ipath_dev_err(dd, "rewrite of BAR1 failed: %d\n", + r); + /* now re-enable memory access */ + if ((r = pci_enable_device(dd->pcidev))) + ipath_dev_err(dd, "pci_enable_device failed after " + "reset: %d\n", r); + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_revision); + if (val == dd->ipath_revision) { + ipath_cdbg(VERBOSE, "Got matching revision " + "register %llx on try %d\n", + (unsigned long long) val, i); + ret = ipath_reinit_msi(dd); + goto bail; + } + /* Probably getting -1 back */ + ipath_dbg("Didn't get expected revision register, " + "got %llx, try %d\n", (unsigned long long) val, + i + 1); + } + ret = 0; /* failed */ + +bail: + return ret; +} + +/** + * ipath_pe_put_tid - write a TID in chip + * @dd: the infinipath device + * @tidptr: pointer to the expected TID (in chip) to udpate + * @tidtype: 0 for eager, 1 for expected + * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing + * + * This exists as a separate routine to allow for special locking etc. + * It's used for both the full cleanup on exit, as well as the normal + * setup and teardown. + */ +static void ipath_pe_put_tid(struct ipath_devdata *dd, u64 __iomem *tidptr, + u32 type, unsigned long pa) +{ + u32 __iomem *tidp32 = (u32 __iomem *)tidptr; + unsigned long flags = 0; /* keep gcc quiet */ + + if (pa != dd->ipath_tidinvalid) { + if (pa & ((1U << 11) - 1)) { + dev_info(&dd->pcidev->dev, "BUG: physaddr %lx " + "not 4KB aligned!\n", pa); + return; + } + pa >>= 11; + /* paranoia check */ + if (pa & (7<<29)) + ipath_dev_err(dd, + "BUG: Physical page address 0x%lx " + "has bits set in 31-29\n", pa); + + if (type == 0) + pa |= dd->ipath_tidtemplate; + else /* for now, always full 4KB page */ + pa |= 2 << 29; + } + + /* workaround chip bug 9437 by writing each TID twice + * and holding a spinlock around the writes, so they don't + * intermix with other TID (eager or expected) writes + * Unfortunately, this call can be done from interrupt level + * for the port 0 eager TIDs, so we have to use irqsave + */ + spin_lock_irqsave(&dd->ipath_tid_lock, flags); + ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xfeeddeaf); + if (dd->ipath_kregbase) + writel(pa, tidp32); + ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xdeadbeef); + mmiowb(); + spin_unlock_irqrestore(&dd->ipath_tid_lock, flags); +} + +/** + * ipath_pe_clear_tid - clear all TID entries for a port, expected and eager + * @dd: the infinipath device + * @port: the port + * + * clear all TID entries for a port, expected and eager. + * Used from ipath_close(). On PE800, TIDs are only 32 bits, + * not 64, but they are still on 64 bit boundaries, so tidbase + * is declared as u64 * for the pointer math, even though we write 32 bits + */ +static void ipath_pe_clear_tids(struct ipath_devdata *dd, unsigned port) +{ + u64 __iomem *tidbase; + unsigned long tidinv; + int i; + + if (!dd->ipath_kregbase) + return; + + ipath_cdbg(VERBOSE, "Invalidate TIDs for port %u\n", port); + + tidinv = dd->ipath_tidinvalid; + tidbase = (u64 __iomem *) + ((char __iomem *)(dd->ipath_kregbase) + + dd->ipath_rcvtidbase + + port * dd->ipath_rcvtidcnt * sizeof(*tidbase)); + + for (i = 0; i < dd->ipath_rcvtidcnt; i++) + ipath_pe_put_tid(dd, &tidbase[i], 0, tidinv); + + tidbase = (u64 __iomem *) + ((char __iomem *)(dd->ipath_kregbase) + + dd->ipath_rcvegrbase + + port * dd->ipath_rcvegrcnt * sizeof(*tidbase)); + + for (i = 0; i < dd->ipath_rcvegrcnt; i++) + ipath_pe_put_tid(dd, &tidbase[i], 1, tidinv); +} + +/** + * ipath_pe_tidtemplate - setup constants for TID updates + * @dd: the infinipath device + * + * We setup stuff that we use a lot, to avoid calculating each time + */ +static void ipath_pe_tidtemplate(struct ipath_devdata *dd) +{ + u32 egrsize = dd->ipath_rcvegrbufsize; + + /* For now, we always allocate 4KB buffers (at init) so we can + * receive max size packets. We may want a module parameter to + * specify 2KB or 4KB and/or make be per port instead of per device + * for those who want to reduce memory footprint. Note that the + * ipath_rcvhdrentsize size must be large enough to hold the largest + * IB header (currently 96 bytes) that we expect to handle (plus of + * course the 2 dwords of RHF). + */ + if (egrsize == 2048) + dd->ipath_tidtemplate = 1U << 29; + else if (egrsize == 4096) + dd->ipath_tidtemplate = 2U << 29; + else { + egrsize = 4096; + dev_info(&dd->pcidev->dev, "BUG: unsupported egrbufsize " + "%u, using %u\n", dd->ipath_rcvegrbufsize, + egrsize); + dd->ipath_tidtemplate = 2U << 29; + } + dd->ipath_tidinvalid = 0; +} + +static int ipath_pe_early_init(struct ipath_devdata *dd) +{ + dd->ipath_flags |= IPATH_4BYTE_TID; + + /* + * For openib, we need to be able to handle an IB header of 96 bytes + * or 24 dwords. HT-400 has arbitrary sized receive buffers, so we + * made them the same size as the PIO buffers. The PE-800 does not + * handle arbitrary size buffers, so we need the header large enough + * to handle largest IB header, but still have room for a 2KB MTU + * standard IB packet. + */ + dd->ipath_rcvhdrentsize = 24; + dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE; + + /* For HT-400, we allocate a somewhat overly large eager buffer, + * such that we can guarantee that we can receive the largest packet + * that we can send out. To truly support a 4KB MTU, we need to + * bump this to a larger value. We'll do this when I get around to + * testing 4KB sends on the PE-800, which I have not yet done. + */ + dd->ipath_rcvegrbufsize = 2048; + /* + * the min() check here is currently a nop, but it may not always + * be, depending on just how we do ipath_rcvegrbufsize + */ + dd->ipath_ibmaxlen = min(dd->ipath_piosize2k, + dd->ipath_rcvegrbufsize + + (dd->ipath_rcvhdrentsize << 2)); + dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen; + + /* + * For PE-800, we can request a receive interrupt for 1 or + * more packets from current offset. For now, we set this + * up for a single packet, to match the HT-400 behavior. + */ + dd->ipath_rhdrhead_intr_off = 1ULL<<32; + + return 0; +} + +int __attribute__((weak)) ipath_unordered_wc(void) +{ + return 0; +} + +/** + * ipath_init_pe_get_base_info - set chip-specific flags for user code + * @dd: the infinipath device + * @kbase: ipath_base_info pointer + * + * We set the PCIE flag because the lower bandwidth on PCIe vs + * HyperTransport can affect some user packet algorithims. + */ +static int ipath_pe_get_base_info(struct ipath_portdata *pd, void *kbase) +{ + struct ipath_base_info *kinfo = kbase; + + if (ipath_unordered_wc()) { + kinfo->spi_runtime_flags |= IPATH_RUNTIME_FORCE_WC_ORDER; + ipath_cdbg(PROC, "Intel processor, forcing WC order\n"); + } + else + ipath_cdbg(PROC, "Not Intel processor, WC ordered\n"); + + kinfo->spi_runtime_flags |= IPATH_RUNTIME_PCIE; + + return 0; +} + +/** + * ipath_init_pe800_funcs - set up the chip-specific function pointers + * @dd: the infinipath device + * + * This is global, and is called directly at init to set up the + * chip-specific function pointers for later use. + */ +void ipath_init_pe800_funcs(struct ipath_devdata *dd) +{ + dd->ipath_f_intrsetup = ipath_pe_intconfig; + dd->ipath_f_bus = ipath_setup_pe_config; + dd->ipath_f_reset = ipath_setup_pe_reset; + dd->ipath_f_get_boardname = ipath_pe_boardname; + dd->ipath_f_init_hwerrors = ipath_pe_init_hwerrors; + dd->ipath_f_early_init = ipath_pe_early_init; + dd->ipath_f_handle_hwerrors = ipath_pe_handle_hwerrors; + dd->ipath_f_quiet_serdes = ipath_pe_quiet_serdes; + dd->ipath_f_bringup_serdes = ipath_pe_bringup_serdes; + dd->ipath_f_clear_tids = ipath_pe_clear_tids; + dd->ipath_f_put_tid = ipath_pe_put_tid; + dd->ipath_f_cleanup = ipath_setup_pe_cleanup; + dd->ipath_f_setextled = ipath_setup_pe_setextled; + dd->ipath_f_get_base_info = ipath_pe_get_base_info; + + /* initialize chip-specific variables */ + dd->ipath_f_tidtemplate = ipath_pe_tidtemplate; + + /* + * setup the register offsets, since they are different for each + * chip + */ + dd->ipath_kregs = &ipath_pe_kregs; + dd->ipath_cregs = &ipath_pe_cregs; + + ipath_init_pe_variables(); +} + diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c new file mode 100644 index 0000000..1889071 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_qp.c @@ -0,0 +1,913 @@ +/* + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/err.h> +#include <linux/vmalloc.h> + +#include "ipath_verbs.h" +#include "ips_common.h" + +#define BITS_PER_PAGE (PAGE_SIZE*BITS_PER_BYTE) +#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) +#define mk_qpn(qpt, map, off) (((map) - (qpt)->map) * BITS_PER_PAGE + \ + (off)) +#define find_next_offset(map, off) find_next_zero_bit((map)->page, \ + BITS_PER_PAGE, off) + +#define TRANS_INVALID 0 +#define TRANS_ANY2RST 1 +#define TRANS_RST2INIT 2 +#define TRANS_INIT2INIT 3 +#define TRANS_INIT2RTR 4 +#define TRANS_RTR2RTS 5 +#define TRANS_RTS2RTS 6 +#define TRANS_SQERR2RTS 7 +#define TRANS_ANY2ERR 8 +#define TRANS_RTS2SQD 9 /* XXX Wait for expected ACKs & signal event */ +#define TRANS_SQD2SQD 10 /* error if not drained & parameter change */ +#define TRANS_SQD2RTS 11 /* error if not drained */ + +/* + * Convert the AETH credit code into the number of credits. + */ +static u32 credit_table[31] = { + 0, /* 0 */ + 1, /* 1 */ + 2, /* 2 */ + 3, /* 3 */ + 4, /* 4 */ + 6, /* 5 */ + 8, /* 6 */ + 12, /* 7 */ + 16, /* 8 */ + 24, /* 9 */ + 32, /* A */ + 48, /* B */ + 64, /* C */ + 96, /* D */ + 128, /* E */ + 192, /* F */ + 256, /* 10 */ + 384, /* 11 */ + 512, /* 12 */ + 768, /* 13 */ + 1024, /* 14 */ + 1536, /* 15 */ + 2048, /* 16 */ + 3072, /* 17 */ + 4096, /* 18 */ + 6144, /* 19 */ + 8192, /* 1A */ + 12288, /* 1B */ + 16384, /* 1C */ + 24576, /* 1D */ + 32768 /* 1E */ +}; + +static u32 alloc_qpn(struct ipath_qp_table *qpt) +{ + u32 i, offset, max_scan, qpn; + struct qpn_map *map; + u32 ret; + + qpn = qpt->last + 1; + if (qpn >= QPN_MAX) + qpn = 2; + offset = qpn & BITS_PER_PAGE_MASK; + map = &qpt->map[qpn / BITS_PER_PAGE]; + max_scan = qpt->nmaps - !offset; + for (i = 0;;) { + if (unlikely(!map->page)) { + unsigned long page = get_zeroed_page(GFP_KERNEL); + unsigned long flags; + + /* + * Free the page if someone raced with us + * installing it: + */ + spin_lock_irqsave(&qpt->lock, flags); + if (map->page) + free_page(page); + else + map->page = (void *)page; + spin_unlock_irqrestore(&qpt->lock, flags); + if (unlikely(!map->page)) + break; + } + if (likely(atomic_read(&map->n_free))) { + do { + if (!test_and_set_bit(offset, map->page)) { + atomic_dec(&map->n_free); + qpt->last = qpn; + ret = qpn; + goto bail; + } + offset = find_next_offset(map, offset); + qpn = mk_qpn(qpt, map, offset); + /* + * This test differs from alloc_pidmap(). + * If find_next_offset() does find a zero + * bit, we don't need to check for QPN + * wrapping around past our starting QPN. + * We just need to be sure we don't loop + * forever. + */ + } while (offset < BITS_PER_PAGE && qpn < QPN_MAX); + } + /* + * In order to keep the number of pages allocated to a + * minimum, we scan the all existing pages before increasing + * the size of the bitmap table. + */ + if (++i > max_scan) { + if (qpt->nmaps == QPNMAP_ENTRIES) + break; + map = &qpt->map[qpt->nmaps++]; + offset = 0; + } else if (map < &qpt->map[qpt->nmaps]) { + ++map; + offset = 0; + } else { + map = &qpt->map[0]; + offset = 2; + } + qpn = mk_qpn(qpt, map, offset); + } + + ret = 0; + +bail: + return ret; +} + +static void free_qpn(struct ipath_qp_table *qpt, u32 qpn) +{ + struct qpn_map *map; + + map = qpt->map + qpn / BITS_PER_PAGE; + if (map->page) + clear_bit(qpn & BITS_PER_PAGE_MASK, map->page); + atomic_inc(&map->n_free); +} + +/** + * ipath_alloc_qpn - allocate a QP number + * @qpt: the QP table + * @qp: the QP + * @type: the QP type (IB_QPT_SMI and IB_QPT_GSI are special) + * + * Allocate the next available QPN and put the QP into the hash table. + * The hash table holds a reference to the QP. + */ +static int ipath_alloc_qpn(struct ipath_qp_table *qpt, struct ipath_qp *qp, + enum ib_qp_type type) +{ + unsigned long flags; + u32 qpn; + int ret; + + if (type == IB_QPT_SMI) + qpn = 0; + else if (type == IB_QPT_GSI) + qpn = 1; + else { + /* Allocate the next available QPN */ + qpn = alloc_qpn(qpt); + if (qpn == 0) { + ret = -ENOMEM; + goto bail; + } + } + qp->ibqp.qp_num = qpn; + + /* Add the QP to the hash table. */ + spin_lock_irqsave(&qpt->lock, flags); + + qpn %= qpt->max; + qp->next = qpt->table[qpn]; + qpt->table[qpn] = qp; + atomic_inc(&qp->refcount); + + spin_unlock_irqrestore(&qpt->lock, flags); + ret = 0; + +bail: + return ret; +} + +/** + * ipath_free_qp - remove a QP from the QP table + * @qpt: the QP table + * @qp: the QP to remove + * + * Remove the QP from the table so it can't be found asynchronously by + * the receive interrupt routine. + */ +static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp) +{ + struct ipath_qp *q, **qpp; + unsigned long flags; + int fnd = 0; + + spin_lock_irqsave(&qpt->lock, flags); + + /* Remove QP from the hash table. */ + qpp = &qpt->table[qp->ibqp.qp_num % qpt->max]; + for (; (q = *qpp) != NULL; qpp = &q->next) { + if (q == qp) { + *qpp = qp->next; + qp->next = NULL; + atomic_dec(&qp->refcount); + fnd = 1; + break; + } + } + + spin_unlock_irqrestore(&qpt->lock, flags); + + if (!fnd) + return; + + /* If QPN is not reserved, mark QPN free in the bitmap. */ + if (qp->ibqp.qp_num > 1) + free_qpn(qpt, qp->ibqp.qp_num); + + wait_event(qp->wait, !atomic_read(&qp->refcount)); +} + +/** + * ipath_free_all_qps - remove all QPs from the table + * @qpt: the QP table to empty + */ +void ipath_free_all_qps(struct ipath_qp_table *qpt) +{ + unsigned long flags; + struct ipath_qp *qp, *nqp; + u32 n; + + for (n = 0; n < qpt->max; n++) { + spin_lock_irqsave(&qpt->lock, flags); + qp = qpt->table[n]; + qpt->table[n] = NULL; + spin_unlock_irqrestore(&qpt->lock, flags); + + while (qp) { + nqp = qp->next; + if (qp->ibqp.qp_num > 1) + free_qpn(qpt, qp->ibqp.qp_num); + if (!atomic_dec_and_test(&qp->refcount) || + !ipath_destroy_qp(&qp->ibqp)) + _VERBS_INFO("QP memory leak!\n"); + qp = nqp; + } + } + + for (n = 0; n < ARRAY_SIZE(qpt->map); n++) { + if (qpt->map[n].page) + free_page((unsigned long)qpt->map[n].page); + } +} + +/** + * ipath_lookup_qpn - return the QP with the given QPN + * @qpt: the QP table + * @qpn: the QP number to look up + * + * The caller is responsible for decrementing the QP reference count + * when done. + */ +struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn) +{ + unsigned long flags; + struct ipath_qp *qp; + + spin_lock_irqsave(&qpt->lock, flags); + + for (qp = qpt->table[qpn % qpt->max]; qp; qp = qp->next) { + if (qp->ibqp.qp_num == qpn) { + atomic_inc(&qp->refcount); + break; + } + } + + spin_unlock_irqrestore(&qpt->lock, flags); + return qp; +} + +/** + * ipath_reset_qp - initialize the QP state to the reset state + * @qp: the QP to reset + */ +static void ipath_reset_qp(struct ipath_qp *qp) +{ + qp->remote_qpn = 0; + qp->qkey = 0; + qp->qp_access_flags = 0; + qp->s_hdrwords = 0; + qp->s_psn = 0; + qp->r_psn = 0; + atomic_set(&qp->msn, 0); + if (qp->ibqp.qp_type == IB_QPT_RC) { + qp->s_state = IB_OPCODE_RC_SEND_LAST; + qp->r_state = IB_OPCODE_RC_SEND_LAST; + } else { + qp->s_state = IB_OPCODE_UC_SEND_LAST; + qp->r_state = IB_OPCODE_UC_SEND_LAST; + } + qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; + qp->s_nak_state = 0; + qp->s_rnr_timeout = 0; + qp->s_head = 0; + qp->s_tail = 0; + qp->s_cur = 0; + qp->s_last = 0; + qp->s_ssn = 1; + qp->s_lsn = 0; + qp->r_rq.head = 0; + qp->r_rq.tail = 0; + qp->r_reuse_sge = 0; +} + +/** + * ipath_error_qp - put a QP into an error state + * @qp: the QP to put into an error state + * + * Flushes both send and receive work queues. + * QP r_rq.lock and s_lock should be held. + */ + +static void ipath_error_qp(struct ipath_qp *qp) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ib_wc wc; + + _VERBS_INFO("QP%d/%d in error state\n", + qp->ibqp.qp_num, qp->remote_qpn); + + spin_lock(&dev->pending_lock); + /* XXX What if its already removed by the timeout code? */ + if (qp->timerwait.next != LIST_POISON1) + list_del(&qp->timerwait); + if (qp->piowait.next != LIST_POISON1) + list_del(&qp->piowait); + spin_unlock(&dev->pending_lock); + + wc.status = IB_WC_WR_FLUSH_ERR; + wc.vendor_err = 0; + wc.byte_len = 0; + wc.imm_data = 0; + wc.qp_num = qp->ibqp.qp_num; + wc.src_qp = 0; + wc.wc_flags = 0; + wc.pkey_index = 0; + wc.slid = 0; + wc.sl = 0; + wc.dlid_path_bits = 0; + wc.port_num = 0; + + while (qp->s_last != qp->s_head) { + struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last); + + wc.wr_id = wqe->wr.wr_id; + wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 1); + } + qp->s_cur = qp->s_tail = qp->s_head; + qp->s_hdrwords = 0; + qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; + + wc.opcode = IB_WC_RECV; + while (qp->r_rq.tail != qp->r_rq.head) { + wc.wr_id = get_rwqe_ptr(&qp->r_rq, qp->r_rq.tail)->wr_id; + if (++qp->r_rq.tail >= qp->r_rq.size) + qp->r_rq.tail = 0; + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + } +} + +/** + * ipath_modify_qp - modify the attributes of a queue pair + * @ibqp: the queue pair who's attributes we're modifying + * @attr: the new attributes + * @attr_mask: the mask of attributes to modify + * + * Returns 0 on success, otherwise returns an errno. + */ +int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask) +{ + struct ipath_qp *qp = to_iqp(ibqp); + enum ib_qp_state cur_state, new_state; + unsigned long flags; + int ret; + + spin_lock_irqsave(&qp->r_rq.lock, flags); + spin_lock(&qp->s_lock); + + cur_state = attr_mask & IB_QP_CUR_STATE ? + attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask)) + goto inval; + + switch (new_state) { + case IB_QPS_RESET: + ipath_reset_qp(qp); + break; + + case IB_QPS_ERR: + ipath_error_qp(qp); + break; + + default: + break; + + } + + if (attr_mask & IB_QP_PKEY_INDEX) { + struct ipath_ibdev *dev = to_idev(ibqp->device); + + if (attr->pkey_index >= ipath_layer_get_npkeys(dev->dd)) + goto inval; + qp->s_pkey_index = attr->pkey_index; + } + + if (attr_mask & IB_QP_DEST_QPN) + qp->remote_qpn = attr->dest_qp_num; + + if (attr_mask & IB_QP_SQ_PSN) { + qp->s_next_psn = attr->sq_psn; + qp->s_last_psn = qp->s_next_psn - 1; + } + + if (attr_mask & IB_QP_RQ_PSN) + qp->r_psn = attr->rq_psn; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->qp_access_flags = attr->qp_access_flags; + + if (attr_mask & IB_QP_AV) { + if (attr->ah_attr.dlid == 0 || + attr->ah_attr.dlid >= IPS_MULTICAST_LID_BASE) + goto inval; + qp->remote_ah_attr = attr->ah_attr; + } + + if (attr_mask & IB_QP_PATH_MTU) + qp->path_mtu = attr->path_mtu; + + if (attr_mask & IB_QP_RETRY_CNT) + qp->s_retry = qp->s_retry_cnt = attr->retry_cnt; + + if (attr_mask & IB_QP_RNR_RETRY) { + qp->s_rnr_retry = attr->rnr_retry; + if (qp->s_rnr_retry > 7) + qp->s_rnr_retry = 7; + qp->s_rnr_retry_cnt = qp->s_rnr_retry; + } + + if (attr_mask & IB_QP_MIN_RNR_TIMER) { + if (attr->min_rnr_timer > 31) + goto inval; + qp->s_min_rnr_timer = attr->min_rnr_timer; + } + + if (attr_mask & IB_QP_QKEY) + qp->qkey = attr->qkey; + + if (attr_mask & IB_QP_PKEY_INDEX) + qp->s_pkey_index = attr->pkey_index; + + qp->state = new_state; + spin_unlock(&qp->s_lock); + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + + /* + * If QP1 changed to the RTS state, try to move to the link to INIT + * even if it was ACTIVE so the SM will reinitialize the SMA's + * state. + */ + if (qp->ibqp.qp_num == 1 && new_state == IB_QPS_RTS) { + struct ipath_ibdev *dev = to_idev(ibqp->device); + + ipath_layer_set_linkstate(dev->dd, IPATH_IB_LINKDOWN); + } + ret = 0; + goto bail; + +inval: + spin_unlock(&qp->s_lock); + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + ret = -EINVAL; + +bail: + return ret; +} + +int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr) +{ + struct ipath_qp *qp = to_iqp(ibqp); + + attr->qp_state = qp->state; + attr->cur_qp_state = attr->qp_state; + attr->path_mtu = qp->path_mtu; + attr->path_mig_state = 0; + attr->qkey = qp->qkey; + attr->rq_psn = qp->r_psn; + attr->sq_psn = qp->s_next_psn; + attr->dest_qp_num = qp->remote_qpn; + attr->qp_access_flags = qp->qp_access_flags; + attr->cap.max_send_wr = qp->s_size - 1; + attr->cap.max_recv_wr = qp->r_rq.size - 1; + attr->cap.max_send_sge = qp->s_max_sge; + attr->cap.max_recv_sge = qp->r_rq.max_sge; + attr->cap.max_inline_data = 0; + attr->ah_attr = qp->remote_ah_attr; + memset(&attr->alt_ah_attr, 0, sizeof(attr->alt_ah_attr)); + attr->pkey_index = qp->s_pkey_index; + attr->alt_pkey_index = 0; + attr->en_sqd_async_notify = 0; + attr->sq_draining = 0; + attr->max_rd_atomic = 1; + attr->max_dest_rd_atomic = 1; + attr->min_rnr_timer = qp->s_min_rnr_timer; + attr->port_num = 1; + attr->timeout = 0; + attr->retry_cnt = qp->s_retry_cnt; + attr->rnr_retry = qp->s_rnr_retry; + attr->alt_port_num = 0; + attr->alt_timeout = 0; + + init_attr->event_handler = qp->ibqp.event_handler; + init_attr->qp_context = qp->ibqp.qp_context; + init_attr->send_cq = qp->ibqp.send_cq; + init_attr->recv_cq = qp->ibqp.recv_cq; + init_attr->srq = qp->ibqp.srq; + init_attr->cap = attr->cap; + init_attr->sq_sig_type = + (qp->s_flags & (1 << IPATH_S_SIGNAL_REQ_WR)) + ? IB_SIGNAL_REQ_WR : 0; + init_attr->qp_type = qp->ibqp.qp_type; + init_attr->port_num = 1; + return 0; +} + +/** + * ipath_compute_aeth - compute the AETH (syndrome + MSN) + * @qp: the queue pair to compute the AETH for + * + * Returns the AETH. + * + * The QP s_lock should be held. + */ +__be32 ipath_compute_aeth(struct ipath_qp *qp) +{ + u32 aeth = atomic_read(&qp->msn) & IPS_MSN_MASK; + + if (qp->s_nak_state) { + aeth |= qp->s_nak_state << IPS_AETH_CREDIT_SHIFT; + } else if (qp->ibqp.srq) { + /* + * Shared receive queues don't generate credits. + * Set the credit field to the invalid value. + */ + aeth |= IPS_AETH_CREDIT_INVAL << IPS_AETH_CREDIT_SHIFT; + } else { + u32 min, max, x; + u32 credits; + + /* + * Compute the number of credits available (RWQEs). + * XXX Not holding the r_rq.lock here so there is a small + * chance that the pair of reads are not atomic. + */ + credits = qp->r_rq.head - qp->r_rq.tail; + if ((int)credits < 0) + credits += qp->r_rq.size; + /* + * Binary search the credit table to find the code to + * use. + */ + min = 0; + max = 31; + for (;;) { + x = (min + max) / 2; + if (credit_table[x] == credits) + break; + if (credit_table[x] > credits) + max = x; + else if (min == x) + break; + else + min = x; + } + aeth |= x << IPS_AETH_CREDIT_SHIFT; + } + return cpu_to_be32(aeth); +} + +/** + * ipath_create_qp - create a queue pair for a device + * @ibpd: the protection domain who's device we create the queue pair for + * @init_attr: the attributes of the queue pair + * @udata: unused by InfiniPath + * + * Returns the queue pair on success, otherwise returns an errno. + * + * Called by the ib_create_qp() core verbs function. + */ +struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct ipath_qp *qp; + int err; + struct ipath_swqe *swq = NULL; + struct ipath_ibdev *dev; + size_t sz; + struct ib_qp *ret; + + if (init_attr->cap.max_send_sge > 255 || + init_attr->cap.max_recv_sge > 255) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + switch (init_attr->qp_type) { + case IB_QPT_UC: + case IB_QPT_RC: + sz = sizeof(struct ipath_sge) * + init_attr->cap.max_send_sge + + sizeof(struct ipath_swqe); + swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz); + if (swq == NULL) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + /* FALLTHROUGH */ + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + qp = kmalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + qp->r_rq.size = init_attr->cap.max_recv_wr + 1; + sz = sizeof(struct ipath_sge) * + init_attr->cap.max_recv_sge + + sizeof(struct ipath_rwqe); + qp->r_rq.wq = vmalloc(qp->r_rq.size * sz); + if (!qp->r_rq.wq) { + kfree(qp); + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + /* + * ib_create_qp() will initialize qp->ibqp + * except for qp->ibqp.qp_num. + */ + spin_lock_init(&qp->s_lock); + spin_lock_init(&qp->r_rq.lock); + atomic_set(&qp->refcount, 0); + init_waitqueue_head(&qp->wait); + tasklet_init(&qp->s_task, + init_attr->qp_type == IB_QPT_RC ? + ipath_do_rc_send : ipath_do_uc_send, + (unsigned long)qp); + qp->piowait.next = LIST_POISON1; + qp->piowait.prev = LIST_POISON2; + qp->timerwait.next = LIST_POISON1; + qp->timerwait.prev = LIST_POISON2; + qp->state = IB_QPS_RESET; + qp->s_wq = swq; + qp->s_size = init_attr->cap.max_send_wr + 1; + qp->s_max_sge = init_attr->cap.max_send_sge; + qp->r_rq.max_sge = init_attr->cap.max_recv_sge; + qp->s_flags = init_attr->sq_sig_type == IB_SIGNAL_REQ_WR ? + 1 << IPATH_S_SIGNAL_REQ_WR : 0; + dev = to_idev(ibpd->device); + err = ipath_alloc_qpn(&dev->qp_table, qp, + init_attr->qp_type); + if (err) { + vfree(swq); + vfree(qp->r_rq.wq); + kfree(qp); + ret = ERR_PTR(err); + goto bail; + } + ipath_reset_qp(qp); + + /* Tell the core driver that the kernel SMA is present. */ + if (qp->ibqp.qp_type == IB_QPT_SMI) + ipath_layer_set_verbs_flags(dev->dd, + IPATH_VERBS_KERNEL_SMA); + break; + + default: + /* Don't support raw QPs */ + ret = ERR_PTR(-ENOSYS); + goto bail; + } + + init_attr->cap.max_inline_data = 0; + + ret = &qp->ibqp; + +bail: + return ret; +} + +/** + * ipath_destroy_qp - destroy a queue pair + * @ibqp: the queue pair to destroy + * + * Returns 0 on success. + * + * Note that this can be called while the QP is actively sending or + * receiving! + */ +int ipath_destroy_qp(struct ib_qp *ibqp) +{ + struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_ibdev *dev = to_idev(ibqp->device); + unsigned long flags; + + /* Tell the core driver that the kernel SMA is gone. */ + if (qp->ibqp.qp_type == IB_QPT_SMI) + ipath_layer_set_verbs_flags(dev->dd, 0); + + spin_lock_irqsave(&qp->r_rq.lock, flags); + spin_lock(&qp->s_lock); + qp->state = IB_QPS_ERR; + spin_unlock(&qp->s_lock); + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + + /* Stop the sending tasklet. */ + tasklet_kill(&qp->s_task); + + /* Make sure the QP isn't on the timeout list. */ + spin_lock_irqsave(&dev->pending_lock, flags); + if (qp->timerwait.next != LIST_POISON1) + list_del(&qp->timerwait); + if (qp->piowait.next != LIST_POISON1) + list_del(&qp->piowait); + spin_unlock_irqrestore(&dev->pending_lock, flags); + + /* + * Make sure that the QP is not in the QPN table so receive + * interrupts will discard packets for this QP. XXX Also remove QP + * from multicast table. + */ + if (atomic_read(&qp->refcount) != 0) + ipath_free_qp(&dev->qp_table, qp); + + vfree(qp->s_wq); + vfree(qp->r_rq.wq); + kfree(qp); + return 0; +} + +/** + * ipath_init_qp_table - initialize the QP table for a device + * @idev: the device who's QP table we're initializing + * @size: the size of the QP table + * + * Returns 0 on success, otherwise returns an errno. + */ +int ipath_init_qp_table(struct ipath_ibdev *idev, int size) +{ + int i; + int ret; + + idev->qp_table.last = 1; /* QPN 0 and 1 are special. */ + idev->qp_table.max = size; + idev->qp_table.nmaps = 1; + idev->qp_table.table = kzalloc(size * sizeof(*idev->qp_table.table), + GFP_KERNEL); + if (idev->qp_table.table == NULL) { + ret = -ENOMEM; + goto bail; + } + + for (i = 0; i < ARRAY_SIZE(idev->qp_table.map); i++) { + atomic_set(&idev->qp_table.map[i].n_free, BITS_PER_PAGE); + idev->qp_table.map[i].page = NULL; + } + + ret = 0; + +bail: + return ret; +} + +/** + * ipath_sqerror_qp - put a QP's send queue into an error state + * @qp: QP who's send queue will be put into an error state + * @wc: the WC responsible for putting the QP in this state + * + * Flushes the send work queue. + * The QP s_lock should be held. + */ + +void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last); + + _VERBS_INFO("Send queue error on QP%d/%d: err: %d\n", + qp->ibqp.qp_num, qp->remote_qpn, wc->status); + + spin_lock(&dev->pending_lock); + /* XXX What if its already removed by the timeout code? */ + if (qp->timerwait.next != LIST_POISON1) + list_del(&qp->timerwait); + if (qp->piowait.next != LIST_POISON1) + list_del(&qp->piowait); + spin_unlock(&dev->pending_lock); + + ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 1); + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + + wc->status = IB_WC_WR_FLUSH_ERR; + + while (qp->s_last != qp->s_head) { + wc->wr_id = wqe->wr.wr_id; + wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 1); + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + wqe = get_swqe_ptr(qp, qp->s_last); + } + qp->s_cur = qp->s_tail = qp->s_head; + qp->state = IB_QPS_SQE; +} + +/** + * ipath_get_credit - flush the send work queue of a QP + * @qp: the qp who's send work queue to flush + * @aeth: the Acknowledge Extended Transport Header + * + * The QP s_lock should be held. + */ +void ipath_get_credit(struct ipath_qp *qp, u32 aeth) +{ + u32 credit = (aeth >> IPS_AETH_CREDIT_SHIFT) & IPS_AETH_CREDIT_MASK; + + /* + * If the credit is invalid, we can send + * as many packets as we like. Otherwise, we have to + * honor the credit field. + */ + if (credit == IPS_AETH_CREDIT_INVAL) { + qp->s_lsn = (u32) -1; + } else if (qp->s_lsn != (u32) -1) { + /* Compute new LSN (i.e., MSN + credit) */ + credit = (aeth + credit_table[credit]) & IPS_MSN_MASK; + if (ipath_cmp24(credit, qp->s_lsn) > 0) + qp->s_lsn = credit; + } + + /* Restart sending if it was blocked due to lack of credits. */ + if (qp->s_cur != qp->s_head && + (qp->s_lsn == (u32) -1 || + ipath_cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn, + qp->s_lsn + 1) <= 0)) + tasklet_hi_schedule(&qp->s_task); +} diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c new file mode 100644 index 0000000..a4055ca --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_rc.c @@ -0,0 +1,1857 @@ +/* + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ipath_verbs.h" +#include "ips_common.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_RC_##x + +/** + * ipath_init_restart- initialize the qp->s_sge after a restart + * @qp: the QP who's SGE we're restarting + * @wqe: the work queue to initialize the QP's SGE from + * + * The QP s_lock should be held. + */ +static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe) +{ + struct ipath_ibdev *dev; + u32 len; + + len = ((qp->s_psn - wqe->psn) & IPS_PSN_MASK) * + ib_mtu_enum_to_int(qp->path_mtu); + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + ipath_skip_sge(&qp->s_sge, len); + qp->s_len = wqe->length - len; + dev = to_idev(qp->ibqp.device); + spin_lock(&dev->pending_lock); + if (qp->timerwait.next == LIST_POISON1) + list_add_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); +} + +/** + * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read) + * @qp: a pointer to the QP + * @ohdr: a pointer to the IB header being constructed + * @pmtu: the path MTU + * + * Return bth0 if constructed; otherwise, return 0. + * Note the QP s_lock must be held. + */ +static inline u32 ipath_make_rc_ack(struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 pmtu) +{ + struct ipath_sge_state *ss; + u32 hwords; + u32 len; + u32 bth0; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + + /* + * Send a response. Note that we are in the responder's + * side of the QP context. + */ + switch (qp->s_ack_state) { + case OP(RDMA_READ_REQUEST): + ss = &qp->s_rdma_sge; + len = qp->s_rdma_len; + if (len > pmtu) { + len = pmtu; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); + } + else + qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); + qp->s_rdma_len -= len; + bth0 = qp->s_ack_state << 24; + ohdr->u.aeth = ipath_compute_aeth(qp); + hwords++; + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_READ_RESPONSE_MIDDLE): + ss = &qp->s_rdma_sge; + len = qp->s_rdma_len; + if (len > pmtu) + len = pmtu; + else { + ohdr->u.aeth = ipath_compute_aeth(qp); + hwords++; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + } + qp->s_rdma_len -= len; + bth0 = qp->s_ack_state << 24; + break; + + case OP(RDMA_READ_RESPONSE_LAST): + case OP(RDMA_READ_RESPONSE_ONLY): + /* + * We have to prevent new requests from changing + * the r_sge state while a ipath_verbs_send() + * is in progress. + * Changing r_state allows the receiver + * to continue processing new packets. + * We do it here now instead of above so + * that we are sure the packet was sent before + * changing the state. + */ + qp->r_state = OP(RDMA_READ_RESPONSE_LAST); + qp->s_ack_state = OP(ACKNOWLEDGE); + return 0; + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): + ss = NULL; + len = 0; + qp->r_state = OP(SEND_LAST); + qp->s_ack_state = OP(ACKNOWLEDGE); + bth0 = IB_OPCODE_ATOMIC_ACKNOWLEDGE << 24; + ohdr->u.at.aeth = ipath_compute_aeth(qp); + ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->s_ack_atomic); + hwords += sizeof(ohdr->u.at) / 4; + break; + + default: + /* Send a regular ACK. */ + ss = NULL; + len = 0; + qp->s_ack_state = OP(ACKNOWLEDGE); + bth0 = qp->s_ack_state << 24; + ohdr->u.aeth = ipath_compute_aeth(qp); + hwords++; + } + qp->s_hdrwords = hwords; + qp->s_cur_sge = ss; + qp->s_cur_size = len; + + return bth0; +} + +/** + * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) + * @qp: a pointer to the QP + * @ohdr: a pointer to the IB header being constructed + * @pmtu: the path MTU + * @bth0p: pointer to the BTH opcode word + * @bth2p: pointer to the BTH PSN word + * + * Return 1 if constructed; otherwise, return 0. + * Note the QP s_lock must be held. + */ +static inline int ipath_make_rc_req(struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 pmtu, u32 *bth0p, u32 *bth2p) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_sge_state *ss; + struct ipath_swqe *wqe; + u32 hwords; + u32 len; + u32 bth0; + u32 bth2; + char newreq; + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) || + qp->s_rnr_timeout) + goto done; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + bth0 = 0; + + /* Send a request. */ + wqe = get_swqe_ptr(qp, qp->s_cur); + switch (qp->s_state) { + default: + /* + * Resend an old request or start a new one. + * + * We keep track of the current SWQE so that + * we don't reset the "furthest progress" state + * if we need to back up. + */ + newreq = 0; + if (qp->s_cur == qp->s_tail) { + /* Check if send work queue is empty. */ + if (qp->s_tail == qp->s_head) + goto done; + qp->s_psn = wqe->psn = qp->s_next_psn; + newreq = 1; + } + /* + * Note that we have to be careful not to modify the + * original work request since we may need to resend + * it. + */ + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_len = len = wqe->length; + ss = &qp->s_sge; + bth2 = 0; + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + /* If no credit, return. */ + if (qp->s_lsn != (u32) -1 && + ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) + goto done; + wqe->lpsn = wqe->psn; + if (len > pmtu) { + wqe->lpsn += (len - 1) / pmtu; + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_ONLY); + else { + qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + bth2 = 1 << 31; /* Request ACK. */ + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + if (newreq) + qp->s_lsn++; + /* FALLTHROUGH */ + case IB_WR_RDMA_WRITE_WITH_IMM: + /* If no credit, return. */ + if (qp->s_lsn != (u32) -1 && + ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) + goto done; + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / 4; + wqe->lpsn = wqe->psn; + if (len > pmtu) { + wqe->lpsn += (len - 1) / pmtu; + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_ONLY); + else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes + * after RETH */ + ohdr->u.rc.imm_data = wqe->wr.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + bth2 = 1 << 31; /* Request ACK. */ + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_READ: + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / 4; + if (newreq) { + qp->s_lsn++; + /* + * Adjust s_next_psn to count the + * expected number of responses. + */ + if (len > pmtu) + qp->s_next_psn += (len - 1) / pmtu; + wqe->lpsn = qp->s_next_psn++; + } + ss = NULL; + len = 0; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) + qp->s_state = OP(COMPARE_SWAP); + else + qp->s_state = OP(FETCH_ADD); + ohdr->u.atomic_eth.vaddr = cpu_to_be64( + wqe->wr.wr.atomic.remote_addr); + ohdr->u.atomic_eth.rkey = cpu_to_be32( + wqe->wr.wr.atomic.rkey); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.swap); + ohdr->u.atomic_eth.compare_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + hwords += sizeof(struct ib_atomic_eth) / 4; + if (newreq) { + qp->s_lsn++; + wqe->lpsn = wqe->psn; + } + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + ss = NULL; + len = 0; + break; + + default: + goto done; + } + if (newreq) { + qp->s_tail++; + if (qp->s_tail >= qp->s_size) + qp->s_tail = 0; + } + bth2 |= qp->s_psn++ & IPS_PSN_MASK; + if ((int)(qp->s_psn - qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + spin_lock(&dev->pending_lock); + if (qp->timerwait.next == LIST_POISON1) + list_add_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + /* + * This case can only happen if a send is restarted. See + * ipath_restart_rc(). + */ + ipath_init_restart(qp, wqe); + /* FALLTHROUGH */ + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + bth2 = qp->s_psn++ & IPS_PSN_MASK; + if ((int)(qp->s_psn - qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + /* + * Request an ACK every 1/2 MB to avoid retransmit + * timeouts. + */ + if (((wqe->length - len) % (512 * 1024)) == 0) + bth2 |= 1 << 31; + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_LAST); + else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + bth2 |= 1 << 31; /* Request ACK. */ + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_LAST): + /* + * This case can only happen if a RDMA write is restarted. + * See ipath_restart_rc(). + */ + ipath_init_restart(qp, wqe); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + bth2 = qp->s_psn++ & IPS_PSN_MASK; + if ((int)(qp->s_psn - qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + /* + * Request an ACK every 1/2 MB to avoid retransmit + * timeouts. + */ + if (((wqe->length - len) % (512 * 1024)) == 0) + bth2 |= 1 << 31; + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_LAST); + else { + qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + bth2 |= 1 << 31; /* Request ACK. */ + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* + * This case can only happen if a RDMA read is restarted. + * See ipath_restart_rc(). + */ + ipath_init_restart(qp, wqe); + len = ((qp->s_psn - wqe->psn) & IPS_PSN_MASK) * pmtu; + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / 4; + bth2 = qp->s_psn++ & IPS_PSN_MASK; + if ((int)(qp->s_psn - qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = NULL; + len = 0; + qp->s_cur++; + if (qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_REQUEST): + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): + /* + * We shouldn't start anything new until this request is + * finished. The ACK will handle rescheduling us. XXX The + * number of outstanding ones is negotiated at connection + * setup time (see pg. 258,289)? XXX Also, if we support + * multiple outstanding requests, we need to check the WQE + * IB_SEND_FENCE flag and not send a new request if a RDMA + * read or atomic is pending. + */ + goto done; + } + qp->s_len -= len; + qp->s_hdrwords = hwords; + qp->s_cur_sge = ss; + qp->s_cur_size = len; + *bth0p = bth0 | (qp->s_state << 24); + *bth2p = bth2; + return 1; + +done: + return 0; +} + +static inline void ipath_make_rc_grh(struct ipath_qp *qp, + struct ib_global_route *grh, + u32 nwords) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + + /* GRH header size in 32-bit words. */ + qp->s_hdrwords += 10; + qp->s_hdr.u.l.grh.version_tclass_flow = + cpu_to_be32((6 << 28) | + (grh->traffic_class << 20) | + grh->flow_label); + qp->s_hdr.u.l.grh.paylen = + cpu_to_be16(((qp->s_hdrwords - 12) + nwords + + SIZE_OF_CRC) << 2); + /* next_hdr is defined by C8-7 in ch. 8.4.1 */ + qp->s_hdr.u.l.grh.next_hdr = 0x1B; + qp->s_hdr.u.l.grh.hop_limit = grh->hop_limit; + /* The SGID is 32-bit aligned. */ + qp->s_hdr.u.l.grh.sgid.global.subnet_prefix = dev->gid_prefix; + qp->s_hdr.u.l.grh.sgid.global.interface_id = + ipath_layer_get_guid(dev->dd); + qp->s_hdr.u.l.grh.dgid = grh->dgid; +} + +/** + * ipath_do_rc_send - perform a send on an RC QP + * @data: contains a pointer to the QP + * + * Process entries in the send work queue until credit or queue is + * exhausted. Only allow one CPU to send a packet per QP (tasklet). + * Otherwise, after we drop the QP s_lock, two threads could send + * packets out of order. + */ +void ipath_do_rc_send(unsigned long data) +{ + struct ipath_qp *qp = (struct ipath_qp *)data; + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + unsigned long flags; + u16 lrh0; + u32 nwords; + u32 extra_bytes; + u32 bth0; + u32 bth2; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + struct ipath_other_headers *ohdr; + + if (test_and_set_bit(IPATH_S_BUSY, &qp->s_flags)) + goto bail; + + if (unlikely(qp->remote_ah_attr.dlid == + ipath_layer_get_lid(dev->dd))) { + struct ib_wc wc; + + /* + * Pass in an uninitialized ib_wc to be consistent with + * other places where ipath_ruc_loopback() is called. + */ + ipath_ruc_loopback(qp, &wc); + goto clear; + } + + ohdr = &qp->s_hdr.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr.u.l.oth; + +again: + /* Check for a constructed packet to be sent. */ + if (qp->s_hdrwords != 0) { + /* + * If no PIO bufs are available, return. An interrupt will + * call ipath_ib_piobufavail() when one is available. + */ + _VERBS_INFO("h %u %p\n", qp->s_hdrwords, &qp->s_hdr); + _VERBS_INFO("d %u %p %u %p %u %u %u %u\n", qp->s_cur_size, + qp->s_cur_sge->sg_list, + qp->s_cur_sge->num_sge, + qp->s_cur_sge->sge.vaddr, + qp->s_cur_sge->sge.sge_length, + qp->s_cur_sge->sge.length, + qp->s_cur_sge->sge.m, + qp->s_cur_sge->sge.n); + if (ipath_verbs_send(dev->dd, qp->s_hdrwords, + (u32 *) &qp->s_hdr, qp->s_cur_size, + qp->s_cur_sge)) { + ipath_no_bufs_available(qp, dev); + goto bail; + } + dev->n_unicast_xmit++; + /* Record that we sent the packet and s_hdr is empty. */ + qp->s_hdrwords = 0; + } + + /* + * The lock is needed to synchronize between setting + * qp->s_ack_state, resend timer, and post_send(). + */ + spin_lock_irqsave(&qp->s_lock, flags); + + /* Sending responses has higher priority over sending requests. */ + if (qp->s_ack_state != OP(ACKNOWLEDGE) && + (bth0 = ipath_make_rc_ack(qp, ohdr, pmtu)) != 0) + bth2 = qp->s_ack_psn++ & IPS_PSN_MASK; + else if (!ipath_make_rc_req(qp, ohdr, pmtu, &bth0, &bth2)) + goto done; + + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Construct the header. */ + extra_bytes = (4 - qp->s_cur_size) & 3; + nwords = (qp->s_cur_size + extra_bytes) >> 2; + lrh0 = IPS_LRH_BTH; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + ipath_make_rc_grh(qp, &qp->remote_ah_attr.grh, nwords); + lrh0 = IPS_LRH_GRH; + } + lrh0 |= qp->remote_ah_attr.sl << 4; + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + + SIZE_OF_CRC); + qp->s_hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->dd)); + bth0 |= ipath_layer_get_pkey(dev->dd, qp->s_pkey_index); + bth0 |= extra_bytes << 20; + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(bth2); + + /* Check for more work to do. */ + goto again; + +done: + spin_unlock_irqrestore(&qp->s_lock, flags); +clear: + clear_bit(IPATH_S_BUSY, &qp->s_flags); +bail: + return; +} + +static void send_rc_ack(struct ipath_qp *qp) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + u16 lrh0; + u32 bth0; + struct ipath_other_headers *ohdr; + + /* Construct the header. */ + ohdr = &qp->s_hdr.u.oth; + lrh0 = IPS_LRH_BTH; + /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */ + qp->s_hdrwords = 6; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + ipath_make_rc_grh(qp, &qp->remote_ah_attr.grh, 0); + ohdr = &qp->s_hdr.u.l.oth; + lrh0 = IPS_LRH_GRH; + } + bth0 = ipath_layer_get_pkey(dev->dd, qp->s_pkey_index); + ohdr->u.aeth = ipath_compute_aeth(qp); + if (qp->s_ack_state >= OP(COMPARE_SWAP)) { + bth0 |= IB_OPCODE_ATOMIC_ACKNOWLEDGE << 24; + ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->s_ack_atomic); + qp->s_hdrwords += sizeof(ohdr->u.at.atomic_ack_eth) / 4; + } + else + bth0 |= OP(ACKNOWLEDGE) << 24; + lrh0 |= qp->remote_ah_attr.sl << 4; + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + SIZE_OF_CRC); + qp->s_hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->dd)); + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(qp->s_ack_psn & IPS_PSN_MASK); + + /* + * If we can send the ACK, clear the ACK state. + */ + if (ipath_verbs_send(dev->dd, qp->s_hdrwords, (u32 *) &qp->s_hdr, + 0, NULL) == 0) { + qp->s_ack_state = OP(ACKNOWLEDGE); + dev->n_rc_qacks++; + dev->n_unicast_xmit++; + } +} + +/** + * ipath_restart_rc - back up requester to resend the last un-ACKed request + * @qp: the QP to restart + * @psn: packet sequence number for the request + * @wc: the work completion request + * + * The QP s_lock should be held. + */ +void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc) +{ + struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last); + struct ipath_ibdev *dev; + u32 n; + + /* + * If there are no requests pending, we are done. + */ + if (ipath_cmp24(psn, qp->s_next_psn) >= 0 || + qp->s_last == qp->s_tail) + goto done; + + if (qp->s_retry == 0) { + wc->wr_id = wqe->wr.wr_id; + wc->status = IB_WC_RETRY_EXC_ERR; + wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc->vendor_err = 0; + wc->byte_len = 0; + wc->qp_num = qp->ibqp.qp_num; + wc->src_qp = qp->remote_qpn; + wc->pkey_index = 0; + wc->slid = qp->remote_ah_attr.dlid; + wc->sl = qp->remote_ah_attr.sl; + wc->dlid_path_bits = 0; + wc->port_num = 0; + ipath_sqerror_qp(qp, wc); + goto bail; + } + qp->s_retry--; + + /* + * Remove the QP from the timeout queue. + * Note: it may already have been removed by ipath_ib_timer(). + */ + dev = to_idev(qp->ibqp.device); + spin_lock(&dev->pending_lock); + if (qp->timerwait.next != LIST_POISON1) + list_del(&qp->timerwait); + spin_unlock(&dev->pending_lock); + + if (wqe->wr.opcode == IB_WR_RDMA_READ) + dev->n_rc_resends++; + else + dev->n_rc_resends += (int)qp->s_psn - (int)psn; + + /* + * If we are starting the request from the beginning, let the normal + * send code handle initialization. + */ + qp->s_cur = qp->s_last; + if (ipath_cmp24(psn, wqe->psn) <= 0) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = wqe->psn; + } else { + n = qp->s_cur; + for (;;) { + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) { + if (ipath_cmp24(psn, qp->s_next_psn) >= 0) { + qp->s_cur = n; + wqe = get_swqe_ptr(qp, n); + } + break; + } + wqe = get_swqe_ptr(qp, n); + if (ipath_cmp24(psn, wqe->psn) < 0) + break; + qp->s_cur = n; + } + qp->s_psn = psn; + + /* + * Reset the state to restart in the middle of a request. + * Don't change the s_sge, s_cur_sge, or s_cur_size. + * See ipath_do_rc_send(). + */ + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_LAST); + break; + + case IB_WR_RDMA_READ: + qp->s_state = + OP(RDMA_READ_RESPONSE_MIDDLE); + break; + + default: + /* + * This case shouldn't happen since its only + * one PSN per req. + */ + qp->s_state = OP(SEND_LAST); + } + } + +done: + tasklet_hi_schedule(&qp->s_task); + +bail: + return; +} + +/** + * reset_psn - reset the QP state to send starting from PSN + * @qp: the QP + * @psn: the packet sequence number to restart at + * + * This is called from ipath_rc_rcv() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held. + */ +static void reset_psn(struct ipath_qp *qp, u32 psn) +{ + struct ipath_swqe *wqe; + u32 n; + + n = qp->s_cur; + wqe = get_swqe_ptr(qp, n); + for (;;) { + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) { + if (ipath_cmp24(psn, qp->s_next_psn) >= 0) { + qp->s_cur = n; + wqe = get_swqe_ptr(qp, n); + } + break; + } + wqe = get_swqe_ptr(qp, n); + if (ipath_cmp24(psn, wqe->psn) < 0) + break; + qp->s_cur = n; + } + qp->s_psn = psn; + + /* + * Set the state to restart in the middle of a + * request. Don't change the s_sge, s_cur_sge, or + * s_cur_size. See ipath_do_rc_send(). + */ + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_LAST); + break; + + case IB_WR_RDMA_READ: + qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); + break; + + default: + /* + * This case shouldn't happen since its only + * one PSN per req. + */ + qp->s_state = OP(SEND_LAST); + } +} + +/** + * do_rc_ack - process an incoming RC ACK + * @qp: the QP the ACK came in on + * @psn: the packet sequence number of the ACK + * @opcode: the opcode of the request that resulted in the ACK + * + * This is called from ipath_rc_rcv() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held. + * Returns 1 if OK, 0 if current operation should be aborted (NAK). + */ +static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ib_wc wc; + struct ipath_swqe *wqe; + int ret = 0; + + /* + * Remove the QP from the timeout queue (or RNR timeout queue). + * If ipath_ib_timer() has already removed it, + * it's OK since we hold the QP s_lock and ipath_restart_rc() + * just won't find anything to restart if we ACK everything. + */ + spin_lock(&dev->pending_lock); + if (qp->timerwait.next != LIST_POISON1) + list_del(&qp->timerwait); + spin_unlock(&dev->pending_lock); + + /* + * Note that NAKs implicitly ACK outstanding SEND and RDMA write + * requests and implicitly NAK RDMA read and atomic requests issued + * before the NAK'ed request. The MSN won't include the NAK'ed + * request but will include an ACK'ed request(s). + */ + wqe = get_swqe_ptr(qp, qp->s_last); + + /* Nothing is pending to ACK/NAK. */ + if (qp->s_last == qp->s_tail) + goto bail; + + /* + * The MSN might be for a later WQE than the PSN indicates so + * only complete WQEs that the PSN finishes. + */ + while (ipath_cmp24(psn, wqe->lpsn) >= 0) { + /* If we are ACKing a WQE, the MSN should be >= the SSN. */ + if (ipath_cmp24(aeth, wqe->ssn) < 0) + break; + /* + * If this request is a RDMA read or atomic, and the ACK is + * for a later operation, this ACK NAKs the RDMA read or + * atomic. In other words, only a RDMA_READ_LAST or ONLY + * can ACK a RDMA read and likewise for atomic ops. Note + * that the NAK case can only happen if relaxed ordering is + * used and requests are sent after an RDMA read or atomic + * is sent but before the response is received. + */ + if ((wqe->wr.opcode == IB_WR_RDMA_READ && + opcode != OP(RDMA_READ_RESPONSE_LAST)) || + ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && + (opcode != OP(ATOMIC_ACKNOWLEDGE) || + ipath_cmp24(wqe->psn, psn) != 0))) { + /* + * The last valid PSN seen is the previous + * request's. + */ + qp->s_last_psn = wqe->psn - 1; + /* Retry this request. */ + ipath_restart_rc(qp, wqe->psn, &wc); + /* + * No need to process the ACK/NAK since we are + * restarting an earlier request. + */ + goto bail; + } + /* Post a send completion queue entry if requested. */ + if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) || + (wqe->wr.send_flags & IB_SEND_SIGNALED)) { + wc.wr_id = wqe->wr.wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc.vendor_err = 0; + wc.byte_len = wqe->length; + wc.qp_num = qp->ibqp.qp_num; + wc.src_qp = qp->remote_qpn; + wc.pkey_index = 0; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + wc.dlid_path_bits = 0; + wc.port_num = 0; + ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + } + qp->s_retry = qp->s_retry_cnt; + /* + * If we are completing a request which is in the process of + * being resent, we can stop resending it since we know the + * responder has already seen it. + */ + if (qp->s_last == qp->s_cur) { + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + wqe = get_swqe_ptr(qp, qp->s_cur); + qp->s_state = OP(SEND_LAST); + qp->s_psn = wqe->psn; + } + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + wqe = get_swqe_ptr(qp, qp->s_last); + if (qp->s_last == qp->s_tail) + break; + } + + switch (aeth >> 29) { + case 0: /* ACK */ + dev->n_rc_acks++; + /* If this is a partial ACK, reset the retransmit timer. */ + if (qp->s_last != qp->s_tail) { + spin_lock(&dev->pending_lock); + list_add_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); + } + ipath_get_credit(qp, aeth); + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + qp->s_retry = qp->s_retry_cnt; + qp->s_last_psn = psn; + ret = 1; + goto bail; + + case 1: /* RNR NAK */ + dev->n_rnr_naks++; + if (qp->s_rnr_retry == 0) { + if (qp->s_last == qp->s_tail) + goto bail; + + wc.status = IB_WC_RNR_RETRY_EXC_ERR; + goto class_b; + } + if (qp->s_rnr_retry_cnt < 7) + qp->s_rnr_retry--; + if (qp->s_last == qp->s_tail) + goto bail; + + /* The last valid PSN seen is the previous request's. */ + qp->s_last_psn = wqe->psn - 1; + + dev->n_rc_resends += (int)qp->s_psn - (int)psn; + + /* + * If we are starting the request from the beginning, let + * the normal send code handle initialization. + */ + qp->s_cur = qp->s_last; + wqe = get_swqe_ptr(qp, qp->s_cur); + if (ipath_cmp24(psn, wqe->psn) <= 0) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = wqe->psn; + } else + reset_psn(qp, psn); + + qp->s_rnr_timeout = + ib_ipath_rnr_table[(aeth >> IPS_AETH_CREDIT_SHIFT) & + IPS_AETH_CREDIT_MASK]; + ipath_insert_rnr_queue(qp); + goto bail; + + case 3: /* NAK */ + /* The last valid PSN seen is the previous request's. */ + if (qp->s_last != qp->s_tail) + qp->s_last_psn = wqe->psn - 1; + switch ((aeth >> IPS_AETH_CREDIT_SHIFT) & + IPS_AETH_CREDIT_MASK) { + case 0: /* PSN sequence error */ + dev->n_seq_naks++; + /* + * Back up to the responder's expected PSN. XXX + * Note that we might get a NAK in the middle of an + * RDMA READ response which terminates the RDMA + * READ. + */ + if (qp->s_last == qp->s_tail) + break; + + if (ipath_cmp24(psn, wqe->psn) < 0) + break; + + /* Retry the request. */ + ipath_restart_rc(qp, psn, &wc); + break; + + case 1: /* Invalid Request */ + wc.status = IB_WC_REM_INV_REQ_ERR; + dev->n_other_naks++; + goto class_b; + + case 2: /* Remote Access Error */ + wc.status = IB_WC_REM_ACCESS_ERR; + dev->n_other_naks++; + goto class_b; + + case 3: /* Remote Operation Error */ + wc.status = IB_WC_REM_OP_ERR; + dev->n_other_naks++; + class_b: + wc.wr_id = wqe->wr.wr_id; + wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc.vendor_err = 0; + wc.byte_len = 0; + wc.qp_num = qp->ibqp.qp_num; + wc.src_qp = qp->remote_qpn; + wc.pkey_index = 0; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + wc.dlid_path_bits = 0; + wc.port_num = 0; + ipath_sqerror_qp(qp, &wc); + break; + + default: + /* Ignore other reserved NAK error codes */ + goto reserved; + } + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + goto bail; + + default: /* 2: reserved */ + reserved: + /* Ignore reserved NAK codes. */ + goto bail; + } + +bail: + return ret; +} + +/** + * ipath_rc_rcv_resp - process an incoming RC response packet + * @dev: the device this packet came in on + * @ohdr: the other headers for this packet + * @data: the packet data + * @tlen: the packet length + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @hdrsize: the header length + * @pmtu: the path MTU + * @header_in_data: true if part of the header data is in the data buffer + * + * This is called from ipath_rc_rcv() to process an incoming RC response + * packet for the given QP. + * Called at interrupt level. + */ +static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, + struct ipath_other_headers *ohdr, + void *data, u32 tlen, + struct ipath_qp *qp, + u32 opcode, + u32 psn, u32 hdrsize, u32 pmtu, + int header_in_data) +{ + unsigned long flags; + struct ib_wc wc; + int diff; + u32 pad; + u32 aeth; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Ignore invalid responses. */ + if (ipath_cmp24(psn, qp->s_next_psn) >= 0) + goto ack_done; + + /* Ignore duplicate responses. */ + diff = ipath_cmp24(psn, qp->s_last_psn); + if (unlikely(diff <= 0)) { + /* Update credits for "ghost" ACKs */ + if (diff == 0 && opcode == OP(ACKNOWLEDGE)) { + if (!header_in_data) + aeth = be32_to_cpu(ohdr->u.aeth); + else { + aeth = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } + if ((aeth >> 29) == 0) + ipath_get_credit(qp, aeth); + } + goto ack_done; + } + + switch (opcode) { + case OP(ACKNOWLEDGE): + case OP(ATOMIC_ACKNOWLEDGE): + case OP(RDMA_READ_RESPONSE_FIRST): + if (!header_in_data) + aeth = be32_to_cpu(ohdr->u.aeth); + else { + aeth = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } + if (opcode == OP(ATOMIC_ACKNOWLEDGE)) + *(u64 *) qp->s_sge.sge.vaddr = *(u64 *) data; + if (!do_rc_ack(qp, aeth, psn, opcode) || + opcode != OP(RDMA_READ_RESPONSE_FIRST)) + goto ack_done; + hdrsize += 4; + /* + * do_rc_ack() has already checked the PSN so skip + * the sequence check. + */ + goto rdma_read; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* no AETH, no ACK */ + if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { + dev->n_rdma_seq++; + ipath_restart_rc(qp, qp->s_last_psn + 1, &wc); + goto ack_done; + } + rdma_read: + if (unlikely(qp->s_state != OP(RDMA_READ_REQUEST))) + goto ack_done; + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto ack_done; + if (unlikely(pmtu >= qp->s_len)) + goto ack_done; + /* We got a response so update the timeout. */ + if (unlikely(qp->s_last == qp->s_tail || + get_swqe_ptr(qp, qp->s_last)->wr.opcode != + IB_WR_RDMA_READ)) + goto ack_done; + spin_lock(&dev->pending_lock); + if (qp->s_rnr_timeout == 0 && + qp->timerwait.next != LIST_POISON1) + list_move_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); + /* + * Update the RDMA receive state but do the copy w/o holding the + * locks and blocking interrupts. XXX Yet another place that + * affects relaxed RDMA order since we don't want s_sge modified. + */ + qp->s_len -= pmtu; + qp->s_last_psn = psn; + spin_unlock_irqrestore(&qp->s_lock, flags); + ipath_copy_sge(&qp->s_sge, data, pmtu); + goto bail; + + case OP(RDMA_READ_RESPONSE_LAST): + /* ACKs READ req. */ + if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { + dev->n_rdma_seq++; + ipath_restart_rc(qp, qp->s_last_psn + 1, &wc); + goto ack_done; + } + /* FALLTHROUGH */ + case OP(RDMA_READ_RESPONSE_ONLY): + if (unlikely(qp->s_state != OP(RDMA_READ_REQUEST))) + goto ack_done; + /* + * Get the number of bytes the message was padded by. + */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* + * Check that the data size is >= 1 && <= pmtu. + * Remember to account for the AETH header (4) and + * ICRC (4). + */ + if (unlikely(tlen <= (hdrsize + pad + 8))) { + /* + * XXX Need to generate an error CQ + * entry. + */ + goto ack_done; + } + tlen -= hdrsize + pad + 8; + if (unlikely(tlen != qp->s_len)) { + /* + * XXX Need to generate an error CQ + * entry. + */ + goto ack_done; + } + if (!header_in_data) + aeth = be32_to_cpu(ohdr->u.aeth); + else { + aeth = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } + ipath_copy_sge(&qp->s_sge, data, tlen); + if (do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST))) { + /* + * Change the state so we contimue + * processing new requests. + */ + qp->s_state = OP(SEND_LAST); + } + goto ack_done; + } + +ack_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +bail: + return; +} + +/** + * ipath_rc_rcv_error - process an incoming duplicate or error RC packet + * @dev: the device this packet came in on + * @ohdr: the other headers for this packet + * @data: the packet data + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @diff: the difference between the PSN and the expected PSN + * @header_in_data: true if part of the header data is in the data buffer + * + * This is called from ipath_rc_rcv() to process an unexpected + * incoming RC packet for the given QP. + * Called at interrupt level. + * Return 1 if no more processing is needed; otherwise return 0 to + * schedule a response to be sent and the s_lock unlocked. + */ +static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, + struct ipath_other_headers *ohdr, + void *data, + struct ipath_qp *qp, + u32 opcode, + u32 psn, + int diff, + int header_in_data) +{ + struct ib_reth *reth; + + if (diff > 0) { + /* + * Packet sequence error. + * A NAK will ACK earlier sends and RDMA writes. + * Don't queue the NAK if a RDMA read, atomic, or + * NAK is pending though. + */ + spin_lock(&qp->s_lock); + if ((qp->s_ack_state >= OP(RDMA_READ_REQUEST) && + qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) || + qp->s_nak_state != 0) { + spin_unlock(&qp->s_lock); + goto done; + } + qp->s_ack_state = OP(SEND_ONLY); + qp->s_nak_state = IB_NAK_PSN_ERROR; + /* Use the expected PSN. */ + qp->s_ack_psn = qp->r_psn; + goto resched; + } + + /* + * Handle a duplicate request. Don't re-execute SEND, RDMA + * write or atomic op. Don't NAK errors, just silently drop + * the duplicate request. Note that r_sge, r_len, and + * r_rcv_len may be in use so don't modify them. + * + * We are supposed to ACK the earliest duplicate PSN but we + * can coalesce an outstanding duplicate ACK. We have to + * send the earliest so that RDMA reads can be restarted at + * the requester's expected PSN. + */ + spin_lock(&qp->s_lock); + if (qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE && + ipath_cmp24(psn, qp->s_ack_psn) >= 0) { + if (qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST) + qp->s_ack_psn = psn; + spin_unlock(&qp->s_lock); + goto done; + } + switch (opcode) { + case OP(RDMA_READ_REQUEST): + /* + * We have to be careful to not change s_rdma_sge + * while ipath_do_rc_send() is using it and not + * holding the s_lock. + */ + if (qp->s_ack_state != OP(ACKNOWLEDGE) && + qp->s_ack_state >= IB_OPCODE_RDMA_READ_REQUEST) { + spin_unlock(&qp->s_lock); + dev->n_rdma_dup_busy++; + goto done; + } + /* RETH comes after BTH */ + if (!header_in_data) + reth = &ohdr->u.rc.reth; + else { + reth = (struct ib_reth *)data; + data += sizeof(*reth); + } + qp->s_rdma_len = be32_to_cpu(reth->length); + if (qp->s_rdma_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* + * Address range must be a subset of the original + * request and start on pmtu boundaries. + */ + ok = ipath_rkey_ok(dev, &qp->s_rdma_sge, + qp->s_rdma_len, vaddr, rkey, + IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto done; + } else { + qp->s_rdma_sge.sg_list = NULL; + qp->s_rdma_sge.num_sge = 0; + qp->s_rdma_sge.sge.mr = NULL; + qp->s_rdma_sge.sge.vaddr = NULL; + qp->s_rdma_sge.sge.length = 0; + qp->s_rdma_sge.sge.sge_length = 0; + } + break; + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): + /* + * Check for the PSN of the last atomic operations + * performed and resend the result if found. + */ + if ((psn & IPS_PSN_MASK) != qp->r_atomic_psn) { + spin_unlock(&qp->s_lock); + goto done; + } + qp->s_ack_atomic = qp->r_atomic_data; + break; + } + qp->s_ack_state = opcode; + qp->s_nak_state = 0; + qp->s_ack_psn = psn; +resched: + return 0; + +done: + return 1; +} + +/** + * ipath_rc_rcv - process an incoming RC packet + * @dev: the device this packet came in on + * @hdr: the header of this packet + * @has_grh: true if the header has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP for this packet + * + * This is called from ipath_qp_rcv() to process an incoming RC packet + * for the given QP. + * Called at interrupt level. + */ +void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp) +{ + struct ipath_other_headers *ohdr; + u32 opcode; + u32 hdrsize; + u32 psn; + u32 pad; + unsigned long flags; + struct ib_wc wc; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + int diff; + struct ib_reth *reth; + int header_in_data; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12; /* LRH + BTH */ + psn = be32_to_cpu(ohdr->bth[2]); + header_in_data = 0; + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */ + /* + * The header with GRH is 60 bytes and the core driver sets + * the eager header buffer size to 56 bytes so the last 4 + * bytes of the BTH header (PSN) is in the data buffer. + */ + header_in_data = + ipath_layer_get_rcvhdrentsize(dev->dd) == 16; + if (header_in_data) { + psn = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } else + psn = be32_to_cpu(ohdr->bth[2]); + } + /* + * The opcode is in the low byte when its in network order + * (top byte when in host order). + */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + + /* + * Process responses (ACKs) before anything else. Note that the + * packet sequence number will be for something in the send work + * queue rather than the expected receive packet sequence number. + * In other words, this QP is the requester. + */ + if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) { + ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn, + hdrsize, pmtu, header_in_data); + goto bail; + } + + spin_lock_irqsave(&qp->r_rq.lock, flags); + + /* Compute 24 bits worth of difference. */ + diff = ipath_cmp24(psn, qp->r_psn); + if (unlikely(diff)) { + if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode, + psn, diff, header_in_data)) + goto done; + goto resched; + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + break; + nack_inv: + /* + * A NAK will ACK earlier sends and RDMA writes. Don't queue the + * NAK if a RDMA read, atomic, or NAK is pending though. + */ + spin_lock(&qp->s_lock); + if (qp->s_ack_state >= OP(RDMA_READ_REQUEST) && + qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) { + spin_unlock(&qp->s_lock); + goto done; + } + /* XXX Flush WQEs */ + qp->state = IB_QPS_ERR; + qp->s_ack_state = OP(SEND_ONLY); + qp->s_nak_state = IB_NAK_INVALID_REQUEST; + qp->s_ack_psn = qp->r_psn; + goto resched; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto nack_inv; + + case OP(RDMA_READ_REQUEST): + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): + /* + * Drop all new requests until a response has been sent. A + * new request then ACKs the RDMA response we sent. Relaxed + * ordering would allow new requests to be processed but we + * would need to keep a queue of rwqe's for all that are in + * progress. Note that we can't RNR NAK this request since + * the RDMA READ or atomic response is already queued to be + * sent (unless we implement a response send queue). + */ + goto done; + + default: + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + goto nack_inv; + break; + } + + wc.imm_data = 0; + wc.wc_flags = 0; + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + if (!ipath_get_rwqe(qp, 0)) { + rnr_nak: + /* + * A RNR NAK will ACK earlier sends and RDMA writes. + * Don't queue the NAK if a RDMA read or atomic + * is pending though. + */ + spin_lock(&qp->s_lock); + if (qp->s_ack_state >= + OP(RDMA_READ_REQUEST) && + qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) { + spin_unlock(&qp->s_lock); + goto done; + } + qp->s_ack_state = OP(SEND_ONLY); + qp->s_nak_state = IB_RNR_NAK | qp->s_min_rnr_timer; + qp->s_ack_psn = qp->r_psn; + goto resched; + } + qp->r_rcv_len = 0; + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + case OP(RDMA_WRITE_MIDDLE): + send_middle: + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto nack_inv; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto nack_inv; + ipath_copy_sge(&qp->r_sge, data, pmtu); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + /* consume RWQE */ + if (!ipath_get_rwqe(qp, 1)) + goto rnr_nak; + goto send_last_imm; + + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + if (!ipath_get_rwqe(qp, 0)) + goto rnr_nak; + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto send_last; + /* FALLTHROUGH */ + case OP(SEND_LAST_WITH_IMMEDIATE): + send_last_imm: + if (header_in_data) { + wc.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else { + /* Immediate data comes after BTH */ + wc.imm_data = ohdr->u.imm_data; + } + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + /* FALLTHROUGH */ + case OP(SEND_LAST): + case OP(RDMA_WRITE_LAST): + send_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto nack_inv; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) + goto nack_inv; + ipath_copy_sge(&qp->r_sge, data, tlen); + atomic_inc(&qp->msn); + if (opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_ONLY)) + break; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.vendor_err = 0; + wc.qp_num = qp->ibqp.qp_num; + wc.src_qp = qp->remote_qpn; + wc.pkey_index = 0; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + wc.dlid_path_bits = 0; + wc.port_num = 0; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + __constant_cpu_to_be32(1 << 23)) != 0); + break; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + /* consume RWQE */ + /* RETH comes after BTH */ + if (!header_in_data) + reth = &ohdr->u.rc.reth; + else { + reth = (struct ib_reth *)data; + data += sizeof(*reth); + } + hdrsize += sizeof(*reth); + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey & NAK */ + ok = ipath_rkey_ok(dev, &qp->r_sge, + qp->r_len, vaddr, rkey, + IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) { + nack_acc: + /* + * A NAK will ACK earlier sends and RDMA + * writes. Don't queue the NAK if a RDMA + * read, atomic, or NAK is pending though. + */ + spin_lock(&qp->s_lock); + if (qp->s_ack_state >= + OP(RDMA_READ_REQUEST) && + qp->s_ack_state != + IB_OPCODE_ACKNOWLEDGE) { + spin_unlock(&qp->s_lock); + goto done; + } + /* XXX Flush WQEs */ + qp->state = IB_QPS_ERR; + qp->s_ack_state = OP(RDMA_WRITE_ONLY); + qp->s_nak_state = + IB_NAK_REMOTE_ACCESS_ERROR; + qp->s_ack_psn = qp->r_psn; + goto resched; + } + } else { + qp->r_sge.sg_list = NULL; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_WRITE))) + goto nack_acc; + if (opcode == OP(RDMA_WRITE_FIRST)) + goto send_middle; + else if (opcode == OP(RDMA_WRITE_ONLY)) + goto send_last; + if (!ipath_get_rwqe(qp, 1)) + goto rnr_nak; + goto send_last_imm; + + case OP(RDMA_READ_REQUEST): + /* RETH comes after BTH */ + if (!header_in_data) + reth = &ohdr->u.rc.reth; + else { + reth = (struct ib_reth *)data; + data += sizeof(*reth); + } + spin_lock(&qp->s_lock); + if (qp->s_ack_state != OP(ACKNOWLEDGE) && + qp->s_ack_state >= IB_OPCODE_RDMA_READ_REQUEST) { + spin_unlock(&qp->s_lock); + goto done; + } + qp->s_rdma_len = be32_to_cpu(reth->length); + if (qp->s_rdma_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey & NAK */ + ok = ipath_rkey_ok(dev, &qp->s_rdma_sge, + qp->s_rdma_len, vaddr, rkey, + IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) { + spin_unlock(&qp->s_lock); + goto nack_acc; + } + /* + * Update the next expected PSN. We add 1 later + * below, so only add the remainder here. + */ + if (qp->s_rdma_len > pmtu) + qp->r_psn += (qp->s_rdma_len - 1) / pmtu; + } else { + qp->s_rdma_sge.sg_list = NULL; + qp->s_rdma_sge.num_sge = 0; + qp->s_rdma_sge.sge.mr = NULL; + qp->s_rdma_sge.sge.vaddr = NULL; + qp->s_rdma_sge.sge.length = 0; + qp->s_rdma_sge.sge.sge_length = 0; + } + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_READ))) + goto nack_acc; + /* + * We need to increment the MSN here instead of when we + * finish sending the result since a duplicate request would + * increment it more than once. + */ + atomic_inc(&qp->msn); + qp->s_ack_state = opcode; + qp->s_nak_state = 0; + qp->s_ack_psn = psn; + qp->r_psn++; + qp->r_state = opcode; + goto rdmadone; + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): { + struct ib_atomic_eth *ateth; + u64 vaddr; + u64 sdata; + u32 rkey; + + if (!header_in_data) + ateth = &ohdr->u.atomic_eth; + else { + ateth = (struct ib_atomic_eth *)data; + data += sizeof(*ateth); + } + vaddr = be64_to_cpu(ateth->vaddr); + if (unlikely(vaddr & (sizeof(u64) - 1))) + goto nack_inv; + rkey = be32_to_cpu(ateth->rkey); + /* Check rkey & NAK */ + if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge, + sizeof(u64), vaddr, rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto nack_acc; + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_ATOMIC))) + goto nack_acc; + /* Perform atomic OP and save result. */ + sdata = be64_to_cpu(ateth->swap_data); + spin_lock(&dev->pending_lock); + qp->r_atomic_data = *(u64 *) qp->r_sge.sge.vaddr; + if (opcode == OP(FETCH_ADD)) + *(u64 *) qp->r_sge.sge.vaddr = + qp->r_atomic_data + sdata; + else if (qp->r_atomic_data == + be64_to_cpu(ateth->compare_data)) + *(u64 *) qp->r_sge.sge.vaddr = sdata; + spin_unlock(&dev->pending_lock); + atomic_inc(&qp->msn); + qp->r_atomic_psn = psn & IPS_PSN_MASK; + psn |= 1 << 31; + break; + } + + default: + /* Drop packet for unknown opcodes. */ + goto done; + } + qp->r_psn++; + qp->r_state = opcode; + /* Send an ACK if requested or required. */ + if (psn & (1 << 31)) { + /* + * Coalesce ACKs unless there is a RDMA READ or + * ATOMIC pending. + */ + spin_lock(&qp->s_lock); + if (qp->s_ack_state == OP(ACKNOWLEDGE) || + qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST) { + qp->s_ack_state = opcode; + qp->s_nak_state = 0; + qp->s_ack_psn = psn; + qp->s_ack_atomic = qp->r_atomic_data; + goto resched; + } + spin_unlock(&qp->s_lock); + } +done: + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + goto bail; + +resched: + /* + * Try to send ACK right away but not if ipath_do_rc_send() is + * active. + */ + if (qp->s_hdrwords == 0 && + (qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST || + qp->s_ack_state >= IB_OPCODE_COMPARE_SWAP)) + send_rc_ack(qp); + +rdmadone: + spin_unlock(&qp->s_lock); + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + + /* Call ipath_do_rc_send() in another thread. */ + tasklet_hi_schedule(&qp->s_task); + +bail: + return; +} diff --git a/drivers/infiniband/hw/ipath/ipath_registers.h b/drivers/infiniband/hw/ipath/ipath_registers.h new file mode 100644 index 0000000..1e59750 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_registers.h @@ -0,0 +1,446 @@ +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_REGISTERS_H +#define _IPATH_REGISTERS_H + +/* + * This file should only be included by kernel source, and by the diags. + * It defines the registers, and their contents, for the InfiniPath HT-400 chip + */ + +/* + * These are the InfiniPath register and buffer bit definitions, + * that are visible to software, and needed only by the kernel + * and diag code. A few, that are visible to protocol and user + * code are in ipath_common.h. Some bits are specific + * to a given chip implementation, and have been moved to the + * chip-specific source file + */ + +/* kr_revision bits */ +#define INFINIPATH_R_CHIPREVMINOR_MASK 0xFF +#define INFINIPATH_R_CHIPREVMINOR_SHIFT 0 +#define INFINIPATH_R_CHIPREVMAJOR_MASK 0xFF +#define INFINIPATH_R_CHIPREVMAJOR_SHIFT 8 +#define INFINIPATH_R_ARCH_MASK 0xFF +#define INFINIPATH_R_ARCH_SHIFT 16 +#define INFINIPATH_R_SOFTWARE_MASK 0xFF +#define INFINIPATH_R_SOFTWARE_SHIFT 24 +#define INFINIPATH_R_BOARDID_MASK 0xFF +#define INFINIPATH_R_BOARDID_SHIFT 32 + +/* kr_control bits */ +#define INFINIPATH_C_FREEZEMODE 0x00000002 +#define INFINIPATH_C_LINKENABLE 0x00000004 +#define INFINIPATH_C_RESET 0x00000001 + +/* kr_sendctrl bits */ +#define INFINIPATH_S_DISARMPIOBUF_SHIFT 16 + +#define IPATH_S_ABORT 0 +#define IPATH_S_PIOINTBUFAVAIL 1 +#define IPATH_S_PIOBUFAVAILUPD 2 +#define IPATH_S_PIOENABLE 3 +#define IPATH_S_DISARM 31 + +#define INFINIPATH_S_ABORT (1U << IPATH_S_ABORT) +#define INFINIPATH_S_PIOINTBUFAVAIL (1U << IPATH_S_PIOINTBUFAVAIL) +#define INFINIPATH_S_PIOBUFAVAILUPD (1U << IPATH_S_PIOBUFAVAILUPD) +#define INFINIPATH_S_PIOENABLE (1U << IPATH_S_PIOENABLE) +#define INFINIPATH_S_DISARM (1U << IPATH_S_DISARM) + +/* kr_rcvctrl bits */ +#define INFINIPATH_R_PORTENABLE_SHIFT 0 +#define INFINIPATH_R_INTRAVAIL_SHIFT 16 +#define INFINIPATH_R_TAILUPD 0x80000000 + +/* kr_intstatus, kr_intclear, kr_intmask bits */ +#define INFINIPATH_I_RCVURG_SHIFT 0 +#define INFINIPATH_I_RCVAVAIL_SHIFT 12 +#define INFINIPATH_I_ERROR 0x80000000 +#define INFINIPATH_I_SPIOSENT 0x40000000 +#define INFINIPATH_I_SPIOBUFAVAIL 0x20000000 +#define INFINIPATH_I_GPIO 0x10000000 + +/* kr_errorstatus, kr_errorclear, kr_errormask bits */ +#define INFINIPATH_E_RFORMATERR 0x0000000000000001ULL +#define INFINIPATH_E_RVCRC 0x0000000000000002ULL +#define INFINIPATH_E_RICRC 0x0000000000000004ULL +#define INFINIPATH_E_RMINPKTLEN 0x0000000000000008ULL +#define INFINIPATH_E_RMAXPKTLEN 0x0000000000000010ULL +#define INFINIPATH_E_RLONGPKTLEN 0x0000000000000020ULL +#define INFINIPATH_E_RSHORTPKTLEN 0x0000000000000040ULL +#define INFINIPATH_E_RUNEXPCHAR 0x0000000000000080ULL +#define INFINIPATH_E_RUNSUPVL 0x0000000000000100ULL +#define INFINIPATH_E_REBP 0x0000000000000200ULL +#define INFINIPATH_E_RIBFLOW 0x0000000000000400ULL +#define INFINIPATH_E_RBADVERSION 0x0000000000000800ULL +#define INFINIPATH_E_RRCVEGRFULL 0x0000000000001000ULL +#define INFINIPATH_E_RRCVHDRFULL 0x0000000000002000ULL +#define INFINIPATH_E_RBADTID 0x0000000000004000ULL +#define INFINIPATH_E_RHDRLEN 0x0000000000008000ULL +#define INFINIPATH_E_RHDR 0x0000000000010000ULL +#define INFINIPATH_E_RIBLOSTLINK 0x0000000000020000ULL +#define INFINIPATH_E_SMINPKTLEN 0x0000000020000000ULL +#define INFINIPATH_E_SMAXPKTLEN 0x0000000040000000ULL +#define INFINIPATH_E_SUNDERRUN 0x0000000080000000ULL +#define INFINIPATH_E_SPKTLEN 0x0000000100000000ULL +#define INFINIPATH_E_SDROPPEDSMPPKT 0x0000000200000000ULL +#define INFINIPATH_E_SDROPPEDDATAPKT 0x0000000400000000ULL +#define INFINIPATH_E_SPIOARMLAUNCH 0x0000000800000000ULL +#define INFINIPATH_E_SUNEXPERRPKTNUM 0x0000001000000000ULL +#define INFINIPATH_E_SUNSUPVL 0x0000002000000000ULL +#define INFINIPATH_E_IBSTATUSCHANGED 0x0001000000000000ULL +#define INFINIPATH_E_INVALIDADDR 0x0002000000000000ULL +#define INFINIPATH_E_RESET 0x0004000000000000ULL +#define INFINIPATH_E_HARDWARE 0x0008000000000000ULL + +/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */ +/* TXEMEMPARITYERR bit 0: PIObuf, 1: PIOpbc, 2: launchfifo + * RXEMEMPARITYERR bit 0: rcvbuf, 1: lookupq, 2: eagerTID, 3: expTID + * bit 4: flag buffer, 5: datainfo, 6: header info */ +#define INFINIPATH_HWE_TXEMEMPARITYERR_MASK 0xFULL +#define INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT 40 +#define INFINIPATH_HWE_RXEMEMPARITYERR_MASK 0x7FULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT 44 +#define INFINIPATH_HWE_RXDSYNCMEMPARITYERR 0x0000000400000000ULL +#define INFINIPATH_HWE_MEMBISTFAILED 0x0040000000000000ULL +#define INFINIPATH_HWE_IBCBUSTOSPCPARITYERR 0x4000000000000000ULL +#define INFINIPATH_HWE_IBCBUSFRSPCPARITYERR 0x8000000000000000ULL + +/* kr_hwdiagctrl bits */ +#define INFINIPATH_DC_FORCETXEMEMPARITYERR_MASK 0xFULL +#define INFINIPATH_DC_FORCETXEMEMPARITYERR_SHIFT 40 +#define INFINIPATH_DC_FORCERXEMEMPARITYERR_MASK 0x7FULL +#define INFINIPATH_DC_FORCERXEMEMPARITYERR_SHIFT 44 +#define INFINIPATH_DC_FORCERXDSYNCMEMPARITYERR 0x0000000400000000ULL +#define INFINIPATH_DC_COUNTERDISABLE 0x1000000000000000ULL +#define INFINIPATH_DC_COUNTERWREN 0x2000000000000000ULL +#define INFINIPATH_DC_FORCEIBCBUSTOSPCPARITYERR 0x4000000000000000ULL +#define INFINIPATH_DC_FORCEIBCBUSFRSPCPARITYERR 0x8000000000000000ULL + +/* kr_ibcctrl bits */ +#define INFINIPATH_IBCC_FLOWCTRLPERIOD_MASK 0xFFULL +#define INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT 0 +#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_MASK 0xFFULL +#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT 8 +#define INFINIPATH_IBCC_LINKINITCMD_MASK 0x3ULL +#define INFINIPATH_IBCC_LINKINITCMD_DISABLE 1 +#define INFINIPATH_IBCC_LINKINITCMD_POLL 2 /* cycle through TS1/TS2 till OK */ +#define INFINIPATH_IBCC_LINKINITCMD_SLEEP 3 /* wait for TS1, then go on */ +#define INFINIPATH_IBCC_LINKINITCMD_SHIFT 16 +#define INFINIPATH_IBCC_LINKCMD_MASK 0x3ULL +#define INFINIPATH_IBCC_LINKCMD_INIT 1 /* move to 0x11 */ +#define INFINIPATH_IBCC_LINKCMD_ARMED 2 /* move to 0x21 */ +#define INFINIPATH_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */ +#define INFINIPATH_IBCC_LINKCMD_SHIFT 18 +#define INFINIPATH_IBCC_MAXPKTLEN_MASK 0x7FFULL +#define INFINIPATH_IBCC_MAXPKTLEN_SHIFT 20 +#define INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK 0xFULL +#define INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT 32 +#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK 0xFULL +#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT 36 +#define INFINIPATH_IBCC_CREDITSCALE_MASK 0x7ULL +#define INFINIPATH_IBCC_CREDITSCALE_SHIFT 40 +#define INFINIPATH_IBCC_LOOPBACK 0x8000000000000000ULL +#define INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE 0x4000000000000000ULL + +/* kr_ibcstatus bits */ +#define INFINIPATH_IBCS_LINKTRAININGSTATE_MASK 0xF +#define INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT 0 +#define INFINIPATH_IBCS_LINKSTATE_MASK 0x7 +#define INFINIPATH_IBCS_LINKSTATE_SHIFT 4 +#define INFINIPATH_IBCS_TXREADY 0x40000000 +#define INFINIPATH_IBCS_TXCREDITOK 0x80000000 +/* link training states (shift by INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) */ +#define INFINIPATH_IBCS_LT_STATE_DISABLED 0x00 +#define INFINIPATH_IBCS_LT_STATE_LINKUP 0x01 +#define INFINIPATH_IBCS_LT_STATE_POLLACTIVE 0x02 +#define INFINIPATH_IBCS_LT_STATE_POLLQUIET 0x03 +#define INFINIPATH_IBCS_LT_STATE_SLEEPDELAY 0x04 +#define INFINIPATH_IBCS_LT_STATE_SLEEPQUIET 0x05 +#define INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE 0x08 +#define INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG 0x09 +#define INFINIPATH_IBCS_LT_STATE_CFGWAITRMT 0x0a +#define INFINIPATH_IBCS_LT_STATE_CFGIDLE 0x0b +#define INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN 0x0c +#define INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT 0x0e +#define INFINIPATH_IBCS_LT_STATE_RECOVERIDLE 0x0f +/* link state machine states (shift by INFINIPATH_IBCS_LINKSTATE_SHIFT) */ +#define INFINIPATH_IBCS_L_STATE_DOWN 0x0 +#define INFINIPATH_IBCS_L_STATE_INIT 0x1 +#define INFINIPATH_IBCS_L_STATE_ARM 0x2 +#define INFINIPATH_IBCS_L_STATE_ACTIVE 0x3 +#define INFINIPATH_IBCS_L_STATE_ACT_DEFER 0x4 + +/* combination link status states that we use with some frequency */ +#define IPATH_IBSTATE_MASK ((INFINIPATH_IBCS_LINKTRAININGSTATE_MASK \ + << INFINIPATH_IBCS_LINKSTATE_SHIFT) | \ + (INFINIPATH_IBCS_LINKSTATE_MASK \ + <<INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT)) +#define IPATH_IBSTATE_INIT ((INFINIPATH_IBCS_L_STATE_INIT \ + << INFINIPATH_IBCS_LINKSTATE_SHIFT) | \ + (INFINIPATH_IBCS_LT_STATE_LINKUP \ + <<INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT)) +#define IPATH_IBSTATE_ARM ((INFINIPATH_IBCS_L_STATE_ARM \ + << INFINIPATH_IBCS_LINKSTATE_SHIFT) | \ + (INFINIPATH_IBCS_LT_STATE_LINKUP \ + <<INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT)) +#define IPATH_IBSTATE_ACTIVE ((INFINIPATH_IBCS_L_STATE_ACTIVE \ + << INFINIPATH_IBCS_LINKSTATE_SHIFT) | \ + (INFINIPATH_IBCS_LT_STATE_LINKUP \ + <<INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT)) + +/* kr_extstatus bits */ +#define INFINIPATH_EXTS_SERDESPLLLOCK 0x1 +#define INFINIPATH_EXTS_GPIOIN_MASK 0xFFFFULL +#define INFINIPATH_EXTS_GPIOIN_SHIFT 48 + +/* kr_extctrl bits */ +#define INFINIPATH_EXTC_GPIOINVERT_MASK 0xFFFFULL +#define INFINIPATH_EXTC_GPIOINVERT_SHIFT 32 +#define INFINIPATH_EXTC_GPIOOE_MASK 0xFFFFULL +#define INFINIPATH_EXTC_GPIOOE_SHIFT 48 +#define INFINIPATH_EXTC_SERDESENABLE 0x80000000ULL +#define INFINIPATH_EXTC_SERDESCONNECT 0x40000000ULL +#define INFINIPATH_EXTC_SERDESENTRUNKING 0x20000000ULL +#define INFINIPATH_EXTC_SERDESDISRXFIFO 0x10000000ULL +#define INFINIPATH_EXTC_SERDESENPLPBK1 0x08000000ULL +#define INFINIPATH_EXTC_SERDESENPLPBK2 0x04000000ULL +#define INFINIPATH_EXTC_SERDESENENCDEC 0x02000000ULL +#define INFINIPATH_EXTC_LED1SECPORT_ON 0x00000020ULL +#define INFINIPATH_EXTC_LED2SECPORT_ON 0x00000010ULL +#define INFINIPATH_EXTC_LED1PRIPORT_ON 0x00000008ULL +#define INFINIPATH_EXTC_LED2PRIPORT_ON 0x00000004ULL +#define INFINIPATH_EXTC_LEDGBLOK_ON 0x00000002ULL +#define INFINIPATH_EXTC_LEDGBLERR_OFF 0x00000001ULL + +/* kr_mdio bits */ +#define INFINIPATH_MDIO_CLKDIV_MASK 0x7FULL +#define INFINIPATH_MDIO_CLKDIV_SHIFT 32 +#define INFINIPATH_MDIO_COMMAND_MASK 0x7ULL +#define INFINIPATH_MDIO_COMMAND_SHIFT 26 +#define INFINIPATH_MDIO_DEVADDR_MASK 0x1FULL +#define INFINIPATH_MDIO_DEVADDR_SHIFT 21 +#define INFINIPATH_MDIO_REGADDR_MASK 0x1FULL +#define INFINIPATH_MDIO_REGADDR_SHIFT 16 +#define INFINIPATH_MDIO_DATA_MASK 0xFFFFULL +#define INFINIPATH_MDIO_DATA_SHIFT 0 +#define INFINIPATH_MDIO_CMDVALID 0x0000000040000000ULL +#define INFINIPATH_MDIO_RDDATAVALID 0x0000000080000000ULL + +/* kr_partitionkey bits */ +#define INFINIPATH_PKEY_SIZE 16 +#define INFINIPATH_PKEY_MASK 0xFFFF +#define INFINIPATH_PKEY_DEFAULT_PKEY 0xFFFF + +/* kr_serdesconfig0 bits */ +#define INFINIPATH_SERDC0_RESET_MASK 0xfULL /* overal reset bits */ +#define INFINIPATH_SERDC0_RESET_PLL 0x10000000ULL /* pll reset */ +#define INFINIPATH_SERDC0_TXIDLE 0xF000ULL /* tx idle enables (per lane) */ +#define INFINIPATH_SERDC0_RXDETECT_EN 0xF0000ULL /* rx detect enables (per lane) */ +#define INFINIPATH_SERDC0_L1PWR_DN 0xF0ULL /* L1 Power down; use with RXDETECT, + Otherwise not used on IB side */ + +/* kr_xgxsconfig bits */ +#define INFINIPATH_XGXS_RESET 0x7ULL +#define INFINIPATH_XGXS_MDIOADDR_MASK 0xfULL +#define INFINIPATH_XGXS_MDIOADDR_SHIFT 4 + +#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL /* 40 bits valid */ + +/* TID entries (memory), HT400-only */ +#define INFINIPATH_RT_VALID 0x8000000000000000ULL +#define INFINIPATH_RT_ADDR_SHIFT 0 +#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFF +#define INFINIPATH_RT_BUFSIZE_SHIFT 48 + +/* + * IPATH_PIO_MAXIBHDR is the max IB header size allowed for in our + * PIO send buffers. This is well beyond anything currently + * defined in the InfiniBand spec. + */ +#define IPATH_PIO_MAXIBHDR 128 + +typedef u64 ipath_err_t; + +/* mask of defined bits for various registers */ +extern u64 infinipath_i_bitsextant; +extern ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant; + +/* masks that are different in various chips, or only exist in some chips */ +extern u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask; + +/* + * register bits for selecting i2c direction and values, used for I2C serial + * flash + */ +extern u16 ipath_gpio_sda_num, ipath_gpio_scl_num; +extern u64 ipath_gpio_sda, ipath_gpio_scl; + +/* + * These are the infinipath general register numbers (not offsets). + * The kernel registers are used directly, those beyond the kernel + * registers are calculated from one of the base registers. The use of + * an integer type doesn't allow type-checking as thorough as, say, + * an enum but allows for better hiding of chip differences. + */ +typedef const u16 ipath_kreg, /* infinipath general registers */ + ipath_creg, /* infinipath counter registers */ + ipath_sreg; /* kernel-only, infinipath send registers */ + +/* + * These are the chip registers common to all infinipath chips, and + * used both by the kernel and the diagnostics or other user code. + * They are all implemented such that 64 bit accesses work. + * Some implement no more than 32 bits. Because 64 bit reads + * require 2 HT cmds on opteron, we access those with 32 bit + * reads for efficiency (they are written as 64 bits, since + * the extra 32 bits are nearly free on writes, and it slightly reduces + * complexity). The rest are all accessed as 64 bits. + */ +struct ipath_kregs { + /* These are the 32 bit group */ + ipath_kreg kr_control; + ipath_kreg kr_counterregbase; + ipath_kreg kr_intmask; + ipath_kreg kr_intstatus; + ipath_kreg kr_pagealign; + ipath_kreg kr_portcnt; + ipath_kreg kr_rcvtidbase; + ipath_kreg kr_rcvtidcnt; + ipath_kreg kr_rcvegrbase; + ipath_kreg kr_rcvegrcnt; + ipath_kreg kr_scratch; + ipath_kreg kr_sendctrl; + ipath_kreg kr_sendpiobufbase; + ipath_kreg kr_sendpiobufcnt; + ipath_kreg kr_sendpiosize; + ipath_kreg kr_sendregbase; + ipath_kreg kr_userregbase; + /* These are the 64 bit group */ + ipath_kreg kr_debugport; + ipath_kreg kr_debugportselect; + ipath_kreg kr_errorclear; + ipath_kreg kr_errormask; + ipath_kreg kr_errorstatus; + ipath_kreg kr_extctrl; + ipath_kreg kr_extstatus; + ipath_kreg kr_gpio_clear; + ipath_kreg kr_gpio_mask; + ipath_kreg kr_gpio_out; + ipath_kreg kr_gpio_status; + ipath_kreg kr_hwdiagctrl; + ipath_kreg kr_hwerrclear; + ipath_kreg kr_hwerrmask; + ipath_kreg kr_hwerrstatus; + ipath_kreg kr_ibcctrl; + ipath_kreg kr_ibcstatus; + ipath_kreg kr_intblocked; + ipath_kreg kr_intclear; + ipath_kreg kr_interruptconfig; + ipath_kreg kr_mdio; + ipath_kreg kr_partitionkey; + ipath_kreg kr_rcvbthqp; + ipath_kreg kr_rcvbufbase; + ipath_kreg kr_rcvbufsize; + ipath_kreg kr_rcvctrl; + ipath_kreg kr_rcvhdrcnt; + ipath_kreg kr_rcvhdrentsize; + ipath_kreg kr_rcvhdrsize; + ipath_kreg kr_rcvintmembase; + ipath_kreg kr_rcvintmemsize; + ipath_kreg kr_revision; + ipath_kreg kr_sendbuffererror; + ipath_kreg kr_sendpioavailaddr; + ipath_kreg kr_serdesconfig0; + ipath_kreg kr_serdesconfig1; + ipath_kreg kr_serdesstatus; + ipath_kreg kr_txintmembase; + ipath_kreg kr_txintmemsize; + ipath_kreg kr_xgxsconfig; + ipath_kreg kr_ibpllcfg; + /* use these two (and the following N ports) only with ipath_k*_kreg64_port(); + * not *kreg64() */ + ipath_kreg kr_rcvhdraddr; + ipath_kreg kr_rcvhdrtailaddr; + + /* remaining registers are not present on all types of infinipath chips */ + ipath_kreg kr_rcvpktledcnt; + ipath_kreg kr_pcierbuftestreg0; + ipath_kreg kr_pcierbuftestreg1; + ipath_kreg kr_pcieq0serdesconfig0; + ipath_kreg kr_pcieq0serdesconfig1; + ipath_kreg kr_pcieq0serdesstatus; + ipath_kreg kr_pcieq1serdesconfig0; + ipath_kreg kr_pcieq1serdesconfig1; + ipath_kreg kr_pcieq1serdesstatus; +}; + +struct ipath_cregs { + ipath_creg cr_badformatcnt; + ipath_creg cr_erricrccnt; + ipath_creg cr_errlinkcnt; + ipath_creg cr_errlpcrccnt; + ipath_creg cr_errpkey; + ipath_creg cr_errrcvflowctrlcnt; + ipath_creg cr_err_rlencnt; + ipath_creg cr_errslencnt; + ipath_creg cr_errtidfull; + ipath_creg cr_errtidvalid; + ipath_creg cr_errvcrccnt; + ipath_creg cr_ibstatuschange; + ipath_creg cr_intcnt; + ipath_creg cr_invalidrlencnt; + ipath_creg cr_invalidslencnt; + ipath_creg cr_lbflowstallcnt; + ipath_creg cr_iblinkdowncnt; + ipath_creg cr_iblinkerrrecovcnt; + ipath_creg cr_ibsymbolerrcnt; + ipath_creg cr_pktrcvcnt; + ipath_creg cr_pktrcvflowctrlcnt; + ipath_creg cr_pktsendcnt; + ipath_creg cr_pktsendflowcnt; + ipath_creg cr_portovflcnt; + ipath_creg cr_rcvebpcnt; + ipath_creg cr_rcvovflcnt; + ipath_creg cr_rxdroppktcnt; + ipath_creg cr_senddropped; + ipath_creg cr_sendstallcnt; + ipath_creg cr_sendunderruncnt; + ipath_creg cr_unsupvlcnt; + ipath_creg cr_wordrcvcnt; + ipath_creg cr_wordsendcnt; +}; + +#endif /* _IPATH_REGISTERS_H */ diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c new file mode 100644 index 0000000..f232e77 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_ruc.c @@ -0,0 +1,552 @@ +/* + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ipath_verbs.h" + +/* + * Convert the AETH RNR timeout code into the number of milliseconds. + */ +const u32 ib_ipath_rnr_table[32] = { + 656, /* 0 */ + 1, /* 1 */ + 1, /* 2 */ + 1, /* 3 */ + 1, /* 4 */ + 1, /* 5 */ + 1, /* 6 */ + 1, /* 7 */ + 1, /* 8 */ + 1, /* 9 */ + 1, /* A */ + 1, /* B */ + 1, /* C */ + 1, /* D */ + 2, /* E */ + 2, /* F */ + 3, /* 10 */ + 4, /* 11 */ + 6, /* 12 */ + 8, /* 13 */ + 11, /* 14 */ + 16, /* 15 */ + 21, /* 16 */ + 31, /* 17 */ + 41, /* 18 */ + 62, /* 19 */ + 82, /* 1A */ + 123, /* 1B */ + 164, /* 1C */ + 246, /* 1D */ + 328, /* 1E */ + 492 /* 1F */ +}; + +/** + * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device + * @qp: the QP + * + * XXX Use a simple list for now. We might need a priority + * queue if we have lots of QPs waiting for RNR timeouts + * but that should be rare. + */ +void ipath_insert_rnr_queue(struct ipath_qp *qp) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + unsigned long flags; + + spin_lock_irqsave(&dev->pending_lock, flags); + if (list_empty(&dev->rnrwait)) + list_add(&qp->timerwait, &dev->rnrwait); + else { + struct list_head *l = &dev->rnrwait; + struct ipath_qp *nqp = list_entry(l->next, struct ipath_qp, + timerwait); + + while (qp->s_rnr_timeout >= nqp->s_rnr_timeout) { + qp->s_rnr_timeout -= nqp->s_rnr_timeout; + l = l->next; + if (l->next == &dev->rnrwait) + break; + nqp = list_entry(l->next, struct ipath_qp, + timerwait); + } + list_add(&qp->timerwait, l); + } + spin_unlock_irqrestore(&dev->pending_lock, flags); +} + +/** + * ipath_get_rwqe - copy the next RWQE into the QP's RWQE + * @qp: the QP + * @wr_id_only: update wr_id only, not SGEs + * + * Return 0 if no RWQE is available, otherwise return 1. + * + * Called at interrupt level with the QP r_rq.lock held. + */ +int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) +{ + struct ipath_rq *rq; + struct ipath_srq *srq; + struct ipath_rwqe *wqe; + int ret; + + if (!qp->ibqp.srq) { + rq = &qp->r_rq; + if (unlikely(rq->tail == rq->head)) { + ret = 0; + goto bail; + } + wqe = get_rwqe_ptr(rq, rq->tail); + qp->r_wr_id = wqe->wr_id; + if (!wr_id_only) { + qp->r_sge.sge = wqe->sg_list[0]; + qp->r_sge.sg_list = wqe->sg_list + 1; + qp->r_sge.num_sge = wqe->num_sge; + qp->r_len = wqe->length; + } + if (++rq->tail >= rq->size) + rq->tail = 0; + ret = 1; + goto bail; + } + + srq = to_isrq(qp->ibqp.srq); + rq = &srq->rq; + spin_lock(&rq->lock); + if (unlikely(rq->tail == rq->head)) { + spin_unlock(&rq->lock); + ret = 0; + goto bail; + } + wqe = get_rwqe_ptr(rq, rq->tail); + qp->r_wr_id = wqe->wr_id; + if (!wr_id_only) { + qp->r_sge.sge = wqe->sg_list[0]; + qp->r_sge.sg_list = wqe->sg_list + 1; + qp->r_sge.num_sge = wqe->num_sge; + qp->r_len = wqe->length; + } + if (++rq->tail >= rq->size) + rq->tail = 0; + if (srq->ibsrq.event_handler) { + struct ib_event ev; + u32 n; + + if (rq->head < rq->tail) + n = rq->size + rq->head - rq->tail; + else + n = rq->head - rq->tail; + if (n < srq->limit) { + srq->limit = 0; + spin_unlock(&rq->lock); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + srq->ibsrq.event_handler(&ev, + srq->ibsrq.srq_context); + } else + spin_unlock(&rq->lock); + } else + spin_unlock(&rq->lock); + ret = 1; + +bail: + return ret; +} + +/** + * ipath_ruc_loopback - handle UC and RC lookback requests + * @sqp: the loopback QP + * @wc: the work completion entry + * + * This is called from ipath_do_uc_send() or ipath_do_rc_send() to + * forward a WQE addressed to the same HCA. + * Note that although we are single threaded due to the tasklet, we still + * have to protect against post_send(). We don't have to worry about + * receive interrupts since this is a connected protocol and all packets + * will pass through here. + */ +void ipath_ruc_loopback(struct ipath_qp *sqp, struct ib_wc *wc) +{ + struct ipath_ibdev *dev = to_idev(sqp->ibqp.device); + struct ipath_qp *qp; + struct ipath_swqe *wqe; + struct ipath_sge *sge; + unsigned long flags; + u64 sdata; + + qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn); + if (!qp) { + dev->n_pkt_drops++; + return; + } + +again: + spin_lock_irqsave(&sqp->s_lock, flags); + + if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_SEND_OK)) { + spin_unlock_irqrestore(&sqp->s_lock, flags); + goto done; + } + + /* Get the next send request. */ + if (sqp->s_last == sqp->s_head) { + /* Send work queue is empty. */ + spin_unlock_irqrestore(&sqp->s_lock, flags); + goto done; + } + + /* + * We can rely on the entry not changing without the s_lock + * being held until we update s_last. + */ + wqe = get_swqe_ptr(sqp, sqp->s_last); + spin_unlock_irqrestore(&sqp->s_lock, flags); + + wc->wc_flags = 0; + wc->imm_data = 0; + + sqp->s_sge.sge = wqe->sg_list[0]; + sqp->s_sge.sg_list = wqe->sg_list + 1; + sqp->s_sge.num_sge = wqe->wr.num_sge; + sqp->s_len = wqe->length; + switch (wqe->wr.opcode) { + case IB_WR_SEND_WITH_IMM: + wc->wc_flags = IB_WC_WITH_IMM; + wc->imm_data = wqe->wr.imm_data; + /* FALLTHROUGH */ + case IB_WR_SEND: + spin_lock_irqsave(&qp->r_rq.lock, flags); + if (!ipath_get_rwqe(qp, 0)) { + rnr_nak: + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + /* Handle RNR NAK */ + if (qp->ibqp.qp_type == IB_QPT_UC) + goto send_comp; + if (sqp->s_rnr_retry == 0) { + wc->status = IB_WC_RNR_RETRY_EXC_ERR; + goto err; + } + if (sqp->s_rnr_retry_cnt < 7) + sqp->s_rnr_retry--; + dev->n_rnr_naks++; + sqp->s_rnr_timeout = + ib_ipath_rnr_table[sqp->s_min_rnr_timer]; + ipath_insert_rnr_queue(sqp); + goto done; + } + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + break; + + case IB_WR_RDMA_WRITE_WITH_IMM: + wc->wc_flags = IB_WC_WITH_IMM; + wc->imm_data = wqe->wr.imm_data; + spin_lock_irqsave(&qp->r_rq.lock, flags); + if (!ipath_get_rwqe(qp, 1)) + goto rnr_nak; + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + /* FALLTHROUGH */ + case IB_WR_RDMA_WRITE: + if (wqe->length == 0) + break; + if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge, wqe->length, + wqe->wr.wr.rdma.remote_addr, + wqe->wr.wr.rdma.rkey, + IB_ACCESS_REMOTE_WRITE))) { + acc_err: + wc->status = IB_WC_REM_ACCESS_ERR; + err: + wc->wr_id = wqe->wr.wr_id; + wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc->vendor_err = 0; + wc->byte_len = 0; + wc->qp_num = sqp->ibqp.qp_num; + wc->src_qp = sqp->remote_qpn; + wc->pkey_index = 0; + wc->slid = sqp->remote_ah_attr.dlid; + wc->sl = sqp->remote_ah_attr.sl; + wc->dlid_path_bits = 0; + wc->port_num = 0; + ipath_sqerror_qp(sqp, wc); + goto done; + } + break; + + case IB_WR_RDMA_READ: + if (unlikely(!ipath_rkey_ok(dev, &sqp->s_sge, wqe->length, + wqe->wr.wr.rdma.remote_addr, + wqe->wr.wr.rdma.rkey, + IB_ACCESS_REMOTE_READ))) + goto acc_err; + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_READ))) + goto acc_err; + qp->r_sge.sge = wqe->sg_list[0]; + qp->r_sge.sg_list = wqe->sg_list + 1; + qp->r_sge.num_sge = wqe->wr.num_sge; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge, sizeof(u64), + wqe->wr.wr.rdma.remote_addr, + wqe->wr.wr.rdma.rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto acc_err; + /* Perform atomic OP and save result. */ + sdata = wqe->wr.wr.atomic.swap; + spin_lock_irqsave(&dev->pending_lock, flags); + qp->r_atomic_data = *(u64 *) qp->r_sge.sge.vaddr; + if (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) + *(u64 *) qp->r_sge.sge.vaddr = + qp->r_atomic_data + sdata; + else if (qp->r_atomic_data == wqe->wr.wr.atomic.compare_add) + *(u64 *) qp->r_sge.sge.vaddr = sdata; + spin_unlock_irqrestore(&dev->pending_lock, flags); + *(u64 *) sqp->s_sge.sge.vaddr = qp->r_atomic_data; + goto send_comp; + + default: + goto done; + } + + sge = &sqp->s_sge.sge; + while (sqp->s_len) { + u32 len = sqp->s_len; + + if (len > sge->length) + len = sge->length; + BUG_ON(len == 0); + ipath_copy_sge(&qp->r_sge, sge->vaddr, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--sqp->s_sge.num_sge) + *sge = *sqp->s_sge.sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + sqp->s_len -= len; + } + + if (wqe->wr.opcode == IB_WR_RDMA_WRITE || + wqe->wr.opcode == IB_WR_RDMA_READ) + goto send_comp; + + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc->opcode = IB_WC_RECV; + wc->wr_id = qp->r_wr_id; + wc->status = IB_WC_SUCCESS; + wc->vendor_err = 0; + wc->byte_len = wqe->length; + wc->qp_num = qp->ibqp.qp_num; + wc->src_qp = qp->remote_qpn; + /* XXX do we know which pkey matched? Only needed for GSI. */ + wc->pkey_index = 0; + wc->slid = qp->remote_ah_attr.dlid; + wc->sl = qp->remote_ah_attr.sl; + wc->dlid_path_bits = 0; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), wc, + wqe->wr.send_flags & IB_SEND_SOLICITED); + +send_comp: + sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; + + if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &sqp->s_flags) || + (wqe->wr.send_flags & IB_SEND_SIGNALED)) { + wc->wr_id = wqe->wr.wr_id; + wc->status = IB_WC_SUCCESS; + wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc->vendor_err = 0; + wc->byte_len = wqe->length; + wc->qp_num = sqp->ibqp.qp_num; + wc->src_qp = 0; + wc->pkey_index = 0; + wc->slid = 0; + wc->sl = 0; + wc->dlid_path_bits = 0; + wc->port_num = 0; + ipath_cq_enter(to_icq(sqp->ibqp.send_cq), wc, 0); + } + + /* Update s_last now that we are finished with the SWQE */ + spin_lock_irqsave(&sqp->s_lock, flags); + if (++sqp->s_last >= sqp->s_size) + sqp->s_last = 0; + spin_unlock_irqrestore(&sqp->s_lock, flags); + goto again; + +done: + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +/** + * ipath_no_bufs_available - tell the layer driver we need buffers + * @qp: the QP that caused the problem + * @dev: the device we ran out of buffers on + * + * Called when we run out of PIO buffers. + */ +void ipath_no_bufs_available(struct ipath_qp *qp, struct ipath_ibdev *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&dev->pending_lock, flags); + if (qp->piowait.next == LIST_POISON1) + list_add_tail(&qp->piowait, &dev->piowait); + spin_unlock_irqrestore(&dev->pending_lock, flags); + /* + * Note that as soon as ipath_layer_want_buffer() is called and + * possibly before it returns, ipath_ib_piobufavail() + * could be called. If we are still in the tasklet function, + * tasklet_hi_schedule() will not call us until the next time + * tasklet_hi_schedule() is called. + * We clear the tasklet flag now since we are committing to return + * from the tasklet function. + */ + clear_bit(IPATH_S_BUSY, &qp->s_flags); + tasklet_unlock(&qp->s_task); + ipath_layer_want_buffer(dev->dd); + dev->n_piowait++; +} + +/** + * ipath_post_rc_send - post RC and UC sends + * @qp: the QP to post on + * @wr: the work request to send + */ +int ipath_post_rc_send(struct ipath_qp *qp, struct ib_send_wr *wr) +{ + struct ipath_swqe *wqe; + unsigned long flags; + u32 next; + int i, j; + int acc; + int ret; + + /* + * Don't allow RDMA reads or atomic operations on UC or + * undefined operations. + * Make sure buffer is large enough to hold the result for atomics. + */ + if (qp->ibqp.qp_type == IB_QPT_UC) { + if ((unsigned) wr->opcode >= IB_WR_RDMA_READ) { + ret = -EINVAL; + goto bail; + } + } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) { + ret = -EINVAL; + goto bail; + } else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP && + (wr->num_sge == 0 || + wr->sg_list[0].length < sizeof(u64) || + wr->sg_list[0].addr & (sizeof(u64) - 1))) { + ret = -EINVAL; + goto bail; + } + /* IB spec says that num_sge == 0 is OK. */ + if (wr->num_sge > qp->s_max_sge) { + ret = -ENOMEM; + goto bail; + } + spin_lock_irqsave(&qp->s_lock, flags); + next = qp->s_head + 1; + if (next >= qp->s_size) + next = 0; + if (next == qp->s_last) { + spin_unlock_irqrestore(&qp->s_lock, flags); + ret = -EINVAL; + goto bail; + } + + wqe = get_swqe_ptr(qp, qp->s_head); + wqe->wr = *wr; + wqe->ssn = qp->s_ssn++; + wqe->sg_list[0].mr = NULL; + wqe->sg_list[0].vaddr = NULL; + wqe->sg_list[0].length = 0; + wqe->sg_list[0].sge_length = 0; + wqe->length = 0; + acc = wr->opcode >= IB_WR_RDMA_READ ? IB_ACCESS_LOCAL_WRITE : 0; + for (i = 0, j = 0; i < wr->num_sge; i++) { + if (to_ipd(qp->ibqp.pd)->user && wr->sg_list[i].lkey == 0) { + spin_unlock_irqrestore(&qp->s_lock, flags); + ret = -EINVAL; + goto bail; + } + if (wr->sg_list[i].length == 0) + continue; + if (!ipath_lkey_ok(&to_idev(qp->ibqp.device)->lk_table, + &wqe->sg_list[j], &wr->sg_list[i], + acc)) { + spin_unlock_irqrestore(&qp->s_lock, flags); + ret = -EINVAL; + goto bail; + } + wqe->length += wr->sg_list[i].length; + j++; + } + wqe->wr.num_sge = j; + qp->s_head = next; + /* + * Wake up the send tasklet if the QP is not waiting + * for an RNR timeout. + */ + next = qp->s_rnr_timeout; + spin_unlock_irqrestore(&qp->s_lock, flags); + + if (next == 0) { + if (qp->ibqp.qp_type == IB_QPT_UC) + ipath_do_uc_send((unsigned long) qp); + else + ipath_do_rc_send((unsigned long) qp); + } + + ret = 0; + +bail: + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_srq.c b/drivers/infiniband/hw/ipath/ipath_srq.c new file mode 100644 index 0000000..01c4c6c --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_srq.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/err.h> +#include <linux/vmalloc.h> + +#include "ipath_verbs.h" + +/** + * ipath_post_srq_receive - post a receive on a shared receive queue + * @ibsrq: the SRQ to post the receive on + * @wr: the list of work requests to post + * @bad_wr: the first WR to cause a problem is put here + * + * This may be called from interrupt context. + */ +int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + struct ipath_ibdev *dev = to_idev(ibsrq->device); + unsigned long flags; + int ret; + + for (; wr; wr = wr->next) { + struct ipath_rwqe *wqe; + u32 next; + int i, j; + + if (wr->num_sge > srq->rq.max_sge) { + *bad_wr = wr; + ret = -ENOMEM; + goto bail; + } + + spin_lock_irqsave(&srq->rq.lock, flags); + next = srq->rq.head + 1; + if (next >= srq->rq.size) + next = 0; + if (next == srq->rq.tail) { + spin_unlock_irqrestore(&srq->rq.lock, flags); + *bad_wr = wr; + ret = -ENOMEM; + goto bail; + } + + wqe = get_rwqe_ptr(&srq->rq, srq->rq.head); + wqe->wr_id = wr->wr_id; + wqe->sg_list[0].mr = NULL; + wqe->sg_list[0].vaddr = NULL; + wqe->sg_list[0].length = 0; + wqe->sg_list[0].sge_length = 0; + wqe->length = 0; + for (i = 0, j = 0; i < wr->num_sge; i++) { + /* Check LKEY */ + if (to_ipd(srq->ibsrq.pd)->user && + wr->sg_list[i].lkey == 0) { + spin_unlock_irqrestore(&srq->rq.lock, + flags); + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + if (wr->sg_list[i].length == 0) + continue; + if (!ipath_lkey_ok(&dev->lk_table, + &wqe->sg_list[j], + &wr->sg_list[i], + IB_ACCESS_LOCAL_WRITE)) { + spin_unlock_irqrestore(&srq->rq.lock, + flags); + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + wqe->length += wr->sg_list[i].length; + j++; + } + wqe->num_sge = j; + srq->rq.head = next; + spin_unlock_irqrestore(&srq->rq.lock, flags); + } + ret = 0; + +bail: + return ret; +} + +/** + * ipath_create_srq - create a shared receive queue + * @ibpd: the protection domain of the SRQ to create + * @attr: the attributes of the SRQ + * @udata: not used by the InfiniPath verbs driver + */ +struct ib_srq *ipath_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata) +{ + struct ipath_srq *srq; + u32 sz; + struct ib_srq *ret; + + if (srq_init_attr->attr.max_sge < 1) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + srq = kmalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + /* + * Need to use vmalloc() if we want to support large #s of entries. + */ + srq->rq.size = srq_init_attr->attr.max_wr + 1; + sz = sizeof(struct ipath_sge) * srq_init_attr->attr.max_sge + + sizeof(struct ipath_rwqe); + srq->rq.wq = vmalloc(srq->rq.size * sz); + if (!srq->rq.wq) { + kfree(srq); + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + /* + * ib_create_srq() will initialize srq->ibsrq. + */ + spin_lock_init(&srq->rq.lock); + srq->rq.head = 0; + srq->rq.tail = 0; + srq->rq.max_sge = srq_init_attr->attr.max_sge; + srq->limit = srq_init_attr->attr.srq_limit; + + ret = &srq->ibsrq; + +bail: + return ret; +} + +/** + * ipath_modify_srq - modify a shared receive queue + * @ibsrq: the SRQ to modify + * @attr: the new attributes of the SRQ + * @attr_mask: indicates which attributes to modify + */ +int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + unsigned long flags; + int ret; + + if (attr_mask & IB_SRQ_LIMIT) { + spin_lock_irqsave(&srq->rq.lock, flags); + srq->limit = attr->srq_limit; + spin_unlock_irqrestore(&srq->rq.lock, flags); + } + if (attr_mask & IB_SRQ_MAX_WR) { + u32 size = attr->max_wr + 1; + struct ipath_rwqe *wq, *p; + u32 n; + u32 sz; + + if (attr->max_sge < srq->rq.max_sge) { + ret = -EINVAL; + goto bail; + } + + sz = sizeof(struct ipath_rwqe) + + attr->max_sge * sizeof(struct ipath_sge); + wq = vmalloc(size * sz); + if (!wq) { + ret = -ENOMEM; + goto bail; + } + + spin_lock_irqsave(&srq->rq.lock, flags); + if (srq->rq.head < srq->rq.tail) + n = srq->rq.size + srq->rq.head - srq->rq.tail; + else + n = srq->rq.head - srq->rq.tail; + if (size <= n || size <= srq->limit) { + spin_unlock_irqrestore(&srq->rq.lock, flags); + vfree(wq); + ret = -EINVAL; + goto bail; + } + n = 0; + p = wq; + while (srq->rq.tail != srq->rq.head) { + struct ipath_rwqe *wqe; + int i; + + wqe = get_rwqe_ptr(&srq->rq, srq->rq.tail); + p->wr_id = wqe->wr_id; + p->length = wqe->length; + p->num_sge = wqe->num_sge; + for (i = 0; i < wqe->num_sge; i++) + p->sg_list[i] = wqe->sg_list[i]; + n++; + p = (struct ipath_rwqe *)((char *) p + sz); + if (++srq->rq.tail >= srq->rq.size) + srq->rq.tail = 0; + } + vfree(srq->rq.wq); + srq->rq.wq = wq; + srq->rq.size = size; + srq->rq.head = n; + srq->rq.tail = 0; + srq->rq.max_sge = attr->max_sge; + spin_unlock_irqrestore(&srq->rq.lock, flags); + } + + ret = 0; + +bail: + return ret; +} + +int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + + attr->max_wr = srq->rq.size - 1; + attr->max_sge = srq->rq.max_sge; + attr->srq_limit = srq->limit; + return 0; +} + +/** + * ipath_destroy_srq - destroy a shared receive queue + * @ibsrq: the SRQ to destroy + */ +int ipath_destroy_srq(struct ib_srq *ibsrq) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + + vfree(srq->rq.wq); + kfree(srq); + + return 0; +} diff --git a/drivers/infiniband/hw/ipath/ipath_stats.c b/drivers/infiniband/hw/ipath/ipath_stats.c new file mode 100644 index 0000000..fe20913 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_stats.c @@ -0,0 +1,303 @@ +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/pci.h> + +#include "ipath_kernel.h" + +struct infinipath_stats ipath_stats; + +/** + * ipath_snap_cntr - snapshot a chip counter + * @dd: the infinipath device + * @creg: the counter to snapshot + * + * called from add_timer and user counter read calls, to deal with + * counters that wrap in "human time". The words sent and received, and + * the packets sent and received are all that we worry about. For now, + * at least, we don't worry about error counters, because if they wrap + * that quickly, we probably don't care. We may eventually just make this + * handle all the counters. word counters can wrap in about 20 seconds + * of full bandwidth traffic, packet counters in a few hours. + */ + +u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg) +{ + u32 val, reg64 = 0; + u64 val64; + unsigned long t0, t1; + u64 ret; + + t0 = jiffies; + /* If fast increment counters are only 32 bits, snapshot them, + * and maintain them as 64bit values in the driver */ + if (!(dd->ipath_flags & IPATH_32BITCOUNTERS) && + (creg == dd->ipath_cregs->cr_wordsendcnt || + creg == dd->ipath_cregs->cr_wordrcvcnt || + creg == dd->ipath_cregs->cr_pktsendcnt || + creg == dd->ipath_cregs->cr_pktrcvcnt)) { + val64 = ipath_read_creg(dd, creg); + val = val64 == ~0ULL ? ~0U : 0; + reg64 = 1; + } else /* val64 just to keep gcc quiet... */ + val64 = val = ipath_read_creg32(dd, creg); + /* + * See if a second has passed. This is just a way to detect things + * that are quite broken. Normally this should take just a few + * cycles (the check is for long enough that we don't care if we get + * pre-empted.) An Opteron HT O read timeout is 4 seconds with + * normal NB values + */ + t1 = jiffies; + if (time_before(t0 + HZ, t1) && val == -1) { + ipath_dev_err(dd, "Error! Read counter 0x%x timed out\n", + creg); + ret = 0ULL; + goto bail; + } + if (reg64) { + ret = val64; + goto bail; + } + + if (creg == dd->ipath_cregs->cr_wordsendcnt) { + if (val != dd->ipath_lastsword) { + dd->ipath_sword += val - dd->ipath_lastsword; + dd->ipath_lastsword = val; + } + val64 = dd->ipath_sword; + } else if (creg == dd->ipath_cregs->cr_wordrcvcnt) { + if (val != dd->ipath_lastrword) { + dd->ipath_rword += val - dd->ipath_lastrword; + dd->ipath_lastrword = val; + } + val64 = dd->ipath_rword; + } else if (creg == dd->ipath_cregs->cr_pktsendcnt) { + if (val != dd->ipath_lastspkts) { + dd->ipath_spkts += val - dd->ipath_lastspkts; + dd->ipath_lastspkts = val; + } + val64 = dd->ipath_spkts; + } else if (creg == dd->ipath_cregs->cr_pktrcvcnt) { + if (val != dd->ipath_lastrpkts) { + dd->ipath_rpkts += val - dd->ipath_lastrpkts; + dd->ipath_lastrpkts = val; + } + val64 = dd->ipath_rpkts; + } else + val64 = (u64) val; + + ret = val64; + +bail: + return ret; +} + +/** + * ipath_qcheck - print delta of egrfull/hdrqfull errors for kernel ports + * @dd: the infinipath device + * + * print the delta of egrfull/hdrqfull errors for kernel ports no more than + * every 5 seconds. User processes are printed at close, but kernel doesn't + * close, so... Separate routine so may call from other places someday, and + * so function name when printed by _IPATH_INFO is meaningfull + */ +static void ipath_qcheck(struct ipath_devdata *dd) +{ + static u64 last_tot_hdrqfull; + size_t blen = 0; + char buf[128]; + + *buf = 0; + if (dd->ipath_pd[0]->port_hdrqfull != dd->ipath_p0_hdrqfull) { + blen = snprintf(buf, sizeof buf, "port 0 hdrqfull %u", + dd->ipath_pd[0]->port_hdrqfull - + dd->ipath_p0_hdrqfull); + dd->ipath_p0_hdrqfull = dd->ipath_pd[0]->port_hdrqfull; + } + if (ipath_stats.sps_etidfull != dd->ipath_last_tidfull) { + blen += snprintf(buf + blen, sizeof buf - blen, + "%srcvegrfull %llu", + blen ? ", " : "", + (unsigned long long) + (ipath_stats.sps_etidfull - + dd->ipath_last_tidfull)); + dd->ipath_last_tidfull = ipath_stats.sps_etidfull; + } + + /* + * this is actually the number of hdrq full interrupts, not actual + * events, but at the moment that's mostly what I'm interested in. + * Actual count, etc. is in the counters, if needed. For production + * users this won't ordinarily be printed. + */ + + if ((ipath_debug & (__IPATH_PKTDBG | __IPATH_DBG)) && + ipath_stats.sps_hdrqfull != last_tot_hdrqfull) { + blen += snprintf(buf + blen, sizeof buf - blen, + "%shdrqfull %llu (all ports)", + blen ? ", " : "", + (unsigned long long) + (ipath_stats.sps_hdrqfull - + last_tot_hdrqfull)); + last_tot_hdrqfull = ipath_stats.sps_hdrqfull; + } + if (blen) + ipath_dbg("%s\n", buf); + + if (dd->ipath_port0head != (u32) + le64_to_cpu(*dd->ipath_hdrqtailptr)) { + if (dd->ipath_lastport0rcv_cnt == + ipath_stats.sps_port0pkts) { + ipath_cdbg(PKT, "missing rcv interrupts? " + "port0 hd=%llx tl=%x; port0pkts %llx\n", + (unsigned long long) + le64_to_cpu(*dd->ipath_hdrqtailptr), + dd->ipath_port0head, + (unsigned long long) + ipath_stats.sps_port0pkts); + ipath_kreceive(dd); + } + dd->ipath_lastport0rcv_cnt = ipath_stats.sps_port0pkts; + } +} + +/** + * ipath_get_faststats - get word counters from chip before they overflow + * @opaque - contains a pointer to the infinipath device ipath_devdata + * + * called from add_timer + */ +void ipath_get_faststats(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *) opaque; + u32 val; + static unsigned cnt; + + /* + * don't access the chip while running diags, or memory diags can + * fail + */ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT) || + ipath_diag_inuse) + /* but re-arm the timer, for diags case; won't hurt other */ + goto done; + + if (dd->ipath_flags & IPATH_32BITCOUNTERS) { + ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt); + ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt); + ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt); + ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt); + } + + ipath_qcheck(dd); + + /* + * deal with repeat error suppression. Doesn't really matter if + * last error was almost a full interval ago, or just a few usecs + * ago; still won't get more than 2 per interval. We may want + * longer intervals for this eventually, could do with mod, counter + * or separate timer. Also see code in ipath_handle_errors() and + * ipath_handle_hwerrors(). + */ + + if (dd->ipath_lasterror) + dd->ipath_lasterror = 0; + if (dd->ipath_lasthwerror) + dd->ipath_lasthwerror = 0; + if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs) + && time_after(jiffies, dd->ipath_unmasktime)) { + char ebuf[256]; + ipath_decode_err(ebuf, sizeof ebuf, + (dd->ipath_maskederrs & ~dd-> + ipath_ignorederrs)); + if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs) & + ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL)) + ipath_dev_err(dd, "Re-enabling masked errors " + "(%s)\n", ebuf); + else { + /* + * rcvegrfull and rcvhdrqfull are "normal", for some + * types of processes (mostly benchmarks) that send + * huge numbers of messages, while not processing + * them. So only complain about these at debug + * level. + */ + ipath_dbg("Disabling frequent queue full errors " + "(%s)\n", ebuf); + } + dd->ipath_maskederrs = dd->ipath_ignorederrs; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + ~dd->ipath_maskederrs); + } + + /* limit qfull messages to ~one per minute per port */ + if ((++cnt & 0x10)) { + for (val = dd->ipath_cfgports - 1; ((int)val) >= 0; + val--) { + if (dd->ipath_lastegrheads[val] != -1) + dd->ipath_lastegrheads[val] = -1; + if (dd->ipath_lastrcvhdrqtails[val] != -1) + dd->ipath_lastrcvhdrqtails[val] = -1; + } + } + + if (dd->ipath_nosma_bufs) { + dd->ipath_nosma_secs += 5; + if (dd->ipath_nosma_secs >= 30) { + ipath_cdbg(SMA, "No SMA bufs avail %u seconds; " + "cancelling pending sends\n", + dd->ipath_nosma_secs); + /* + * issue an abort as well, in case we have a packet + * stuck in launch fifo. This could corrupt an + * outgoing user packet in the worst case, + * but this is a pretty catastrophic, anyway. + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + INFINIPATH_S_ABORT); + ipath_disarm_piobufs(dd, dd->ipath_lastport_piobuf, + dd->ipath_piobcnt2k + + dd->ipath_piobcnt4k - + dd->ipath_lastport_piobuf); + /* start again, if necessary */ + dd->ipath_nosma_secs = 0; + } else + ipath_cdbg(SMA, "No SMA bufs avail %u tries, " + "after %u seconds\n", + dd->ipath_nosma_bufs, + dd->ipath_nosma_secs); + } + +done: + mod_timer(&dd->ipath_stats_timer, jiffies + HZ * 5); +} diff --git a/drivers/infiniband/hw/ipath/ipath_sysfs.c b/drivers/infiniband/hw/ipath/ipath_sysfs.c new file mode 100644 index 0000000..32acd80 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_sysfs.c @@ -0,0 +1,778 @@ +/* + * Copyright (c) 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/ctype.h> +#include <linux/pci.h> + +#include "ipath_kernel.h" +#include "ips_common.h" +#include "ipath_layer.h" + +/** + * ipath_parse_ushort - parse an unsigned short value in an arbitrary base + * @str: the string containing the number + * @valp: where to put the result + * + * returns the number of bytes consumed, or negative value on error + */ +int ipath_parse_ushort(const char *str, unsigned short *valp) +{ + unsigned long val; + char *end; + int ret; + + if (!isdigit(str[0])) { + ret = -EINVAL; + goto bail; + } + + val = simple_strtoul(str, &end, 0); + + if (val > 0xffff) { + ret = -EINVAL; + goto bail; + } + + *valp = val; + + ret = end + 1 - str; + if (ret == 0) + ret = -EINVAL; + +bail: + return ret; +} + +static ssize_t show_version(struct device_driver *dev, char *buf) +{ + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", ipath_core_version); +} + +static ssize_t show_num_units(struct device_driver *dev, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%d\n", + ipath_count_units(NULL, NULL, NULL)); +} + +#define DRIVER_STAT(name, attr) \ + static ssize_t show_stat_##name(struct device_driver *dev, \ + char *buf) \ + { \ + return scnprintf( \ + buf, PAGE_SIZE, "%llu\n", \ + (unsigned long long) ipath_stats.sps_ ##attr); \ + } \ + static DRIVER_ATTR(name, S_IRUGO, show_stat_##name, NULL) + +DRIVER_STAT(intrs, ints); +DRIVER_STAT(err_intrs, errints); +DRIVER_STAT(errs, errs); +DRIVER_STAT(pkt_errs, pkterrs); +DRIVER_STAT(crc_errs, crcerrs); +DRIVER_STAT(hw_errs, hwerrs); +DRIVER_STAT(ib_link, iblink); +DRIVER_STAT(port0_pkts, port0pkts); +DRIVER_STAT(ether_spkts, ether_spkts); +DRIVER_STAT(ether_rpkts, ether_rpkts); +DRIVER_STAT(sma_spkts, sma_spkts); +DRIVER_STAT(sma_rpkts, sma_rpkts); +DRIVER_STAT(hdrq_full, hdrqfull); +DRIVER_STAT(etid_full, etidfull); +DRIVER_STAT(no_piobufs, nopiobufs); +DRIVER_STAT(ports, ports); +DRIVER_STAT(pkey0, pkeys[0]); +DRIVER_STAT(pkey1, pkeys[1]); +DRIVER_STAT(pkey2, pkeys[2]); +DRIVER_STAT(pkey3, pkeys[3]); +/* XXX fix the following when dynamic table of devices used */ +DRIVER_STAT(lid0, lid[0]); +DRIVER_STAT(lid1, lid[1]); +DRIVER_STAT(lid2, lid[2]); +DRIVER_STAT(lid3, lid[3]); + +DRIVER_STAT(nports, nports); +DRIVER_STAT(null_intr, nullintr); +DRIVER_STAT(max_pkts_call, maxpkts_call); +DRIVER_STAT(avg_pkts_call, avgpkts_call); +DRIVER_STAT(page_locks, pagelocks); +DRIVER_STAT(page_unlocks, pageunlocks); +DRIVER_STAT(krdrops, krdrops); +/* XXX fix the following when dynamic table of devices used */ +DRIVER_STAT(mlid0, mlid[0]); +DRIVER_STAT(mlid1, mlid[1]); +DRIVER_STAT(mlid2, mlid[2]); +DRIVER_STAT(mlid3, mlid[3]); + +static struct attribute *driver_stat_attributes[] = { + &driver_attr_intrs.attr, + &driver_attr_err_intrs.attr, + &driver_attr_errs.attr, + &driver_attr_pkt_errs.attr, + &driver_attr_crc_errs.attr, + &driver_attr_hw_errs.attr, + &driver_attr_ib_link.attr, + &driver_attr_port0_pkts.attr, + &driver_attr_ether_spkts.attr, + &driver_attr_ether_rpkts.attr, + &driver_attr_sma_spkts.attr, + &driver_attr_sma_rpkts.attr, + &driver_attr_hdrq_full.attr, + &driver_attr_etid_full.attr, + &driver_attr_no_piobufs.attr, + &driver_attr_ports.attr, + &driver_attr_pkey0.attr, + &driver_attr_pkey1.attr, + &driver_attr_pkey2.attr, + &driver_attr_pkey3.attr, + &driver_attr_lid0.attr, + &driver_attr_lid1.attr, + &driver_attr_lid2.attr, + &driver_attr_lid3.attr, + &driver_attr_nports.attr, + &driver_attr_null_intr.attr, + &driver_attr_max_pkts_call.attr, + &driver_attr_avg_pkts_call.attr, + &driver_attr_page_locks.attr, + &driver_attr_page_unlocks.attr, + &driver_attr_krdrops.attr, + &driver_attr_mlid0.attr, + &driver_attr_mlid1.attr, + &driver_attr_mlid2.attr, + &driver_attr_mlid3.attr, + NULL +}; + +static struct attribute_group driver_stat_attr_group = { + .name = "stats", + .attrs = driver_stat_attributes +}; + +static ssize_t show_status(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + ssize_t ret; + + if (!dd->ipath_statusp) { + ret = -EINVAL; + goto bail; + } + + ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n", + (unsigned long long) *(dd->ipath_statusp)); + +bail: + return ret; +} + +static const char *ipath_status_str[] = { + "Initted", + "Disabled", + "Admin_Disabled", + "OIB_SMA", + "SMA", + "Present", + "IB_link_up", + "IB_configured", + "NoIBcable", + "Fatal_Hardware_Error", + NULL, +}; + +static ssize_t show_status_str(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int i, any; + u64 s; + ssize_t ret; + + if (!dd->ipath_statusp) { + ret = -EINVAL; + goto bail; + } + + s = *(dd->ipath_statusp); + *buf = '\0'; + for (any = i = 0; s && ipath_status_str[i]; i++) { + if (s & 1) { + if (any && strlcat(buf, " ", PAGE_SIZE) >= + PAGE_SIZE) + /* overflow */ + break; + if (strlcat(buf, ipath_status_str[i], + PAGE_SIZE) >= PAGE_SIZE) + break; + any = 1; + } + s >>= 1; + } + if (any) + strlcat(buf, "\n", PAGE_SIZE); + + ret = strlen(buf); + +bail: + return ret; +} + +static ssize_t show_boardversion(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_boardversion); +} + +static ssize_t show_lid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_lid); +} + +static ssize_t store_lid(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 lid; + int ret; + + ret = ipath_parse_ushort(buf, &lid); + if (ret < 0) + goto invalid; + + if (lid == 0 || lid >= 0xc000) { + ret = -EINVAL; + goto invalid; + } + + ipath_set_sps_lid(dd, lid, 0); + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid LID\n"); +bail: + return ret; +} + +static ssize_t show_mlid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_mlid); +} + +static ssize_t store_mlid(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int unit; + u16 mlid; + int ret; + + ret = ipath_parse_ushort(buf, &mlid); + if (ret < 0) + goto invalid; + + unit = dd->ipath_unit; + + dd->ipath_mlid = mlid; + ipath_stats.sps_mlid[unit] = mlid; + ipath_layer_intr(dd, IPATH_LAYER_INT_BCAST); + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid MLID\n"); +bail: + return ret; +} + +static ssize_t show_guid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u8 *guid; + + guid = (u8 *) & (dd->ipath_guid); + + return scnprintf(buf, PAGE_SIZE, + "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", + guid[0], guid[1], guid[2], guid[3], + guid[4], guid[5], guid[6], guid[7]); +} + +static ssize_t store_guid(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + ssize_t ret; + unsigned short guid[8]; + __be64 nguid; + u8 *ng; + int i; + + if (sscanf(buf, "%hx:%hx:%hx:%hx:%hx:%hx:%hx:%hx", + &guid[0], &guid[1], &guid[2], &guid[3], + &guid[4], &guid[5], &guid[6], &guid[7]) != 8) + goto invalid; + + ng = (u8 *) &nguid; + + for (i = 0; i < 8; i++) { + if (guid[i] > 0xff) + goto invalid; + ng[i] = guid[i]; + } + + dd->ipath_guid = nguid; + dd->ipath_nguid = 1; + + ret = strlen(buf); + goto bail; + +invalid: + ipath_dev_err(dd, "attempt to set invalid GUID\n"); + ret = -EINVAL; + +bail: + return ret; +} + +static ssize_t show_nguid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_nguid); +} + +static ssize_t show_serial(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + buf[sizeof dd->ipath_serial] = '\0'; + memcpy(buf, dd->ipath_serial, sizeof dd->ipath_serial); + strcat(buf, "\n"); + return strlen(buf); +} + +static ssize_t show_unit(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_unit); +} + +#define DEVICE_COUNTER(name, attr) \ + static ssize_t show_counter_##name(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + struct ipath_devdata *dd = dev_get_drvdata(dev); \ + return scnprintf(\ + buf, PAGE_SIZE, "%llu\n", (unsigned long long) \ + ipath_snap_cntr( \ + dd, offsetof(struct infinipath_counters, \ + attr) / sizeof(u64))); \ + } \ + static DEVICE_ATTR(name, S_IRUGO, show_counter_##name, NULL); + +DEVICE_COUNTER(ib_link_downeds, IBLinkDownedCnt); +DEVICE_COUNTER(ib_link_err_recoveries, IBLinkErrRecoveryCnt); +DEVICE_COUNTER(ib_status_changes, IBStatusChangeCnt); +DEVICE_COUNTER(ib_symbol_errs, IBSymbolErrCnt); +DEVICE_COUNTER(lb_flow_stalls, LBFlowStallCnt); +DEVICE_COUNTER(lb_ints, LBIntCnt); +DEVICE_COUNTER(rx_bad_formats, RxBadFormatCnt); +DEVICE_COUNTER(rx_buf_ovfls, RxBufOvflCnt); +DEVICE_COUNTER(rx_data_pkts, RxDataPktCnt); +DEVICE_COUNTER(rx_dropped_pkts, RxDroppedPktCnt); +DEVICE_COUNTER(rx_dwords, RxDwordCnt); +DEVICE_COUNTER(rx_ebps, RxEBPCnt); +DEVICE_COUNTER(rx_flow_ctrl_errs, RxFlowCtrlErrCnt); +DEVICE_COUNTER(rx_flow_pkts, RxFlowPktCnt); +DEVICE_COUNTER(rx_icrc_errs, RxICRCErrCnt); +DEVICE_COUNTER(rx_len_errs, RxLenErrCnt); +DEVICE_COUNTER(rx_link_problems, RxLinkProblemCnt); +DEVICE_COUNTER(rx_lpcrc_errs, RxLPCRCErrCnt); +DEVICE_COUNTER(rx_max_min_len_errs, RxMaxMinLenErrCnt); +DEVICE_COUNTER(rx_p0_hdr_egr_ovfls, RxP0HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p1_hdr_egr_ovfls, RxP1HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p2_hdr_egr_ovfls, RxP2HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p3_hdr_egr_ovfls, RxP3HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p4_hdr_egr_ovfls, RxP4HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p5_hdr_egr_ovfls, RxP5HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p6_hdr_egr_ovfls, RxP6HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p7_hdr_egr_ovfls, RxP7HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p8_hdr_egr_ovfls, RxP8HdrEgrOvflCnt); +DEVICE_COUNTER(rx_pkey_mismatches, RxPKeyMismatchCnt); +DEVICE_COUNTER(rx_tid_full_errs, RxTIDFullErrCnt); +DEVICE_COUNTER(rx_tid_valid_errs, RxTIDValidErrCnt); +DEVICE_COUNTER(rx_vcrc_errs, RxVCRCErrCnt); +DEVICE_COUNTER(tx_data_pkts, TxDataPktCnt); +DEVICE_COUNTER(tx_dropped_pkts, TxDroppedPktCnt); +DEVICE_COUNTER(tx_dwords, TxDwordCnt); +DEVICE_COUNTER(tx_flow_pkts, TxFlowPktCnt); +DEVICE_COUNTER(tx_flow_stalls, TxFlowStallCnt); +DEVICE_COUNTER(tx_len_errs, TxLenErrCnt); +DEVICE_COUNTER(tx_max_min_len_errs, TxMaxMinLenErrCnt); +DEVICE_COUNTER(tx_underruns, TxUnderrunCnt); +DEVICE_COUNTER(tx_unsup_vl_errs, TxUnsupVLErrCnt); + +static struct attribute *dev_counter_attributes[] = { + &dev_attr_ib_link_downeds.attr, + &dev_attr_ib_link_err_recoveries.attr, + &dev_attr_ib_status_changes.attr, + &dev_attr_ib_symbol_errs.attr, + &dev_attr_lb_flow_stalls.attr, + &dev_attr_lb_ints.attr, + &dev_attr_rx_bad_formats.attr, + &dev_attr_rx_buf_ovfls.attr, + &dev_attr_rx_data_pkts.attr, + &dev_attr_rx_dropped_pkts.attr, + &dev_attr_rx_dwords.attr, + &dev_attr_rx_ebps.attr, + &dev_attr_rx_flow_ctrl_errs.attr, + &dev_attr_rx_flow_pkts.attr, + &dev_attr_rx_icrc_errs.attr, + &dev_attr_rx_len_errs.attr, + &dev_attr_rx_link_problems.attr, + &dev_attr_rx_lpcrc_errs.attr, + &dev_attr_rx_max_min_len_errs.attr, + &dev_attr_rx_p0_hdr_egr_ovfls.attr, + &dev_attr_rx_p1_hdr_egr_ovfls.attr, + &dev_attr_rx_p2_hdr_egr_ovfls.attr, + &dev_attr_rx_p3_hdr_egr_ovfls.attr, + &dev_attr_rx_p4_hdr_egr_ovfls.attr, + &dev_attr_rx_p5_hdr_egr_ovfls.attr, + &dev_attr_rx_p6_hdr_egr_ovfls.attr, + &dev_attr_rx_p7_hdr_egr_ovfls.attr, + &dev_attr_rx_p8_hdr_egr_ovfls.attr, + &dev_attr_rx_pkey_mismatches.attr, + &dev_attr_rx_tid_full_errs.attr, + &dev_attr_rx_tid_valid_errs.attr, + &dev_attr_rx_vcrc_errs.attr, + &dev_attr_tx_data_pkts.attr, + &dev_attr_tx_dropped_pkts.attr, + &dev_attr_tx_dwords.attr, + &dev_attr_tx_flow_pkts.attr, + &dev_attr_tx_flow_stalls.attr, + &dev_attr_tx_len_errs.attr, + &dev_attr_tx_max_min_len_errs.attr, + &dev_attr_tx_underruns.attr, + &dev_attr_tx_unsup_vl_errs.attr, + NULL +}; + +static struct attribute_group dev_counter_attr_group = { + .name = "counters", + .attrs = dev_counter_attributes +}; + +static ssize_t store_reset(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + if (count < 5 || memcmp(buf, "reset", 5)) { + ret = -EINVAL; + goto bail; + } + + if (dd->ipath_flags & IPATH_DISABLED) { + /* + * post-reset init would re-enable interrupts, etc. + * so don't allow reset on disabled devices. Not + * perfect error, but about the best choice. + */ + dev_info(dev,"Unit %d is disabled, can't reset\n", + dd->ipath_unit); + ret = -EINVAL; + } + ret = ipath_reset_device(dd->ipath_unit); +bail: + return ret<0 ? ret : count; +} + +static ssize_t store_link_state(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 state; + + ret = ipath_parse_ushort(buf, &state); + if (ret < 0) + goto invalid; + + r = ipath_layer_set_linkstate(dd, state); + if (r < 0) { + ret = r; + goto bail; + } + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid link state\n"); +bail: + return ret; +} + +static ssize_t show_mtu(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_ibmtu); +} + +static ssize_t store_mtu(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + ssize_t ret; + u16 mtu = 0; + int r; + + ret = ipath_parse_ushort(buf, &mtu); + if (ret < 0) + goto invalid; + + r = ipath_layer_set_mtu(dd, mtu); + if (r < 0) + ret = r; + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid MTU\n"); +bail: + return ret; +} + +static ssize_t show_enabled(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + return scnprintf(buf, PAGE_SIZE, "%u\n", + (dd->ipath_flags & IPATH_DISABLED) ? 0 : 1); +} + +static ssize_t store_enabled(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + ssize_t ret; + u16 enable = 0; + + ret = ipath_parse_ushort(buf, &enable); + if (ret < 0) { + ipath_dev_err(dd, "attempt to use non-numeric on enable\n"); + goto bail; + } + + if (enable) { + if (!(dd->ipath_flags & IPATH_DISABLED)) + goto bail; + + dev_info(dev, "Enabling unit %d\n", dd->ipath_unit); + /* same as post-reset */ + ret = ipath_init_chip(dd, 1); + if (ret) + ipath_dev_err(dd, "Failed to enable unit %d\n", + dd->ipath_unit); + else { + dd->ipath_flags &= ~IPATH_DISABLED; + *dd->ipath_statusp &= ~IPATH_STATUS_ADMIN_DISABLED; + } + } + else if (!(dd->ipath_flags & IPATH_DISABLED)) { + dev_info(dev, "Disabling unit %d\n", dd->ipath_unit); + ipath_shutdown_device(dd); + dd->ipath_flags |= IPATH_DISABLED; + *dd->ipath_statusp |= IPATH_STATUS_ADMIN_DISABLED; + } + +bail: + return ret; +} + +static DRIVER_ATTR(num_units, S_IRUGO, show_num_units, NULL); +static DRIVER_ATTR(version, S_IRUGO, show_version, NULL); + +static struct attribute *driver_attributes[] = { + &driver_attr_num_units.attr, + &driver_attr_version.attr, + NULL +}; + +static struct attribute_group driver_attr_group = { + .attrs = driver_attributes +}; + +static DEVICE_ATTR(guid, S_IWUSR | S_IRUGO, show_guid, store_guid); +static DEVICE_ATTR(lid, S_IWUSR | S_IRUGO, show_lid, store_lid); +static DEVICE_ATTR(link_state, S_IWUSR, NULL, store_link_state); +static DEVICE_ATTR(mlid, S_IWUSR | S_IRUGO, show_mlid, store_mlid); +static DEVICE_ATTR(mtu, S_IWUSR | S_IRUGO, show_mtu, store_mtu); +static DEVICE_ATTR(enabled, S_IWUSR | S_IRUGO, show_enabled, store_enabled); +static DEVICE_ATTR(nguid, S_IRUGO, show_nguid, NULL); +static DEVICE_ATTR(reset, S_IWUSR, NULL, store_reset); +static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); +static DEVICE_ATTR(status, S_IRUGO, show_status, NULL); +static DEVICE_ATTR(status_str, S_IRUGO, show_status_str, NULL); +static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL); +static DEVICE_ATTR(unit, S_IRUGO, show_unit, NULL); + +static struct attribute *dev_attributes[] = { + &dev_attr_guid.attr, + &dev_attr_lid.attr, + &dev_attr_link_state.attr, + &dev_attr_mlid.attr, + &dev_attr_mtu.attr, + &dev_attr_nguid.attr, + &dev_attr_serial.attr, + &dev_attr_status.attr, + &dev_attr_status_str.attr, + &dev_attr_boardversion.attr, + &dev_attr_unit.attr, + &dev_attr_enabled.attr, + NULL +}; + +static struct attribute_group dev_attr_group = { + .attrs = dev_attributes +}; + +/** + * ipath_expose_reset - create a device reset file + * @dev: the device structure + * + * Only expose a file that lets us reset the device after someone + * enters diag mode. A device reset is quite likely to crash the + * machine entirely, so we don't want to normally make it + * available. + */ +int ipath_expose_reset(struct device *dev) +{ + return device_create_file(dev, &dev_attr_reset); +} + +int ipath_driver_create_group(struct device_driver *drv) +{ + int ret; + + ret = sysfs_create_group(&drv->kobj, &driver_attr_group); + if (ret) + goto bail; + + ret = sysfs_create_group(&drv->kobj, &driver_stat_attr_group); + if (ret) + sysfs_remove_group(&drv->kobj, &driver_attr_group); + +bail: + return ret; +} + +void ipath_driver_remove_group(struct device_driver *drv) +{ + sysfs_remove_group(&drv->kobj, &driver_stat_attr_group); + sysfs_remove_group(&drv->kobj, &driver_attr_group); +} + +int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd) +{ + int ret; + char unit[5]; + + ret = sysfs_create_group(&dev->kobj, &dev_attr_group); + if (ret) + goto bail; + + ret = sysfs_create_group(&dev->kobj, &dev_counter_attr_group); + if (ret) + goto bail_attrs; + + snprintf(unit, sizeof(unit), "%02d", dd->ipath_unit); + ret = sysfs_create_link(&dev->driver->kobj, &dev->kobj, unit); + if (ret == 0) + goto bail; + + sysfs_remove_group(&dev->kobj, &dev_counter_attr_group); +bail_attrs: + sysfs_remove_group(&dev->kobj, &dev_attr_group); +bail: + return ret; +} + +void ipath_device_remove_group(struct device *dev, struct ipath_devdata *dd) +{ + char unit[5]; + + snprintf(unit, sizeof(unit), "%02d", dd->ipath_unit); + sysfs_remove_link(&dev->driver->kobj, unit); + + sysfs_remove_group(&dev->kobj, &dev_counter_attr_group); + sysfs_remove_group(&dev->kobj, &dev_attr_group); + + device_remove_file(dev, &dev_attr_reset); +} diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c new file mode 100644 index 0000000..0d6dbc0 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_uc.c @@ -0,0 +1,645 @@ +/* + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ipath_verbs.h" +#include "ips_common.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_UC_##x + +static void complete_last_send(struct ipath_qp *qp, struct ipath_swqe *wqe, + struct ib_wc *wc) +{ + if (++qp->s_last == qp->s_size) + qp->s_last = 0; + if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) || + (wqe->wr.send_flags & IB_SEND_SIGNALED)) { + wc->wr_id = wqe->wr.wr_id; + wc->status = IB_WC_SUCCESS; + wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc->vendor_err = 0; + wc->byte_len = wqe->length; + wc->qp_num = qp->ibqp.qp_num; + wc->src_qp = qp->remote_qpn; + wc->pkey_index = 0; + wc->slid = qp->remote_ah_attr.dlid; + wc->sl = qp->remote_ah_attr.sl; + wc->dlid_path_bits = 0; + wc->port_num = 0; + ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 0); + } + wqe = get_swqe_ptr(qp, qp->s_last); +} + +/** + * ipath_do_uc_send - do a send on a UC queue + * @data: contains a pointer to the QP to send on + * + * Process entries in the send work queue until the queue is exhausted. + * Only allow one CPU to send a packet per QP (tasklet). + * Otherwise, after we drop the QP lock, two threads could send + * packets out of order. + * This is similar to ipath_do_rc_send() below except we don't have + * timeouts or resends. + */ +void ipath_do_uc_send(unsigned long data) +{ + struct ipath_qp *qp = (struct ipath_qp *)data; + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_swqe *wqe; + unsigned long flags; + u16 lrh0; + u32 hwords; + u32 nwords; + u32 extra_bytes; + u32 bth0; + u32 bth2; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + u32 len; + struct ipath_other_headers *ohdr; + struct ib_wc wc; + + if (test_and_set_bit(IPATH_S_BUSY, &qp->s_flags)) + goto bail; + + if (unlikely(qp->remote_ah_attr.dlid == + ipath_layer_get_lid(dev->dd))) { + /* Pass in an uninitialized ib_wc to save stack space. */ + ipath_ruc_loopback(qp, &wc); + clear_bit(IPATH_S_BUSY, &qp->s_flags); + goto bail; + } + + ohdr = &qp->s_hdr.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr.u.l.oth; + +again: + /* Check for a constructed packet to be sent. */ + if (qp->s_hdrwords != 0) { + /* + * If no PIO bufs are available, return. + * An interrupt will call ipath_ib_piobufavail() + * when one is available. + */ + if (ipath_verbs_send(dev->dd, qp->s_hdrwords, + (u32 *) &qp->s_hdr, + qp->s_cur_size, + qp->s_cur_sge)) { + ipath_no_bufs_available(qp, dev); + goto bail; + } + dev->n_unicast_xmit++; + /* Record that we sent the packet and s_hdr is empty. */ + qp->s_hdrwords = 0; + } + + lrh0 = IPS_LRH_BTH; + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + + /* + * The lock is needed to synchronize between + * setting qp->s_ack_state and post_send(). + */ + spin_lock_irqsave(&qp->s_lock, flags); + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) + goto done; + + bth0 = ipath_layer_get_pkey(dev->dd, qp->s_pkey_index); + + /* Send a request. */ + wqe = get_swqe_ptr(qp, qp->s_last); + switch (qp->s_state) { + default: + /* + * Signal the completion of the last send (if there is + * one). + */ + if (qp->s_last != qp->s_tail) + complete_last_send(qp, wqe, &wc); + + /* Check if send work queue is empty. */ + if (qp->s_tail == qp->s_head) + goto done; + /* + * Start a new request. + */ + qp->s_psn = wqe->psn = qp->s_next_psn; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_len = len = wqe->length; + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + if (len > pmtu) { + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_ONLY); + else { + qp->s_state = + OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / 4; + if (len > pmtu) { + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_ONLY); + else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the RETH */ + ohdr->u.rc.imm_data = wqe->wr.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + break; + + default: + goto done; + } + if (++qp->s_tail >= qp->s_size) + qp->s_tail = 0; + break; + + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_LAST); + else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + break; + + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_LAST); + else { + qp->s_state = + OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + break; + } + bth2 = qp->s_next_psn++ & IPS_PSN_MASK; + qp->s_len -= len; + bth0 |= qp->s_state << 24; + + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Construct the header. */ + extra_bytes = (4 - len) & 3; + nwords = (len + extra_bytes) >> 2; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + /* Header size in 32-bit words. */ + hwords += 10; + lrh0 = IPS_LRH_GRH; + qp->s_hdr.u.l.grh.version_tclass_flow = + cpu_to_be32((6 << 28) | + (qp->remote_ah_attr.grh.traffic_class + << 20) | + qp->remote_ah_attr.grh.flow_label); + qp->s_hdr.u.l.grh.paylen = + cpu_to_be16(((hwords - 12) + nwords + + SIZE_OF_CRC) << 2); + /* next_hdr is defined by C8-7 in ch. 8.4.1 */ + qp->s_hdr.u.l.grh.next_hdr = 0x1B; + qp->s_hdr.u.l.grh.hop_limit = + qp->remote_ah_attr.grh.hop_limit; + /* The SGID is 32-bit aligned. */ + qp->s_hdr.u.l.grh.sgid.global.subnet_prefix = + dev->gid_prefix; + qp->s_hdr.u.l.grh.sgid.global.interface_id = + ipath_layer_get_guid(dev->dd); + qp->s_hdr.u.l.grh.dgid = qp->remote_ah_attr.grh.dgid; + } + qp->s_hdrwords = hwords; + qp->s_cur_sge = &qp->s_sge; + qp->s_cur_size = len; + lrh0 |= qp->remote_ah_attr.sl << 4; + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); + /* DEST LID */ + qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + qp->s_hdr.lrh[2] = cpu_to_be16(hwords + nwords + SIZE_OF_CRC); + qp->s_hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->dd)); + bth0 |= extra_bytes << 20; + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(bth2); + + /* Check for more work to do. */ + goto again; + +done: + spin_unlock_irqrestore(&qp->s_lock, flags); + clear_bit(IPATH_S_BUSY, &qp->s_flags); + +bail: + return; +} + +/** + * ipath_uc_rcv - handle an incoming UC packet + * @dev: the device the packet came in on + * @hdr: the header of the packet + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the length of the packet + * @qp: the QP for this packet. + * + * This is called from ipath_qp_rcv() to process an incoming UC packet + * for the given QP. + * Called at interrupt level. + */ +void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp) +{ + struct ipath_other_headers *ohdr; + int opcode; + u32 hdrsize; + u32 psn; + u32 pad; + unsigned long flags; + struct ib_wc wc; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + struct ib_reth *reth; + int header_in_data; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12; /* LRH + BTH */ + psn = be32_to_cpu(ohdr->bth[2]); + header_in_data = 0; + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */ + /* + * The header with GRH is 60 bytes and the + * core driver sets the eager header buffer + * size to 56 bytes so the last 4 bytes of + * the BTH header (PSN) is in the data buffer. + */ + header_in_data = + ipath_layer_get_rcvhdrentsize(dev->dd) == 16; + if (header_in_data) { + psn = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } else + psn = be32_to_cpu(ohdr->bth[2]); + } + /* + * The opcode is in the low byte when its in network order + * (top byte when in host order). + */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + + wc.imm_data = 0; + wc.wc_flags = 0; + + spin_lock_irqsave(&qp->r_rq.lock, flags); + + /* Compare the PSN verses the expected PSN. */ + if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) { + /* + * Handle a sequence error. + * Silently drop any current message. + */ + qp->r_psn = psn; + inv: + qp->r_state = OP(SEND_LAST); + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + goto send_first; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + goto rdma_first; + + default: + dev->n_pkt_drops++; + goto done; + } + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + default: + if (opcode == OP(SEND_FIRST) || + opcode == OP(SEND_ONLY) || + opcode == OP(SEND_ONLY_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_FIRST) || + opcode == OP(RDMA_WRITE_ONLY) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + break; + goto inv; + } + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + send_first: + if (qp->r_reuse_sge) { + qp->r_reuse_sge = 0; + qp->r_sge = qp->s_rdma_sge; + } else if (!ipath_get_rwqe(qp, 0)) { + dev->n_pkt_drops++; + goto done; + } + /* Save the WQE so we can reuse it in case of an error. */ + qp->s_rdma_sge = qp->r_sge; + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto send_last; + else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE)) + goto send_last_imm; + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) { + qp->r_reuse_sge = 1; + dev->n_pkt_drops++; + goto done; + } + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) { + qp->r_reuse_sge = 1; + dev->n_pkt_drops++; + goto done; + } + ipath_copy_sge(&qp->r_sge, data, pmtu); + break; + + case OP(SEND_LAST_WITH_IMMEDIATE): + send_last_imm: + if (header_in_data) { + wc.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else { + /* Immediate data comes after BTH */ + wc.imm_data = ohdr->u.imm_data; + } + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + /* FALLTHROUGH */ + case OP(SEND_LAST): + send_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) { + qp->r_reuse_sge = 1; + dev->n_pkt_drops++; + goto done; + } + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) { + qp->r_reuse_sge = 1; + dev->n_pkt_drops++; + goto done; + } + /* XXX Need to free SGEs */ + last_imm: + ipath_copy_sge(&qp->r_sge, data, tlen); + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.vendor_err = 0; + wc.qp_num = qp->ibqp.qp_num; + wc.src_qp = qp->remote_qpn; + wc.pkey_index = 0; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + wc.dlid_path_bits = 0; + wc.port_num = 0; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + __constant_cpu_to_be32(1 << 23)) != 0); + break; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */ + rdma_first: + /* RETH comes after BTH */ + if (!header_in_data) + reth = &ohdr->u.rc.reth; + else { + reth = (struct ib_reth *)data; + data += sizeof(*reth); + } + hdrsize += sizeof(*reth); + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + + /* Check rkey */ + if (unlikely(!ipath_rkey_ok( + dev, &qp->r_sge, qp->r_len, + vaddr, rkey, + IB_ACCESS_REMOTE_WRITE))) { + dev->n_pkt_drops++; + goto done; + } + } else { + qp->r_sge.sg_list = NULL; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_WRITE))) { + dev->n_pkt_drops++; + goto done; + } + if (opcode == OP(RDMA_WRITE_ONLY)) + goto rdma_last; + else if (opcode == + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + goto rdma_last_imm; + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) { + dev->n_pkt_drops++; + goto done; + } + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) { + dev->n_pkt_drops++; + goto done; + } + ipath_copy_sge(&qp->r_sge, data, pmtu); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + rdma_last_imm: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) { + dev->n_pkt_drops++; + goto done; + } + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) { + dev->n_pkt_drops++; + goto done; + } + if (qp->r_reuse_sge) { + qp->r_reuse_sge = 0; + } else if (!ipath_get_rwqe(qp, 1)) { + dev->n_pkt_drops++; + goto done; + } + if (header_in_data) { + wc.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else { + /* Immediate data comes after BTH */ + wc.imm_data = ohdr->u.imm_data; + } + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + wc.byte_len = 0; + goto last_imm; + + case OP(RDMA_WRITE_LAST): + rdma_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) { + dev->n_pkt_drops++; + goto done; + } + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) { + dev->n_pkt_drops++; + goto done; + } + ipath_copy_sge(&qp->r_sge, data, tlen); + break; + + default: + /* Drop packet for unknown opcodes. */ + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + dev->n_pkt_drops++; + goto bail; + } + qp->r_psn++; + qp->r_state = opcode; +done: + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + +bail: + return; +} diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c new file mode 100644 index 0000000..01cfb30 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_ud.c @@ -0,0 +1,621 @@ +/* + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_smi.h> + +#include "ipath_verbs.h" +#include "ips_common.h" + +/** + * ipath_ud_loopback - handle send on loopback QPs + * @sqp: the QP + * @ss: the SGE state + * @length: the length of the data to send + * @wr: the work request + * @wc: the work completion entry + * + * This is called from ipath_post_ud_send() to forward a WQE addressed + * to the same HCA. + */ +static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_sge_state *ss, + u32 length, struct ib_send_wr *wr, struct ib_wc *wc) +{ + struct ipath_ibdev *dev = to_idev(sqp->ibqp.device); + struct ipath_qp *qp; + struct ib_ah_attr *ah_attr; + unsigned long flags; + struct ipath_rq *rq; + struct ipath_srq *srq; + struct ipath_sge_state rsge; + struct ipath_sge *sge; + struct ipath_rwqe *wqe; + + qp = ipath_lookup_qpn(&dev->qp_table, wr->wr.ud.remote_qpn); + if (!qp) + return; + + /* + * Check that the qkey matches (except for QP0, see 9.6.1.4.1). + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + if (unlikely(qp->ibqp.qp_num && + ((int) wr->wr.ud.remote_qkey < 0 + ? qp->qkey : wr->wr.ud.remote_qkey) != qp->qkey)) { + /* XXX OK to lose a count once in a while. */ + dev->qkey_violations++; + dev->n_pkt_drops++; + goto done; + } + + /* + * A GRH is expected to preceed the data even if not + * present on the wire. + */ + wc->byte_len = length + sizeof(struct ib_grh); + + if (wr->opcode == IB_WR_SEND_WITH_IMM) { + wc->wc_flags = IB_WC_WITH_IMM; + wc->imm_data = wr->imm_data; + } else { + wc->wc_flags = 0; + wc->imm_data = 0; + } + + /* + * Get the next work request entry to find where to put the data. + * Note that it is safe to drop the lock after changing rq->tail + * since ipath_post_receive() won't fill the empty slot. + */ + if (qp->ibqp.srq) { + srq = to_isrq(qp->ibqp.srq); + rq = &srq->rq; + } else { + srq = NULL; + rq = &qp->r_rq; + } + spin_lock_irqsave(&rq->lock, flags); + if (rq->tail == rq->head) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto done; + } + /* Silently drop packets which are too big. */ + wqe = get_rwqe_ptr(rq, rq->tail); + if (wc->byte_len > wqe->length) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto done; + } + wc->wr_id = wqe->wr_id; + rsge.sge = wqe->sg_list[0]; + rsge.sg_list = wqe->sg_list + 1; + rsge.num_sge = wqe->num_sge; + if (++rq->tail >= rq->size) + rq->tail = 0; + if (srq && srq->ibsrq.event_handler) { + u32 n; + + if (rq->head < rq->tail) + n = rq->size + rq->head - rq->tail; + else + n = rq->head - rq->tail; + if (n < srq->limit) { + struct ib_event ev; + + srq->limit = 0; + spin_unlock_irqrestore(&rq->lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + srq->ibsrq.event_handler(&ev, + srq->ibsrq.srq_context); + } else + spin_unlock_irqrestore(&rq->lock, flags); + } else + spin_unlock_irqrestore(&rq->lock, flags); + ah_attr = &to_iah(wr->wr.ud.ah)->attr; + if (ah_attr->ah_flags & IB_AH_GRH) { + ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh)); + wc->wc_flags |= IB_WC_GRH; + } else + ipath_skip_sge(&rsge, sizeof(struct ib_grh)); + sge = &ss->sge; + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + BUG_ON(len == 0); + ipath_copy_sge(&rsge, sge->vaddr, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } + wc->status = IB_WC_SUCCESS; + wc->opcode = IB_WC_RECV; + wc->vendor_err = 0; + wc->qp_num = qp->ibqp.qp_num; + wc->src_qp = sqp->ibqp.qp_num; + /* XXX do we know which pkey matched? Only needed for GSI. */ + wc->pkey_index = 0; + wc->slid = ipath_layer_get_lid(dev->dd) | + (ah_attr->src_path_bits & + ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1)); + wc->sl = ah_attr->sl; + wc->dlid_path_bits = + ah_attr->dlid & ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), wc, + wr->send_flags & IB_SEND_SOLICITED); + +done: + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +/** + * ipath_post_ud_send - post a UD send on QP + * @qp: the QP + * @wr: the work request + * + * Note that we actually send the data as it is posted instead of putting + * the request into a ring buffer. If we wanted to use a ring buffer, + * we would need to save a reference to the destination address in the SWQE. + */ +int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_other_headers *ohdr; + struct ib_ah_attr *ah_attr; + struct ipath_sge_state ss; + struct ipath_sge *sg_list; + struct ib_wc wc; + u32 hwords; + u32 nwords; + u32 len; + u32 extra_bytes; + u32 bth0; + u16 lrh0; + u16 lid; + int i; + int ret; + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) { + ret = 0; + goto bail; + } + + /* IB spec says that num_sge == 0 is OK. */ + if (wr->num_sge > qp->s_max_sge) { + ret = -EINVAL; + goto bail; + } + + if (wr->num_sge > 1) { + sg_list = kmalloc((qp->s_max_sge - 1) * sizeof(*sg_list), + GFP_ATOMIC); + if (!sg_list) { + ret = -ENOMEM; + goto bail; + } + } else + sg_list = NULL; + + /* Check the buffer to send. */ + ss.sg_list = sg_list; + ss.sge.mr = NULL; + ss.sge.vaddr = NULL; + ss.sge.length = 0; + ss.sge.sge_length = 0; + ss.num_sge = 0; + len = 0; + for (i = 0; i < wr->num_sge; i++) { + /* Check LKEY */ + if (to_ipd(qp->ibqp.pd)->user && wr->sg_list[i].lkey == 0) { + ret = -EINVAL; + goto bail; + } + + if (wr->sg_list[i].length == 0) + continue; + if (!ipath_lkey_ok(&dev->lk_table, ss.num_sge ? + sg_list + ss.num_sge - 1 : &ss.sge, + &wr->sg_list[i], 0)) { + ret = -EINVAL; + goto bail; + } + len += wr->sg_list[i].length; + ss.num_sge++; + } + extra_bytes = (4 - len) & 3; + nwords = (len + extra_bytes) >> 2; + + /* Construct the header. */ + ah_attr = &to_iah(wr->wr.ud.ah)->attr; + if (ah_attr->dlid == 0) { + ret = -EINVAL; + goto bail; + } + if (ah_attr->dlid >= IPS_MULTICAST_LID_BASE) { + if (ah_attr->dlid != IPS_PERMISSIVE_LID) + dev->n_multicast_xmit++; + else + dev->n_unicast_xmit++; + } else { + dev->n_unicast_xmit++; + lid = ah_attr->dlid & + ~((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); + if (unlikely(lid == ipath_layer_get_lid(dev->dd))) { + /* + * Pass in an uninitialized ib_wc to save stack + * space. + */ + ipath_ud_loopback(qp, &ss, len, wr, &wc); + goto done; + } + } + if (ah_attr->ah_flags & IB_AH_GRH) { + /* Header size in 32-bit words. */ + hwords = 17; + lrh0 = IPS_LRH_GRH; + ohdr = &qp->s_hdr.u.l.oth; + qp->s_hdr.u.l.grh.version_tclass_flow = + cpu_to_be32((6 << 28) | + (ah_attr->grh.traffic_class << 20) | + ah_attr->grh.flow_label); + qp->s_hdr.u.l.grh.paylen = + cpu_to_be16(((wr->opcode == + IB_WR_SEND_WITH_IMM ? 6 : 5) + + nwords + SIZE_OF_CRC) << 2); + /* next_hdr is defined by C8-7 in ch. 8.4.1 */ + qp->s_hdr.u.l.grh.next_hdr = 0x1B; + qp->s_hdr.u.l.grh.hop_limit = ah_attr->grh.hop_limit; + /* The SGID is 32-bit aligned. */ + qp->s_hdr.u.l.grh.sgid.global.subnet_prefix = + dev->gid_prefix; + qp->s_hdr.u.l.grh.sgid.global.interface_id = + ipath_layer_get_guid(dev->dd); + qp->s_hdr.u.l.grh.dgid = ah_attr->grh.dgid; + /* + * Don't worry about sending to locally attached multicast + * QPs. It is unspecified by the spec. what happens. + */ + } else { + /* Header size in 32-bit words. */ + hwords = 7; + lrh0 = IPS_LRH_BTH; + ohdr = &qp->s_hdr.u.oth; + } + if (wr->opcode == IB_WR_SEND_WITH_IMM) { + ohdr->u.ud.imm_data = wr->imm_data; + wc.imm_data = wr->imm_data; + hwords += 1; + bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24; + } else if (wr->opcode == IB_WR_SEND) { + wc.imm_data = 0; + bth0 = IB_OPCODE_UD_SEND_ONLY << 24; + } else { + ret = -EINVAL; + goto bail; + } + lrh0 |= ah_attr->sl << 4; + if (qp->ibqp.qp_type == IB_QPT_SMI) + lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */ + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr.lrh[1] = cpu_to_be16(ah_attr->dlid); /* DEST LID */ + qp->s_hdr.lrh[2] = cpu_to_be16(hwords + nwords + SIZE_OF_CRC); + lid = ipath_layer_get_lid(dev->dd); + if (lid) { + lid |= ah_attr->src_path_bits & + ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); + qp->s_hdr.lrh[3] = cpu_to_be16(lid); + } else + qp->s_hdr.lrh[3] = IB_LID_PERMISSIVE; + if (wr->send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + bth0 |= extra_bytes << 20; + bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPS_DEFAULT_P_KEY : + ipath_layer_get_pkey(dev->dd, qp->s_pkey_index); + ohdr->bth[0] = cpu_to_be32(bth0); + /* + * Use the multicast QP if the destination LID is a multicast LID. + */ + ohdr->bth[1] = ah_attr->dlid >= IPS_MULTICAST_LID_BASE && + ah_attr->dlid != IPS_PERMISSIVE_LID ? + __constant_cpu_to_be32(IPS_MULTICAST_QPN) : + cpu_to_be32(wr->wr.ud.remote_qpn); + /* XXX Could lose a PSN count but not worth locking */ + ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPS_PSN_MASK); + /* + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + ohdr->u.ud.deth[0] = cpu_to_be32((int)wr->wr.ud.remote_qkey < 0 ? + qp->qkey : wr->wr.ud.remote_qkey); + ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); + if (ipath_verbs_send(dev->dd, hwords, (u32 *) &qp->s_hdr, + len, &ss)) + dev->n_no_piobuf++; + +done: + /* Queue the completion status entry. */ + if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) || + (wr->send_flags & IB_SEND_SIGNALED)) { + wc.wr_id = wr->wr_id; + wc.status = IB_WC_SUCCESS; + wc.vendor_err = 0; + wc.opcode = IB_WC_SEND; + wc.byte_len = len; + wc.qp_num = qp->ibqp.qp_num; + wc.src_qp = 0; + wc.wc_flags = 0; + /* XXX initialize other fields? */ + ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + } + kfree(sg_list); + + ret = 0; + +bail: + return ret; +} + +/** + * ipath_ud_rcv - receive an incoming UD packet + * @dev: the device the packet came in on + * @hdr: the packet header + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP the packet came on + * + * This is called from ipath_qp_rcv() to process an incoming UD packet + * for the given QP. + * Called at interrupt level. + */ +void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp) +{ + struct ipath_other_headers *ohdr; + int opcode; + u32 hdrsize; + u32 pad; + unsigned long flags; + struct ib_wc wc; + u32 qkey; + u32 src_qp; + struct ipath_rq *rq; + struct ipath_srq *srq; + struct ipath_rwqe *wqe; + u16 dlid; + int header_in_data; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12 + 8; /* LRH + BTH + DETH */ + qkey = be32_to_cpu(ohdr->u.ud.deth[0]); + src_qp = be32_to_cpu(ohdr->u.ud.deth[1]); + header_in_data = 0; + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */ + /* + * The header with GRH is 68 bytes and the core driver sets + * the eager header buffer size to 56 bytes so the last 12 + * bytes of the IB header is in the data buffer. + */ + header_in_data = + ipath_layer_get_rcvhdrentsize(dev->dd) == 16; + if (header_in_data) { + qkey = be32_to_cpu(((__be32 *) data)[1]); + src_qp = be32_to_cpu(((__be32 *) data)[2]); + data += 12; + } else { + qkey = be32_to_cpu(ohdr->u.ud.deth[0]); + src_qp = be32_to_cpu(ohdr->u.ud.deth[1]); + } + } + src_qp &= IPS_QPN_MASK; + + /* + * Check that the permissive LID is only used on QP0 + * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1). + */ + if (qp->ibqp.qp_num) { + if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE || + hdr->lrh[3] == IB_LID_PERMISSIVE)) { + dev->n_pkt_drops++; + goto bail; + } + if (unlikely(qkey != qp->qkey)) { + /* XXX OK to lose a count once in a while. */ + dev->qkey_violations++; + dev->n_pkt_drops++; + goto bail; + } + } else if (hdr->lrh[1] == IB_LID_PERMISSIVE || + hdr->lrh[3] == IB_LID_PERMISSIVE) { + struct ib_smp *smp = (struct ib_smp *) data; + + if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + dev->n_pkt_drops++; + goto bail; + } + } + + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + if (unlikely(tlen < (hdrsize + pad + 4))) { + /* Drop incomplete packets. */ + dev->n_pkt_drops++; + goto bail; + } + tlen -= hdrsize + pad + 4; + + /* Drop invalid MAD packets (see 13.5.3.1). */ + if (unlikely((qp->ibqp.qp_num == 0 && + (tlen != 256 || + (be16_to_cpu(hdr->lrh[0]) >> 12) != 15)) || + (qp->ibqp.qp_num == 1 && + (tlen != 256 || + (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))) { + dev->n_pkt_drops++; + goto bail; + } + + /* + * A GRH is expected to preceed the data even if not + * present on the wire. + */ + wc.byte_len = tlen + sizeof(struct ib_grh); + + /* + * The opcode is in the low byte when its in network order + * (top byte when in host order). + */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + if (qp->ibqp.qp_num > 1 && + opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) { + if (header_in_data) { + wc.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else + wc.imm_data = ohdr->u.ud.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + hdrsize += sizeof(u32); + } else if (opcode == IB_OPCODE_UD_SEND_ONLY) { + wc.imm_data = 0; + wc.wc_flags = 0; + } else { + dev->n_pkt_drops++; + goto bail; + } + + /* + * Get the next work request entry to find where to put the data. + * Note that it is safe to drop the lock after changing rq->tail + * since ipath_post_receive() won't fill the empty slot. + */ + if (qp->ibqp.srq) { + srq = to_isrq(qp->ibqp.srq); + rq = &srq->rq; + } else { + srq = NULL; + rq = &qp->r_rq; + } + spin_lock_irqsave(&rq->lock, flags); + if (rq->tail == rq->head) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto bail; + } + /* Silently drop packets which are too big. */ + wqe = get_rwqe_ptr(rq, rq->tail); + if (wc.byte_len > wqe->length) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto bail; + } + wc.wr_id = wqe->wr_id; + qp->r_sge.sge = wqe->sg_list[0]; + qp->r_sge.sg_list = wqe->sg_list + 1; + qp->r_sge.num_sge = wqe->num_sge; + if (++rq->tail >= rq->size) + rq->tail = 0; + if (srq && srq->ibsrq.event_handler) { + u32 n; + + if (rq->head < rq->tail) + n = rq->size + rq->head - rq->tail; + else + n = rq->head - rq->tail; + if (n < srq->limit) { + struct ib_event ev; + + srq->limit = 0; + spin_unlock_irqrestore(&rq->lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + srq->ibsrq.event_handler(&ev, + srq->ibsrq.srq_context); + } else + spin_unlock_irqrestore(&rq->lock, flags); + } else + spin_unlock_irqrestore(&rq->lock, flags); + if (has_grh) { + ipath_copy_sge(&qp->r_sge, &hdr->u.l.grh, + sizeof(struct ib_grh)); + wc.wc_flags |= IB_WC_GRH; + } else + ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh)); + ipath_copy_sge(&qp->r_sge, data, + wc.byte_len - sizeof(struct ib_grh)); + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.vendor_err = 0; + wc.qp_num = qp->ibqp.qp_num; + wc.src_qp = src_qp; + /* XXX do we know which pkey matched? Only needed for GSI. */ + wc.pkey_index = 0; + wc.slid = be16_to_cpu(hdr->lrh[3]); + wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF; + dlid = be16_to_cpu(hdr->lrh[1]); + /* + * Save the LMC lower bits if the destination LID is a unicast LID. + */ + wc.dlid_path_bits = dlid >= IPS_MULTICAST_LID_BASE ? 0 : + dlid & ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + __constant_cpu_to_be32(1 << 23)) != 0); + +bail:; +} diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c new file mode 100644 index 0000000..2bb08af --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/mm.h> +#include <linux/device.h> + +#include "ipath_kernel.h" + +static void __ipath_release_user_pages(struct page **p, size_t num_pages, + int dirty) +{ + size_t i; + + for (i = 0; i < num_pages; i++) { + ipath_cdbg(MM, "%lu/%lu put_page %p\n", (unsigned long) i, + (unsigned long) num_pages, p[i]); + if (dirty) + set_page_dirty_lock(p[i]); + put_page(p[i]); + } +} + +/* call with current->mm->mmap_sem held */ +static int __get_user_pages(unsigned long start_page, size_t num_pages, + struct page **p, struct vm_area_struct **vma) +{ + unsigned long lock_limit; + size_t got; + int ret; + +#if 0 + /* + * XXX - causes MPI programs to fail, haven't had time to check + * yet + */ + if (!capable(CAP_IPC_LOCK)) { + ret = -EPERM; + goto bail; + } +#endif + + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> + PAGE_SHIFT; + + if (num_pages > lock_limit) { + ret = -ENOMEM; + goto bail; + } + + ipath_cdbg(VERBOSE, "pin %lx pages from vaddr %lx\n", + (unsigned long) num_pages, start_page); + + for (got = 0; got < num_pages; got += ret) { + ret = get_user_pages(current, current->mm, + start_page + got * PAGE_SIZE, + num_pages - got, 1, 1, + p + got, vma); + if (ret < 0) + goto bail_release; + } + + current->mm->locked_vm += num_pages; + + ret = 0; + goto bail; + +bail_release: + __ipath_release_user_pages(p, got, 0); +bail: + return ret; +} + +/** + * ipath_get_user_pages - lock user pages into memory + * @start_page: the start page + * @num_pages: the number of pages + * @p: the output page structures + * + * This function takes a given start page (page aligned user virtual + * address) and pins it and the following specified number of pages. For + * now, num_pages is always 1, but that will probably change at some point + * (because caller is doing expected sends on a single virtually contiguous + * buffer, so we can do all pages at once). + */ +int ipath_get_user_pages(unsigned long start_page, size_t num_pages, + struct page **p) +{ + int ret; + + down_write(¤t->mm->mmap_sem); + + ret = __get_user_pages(start_page, num_pages, p, NULL); + + up_write(¤t->mm->mmap_sem); + + return ret; +} + +/** + * ipath_get_user_pages_nocopy - lock a single page for I/O and mark shared + * @start_page: the page to lock + * @p: the output page structure + * + * This is similar to ipath_get_user_pages, but it's always one page, and we + * mark the page as locked for I/O, and shared. This is used for the user + * process page that contains the destination address for the rcvhdrq tail + * update, so we need to have the vma. If we don't do this, the page can be + * taken away from us on fork, even if the child never touches it, and then + * the user process never sees the tail register updates. + */ +int ipath_get_user_pages_nocopy(unsigned long page, struct page **p) +{ + struct vm_area_struct *vma; + int ret; + + down_write(¤t->mm->mmap_sem); + + ret = __get_user_pages(page, 1, p, &vma); + + up_write(¤t->mm->mmap_sem); + + return ret; +} + +void ipath_release_user_pages(struct page **p, size_t num_pages) +{ + down_write(¤t->mm->mmap_sem); + + __ipath_release_user_pages(p, num_pages, 1); + + current->mm->locked_vm -= num_pages; + + up_write(¤t->mm->mmap_sem); +} + +struct ipath_user_pages_work { + struct work_struct work; + struct mm_struct *mm; + unsigned long num_pages; +}; + +static void user_pages_account(void *ptr) +{ + struct ipath_user_pages_work *work = ptr; + + down_write(&work->mm->mmap_sem); + work->mm->locked_vm -= work->num_pages; + up_write(&work->mm->mmap_sem); + mmput(work->mm); + kfree(work); +} + +void ipath_release_user_pages_on_close(struct page **p, size_t num_pages) +{ + struct ipath_user_pages_work *work; + struct mm_struct *mm; + + __ipath_release_user_pages(p, num_pages, 1); + + mm = get_task_mm(current); + if (!mm) + goto bail; + + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) + goto bail_mm; + + goto bail; + + INIT_WORK(&work->work, user_pages_account, work); + work->mm = mm; + work->num_pages = num_pages; + +bail_mm: + mmput(mm); +bail: + return; +} diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c new file mode 100644 index 0000000..8d2558a --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -0,0 +1,1222 @@ +/* + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_mad.h> +#include <rdma/ib_user_verbs.h> +#include <linux/utsname.h> + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ips_common.h" + +/* Not static, because we don't want the compiler removing it */ +const char ipath_verbs_version[] = "ipath_verbs " IPATH_IDSTR; + +static unsigned int ib_ipath_qp_table_size = 251; +module_param_named(qp_table_size, ib_ipath_qp_table_size, uint, S_IRUGO); +MODULE_PARM_DESC(qp_table_size, "QP table size"); + +unsigned int ib_ipath_lkey_table_size = 12; +module_param_named(lkey_table_size, ib_ipath_lkey_table_size, uint, + S_IRUGO); +MODULE_PARM_DESC(lkey_table_size, + "LKEY table size in bits (2^n, 1 <= n <= 23)"); + +unsigned int ib_ipath_debug; /* debug mask */ +module_param_named(debug, ib_ipath_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(debug, "Verbs debug mask"); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("PathScale <support@pathscale.com>"); +MODULE_DESCRIPTION("Pathscale InfiniPath driver"); + +const int ib_ipath_state_ops[IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = 0, + [IB_QPS_INIT] = IPATH_POST_RECV_OK, + [IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK, + [IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK | + IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK, + [IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK | + IPATH_POST_SEND_OK, + [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK, + [IB_QPS_ERR] = 0, +}; + +/* + * Translate ib_wr_opcode into ib_wc_opcode. + */ +const enum ib_wc_opcode ib_ipath_wc_opcode[] = { + [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, + [IB_WR_SEND] = IB_WC_SEND, + [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, + [IB_WR_RDMA_READ] = IB_WC_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP, + [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD +}; + +/* + * System image GUID. + */ +static __be64 sys_image_guid; + +/** + * ipath_copy_sge - copy data to SGE memory + * @ss: the SGE state + * @data: the data to copy + * @length: the length of the data + */ +void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length) +{ + struct ipath_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + BUG_ON(len == 0); + if (len > length) + len = length; + memcpy(sge->vaddr, data, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + data += len; + length -= len; + } +} + +/** + * ipath_skip_sge - skip over SGE memory - XXX almost dup of prev func + * @ss: the SGE state + * @length: the number of bytes to skip + */ +void ipath_skip_sge(struct ipath_sge_state *ss, u32 length) +{ + struct ipath_sge *sge = &ss->sge; + + while (length > sge->sge_length) { + length -= sge->sge_length; + ss->sge = *ss->sg_list++; + } + while (length) { + u32 len = sge->length; + + BUG_ON(len == 0); + if (len > length) + len = length; + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } +} + +/** + * ipath_post_send - post a send on a QP + * @ibqp: the QP to post the send on + * @wr: the list of work requests to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + */ +static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct ipath_qp *qp = to_iqp(ibqp); + int err = 0; + + /* Check that state is OK to post send. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)) { + *bad_wr = wr; + err = -EINVAL; + goto bail; + } + + for (; wr; wr = wr->next) { + switch (qp->ibqp.qp_type) { + case IB_QPT_UC: + case IB_QPT_RC: + err = ipath_post_rc_send(qp, wr); + break; + + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + err = ipath_post_ud_send(qp, wr); + break; + + default: + err = -EINVAL; + } + if (err) { + *bad_wr = wr; + break; + } + } + +bail: + return err; +} + +/** + * ipath_post_receive - post a receive on a QP + * @ibqp: the QP to post the receive on + * @wr: the WR to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + */ +static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct ipath_qp *qp = to_iqp(ibqp); + unsigned long flags; + int ret; + + /* Check that state is OK to post receive. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK)) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + for (; wr; wr = wr->next) { + struct ipath_rwqe *wqe; + u32 next; + int i, j; + + if (wr->num_sge > qp->r_rq.max_sge) { + *bad_wr = wr; + ret = -ENOMEM; + goto bail; + } + + spin_lock_irqsave(&qp->r_rq.lock, flags); + next = qp->r_rq.head + 1; + if (next >= qp->r_rq.size) + next = 0; + if (next == qp->r_rq.tail) { + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + *bad_wr = wr; + ret = -ENOMEM; + goto bail; + } + + wqe = get_rwqe_ptr(&qp->r_rq, qp->r_rq.head); + wqe->wr_id = wr->wr_id; + wqe->sg_list[0].mr = NULL; + wqe->sg_list[0].vaddr = NULL; + wqe->sg_list[0].length = 0; + wqe->sg_list[0].sge_length = 0; + wqe->length = 0; + for (i = 0, j = 0; i < wr->num_sge; i++) { + /* Check LKEY */ + if (to_ipd(qp->ibqp.pd)->user && + wr->sg_list[i].lkey == 0) { + spin_unlock_irqrestore(&qp->r_rq.lock, + flags); + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + if (wr->sg_list[i].length == 0) + continue; + if (!ipath_lkey_ok( + &to_idev(qp->ibqp.device)->lk_table, + &wqe->sg_list[j], &wr->sg_list[i], + IB_ACCESS_LOCAL_WRITE)) { + spin_unlock_irqrestore(&qp->r_rq.lock, + flags); + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + wqe->length += wr->sg_list[i].length; + j++; + } + wqe->num_sge = j; + qp->r_rq.head = next; + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + } + ret = 0; + +bail: + return ret; +} + +/** + * ipath_qp_rcv - processing an incoming packet on a QP + * @dev: the device the packet came on + * @hdr: the packet header + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP the packet came on + * + * This is called from ipath_ib_rcv() to process an incoming packet + * for the given QP. + * Called at interrupt level. + */ +static void ipath_qp_rcv(struct ipath_ibdev *dev, + struct ipath_ib_header *hdr, int has_grh, + void *data, u32 tlen, struct ipath_qp *qp) +{ + /* Check for valid receive state. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + dev->n_pkt_drops++; + return; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + ipath_ud_rcv(dev, hdr, has_grh, data, tlen, qp); + break; + + case IB_QPT_RC: + ipath_rc_rcv(dev, hdr, has_grh, data, tlen, qp); + break; + + case IB_QPT_UC: + ipath_uc_rcv(dev, hdr, has_grh, data, tlen, qp); + break; + + default: + break; + } +} + +/** + * ipath_ib_rcv - process and incoming packet + * @arg: the device pointer + * @rhdr: the header of the packet + * @data: the packet data + * @tlen: the packet length + * + * This is called from ipath_kreceive() to process an incoming packet at + * interrupt level. Tlen is the length of the header + data + CRC in bytes. + */ +static void ipath_ib_rcv(void *arg, void *rhdr, void *data, u32 tlen) +{ + struct ipath_ibdev *dev = (struct ipath_ibdev *) arg; + struct ipath_ib_header *hdr = rhdr; + struct ipath_other_headers *ohdr; + struct ipath_qp *qp; + u32 qp_num; + int lnh; + u8 opcode; + u16 lid; + + if (unlikely(dev == NULL)) + goto bail; + + if (unlikely(tlen < 24)) { /* LRH+BTH+CRC */ + dev->rcv_errors++; + goto bail; + } + + /* Check for a valid destination LID (see ch. 7.11.1). */ + lid = be16_to_cpu(hdr->lrh[1]); + if (lid < IPS_MULTICAST_LID_BASE) { + lid &= ~((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); + if (unlikely(lid != ipath_layer_get_lid(dev->dd))) { + dev->rcv_errors++; + goto bail; + } + } + + /* Check for GRH */ + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh == IPS_LRH_BTH) + ohdr = &hdr->u.oth; + else if (lnh == IPS_LRH_GRH) + ohdr = &hdr->u.l.oth; + else { + dev->rcv_errors++; + goto bail; + } + + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + dev->opstats[opcode].n_bytes += tlen; + dev->opstats[opcode].n_packets++; + + /* Get the destination QP number. */ + qp_num = be32_to_cpu(ohdr->bth[1]) & IPS_QPN_MASK; + if (qp_num == IPS_MULTICAST_QPN) { + struct ipath_mcast *mcast; + struct ipath_mcast_qp *p; + + mcast = ipath_mcast_find(&hdr->u.l.grh.dgid); + if (mcast == NULL) { + dev->n_pkt_drops++; + goto bail; + } + dev->n_multicast_rcv++; + list_for_each_entry_rcu(p, &mcast->qp_list, list) + ipath_qp_rcv(dev, hdr, lnh == IPS_LRH_GRH, data, + tlen, p->qp); + /* + * Notify ipath_multicast_detach() if it is waiting for us + * to finish. + */ + if (atomic_dec_return(&mcast->refcount) <= 1) + wake_up(&mcast->wait); + } else { + qp = ipath_lookup_qpn(&dev->qp_table, qp_num); + if (qp) { + dev->n_unicast_rcv++; + ipath_qp_rcv(dev, hdr, lnh == IPS_LRH_GRH, data, + tlen, qp); + /* + * Notify ipath_destroy_qp() if it is waiting + * for us to finish. + */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } else + dev->n_pkt_drops++; + } + +bail:; +} + +/** + * ipath_ib_timer - verbs timer + * @arg: the device pointer + * + * This is called from ipath_do_rcv_timer() at interrupt level to check for + * QPs which need retransmits and to collect performance numbers. + */ +static void ipath_ib_timer(void *arg) +{ + struct ipath_ibdev *dev = (struct ipath_ibdev *) arg; + struct ipath_qp *resend = NULL; + struct ipath_qp *rnr = NULL; + struct list_head *last; + struct ipath_qp *qp; + unsigned long flags; + + if (dev == NULL) + return; + + spin_lock_irqsave(&dev->pending_lock, flags); + /* Start filling the next pending queue. */ + if (++dev->pending_index >= ARRAY_SIZE(dev->pending)) + dev->pending_index = 0; + /* Save any requests still in the new queue, they have timed out. */ + last = &dev->pending[dev->pending_index]; + while (!list_empty(last)) { + qp = list_entry(last->next, struct ipath_qp, timerwait); + if (last->next == LIST_POISON1 || + last->next != &qp->timerwait || + qp->timerwait.prev != last) { + INIT_LIST_HEAD(last); + } else { + list_del(&qp->timerwait); + qp->timerwait.prev = (struct list_head *) resend; + resend = qp; + atomic_inc(&qp->refcount); + } + } + last = &dev->rnrwait; + if (!list_empty(last)) { + qp = list_entry(last->next, struct ipath_qp, timerwait); + if (--qp->s_rnr_timeout == 0) { + do { + if (last->next == LIST_POISON1 || + last->next != &qp->timerwait || + qp->timerwait.prev != last) { + INIT_LIST_HEAD(last); + break; + } + list_del(&qp->timerwait); + qp->timerwait.prev = + (struct list_head *) rnr; + rnr = qp; + if (list_empty(last)) + break; + qp = list_entry(last->next, struct ipath_qp, + timerwait); + } while (qp->s_rnr_timeout == 0); + } + } + /* + * We should only be in the started state if pma_sample_start != 0 + */ + if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED && + --dev->pma_sample_start == 0) { + dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING; + ipath_layer_snapshot_counters(dev->dd, &dev->ipath_sword, + &dev->ipath_rword, + &dev->ipath_spkts, + &dev->ipath_rpkts, + &dev->ipath_xmit_wait); + } + if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) { + if (dev->pma_sample_interval == 0) { + u64 ta, tb, tc, td, te; + + dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE; + ipath_layer_snapshot_counters(dev->dd, &ta, &tb, + &tc, &td, &te); + + dev->ipath_sword = ta - dev->ipath_sword; + dev->ipath_rword = tb - dev->ipath_rword; + dev->ipath_spkts = tc - dev->ipath_spkts; + dev->ipath_rpkts = td - dev->ipath_rpkts; + dev->ipath_xmit_wait = te - dev->ipath_xmit_wait; + } + else + dev->pma_sample_interval--; + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + + /* XXX What if timer fires again while this is running? */ + for (qp = resend; qp != NULL; + qp = (struct ipath_qp *) qp->timerwait.prev) { + struct ib_wc wc; + + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_last != qp->s_tail && qp->state == IB_QPS_RTS) { + dev->n_timeouts++; + ipath_restart_rc(qp, qp->s_last_psn + 1, &wc); + } + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } + for (qp = rnr; qp != NULL; + qp = (struct ipath_qp *) qp->timerwait.prev) + tasklet_hi_schedule(&qp->s_task); +} + +/** + * ipath_ib_piobufavail - callback when a PIO buffer is available + * @arg: the device pointer + * + * This is called from ipath_intr() at interrupt level when a PIO buffer is + * available after ipath_verbs_send() returned an error that no buffers were + * available. Return 0 if we consumed all the PIO buffers and we still have + * QPs waiting for buffers (for now, just do a tasklet_hi_schedule and + * return one). + */ +static int ipath_ib_piobufavail(void *arg) +{ + struct ipath_ibdev *dev = (struct ipath_ibdev *) arg; + struct ipath_qp *qp; + unsigned long flags; + + if (dev == NULL) + goto bail; + + spin_lock_irqsave(&dev->pending_lock, flags); + while (!list_empty(&dev->piowait)) { + qp = list_entry(dev->piowait.next, struct ipath_qp, + piowait); + list_del(&qp->piowait); + tasklet_hi_schedule(&qp->s_task); + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + +bail: + return 1; +} + +static int ipath_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + u32 vendor, boardrev, majrev, minrev; + + memset(props, 0, sizeof(*props)); + + props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | + IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | + IB_DEVICE_SYS_IMAGE_GUID; + ipath_layer_query_device(dev->dd, &vendor, &boardrev, + &majrev, &minrev); + props->vendor_id = vendor; + props->vendor_part_id = boardrev; + props->hw_ver = boardrev << 16 | majrev << 8 | minrev; + + props->sys_image_guid = dev->sys_image_guid; + + props->max_mr_size = ~0ull; + props->max_qp = 0xffff; + props->max_qp_wr = 0xffff; + props->max_sge = 255; + props->max_cq = 0xffff; + props->max_cqe = 0xffff; + props->max_mr = 0xffff; + props->max_pd = 0xffff; + props->max_qp_rd_atom = 1; + props->max_qp_init_rd_atom = 1; + /* props->max_res_rd_atom */ + props->max_srq = 0xffff; + props->max_srq_wr = 0xffff; + props->max_srq_sge = 255; + /* props->local_ca_ack_delay */ + props->atomic_cap = IB_ATOMIC_HCA; + props->max_pkeys = ipath_layer_get_npkeys(dev->dd); + props->max_mcast_grp = 0xffff; + props->max_mcast_qp_attach = 0xffff; + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + + return 0; +} + +const u8 ipath_cvt_physportstate[16] = { + [INFINIPATH_IBCS_LT_STATE_DISABLED] = 3, + [INFINIPATH_IBCS_LT_STATE_LINKUP] = 5, + [INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = 2, + [INFINIPATH_IBCS_LT_STATE_POLLQUIET] = 2, + [INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = 1, + [INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = 1, + [INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] = 4, + [INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] = 4, + [INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] = 4, + [INFINIPATH_IBCS_LT_STATE_CFGIDLE] = 4, + [INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] = 6, + [INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] = 6, + [INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] = 6, +}; + +static int ipath_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + enum ib_mtu mtu; + u16 lid = ipath_layer_get_lid(dev->dd); + u64 ibcstat; + + memset(props, 0, sizeof(*props)); + props->lid = lid ? lid : __constant_be16_to_cpu(IB_LID_PERMISSIVE); + props->lmc = dev->mkeyprot_resv_lmc & 7; + props->sm_lid = dev->sm_lid; + props->sm_sl = dev->sm_sl; + ibcstat = ipath_layer_get_lastibcstat(dev->dd); + props->state = ((ibcstat >> 4) & 0x3) + 1; + /* See phys_state_show() */ + props->phys_state = ipath_cvt_physportstate[ + ipath_layer_get_lastibcstat(dev->dd) & 0xf]; + props->port_cap_flags = dev->port_cap_flags; + props->gid_tbl_len = 1; + props->max_msg_sz = 4096; + props->pkey_tbl_len = ipath_layer_get_npkeys(dev->dd); + props->bad_pkey_cntr = ipath_layer_get_cr_errpkey(dev->dd) - + dev->n_pkey_violations; + props->qkey_viol_cntr = dev->qkey_violations; + props->active_width = IB_WIDTH_4X; + /* See rate_show() */ + props->active_speed = 1; /* Regular 10Mbs speed. */ + props->max_vl_num = 1; /* VLCap = VL0 */ + props->init_type_reply = 0; + + props->max_mtu = IB_MTU_4096; + switch (ipath_layer_get_ibmtu(dev->dd)) { + case 4096: + mtu = IB_MTU_4096; + break; + case 2048: + mtu = IB_MTU_2048; + break; + case 1024: + mtu = IB_MTU_1024; + break; + case 512: + mtu = IB_MTU_512; + break; + case 256: + mtu = IB_MTU_256; + break; + default: + mtu = IB_MTU_2048; + } + props->active_mtu = mtu; + props->subnet_timeout = dev->subnet_timeout; + + return 0; +} + +static int ipath_modify_device(struct ib_device *device, + int device_modify_mask, + struct ib_device_modify *device_modify) +{ + int ret; + + if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | + IB_DEVICE_MODIFY_NODE_DESC)) { + ret = -EOPNOTSUPP; + goto bail; + } + + if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) + memcpy(device->node_desc, device_modify->node_desc, 64); + + if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) + to_idev(device)->sys_image_guid = + cpu_to_be64(device_modify->sys_image_guid); + + ret = 0; + +bail: + return ret; +} + +static int ipath_modify_port(struct ib_device *ibdev, + u8 port, int port_modify_mask, + struct ib_port_modify *props) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + + dev->port_cap_flags |= props->set_port_cap_mask; + dev->port_cap_flags &= ~props->clr_port_cap_mask; + if (port_modify_mask & IB_PORT_SHUTDOWN) + ipath_layer_set_linkstate(dev->dd, IPATH_IB_LINKDOWN); + if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR) + dev->qkey_violations = 0; + return 0; +} + +static int ipath_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + int ret; + + if (index >= 1) { + ret = -EINVAL; + goto bail; + } + gid->global.subnet_prefix = dev->gid_prefix; + gid->global.interface_id = ipath_layer_get_guid(dev->dd); + + ret = 0; + +bail: + return ret; +} + +static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct ipath_pd *pd; + struct ib_pd *ret; + + pd = kmalloc(sizeof *pd, GFP_KERNEL); + if (!pd) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + /* ib_alloc_pd() will initialize pd->ibpd. */ + pd->user = udata != NULL; + + ret = &pd->ibpd; + +bail: + return ret; +} + +static int ipath_dealloc_pd(struct ib_pd *ibpd) +{ + struct ipath_pd *pd = to_ipd(ibpd); + + kfree(pd); + + return 0; +} + +/** + * ipath_create_ah - create an address handle + * @pd: the protection domain + * @ah_attr: the attributes of the AH + * + * This may be called from interrupt context. + */ +static struct ib_ah *ipath_create_ah(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + struct ipath_ah *ah; + struct ib_ah *ret; + + /* A multicast address requires a GRH (see ch. 8.4.1). */ + if (ah_attr->dlid >= IPS_MULTICAST_LID_BASE && + ah_attr->dlid != IPS_PERMISSIVE_LID && + !(ah_attr->ah_flags & IB_AH_GRH)) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + ah = kmalloc(sizeof *ah, GFP_ATOMIC); + if (!ah) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + /* ib_create_ah() will initialize ah->ibah. */ + ah->attr = *ah_attr; + + ret = &ah->ibah; + +bail: + return ret; +} + +/** + * ipath_destroy_ah - destroy an address handle + * @ibah: the AH to destroy + * + * This may be called from interrupt context. + */ +static int ipath_destroy_ah(struct ib_ah *ibah) +{ + struct ipath_ah *ah = to_iah(ibah); + + kfree(ah); + + return 0; +} + +static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct ipath_ah *ah = to_iah(ibah); + + *ah_attr = ah->attr; + + return 0; +} + +static int ipath_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + int ret; + + if (index >= ipath_layer_get_npkeys(dev->dd)) { + ret = -EINVAL; + goto bail; + } + + *pkey = ipath_layer_get_pkey(dev->dd, index); + ret = 0; + +bail: + return ret; +} + + +/** + * ipath_alloc_ucontext - allocate a ucontest + * @ibdev: the infiniband device + * @udata: not used by the InfiniPath driver + */ + +static struct ib_ucontext *ipath_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct ipath_ucontext *context; + struct ib_ucontext *ret; + + context = kmalloc(sizeof *context, GFP_KERNEL); + if (!context) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + ret = &context->ibucontext; + +bail: + return ret; +} + +static int ipath_dealloc_ucontext(struct ib_ucontext *context) +{ + kfree(to_iucontext(context)); + return 0; +} + +static int ipath_verbs_register_sysfs(struct ib_device *dev); + +/** + * ipath_register_ib_device - register our device with the infiniband core + * @unit: the device number to register + * @dd: the device data structure + * Return the allocated ipath_ibdev pointer or NULL on error. + */ +static void *ipath_register_ib_device(int unit, struct ipath_devdata *dd) +{ + struct ipath_ibdev *idev; + struct ib_device *dev; + int ret; + + idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev); + if (idev == NULL) + goto bail; + + dev = &idev->ibdev; + + /* Only need to initialize non-zero fields. */ + spin_lock_init(&idev->qp_table.lock); + spin_lock_init(&idev->lk_table.lock); + idev->sm_lid = __constant_be16_to_cpu(IB_LID_PERMISSIVE); + /* Set the prefix to the default value (see ch. 4.1.1) */ + idev->gid_prefix = __constant_cpu_to_be64(0xfe80000000000000ULL); + + ret = ipath_init_qp_table(idev, ib_ipath_qp_table_size); + if (ret) + goto err_qp; + + /* + * The top ib_ipath_lkey_table_size bits are used to index the + * table. The lower 8 bits can be owned by the user (copied from + * the LKEY). The remaining bits act as a generation number or tag. + */ + idev->lk_table.max = 1 << ib_ipath_lkey_table_size; + idev->lk_table.table = kzalloc(idev->lk_table.max * + sizeof(*idev->lk_table.table), + GFP_KERNEL); + if (idev->lk_table.table == NULL) { + ret = -ENOMEM; + goto err_lk; + } + spin_lock_init(&idev->pending_lock); + INIT_LIST_HEAD(&idev->pending[0]); + INIT_LIST_HEAD(&idev->pending[1]); + INIT_LIST_HEAD(&idev->pending[2]); + INIT_LIST_HEAD(&idev->piowait); + INIT_LIST_HEAD(&idev->rnrwait); + idev->pending_index = 0; + idev->port_cap_flags = + IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP; + idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA; + idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA; + idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS; + idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS; + idev->pma_counter_select[5] = IB_PMA_PORT_XMIT_WAIT; + idev->link_width_enabled = 3; /* 1x or 4x */ + + /* + * The system image GUID is supposed to be the same for all + * IB HCAs in a single system but since there can be other + * device types in the system, we can't be sure this is unique. + */ + if (!sys_image_guid) + sys_image_guid = ipath_layer_get_guid(dd); + idev->sys_image_guid = sys_image_guid; + idev->ib_unit = unit; + idev->dd = dd; + + strlcpy(dev->name, "ipath%d", IB_DEVICE_NAME_MAX); + dev->node_guid = ipath_layer_get_guid(dd); + dev->uverbs_abi_ver = IPATH_UVERBS_ABI_VERSION; + dev->uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_CREATE_AH) | + (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | + (1ull << IB_USER_VERBS_CMD_QUERY_AH) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV); + dev->node_type = IB_NODE_CA; + dev->phys_port_cnt = 1; + dev->dma_device = ipath_layer_get_device(dd); + dev->class_dev.dev = dev->dma_device; + dev->query_device = ipath_query_device; + dev->modify_device = ipath_modify_device; + dev->query_port = ipath_query_port; + dev->modify_port = ipath_modify_port; + dev->query_pkey = ipath_query_pkey; + dev->query_gid = ipath_query_gid; + dev->alloc_ucontext = ipath_alloc_ucontext; + dev->dealloc_ucontext = ipath_dealloc_ucontext; + dev->alloc_pd = ipath_alloc_pd; + dev->dealloc_pd = ipath_dealloc_pd; + dev->create_ah = ipath_create_ah; + dev->destroy_ah = ipath_destroy_ah; + dev->query_ah = ipath_query_ah; + dev->create_srq = ipath_create_srq; + dev->modify_srq = ipath_modify_srq; + dev->query_srq = ipath_query_srq; + dev->destroy_srq = ipath_destroy_srq; + dev->create_qp = ipath_create_qp; + dev->modify_qp = ipath_modify_qp; + dev->query_qp = ipath_query_qp; + dev->destroy_qp = ipath_destroy_qp; + dev->post_send = ipath_post_send; + dev->post_recv = ipath_post_receive; + dev->post_srq_recv = ipath_post_srq_receive; + dev->create_cq = ipath_create_cq; + dev->destroy_cq = ipath_destroy_cq; + dev->resize_cq = ipath_resize_cq; + dev->poll_cq = ipath_poll_cq; + dev->req_notify_cq = ipath_req_notify_cq; + dev->get_dma_mr = ipath_get_dma_mr; + dev->reg_phys_mr = ipath_reg_phys_mr; + dev->reg_user_mr = ipath_reg_user_mr; + dev->dereg_mr = ipath_dereg_mr; + dev->alloc_fmr = ipath_alloc_fmr; + dev->map_phys_fmr = ipath_map_phys_fmr; + dev->unmap_fmr = ipath_unmap_fmr; + dev->dealloc_fmr = ipath_dealloc_fmr; + dev->attach_mcast = ipath_multicast_attach; + dev->detach_mcast = ipath_multicast_detach; + dev->process_mad = ipath_process_mad; + + snprintf(dev->node_desc, sizeof(dev->node_desc), + IPATH_IDSTR " %s kernel_SMA", system_utsname.nodename); + + ret = ib_register_device(dev); + if (ret) + goto err_reg; + + if (ipath_verbs_register_sysfs(dev)) + goto err_class; + + ipath_layer_enable_timer(dd); + + goto bail; + +err_class: + ib_unregister_device(dev); +err_reg: + kfree(idev->lk_table.table); +err_lk: + kfree(idev->qp_table.table); +err_qp: + ib_dealloc_device(dev); + _VERBS_ERROR("ib_ipath%d cannot register verbs (%d)!\n", + unit, -ret); + idev = NULL; + +bail: + return idev; +} + +static void ipath_unregister_ib_device(void *arg) +{ + struct ipath_ibdev *dev = (struct ipath_ibdev *) arg; + struct ib_device *ibdev = &dev->ibdev; + + ipath_layer_disable_timer(dev->dd); + + ib_unregister_device(ibdev); + + if (!list_empty(&dev->pending[0]) || + !list_empty(&dev->pending[1]) || + !list_empty(&dev->pending[2])) + _VERBS_ERROR("ipath%d pending list not empty!\n", + dev->ib_unit); + if (!list_empty(&dev->piowait)) + _VERBS_ERROR("ipath%d piowait list not empty!\n", + dev->ib_unit); + if (!list_empty(&dev->rnrwait)) + _VERBS_ERROR("ipath%d rnrwait list not empty!\n", + dev->ib_unit); + if (!ipath_mcast_tree_empty()) + _VERBS_ERROR("ipath%d multicast table memory leak!\n", + dev->ib_unit); + /* + * Note that ipath_unregister_ib_device() can be called before all + * the QPs are destroyed! + */ + ipath_free_all_qps(&dev->qp_table); + kfree(dev->qp_table.table); + kfree(dev->lk_table.table); + ib_dealloc_device(ibdev); +} + +static int __init ipath_verbs_init(void) +{ + return ipath_verbs_register(ipath_register_ib_device, + ipath_unregister_ib_device, + ipath_ib_piobufavail, ipath_ib_rcv, + ipath_ib_timer); +} + +static void __exit ipath_verbs_cleanup(void) +{ + ipath_verbs_unregister(); +} + +static ssize_t show_rev(struct class_device *cdev, char *buf) +{ + struct ipath_ibdev *dev = + container_of(cdev, struct ipath_ibdev, ibdev.class_dev); + int vendor, boardrev, majrev, minrev; + + ipath_layer_query_device(dev->dd, &vendor, &boardrev, + &majrev, &minrev); + return sprintf(buf, "%d.%d\n", majrev, minrev); +} + +static ssize_t show_hca(struct class_device *cdev, char *buf) +{ + struct ipath_ibdev *dev = + container_of(cdev, struct ipath_ibdev, ibdev.class_dev); + int ret; + + ret = ipath_layer_get_boardname(dev->dd, buf, 128); + if (ret < 0) + goto bail; + strcat(buf, "\n"); + ret = strlen(buf); + +bail: + return ret; +} + +static ssize_t show_stats(struct class_device *cdev, char *buf) +{ + struct ipath_ibdev *dev = + container_of(cdev, struct ipath_ibdev, ibdev.class_dev); + int i; + int len; + + len = sprintf(buf, + "RC resends %d\n" + "RC QACKs %d\n" + "RC ACKs %d\n" + "RC SEQ NAKs %d\n" + "RC RDMA seq %d\n" + "RC RNR NAKs %d\n" + "RC OTH NAKs %d\n" + "RC timeouts %d\n" + "RC RDMA dup %d\n" + "piobuf wait %d\n" + "no piobuf %d\n" + "PKT drops %d\n" + "WQE errs %d\n", + dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks, + dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks, + dev->n_other_naks, dev->n_timeouts, + dev->n_rdma_dup_busy, dev->n_piowait, + dev->n_no_piobuf, dev->n_pkt_drops, dev->n_wqe_errs); + for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) { + const struct ipath_opcode_stats *si = &dev->opstats[i]; + + if (!si->n_packets && !si->n_bytes) + continue; + len += sprintf(buf + len, "%02x %llu/%llu\n", i, + (unsigned long long) si->n_packets, + (unsigned long long) si->n_bytes); + } + return len; +} + +static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL); +static CLASS_DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL); + +static struct class_device_attribute *ipath_class_attributes[] = { + &class_device_attr_hw_rev, + &class_device_attr_hca_type, + &class_device_attr_board_id, + &class_device_attr_stats +}; + +static int ipath_verbs_register_sysfs(struct ib_device *dev) +{ + int i; + int ret; + + for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) + if (class_device_create_file(&dev->class_dev, + ipath_class_attributes[i])) { + ret = 1; + goto bail; + } + + ret = 0; + +bail: + return ret; +} + +module_init(ipath_verbs_init); +module_exit(ipath_verbs_cleanup); diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h new file mode 100644 index 0000000..fcafbc7 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_verbs.h @@ -0,0 +1,692 @@ +/* + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IPATH_VERBS_H +#define IPATH_VERBS_H + +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <rdma/ib_pack.h> + +#include "ipath_layer.h" +#include "verbs_debug.h" + +#define QPN_MAX (1 << 24) +#define QPNMAP_ENTRIES (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE) + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define IPATH_UVERBS_ABI_VERSION 1 + +/* + * Define an ib_cq_notify value that is not valid so we know when CQ + * notifications are armed. + */ +#define IB_CQ_NONE (IB_CQ_NEXT_COMP + 1) + +#define IB_RNR_NAK 0x20 +#define IB_NAK_PSN_ERROR 0x60 +#define IB_NAK_INVALID_REQUEST 0x61 +#define IB_NAK_REMOTE_ACCESS_ERROR 0x62 +#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63 +#define IB_NAK_INVALID_RD_REQUEST 0x64 + +#define IPATH_POST_SEND_OK 0x01 +#define IPATH_POST_RECV_OK 0x02 +#define IPATH_PROCESS_RECV_OK 0x04 +#define IPATH_PROCESS_SEND_OK 0x08 + +/* IB Performance Manager status values */ +#define IB_PMA_SAMPLE_STATUS_DONE 0x00 +#define IB_PMA_SAMPLE_STATUS_STARTED 0x01 +#define IB_PMA_SAMPLE_STATUS_RUNNING 0x02 + +/* Mandatory IB performance counter select values. */ +#define IB_PMA_PORT_XMIT_DATA __constant_htons(0x0001) +#define IB_PMA_PORT_RCV_DATA __constant_htons(0x0002) +#define IB_PMA_PORT_XMIT_PKTS __constant_htons(0x0003) +#define IB_PMA_PORT_RCV_PKTS __constant_htons(0x0004) +#define IB_PMA_PORT_XMIT_WAIT __constant_htons(0x0005) + +struct ib_reth { + __be64 vaddr; + __be32 rkey; + __be32 length; +} __attribute__ ((packed)); + +struct ib_atomic_eth { + __be64 vaddr; + __be32 rkey; + __be64 swap_data; + __be64 compare_data; +} __attribute__ ((packed)); + +struct ipath_other_headers { + __be32 bth[3]; + union { + struct { + __be32 deth[2]; + __be32 imm_data; + } ud; + struct { + struct ib_reth reth; + __be32 imm_data; + } rc; + struct { + __be32 aeth; + __be64 atomic_ack_eth; + } at; + __be32 imm_data; + __be32 aeth; + struct ib_atomic_eth atomic_eth; + } u; +} __attribute__ ((packed)); + +/* + * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes + * long (72 w/ imm_data). Only the first 56 bytes of the IB header + * will be in the eager header buffer. The remaining 12 or 16 bytes + * are in the data buffer. + */ +struct ipath_ib_header { + __be16 lrh[4]; + union { + struct { + struct ib_grh grh; + struct ipath_other_headers oth; + } l; + struct ipath_other_headers oth; + } u; +} __attribute__ ((packed)); + +/* + * There is one struct ipath_mcast for each multicast GID. + * All attached QPs are then stored as a list of + * struct ipath_mcast_qp. + */ +struct ipath_mcast_qp { + struct list_head list; + struct ipath_qp *qp; +}; + +struct ipath_mcast { + struct rb_node rb_node; + union ib_gid mgid; + struct list_head qp_list; + wait_queue_head_t wait; + atomic_t refcount; +}; + +/* Memory region */ +struct ipath_mr { + struct ib_mr ibmr; + struct ipath_mregion mr; /* must be last */ +}; + +/* Fast memory region */ +struct ipath_fmr { + struct ib_fmr ibfmr; + u8 page_shift; + struct ipath_mregion mr; /* must be last */ +}; + +/* Protection domain */ +struct ipath_pd { + struct ib_pd ibpd; + int user; /* non-zero if created from user space */ +}; + +/* Address Handle */ +struct ipath_ah { + struct ib_ah ibah; + struct ib_ah_attr attr; +}; + +/* + * Quick description of our CQ/QP locking scheme: + * + * We have one global lock that protects dev->cq/qp_table. Each + * struct ipath_cq/qp also has its own lock. An individual qp lock + * may be taken inside of an individual cq lock. Both cqs attached to + * a qp may be locked, with the send cq locked first. No other + * nesting should be done. + * + * Each struct ipath_cq/qp also has an atomic_t ref count. The + * pointer from the cq/qp_table to the struct counts as one reference. + * This reference also is good for access through the consumer API, so + * modifying the CQ/QP etc doesn't need to take another reference. + * Access because of a completion being polled does need a reference. + * + * Finally, each struct ipath_cq/qp has a wait_queue_head_t for the + * destroy function to sleep on. + * + * This means that access from the consumer API requires nothing but + * taking the struct's lock. + * + * Access because of a completion event should go as follows: + * - lock cq/qp_table and look up struct + * - increment ref count in struct + * - drop cq/qp_table lock + * - lock struct, do your thing, and unlock struct + * - decrement ref count; if zero, wake up waiters + * + * To destroy a CQ/QP, we can do the following: + * - lock cq/qp_table, remove pointer, unlock cq/qp_table lock + * - decrement ref count + * - wait_event until ref count is zero + * + * It is the consumer's responsibilty to make sure that no QP + * operations (WQE posting or state modification) are pending when the + * QP is destroyed. Also, the consumer must make sure that calls to + * qp_modify are serialized. + * + * Possible optimizations (wait for profile data to see if/where we + * have locks bouncing between CPUs): + * - split cq/qp table lock into n separate (cache-aligned) locks, + * indexed (say) by the page in the table + */ + +struct ipath_cq { + struct ib_cq ibcq; + struct tasklet_struct comptask; + spinlock_t lock; + u8 notify; + u8 triggered; + u32 head; /* new records added to the head */ + u32 tail; /* poll_cq() reads from here. */ + struct ib_wc *queue; /* this is actually ibcq.cqe + 1 */ +}; + +/* + * Send work request queue entry. + * The size of the sg_list is determined when the QP is created and stored + * in qp->s_max_sge. + */ +struct ipath_swqe { + struct ib_send_wr wr; /* don't use wr.sg_list */ + u32 psn; /* first packet sequence number */ + u32 lpsn; /* last packet sequence number */ + u32 ssn; /* send sequence number */ + u32 length; /* total length of data in sg_list */ + struct ipath_sge sg_list[0]; +}; + +/* + * Receive work request queue entry. + * The size of the sg_list is determined when the QP is created and stored + * in qp->r_max_sge. + */ +struct ipath_rwqe { + u64 wr_id; + u32 length; /* total length of data in sg_list */ + u8 num_sge; + struct ipath_sge sg_list[0]; +}; + +struct ipath_rq { + spinlock_t lock; + u32 head; /* new work requests posted to the head */ + u32 tail; /* receives pull requests from here. */ + u32 size; /* size of RWQE array */ + u8 max_sge; + struct ipath_rwqe *wq; /* RWQE array */ +}; + +struct ipath_srq { + struct ib_srq ibsrq; + struct ipath_rq rq; + /* send signal when number of RWQEs < limit */ + u32 limit; +}; + +/* + * Variables prefixed with s_ are for the requester (sender). + * Variables prefixed with r_ are for the responder (receiver). + * Variables prefixed with ack_ are for responder replies. + * + * Common variables are protected by both r_rq.lock and s_lock in that order + * which only happens in modify_qp() or changing the QP 'state'. + */ +struct ipath_qp { + struct ib_qp ibqp; + struct ipath_qp *next; /* link list for QPN hash table */ + struct list_head piowait; /* link for wait PIO buf */ + struct list_head timerwait; /* link for waiting for timeouts */ + struct ib_ah_attr remote_ah_attr; + struct ipath_ib_header s_hdr; /* next packet header to send */ + atomic_t refcount; + wait_queue_head_t wait; + struct tasklet_struct s_task; + struct ipath_sge_state *s_cur_sge; + struct ipath_sge_state s_sge; /* current send request data */ + /* current RDMA read send data */ + struct ipath_sge_state s_rdma_sge; + struct ipath_sge_state r_sge; /* current receive data */ + spinlock_t s_lock; + unsigned long s_flags; + u32 s_hdrwords; /* size of s_hdr in 32 bit words */ + u32 s_cur_size; /* size of send packet in bytes */ + u32 s_len; /* total length of s_sge */ + u32 s_rdma_len; /* total length of s_rdma_sge */ + u32 s_next_psn; /* PSN for next request */ + u32 s_last_psn; /* last response PSN processed */ + u32 s_psn; /* current packet sequence number */ + u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */ + u32 s_ack_psn; /* PSN for next ACK or RDMA_READ */ + u64 s_ack_atomic; /* data for atomic ACK */ + u64 r_wr_id; /* ID for current receive WQE */ + u64 r_atomic_data; /* data for last atomic op */ + u32 r_atomic_psn; /* PSN of last atomic op */ + u32 r_len; /* total length of r_sge */ + u32 r_rcv_len; /* receive data len processed */ + u32 r_psn; /* expected rcv packet sequence number */ + u8 state; /* QP state */ + u8 s_state; /* opcode of last packet sent */ + u8 s_ack_state; /* opcode of packet to ACK */ + u8 s_nak_state; /* non-zero if NAK is pending */ + u8 r_state; /* opcode of last packet received */ + u8 r_reuse_sge; /* for UC receive errors */ + u8 r_sge_inx; /* current index into sg_list */ + u8 s_max_sge; /* size of s_wq->sg_list */ + u8 qp_access_flags; + u8 s_retry_cnt; /* number of times to retry */ + u8 s_rnr_retry_cnt; + u8 s_min_rnr_timer; + u8 s_retry; /* requester retry counter */ + u8 s_rnr_retry; /* requester RNR retry counter */ + u8 s_pkey_index; /* PKEY index to use */ + enum ib_mtu path_mtu; + atomic_t msn; /* message sequence number */ + u32 remote_qpn; + u32 qkey; /* QKEY for this QP (for UD or RD) */ + u32 s_size; /* send work queue size */ + u32 s_head; /* new entries added here */ + u32 s_tail; /* next entry to process */ + u32 s_cur; /* current work queue entry */ + u32 s_last; /* last un-ACK'ed entry */ + u32 s_ssn; /* SSN of tail entry */ + u32 s_lsn; /* limit sequence number (credit) */ + struct ipath_swqe *s_wq; /* send work queue */ + struct ipath_rq r_rq; /* receive work queue */ +}; + +/* + * Bit definitions for s_flags. + */ +#define IPATH_S_BUSY 0 +#define IPATH_S_SIGNAL_REQ_WR 1 + +/* + * Since struct ipath_swqe is not a fixed size, we can't simply index into + * struct ipath_qp.s_wq. This function does the array index computation. + */ +static inline struct ipath_swqe *get_swqe_ptr(struct ipath_qp *qp, + unsigned n) +{ + return (struct ipath_swqe *)((char *)qp->s_wq + + (sizeof(struct ipath_swqe) + + qp->s_max_sge * + sizeof(struct ipath_sge)) * n); +} + +/* + * Since struct ipath_rwqe is not a fixed size, we can't simply index into + * struct ipath_rq.wq. This function does the array index computation. + */ +static inline struct ipath_rwqe *get_rwqe_ptr(struct ipath_rq *rq, + unsigned n) +{ + return (struct ipath_rwqe *) + ((char *) rq->wq + + (sizeof(struct ipath_rwqe) + + rq->max_sge * sizeof(struct ipath_sge)) * n); +} + +/* + * QPN-map pages start out as NULL, they get allocated upon + * first use and are never deallocated. This way, + * large bitmaps are not allocated unless large numbers of QPs are used. + */ +struct qpn_map { + atomic_t n_free; + void *page; +}; + +struct ipath_qp_table { + spinlock_t lock; + u32 last; /* last QP number allocated */ + u32 max; /* size of the hash table */ + u32 nmaps; /* size of the map table */ + struct ipath_qp **table; + /* bit map of free numbers */ + struct qpn_map map[QPNMAP_ENTRIES]; +}; + +struct ipath_lkey_table { + spinlock_t lock; + u32 next; /* next unused index (speeds search) */ + u32 gen; /* generation count */ + u32 max; /* size of the table */ + struct ipath_mregion **table; +}; + +struct ipath_opcode_stats { + u64 n_packets; /* number of packets */ + u64 n_bytes; /* total number of bytes */ +}; + +struct ipath_ibdev { + struct ib_device ibdev; + struct list_head dev_list; + struct ipath_devdata *dd; + int ib_unit; /* This is the device number */ + u16 sm_lid; /* in host order */ + u8 sm_sl; + u8 mkeyprot_resv_lmc; + /* non-zero when timer is set */ + unsigned long mkey_lease_timeout; + + /* The following fields are really per port. */ + struct ipath_qp_table qp_table; + struct ipath_lkey_table lk_table; + struct list_head pending[3]; /* FIFO of QPs waiting for ACKs */ + struct list_head piowait; /* list for wait PIO buf */ + /* list of QPs waiting for RNR timer */ + struct list_head rnrwait; + spinlock_t pending_lock; + __be64 sys_image_guid; /* in network order */ + __be64 gid_prefix; /* in network order */ + __be64 mkey; + u64 ipath_sword; /* total dwords sent (sample result) */ + u64 ipath_rword; /* total dwords received (sample result) */ + u64 ipath_spkts; /* total packets sent (sample result) */ + u64 ipath_rpkts; /* total packets received (sample result) */ + /* # of ticks no data sent (sample result) */ + u64 ipath_xmit_wait; + u64 rcv_errors; /* # of packets with SW detected rcv errs */ + u64 n_unicast_xmit; /* total unicast packets sent */ + u64 n_unicast_rcv; /* total unicast packets received */ + u64 n_multicast_xmit; /* total multicast packets sent */ + u64 n_multicast_rcv; /* total multicast packets received */ + u64 n_symbol_error_counter; /* starting count for PMA */ + u64 n_link_error_recovery_counter; /* starting count for PMA */ + u64 n_link_downed_counter; /* starting count for PMA */ + u64 n_port_rcv_errors; /* starting count for PMA */ + u64 n_port_rcv_remphys_errors; /* starting count for PMA */ + u64 n_port_xmit_discards; /* starting count for PMA */ + u64 n_port_xmit_data; /* starting count for PMA */ + u64 n_port_rcv_data; /* starting count for PMA */ + u64 n_port_xmit_packets; /* starting count for PMA */ + u64 n_port_rcv_packets; /* starting count for PMA */ + u32 n_pkey_violations; /* starting count for PMA */ + u32 n_rc_resends; + u32 n_rc_acks; + u32 n_rc_qacks; + u32 n_seq_naks; + u32 n_rdma_seq; + u32 n_rnr_naks; + u32 n_other_naks; + u32 n_timeouts; + u32 n_pkt_drops; + u32 n_wqe_errs; + u32 n_rdma_dup_busy; + u32 n_piowait; + u32 n_no_piobuf; + u32 port_cap_flags; + u32 pma_sample_start; + u32 pma_sample_interval; + __be16 pma_counter_select[5]; + u16 pma_tag; + u16 qkey_violations; + u16 mkey_violations; + u16 mkey_lease_period; + u16 pending_index; /* which pending queue is active */ + u8 pma_sample_status; + u8 subnet_timeout; + u8 link_width_enabled; + u8 vl_high_limit; + struct ipath_opcode_stats opstats[128]; +}; + +struct ipath_ucontext { + struct ib_ucontext ibucontext; +}; + +static inline struct ipath_mr *to_imr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct ipath_mr, ibmr); +} + +static inline struct ipath_fmr *to_ifmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct ipath_fmr, ibfmr); +} + +static inline struct ipath_pd *to_ipd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct ipath_pd, ibpd); +} + +static inline struct ipath_ah *to_iah(struct ib_ah *ibah) +{ + return container_of(ibah, struct ipath_ah, ibah); +} + +static inline struct ipath_cq *to_icq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct ipath_cq, ibcq); +} + +static inline struct ipath_srq *to_isrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct ipath_srq, ibsrq); +} + +static inline struct ipath_qp *to_iqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct ipath_qp, ibqp); +} + +static inline struct ipath_ibdev *to_idev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct ipath_ibdev, ibdev); +} + +int ipath_process_mad(struct ib_device *ibdev, + int mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad); + +static inline struct ipath_ucontext *to_iucontext(struct ib_ucontext + *ibucontext) +{ + return container_of(ibucontext, struct ipath_ucontext, ibucontext); +} + +/* + * Compare the lower 24 bits of the two values. + * Returns an integer <, ==, or > than zero. + */ +static inline int ipath_cmp24(u32 a, u32 b) +{ + return (((int) a) - ((int) b)) << 8; +} + +struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid); + +int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); + +int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); + +int ipath_mcast_tree_empty(void); + +__be32 ipath_compute_aeth(struct ipath_qp *qp); + +struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn); + +struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); + +int ipath_destroy_qp(struct ib_qp *ibqp); + +int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask); + +int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr); + +void ipath_free_all_qps(struct ipath_qp_table *qpt); + +int ipath_init_qp_table(struct ipath_ibdev *idev, int size); + +void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc); + +void ipath_get_credit(struct ipath_qp *qp, u32 aeth); + +void ipath_do_rc_send(unsigned long data); + +void ipath_do_uc_send(unsigned long data); + +void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig); + +int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss, + u32 len, u64 vaddr, u32 rkey, int acc); + +int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge, + struct ib_sge *sge, int acc); + +void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length); + +void ipath_skip_sge(struct ipath_sge_state *ss, u32 length); + +int ipath_post_rc_send(struct ipath_qp *qp, struct ib_send_wr *wr); + +void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp); + +void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp); + +void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc); + +int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr); + +void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp); + +int ipath_alloc_lkey(struct ipath_lkey_table *rkt, + struct ipath_mregion *mr); + +void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey); + +int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge, + struct ib_sge *sge, int acc); + +int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss, + u32 len, u64 vaddr, u32 rkey, int acc); + +int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); + +struct ib_srq *ipath_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata); + +int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask); + +int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr); + +int ipath_destroy_srq(struct ib_srq *ibsrq); + +void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig); + +int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); + +struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, + struct ib_ucontext *context, + struct ib_udata *udata); + +int ipath_destroy_cq(struct ib_cq *ibcq); + +int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify notify); + +int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); + +struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc); + +struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start); + +struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, + int mr_access_flags, + struct ib_udata *udata); + +int ipath_dereg_mr(struct ib_mr *ibmr); + +struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr); + +int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list, + int list_len, u64 iova); + +int ipath_unmap_fmr(struct list_head *fmr_list); + +int ipath_dealloc_fmr(struct ib_fmr *ibfmr); + +void ipath_no_bufs_available(struct ipath_qp *qp, struct ipath_ibdev *dev); + +void ipath_insert_rnr_queue(struct ipath_qp *qp); + +int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only); + +void ipath_ruc_loopback(struct ipath_qp *sqp, struct ib_wc *wc); + +extern const enum ib_wc_opcode ib_ipath_wc_opcode[]; + +extern const u8 ipath_cvt_physportstate[]; + +extern const int ib_ipath_state_ops[]; + +extern unsigned int ib_ipath_lkey_table_size; + +extern const u32 ib_ipath_rnr_table[]; + +#endif /* IPATH_VERBS_H */ diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c new file mode 100644 index 0000000..10b31d2 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/list.h> +#include <linux/rcupdate.h> + +#include "ipath_verbs.h" + +/* + * Global table of GID to attached QPs. + * The table is global to all ipath devices since a send from one QP/device + * needs to be locally routed to any locally attached QPs on the same + * or different device. + */ +static struct rb_root mcast_tree; +static DEFINE_SPINLOCK(mcast_lock); + +/** + * ipath_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct + * @qp: the QP to link + */ +static struct ipath_mcast_qp *ipath_mcast_qp_alloc(struct ipath_qp *qp) +{ + struct ipath_mcast_qp *mqp; + + mqp = kmalloc(sizeof *mqp, GFP_KERNEL); + if (!mqp) + goto bail; + + mqp->qp = qp; + atomic_inc(&qp->refcount); + +bail: + return mqp; +} + +static void ipath_mcast_qp_free(struct ipath_mcast_qp *mqp) +{ + struct ipath_qp *qp = mqp->qp; + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + + kfree(mqp); +} + +/** + * ipath_mcast_alloc - allocate the multicast GID structure + * @mgid: the multicast GID + * + * A list of QPs will be attached to this structure. + */ +static struct ipath_mcast *ipath_mcast_alloc(union ib_gid *mgid) +{ + struct ipath_mcast *mcast; + + mcast = kmalloc(sizeof *mcast, GFP_KERNEL); + if (!mcast) + goto bail; + + mcast->mgid = *mgid; + INIT_LIST_HEAD(&mcast->qp_list); + init_waitqueue_head(&mcast->wait); + atomic_set(&mcast->refcount, 0); + +bail: + return mcast; +} + +static void ipath_mcast_free(struct ipath_mcast *mcast) +{ + struct ipath_mcast_qp *p, *tmp; + + list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) + ipath_mcast_qp_free(p); + + kfree(mcast); +} + +/** + * ipath_mcast_find - search the global table for the given multicast GID + * @mgid: the multicast GID to search for + * + * Returns NULL if not found. + * + * The caller is responsible for decrementing the reference count if found. + */ +struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid) +{ + struct rb_node *n; + unsigned long flags; + struct ipath_mcast *mcast; + + spin_lock_irqsave(&mcast_lock, flags); + n = mcast_tree.rb_node; + while (n) { + int ret; + + mcast = rb_entry(n, struct ipath_mcast, rb_node); + + ret = memcmp(mgid->raw, mcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else { + atomic_inc(&mcast->refcount); + spin_unlock_irqrestore(&mcast_lock, flags); + goto bail; + } + } + spin_unlock_irqrestore(&mcast_lock, flags); + + mcast = NULL; + +bail: + return mcast; +} + +/** + * ipath_mcast_add - insert mcast GID into table and attach QP struct + * @mcast: the mcast GID table + * @mqp: the QP to attach + * + * Return zero if both were added. Return EEXIST if the GID was already in + * the table but the QP was added. Return ESRCH if the QP was already + * attached and neither structure was added. + */ +static int ipath_mcast_add(struct ipath_mcast *mcast, + struct ipath_mcast_qp *mqp) +{ + struct rb_node **n = &mcast_tree.rb_node; + struct rb_node *pn = NULL; + unsigned long flags; + int ret; + + spin_lock_irqsave(&mcast_lock, flags); + + while (*n) { + struct ipath_mcast *tmcast; + struct ipath_mcast_qp *p; + + pn = *n; + tmcast = rb_entry(pn, struct ipath_mcast, rb_node); + + ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) { + n = &pn->rb_left; + continue; + } + if (ret > 0) { + n = &pn->rb_right; + continue; + } + + /* Search the QP list to see if this is already there. */ + list_for_each_entry_rcu(p, &tmcast->qp_list, list) { + if (p->qp == mqp->qp) { + spin_unlock_irqrestore(&mcast_lock, flags); + ret = ESRCH; + goto bail; + } + } + list_add_tail_rcu(&mqp->list, &tmcast->qp_list); + spin_unlock_irqrestore(&mcast_lock, flags); + ret = EEXIST; + goto bail; + } + + list_add_tail_rcu(&mqp->list, &mcast->qp_list); + + atomic_inc(&mcast->refcount); + rb_link_node(&mcast->rb_node, pn, n); + rb_insert_color(&mcast->rb_node, &mcast_tree); + + spin_unlock_irqrestore(&mcast_lock, flags); + + ret = 0; + +bail: + return ret; +} + +int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_mcast *mcast; + struct ipath_mcast_qp *mqp; + int ret; + + /* + * Allocate data structures since its better to do this outside of + * spin locks and it will most likely be needed. + */ + mcast = ipath_mcast_alloc(gid); + if (mcast == NULL) { + ret = -ENOMEM; + goto bail; + } + mqp = ipath_mcast_qp_alloc(qp); + if (mqp == NULL) { + ipath_mcast_free(mcast); + ret = -ENOMEM; + goto bail; + } + switch (ipath_mcast_add(mcast, mqp)) { + case ESRCH: + /* Neither was used: can't attach the same QP twice. */ + ipath_mcast_qp_free(mqp); + ipath_mcast_free(mcast); + ret = -EINVAL; + goto bail; + case EEXIST: /* The mcast wasn't used */ + ipath_mcast_free(mcast); + break; + default: + break; + } + + ret = 0; + +bail: + return ret; +} + +int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_mcast *mcast = NULL; + struct ipath_mcast_qp *p, *tmp; + struct rb_node *n; + unsigned long flags; + int last = 0; + int ret; + + spin_lock_irqsave(&mcast_lock, flags); + + /* Find the GID in the mcast table. */ + n = mcast_tree.rb_node; + while (1) { + if (n == NULL) { + spin_unlock_irqrestore(&mcast_lock, flags); + ret = 0; + goto bail; + } + + mcast = rb_entry(n, struct ipath_mcast, rb_node); + ret = memcmp(gid->raw, mcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else + break; + } + + /* Search the QP list. */ + list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) { + if (p->qp != qp) + continue; + /* + * We found it, so remove it, but don't poison the forward + * link until we are sure there are no list walkers. + */ + list_del_rcu(&p->list); + + /* If this was the last attached QP, remove the GID too. */ + if (list_empty(&mcast->qp_list)) { + rb_erase(&mcast->rb_node, &mcast_tree); + last = 1; + } + break; + } + + spin_unlock_irqrestore(&mcast_lock, flags); + + if (p) { + /* + * Wait for any list walkers to finish before freeing the + * list element. + */ + wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1); + ipath_mcast_qp_free(p); + } + if (last) { + atomic_dec(&mcast->refcount); + wait_event(mcast->wait, !atomic_read(&mcast->refcount)); + ipath_mcast_free(mcast); + } + + ret = 0; + +bail: + return ret; +} + +int ipath_mcast_tree_empty(void) +{ + return mcast_tree.rb_node == NULL; +} diff --git a/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c b/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c new file mode 100644 index 0000000..adc5322f --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file is conditionally built on x86_64 only. Otherwise weak symbol + * versions of the functions exported from here are used. + */ + +#include <linux/pci.h> +#include <asm/mtrr.h> +#include <asm/processor.h> + +#include "ipath_kernel.h" + +/** + * ipath_enable_wc - enable write combining for MMIO writes to the device + * @dd: infinipath device + * + * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable + * write combining. + */ +int ipath_enable_wc(struct ipath_devdata *dd) +{ + int ret = 0; + u64 pioaddr, piolen; + unsigned bits; + const unsigned long addr = pci_resource_start(dd->pcidev, 0); + const size_t len = pci_resource_len(dd->pcidev, 0); + + /* + * Set the PIO buffers to be WCCOMB, so we get HT bursts to the + * chip. Linux (possibly the hardware) requires it to be on a power + * of 2 address matching the length (which has to be a power of 2). + * For rev1, that means the base address, for rev2, it will be just + * the PIO buffers themselves. + */ + pioaddr = addr + dd->ipath_piobufbase; + piolen = (dd->ipath_piobcnt2k + + dd->ipath_piobcnt4k) * + ALIGN(dd->ipath_piobcnt2k + + dd->ipath_piobcnt4k, dd->ipath_palign); + + for (bits = 0; !(piolen & (1ULL << bits)); bits++) + /* do nothing */ ; + + if (piolen != (1ULL << bits)) { + piolen >>= bits; + while (piolen >>= 1) + bits++; + piolen = 1ULL << (bits + 1); + } + if (pioaddr & (piolen - 1)) { + u64 atmp; + ipath_dbg("pioaddr %llx not on right boundary for size " + "%llx, fixing\n", + (unsigned long long) pioaddr, + (unsigned long long) piolen); + atmp = pioaddr & ~(piolen - 1); + if (atmp < addr || (atmp + piolen) > (addr + len)) { + ipath_dev_err(dd, "No way to align address/size " + "(%llx/%llx), no WC mtrr\n", + (unsigned long long) atmp, + (unsigned long long) piolen << 1); + ret = -ENODEV; + } else { + ipath_dbg("changing WC base from %llx to %llx, " + "len from %llx to %llx\n", + (unsigned long long) pioaddr, + (unsigned long long) atmp, + (unsigned long long) piolen, + (unsigned long long) piolen << 1); + pioaddr = atmp; + piolen <<= 1; + } + } + + if (!ret) { + int cookie; + ipath_cdbg(VERBOSE, "Setting mtrr for chip to WC " + "(addr %llx, len=0x%llx)\n", + (unsigned long long) pioaddr, + (unsigned long long) piolen); + cookie = mtrr_add(pioaddr, piolen, MTRR_TYPE_WRCOMB, 0); + if (cookie < 0) { + { + dev_info(&dd->pcidev->dev, + "mtrr_add() WC for PIO bufs " + "failed (%d)\n", + cookie); + ret = -EINVAL; + } + } else { + ipath_cdbg(VERBOSE, "Set mtrr for chip to WC, " + "cookie is %d\n", cookie); + dd->ipath_wc_cookie = cookie; + } + } + + return ret; +} + +/** + * ipath_disable_wc - disable write combining for MMIO writes to the device + * @dd: infinipath device + */ +void ipath_disable_wc(struct ipath_devdata *dd) +{ + if (dd->ipath_wc_cookie) { + ipath_cdbg(VERBOSE, "undoing WCCOMB on pio buffers\n"); + mtrr_del(dd->ipath_wc_cookie, 0, 0); + dd->ipath_wc_cookie = 0; + } +} + +/** + * ipath_unordered_wc - indicate whether write combining is ordered + * + * Because our performance depends on our ability to do write combining mmio + * writes in the most efficient way, we need to know if we are on an Intel + * or AMD x86_64 processor. AMD x86_64 processors flush WC buffers out in + * the order completed, and so no special flushing is required to get + * correct ordering. Intel processors, however, will flush write buffers + * out in "random" orders, and so explicit ordering is needed at times. + */ +int ipath_unordered_wc(void) +{ + return boot_cpu_data.x86_vendor != X86_VENDOR_AMD; +} diff --git a/drivers/infiniband/hw/ipath/ips_common.h b/drivers/infiniband/hw/ipath/ips_common.h new file mode 100644 index 0000000..410a764 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ips_common.h @@ -0,0 +1,263 @@ +#ifndef IPS_COMMON_H +#define IPS_COMMON_H +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ipath_common.h" + +struct ipath_header { + /* + * Version - 4 bits, Port - 4 bits, TID - 10 bits and Offset - + * 14 bits before ECO change ~28 Dec 03. After that, Vers 4, + * Port 3, TID 11, offset 14. + */ + __le32 ver_port_tid_offset; + __le16 chksum; + __le16 pkt_flags; +}; + +struct ips_message_header { + __be16 lrh[4]; + __be32 bth[3]; + /* fields below this point are in host byte order */ + struct ipath_header iph; + __u8 sub_opcode; + __u8 flags; + __u16 src_rank; + /* 24 bits. The upper 8 bit is available for other use */ + union { + struct { + unsigned ack_seq_num:24; + unsigned port:4; + unsigned unused:4; + }; + __u32 ack_seq_num_org; + }; + __u8 expected_tid_session_id; + __u8 tinylen; /* to aid MPI */ + union { + __u16 tag; /* to aid MPI */ + __u16 mqhdr; /* for PSM MQ */ + }; + union { + __u32 mpi[4]; /* to aid MPI */ + __u32 data[4]; + __u64 mq[2]; /* for PSM MQ */ + struct { + __u16 mtu; + __u8 major_ver; + __u8 minor_ver; + __u32 not_used; //free + __u32 run_id; + __u32 client_ver; + }; + }; +}; + +struct ether_header { + __be16 lrh[4]; + __be32 bth[3]; + struct ipath_header iph; + __u8 sub_opcode; + __u8 cmd; + __be16 lid; + __u16 mac[3]; + __u8 frag_num; + __u8 seq_num; + __le32 len; + /* MUST be of word size due to PIO write requirements */ + __u32 csum; + __le16 csum_offset; + __le16 flags; + __u16 first_2_bytes; + __u8 unused[2]; /* currently unused */ +}; + +/* + * The PIO buffer used for sending infinipath messages must only be written + * in 32-bit words, all the data must be written, and no writes can occur + * after the last word is written (which transfers "ownership" of the buffer + * to the chip and triggers the message to be sent). + * Since the Linux sk_buff structure can be recursive, non-aligned, and + * any number of bytes in each segment, we use the following structure + * to keep information about the overall state of the copy operation. + * This is used to save the information needed to store the checksum + * in the right place before sending the last word to the hardware and + * to buffer the last 0-3 bytes of non-word sized segments. + */ +struct copy_data_s { + struct ether_header *hdr; + /* addr of PIO buf to write csum to */ + __u32 __iomem *csum_pio; + __u32 __iomem *to; /* addr of PIO buf to write data to */ + __u32 device; /* which device to allocate PIO bufs from */ + __s32 error; /* set if there is an error. */ + __s32 extra; /* amount of data saved in u.buf below */ + __u32 len; /* total length to send in bytes */ + __u32 flen; /* frament length in words */ + __u32 csum; /* partial IP checksum */ + __u32 pos; /* position for partial checksum */ + __u32 offset; /* offset to where data currently starts */ + __s32 checksum_calc; /* set to 1 when csum has been calculated */ + struct sk_buff *skb; + union { + __u32 w; + __u8 buf[4]; + } u; +}; + +/* IB - LRH header consts */ +#define IPS_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */ +#define IPS_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ + +#define IPS_OFFSET 0 + +/* + * defines the cut-off point between the header queue and eager/expected + * TID queue + */ +#define NUM_OF_EXTRA_WORDS_IN_HEADER_QUEUE \ + ((sizeof(struct ips_message_header) - \ + offsetof(struct ips_message_header, iph)) >> 2) + +/* OpCodes */ +#define OPCODE_IPS 0xC0 +#define OPCODE_ITH4X 0xC1 + +/* OpCode 30 is use by stand-alone test programs */ +#define OPCODE_RAW_DATA 0xDE +/* last OpCode (31) is reserved for test */ +#define OPCODE_TEST 0xDF + +/* sub OpCodes - ips */ +#define OPCODE_SEQ_DATA 0x01 +#define OPCODE_SEQ_CTRL 0x02 + +#define OPCODE_SEQ_MQ_DATA 0x03 +#define OPCODE_SEQ_MQ_CTRL 0x04 + +#define OPCODE_ACK 0x10 +#define OPCODE_NAK 0x11 + +#define OPCODE_ERR_CHK 0x20 +#define OPCODE_ERR_CHK_PLS 0x21 + +#define OPCODE_STARTUP 0x30 +#define OPCODE_STARTUP_ACK 0x31 +#define OPCODE_STARTUP_NAK 0x32 + +#define OPCODE_STARTUP_EXT 0x34 +#define OPCODE_STARTUP_ACK_EXT 0x35 +#define OPCODE_STARTUP_NAK_EXT 0x36 + +#define OPCODE_TIDS_RELEASE 0x40 +#define OPCODE_TIDS_RELEASE_CONFIRM 0x41 + +#define OPCODE_CLOSE 0x50 +#define OPCODE_CLOSE_ACK 0x51 +/* + * like OPCODE_CLOSE, but no complaint if other side has already closed. + * Used when doing abort(), MPI_Abort(), etc. + */ +#define OPCODE_ABORT 0x52 + +/* sub OpCodes - ith4x */ +#define OPCODE_ENCAP 0x81 +#define OPCODE_LID_ARP 0x82 + +/* Receive Header Queue: receive type (from infinipath) */ +#define RCVHQ_RCV_TYPE_EXPECTED 0 +#define RCVHQ_RCV_TYPE_EAGER 1 +#define RCVHQ_RCV_TYPE_NON_KD 2 +#define RCVHQ_RCV_TYPE_ERROR 3 + +/* misc. */ +#define SIZE_OF_CRC 1 + +#define EAGER_TID_ID INFINIPATH_I_TID_MASK + +#define IPS_DEFAULT_P_KEY 0xFFFF + +#define IPS_PERMISSIVE_LID 0xFFFF +#define IPS_MULTICAST_LID_BASE 0xC000 + +#define IPS_AETH_CREDIT_SHIFT 24 +#define IPS_AETH_CREDIT_MASK 0x1F +#define IPS_AETH_CREDIT_INVAL 0x1F + +#define IPS_PSN_MASK 0xFFFFFF +#define IPS_MSN_MASK 0xFFFFFF +#define IPS_QPN_MASK 0xFFFFFF +#define IPS_MULTICAST_QPN 0xFFFFFF + +/* functions for extracting fields from rcvhdrq entries */ +static inline __u32 ips_get_hdr_err_flags(const __le32 * rbuf) +{ + return __le32_to_cpu(rbuf[1]); +} + +static inline __u32 ips_get_index(const __le32 * rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_EGRINDEX_SHIFT) + & INFINIPATH_RHF_EGRINDEX_MASK; +} + +static inline __u32 ips_get_rcv_type(const __le32 * rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_RCVTYPE_SHIFT) + & INFINIPATH_RHF_RCVTYPE_MASK; +} + +static inline __u32 ips_get_length_in_bytes(const __le32 * rbuf) +{ + return ((__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_LENGTH_SHIFT) + & INFINIPATH_RHF_LENGTH_MASK) << 2; +} + +static inline void *ips_get_first_protocol_header(const __u32 * rbuf) +{ + return (void *)&rbuf[2]; +} + +static inline struct ips_message_header *ips_get_ips_header(const __u32 * + rbuf) +{ + return (struct ips_message_header *)&rbuf[2]; +} + +static inline __u32 ips_get_ipath_ver(__le32 hdrword) +{ + return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT) + & INFINIPATH_I_VERS_MASK; +} + +#endif /* IPS_COMMON_H */ diff --git a/drivers/infiniband/hw/ipath/verbs_debug.h b/drivers/infiniband/hw/ipath/verbs_debug.h new file mode 100644 index 0000000..40d693c --- /dev/null +++ b/drivers/infiniband/hw/ipath/verbs_debug.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _VERBS_DEBUG_H +#define _VERBS_DEBUG_H + +/* + * This file contains tracing code for the ib_ipath kernel module. + */ +#ifndef _VERBS_DEBUGGING /* tracing enabled or not */ +#define _VERBS_DEBUGGING 1 +#endif + +extern unsigned ib_ipath_debug; + +#define _VERBS_ERROR(fmt,...) \ + do { \ + printk(KERN_ERR "%s: " fmt, "ib_ipath", ##__VA_ARGS__); \ + } while(0) + +#define _VERBS_UNIT_ERROR(unit,fmt,...) \ + do { \ + printk(KERN_ERR "%s: " fmt, "ib_ipath", ##__VA_ARGS__); \ + } while(0) + +#if _VERBS_DEBUGGING + +/* + * Mask values for debugging. The scheme allows us to compile out any + * of the debug tracing stuff, and if compiled in, to enable or + * disable dynamically. + * This can be set at modprobe time also: + * modprobe ib_path ib_ipath_debug=3 + */ + +#define __VERBS_INFO 0x1 /* generic low verbosity stuff */ +#define __VERBS_DBG 0x2 /* generic debug */ +#define __VERBS_VDBG 0x4 /* verbose debug */ +#define __VERBS_SMADBG 0x8000 /* sma packet debug */ + +#define _VERBS_INFO(fmt,...) \ + do { \ + if (unlikely(ib_ipath_debug&__VERBS_INFO)) \ + printk(KERN_INFO "%s: " fmt,"ib_ipath", \ + ##__VA_ARGS__); \ + } while(0) + +#define _VERBS_DBG(fmt,...) \ + do { \ + if (unlikely(ib_ipath_debug&__VERBS_DBG)) \ + printk(KERN_DEBUG "%s: " fmt, __func__, \ + ##__VA_ARGS__); \ + } while(0) + +#define _VERBS_VDBG(fmt,...) \ + do { \ + if (unlikely(ib_ipath_debug&__VERBS_VDBG)) \ + printk(KERN_DEBUG "%s: " fmt, __func__, \ + ##__VA_ARGS__); \ + } while(0) + +#define _VERBS_SMADBG(fmt,...) \ + do { \ + if (unlikely(ib_ipath_debug&__VERBS_SMADBG)) \ + printk(KERN_DEBUG "%s: " fmt, __func__, \ + ##__VA_ARGS__); \ + } while(0) + +#else /* ! _VERBS_DEBUGGING */ + +#define _VERBS_INFO(fmt,...) +#define _VERBS_DBG(fmt,...) +#define _VERBS_VDBG(fmt,...) +#define _VERBS_SMADBG(fmt,...) + +#endif /* _VERBS_DEBUGGING */ + +#endif /* _VERBS_DEBUG_H */ diff --git a/drivers/infiniband/hw/mthca/Kconfig b/drivers/infiniband/hw/mthca/Kconfig index e88be85..9aa5a44 100644 --- a/drivers/infiniband/hw/mthca/Kconfig +++ b/drivers/infiniband/hw/mthca/Kconfig @@ -7,10 +7,11 @@ config INFINIBAND_MTHCA ("Tavor") and the MT25208 PCI Express HCA ("Arbel"). config INFINIBAND_MTHCA_DEBUG - bool "Verbose debugging output" + bool "Verbose debugging output" if EMBEDDED depends on INFINIBAND_MTHCA - default n + default y ---help--- - This option causes the mthca driver produce a bunch of debug - messages. Select this is you are developing the driver or - trying to diagnose a problem. + This option causes debugging code to be compiled into the + mthca driver. The output can be turned on via the + debug_level module parameter (which can also be set after + the driver is loaded through sysfs). diff --git a/drivers/infiniband/hw/mthca/Makefile b/drivers/infiniband/hw/mthca/Makefile index 47ec5a7..e388d95 100644 --- a/drivers/infiniband/hw/mthca/Makefile +++ b/drivers/infiniband/hw/mthca/Makefile @@ -1,7 +1,3 @@ -ifdef CONFIG_INFINIBAND_MTHCA_DEBUG -EXTRA_CFLAGS += -DDEBUG -endif - obj-$(CONFIG_INFINIBAND_MTHCA) += ib_mthca.o ib_mthca-y := mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \ diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c index bc5bdcbe5..b12aa03 100644 --- a/drivers/infiniband/hw/mthca/mthca_av.c +++ b/drivers/infiniband/hw/mthca/mthca_av.c @@ -42,6 +42,20 @@ #include "mthca_dev.h" +enum { + MTHCA_RATE_TAVOR_FULL = 0, + MTHCA_RATE_TAVOR_1X = 1, + MTHCA_RATE_TAVOR_4X = 2, + MTHCA_RATE_TAVOR_1X_DDR = 3 +}; + +enum { + MTHCA_RATE_MEMFREE_FULL = 0, + MTHCA_RATE_MEMFREE_QUARTER = 1, + MTHCA_RATE_MEMFREE_EIGHTH = 2, + MTHCA_RATE_MEMFREE_HALF = 3 +}; + struct mthca_av { __be32 port_pd; u8 reserved1; @@ -55,6 +69,90 @@ struct mthca_av { __be32 dgid[4]; }; +static enum ib_rate memfree_rate_to_ib(u8 mthca_rate, u8 port_rate) +{ + switch (mthca_rate) { + case MTHCA_RATE_MEMFREE_EIGHTH: + return mult_to_ib_rate(port_rate >> 3); + case MTHCA_RATE_MEMFREE_QUARTER: + return mult_to_ib_rate(port_rate >> 2); + case MTHCA_RATE_MEMFREE_HALF: + return mult_to_ib_rate(port_rate >> 1); + case MTHCA_RATE_MEMFREE_FULL: + default: + return mult_to_ib_rate(port_rate); + } +} + +static enum ib_rate tavor_rate_to_ib(u8 mthca_rate, u8 port_rate) +{ + switch (mthca_rate) { + case MTHCA_RATE_TAVOR_1X: return IB_RATE_2_5_GBPS; + case MTHCA_RATE_TAVOR_1X_DDR: return IB_RATE_5_GBPS; + case MTHCA_RATE_TAVOR_4X: return IB_RATE_10_GBPS; + default: return port_rate; + } +} + +enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port) +{ + if (mthca_is_memfree(dev)) { + /* Handle old Arbel FW */ + if (dev->limits.stat_rate_support == 0x3 && mthca_rate) + return IB_RATE_2_5_GBPS; + + return memfree_rate_to_ib(mthca_rate, dev->rate[port - 1]); + } else + return tavor_rate_to_ib(mthca_rate, dev->rate[port - 1]); +} + +static u8 ib_rate_to_memfree(u8 req_rate, u8 cur_rate) +{ + if (cur_rate <= req_rate) + return 0; + + /* + * Inter-packet delay (IPD) to get from rate X down to a rate + * no more than Y is (X - 1) / Y. + */ + switch ((cur_rate - 1) / req_rate) { + case 0: return MTHCA_RATE_MEMFREE_FULL; + case 1: return MTHCA_RATE_MEMFREE_HALF; + case 2: /* fall through */ + case 3: return MTHCA_RATE_MEMFREE_QUARTER; + default: return MTHCA_RATE_MEMFREE_EIGHTH; + } +} + +static u8 ib_rate_to_tavor(u8 static_rate) +{ + switch (static_rate) { + case IB_RATE_2_5_GBPS: return MTHCA_RATE_TAVOR_1X; + case IB_RATE_5_GBPS: return MTHCA_RATE_TAVOR_1X_DDR; + case IB_RATE_10_GBPS: return MTHCA_RATE_TAVOR_4X; + default: return MTHCA_RATE_TAVOR_FULL; + } +} + +u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port) +{ + u8 rate; + + if (!static_rate || ib_rate_to_mult(static_rate) >= dev->rate[port - 1]) + return 0; + + if (mthca_is_memfree(dev)) + rate = ib_rate_to_memfree(ib_rate_to_mult(static_rate), + dev->rate[port - 1]); + else + rate = ib_rate_to_tavor(static_rate); + + if (!(dev->limits.stat_rate_support & (1 << rate))) + rate = 1; + + return rate; +} + int mthca_create_ah(struct mthca_dev *dev, struct mthca_pd *pd, struct ib_ah_attr *ah_attr, @@ -107,7 +205,7 @@ on_hca_fail: av->g_slid = ah_attr->src_path_bits; av->dlid = cpu_to_be16(ah_attr->dlid); av->msg_sr = (3 << 4) | /* 2K message */ - ah_attr->static_rate; + mthca_get_rate(dev, ah_attr->static_rate, ah_attr->port_num); av->sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28); if (ah_attr->ah_flags & IB_AH_GRH) { av->g_slid |= 0x80; diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index 343eca5..1985b5d 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -965,6 +965,7 @@ int mthca_QUERY_DEV_LIM(struct mthca_dev *dev, u32 *outbox; u8 field; u16 size; + u16 stat_rate; int err; #define QUERY_DEV_LIM_OUT_SIZE 0x100 @@ -995,6 +996,7 @@ int mthca_QUERY_DEV_LIM(struct mthca_dev *dev, #define QUERY_DEV_LIM_MTU_WIDTH_OFFSET 0x36 #define QUERY_DEV_LIM_VL_PORT_OFFSET 0x37 #define QUERY_DEV_LIM_MAX_GID_OFFSET 0x3b +#define QUERY_DEV_LIM_RATE_SUPPORT_OFFSET 0x3c #define QUERY_DEV_LIM_MAX_PKEY_OFFSET 0x3f #define QUERY_DEV_LIM_FLAGS_OFFSET 0x44 #define QUERY_DEV_LIM_RSVD_UAR_OFFSET 0x48 @@ -1086,6 +1088,8 @@ int mthca_QUERY_DEV_LIM(struct mthca_dev *dev, dev_lim->num_ports = field & 0xf; MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_GID_OFFSET); dev_lim->max_gids = 1 << (field & 0xf); + MTHCA_GET(stat_rate, outbox, QUERY_DEV_LIM_RATE_SUPPORT_OFFSET); + dev_lim->stat_rate_support = stat_rate; MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PKEY_OFFSET); dev_lim->max_pkeys = 1 << (field & 0xf); MTHCA_GET(dev_lim->flags, outbox, QUERY_DEV_LIM_FLAGS_OFFSET); diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.h b/drivers/infiniband/hw/mthca/mthca_cmd.h index e4ec35c..2f976f2 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.h +++ b/drivers/infiniband/hw/mthca/mthca_cmd.h @@ -146,6 +146,7 @@ struct mthca_dev_lim { int max_vl; int num_ports; int max_gids; + u16 stat_rate_support; int max_pkeys; u32 flags; int reserved_uars; diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index ad52edb..4c1dcb4 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -151,6 +151,7 @@ struct mthca_limits { int reserved_qps; int num_srqs; int max_srq_wqes; + int max_srq_sge; int reserved_srqs; int num_eecs; int reserved_eecs; @@ -172,6 +173,7 @@ struct mthca_limits { int reserved_pds; u32 page_size_cap; u32 flags; + u16 stat_rate_support; u8 port_width_cap; }; @@ -353,10 +355,24 @@ struct mthca_dev { struct ib_mad_agent *send_agent[MTHCA_MAX_PORTS][2]; struct ib_ah *sm_ah[MTHCA_MAX_PORTS]; spinlock_t sm_lock; + u8 rate[MTHCA_MAX_PORTS]; }; -#define mthca_dbg(mdev, format, arg...) \ - dev_dbg(&mdev->pdev->dev, format, ## arg) +#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG +extern int mthca_debug_level; + +#define mthca_dbg(mdev, format, arg...) \ + do { \ + if (mthca_debug_level) \ + dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ## arg); \ + } while (0) + +#else /* CONFIG_INFINIBAND_MTHCA_DEBUG */ + +#define mthca_dbg(mdev, format, arg...) do { (void) mdev; } while (0) + +#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */ + #define mthca_err(mdev, format, arg...) \ dev_err(&mdev->pdev->dev, format, ## arg) #define mthca_info(mdev, format, arg...) \ @@ -492,6 +508,7 @@ void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq); int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask attr_mask); int mthca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); +int mthca_max_srq_sge(struct mthca_dev *dev); void mthca_srq_event(struct mthca_dev *dev, u32 srqn, enum ib_event_type event_type); void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr); @@ -542,6 +559,8 @@ int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah, struct ib_ud_header *header); int mthca_ah_query(struct ib_ah *ibah, struct ib_ah_attr *attr); int mthca_ah_grh_present(struct mthca_ah *ah); +u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port); +enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port); int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c index dfb482e..4730863 100644 --- a/drivers/infiniband/hw/mthca/mthca_mad.c +++ b/drivers/infiniband/hw/mthca/mthca_mad.c @@ -49,6 +49,30 @@ enum { MTHCA_VENDOR_CLASS2 = 0xa }; +static int mthca_update_rate(struct mthca_dev *dev, u8 port_num) +{ + struct ib_port_attr *tprops = NULL; + int ret; + + tprops = kmalloc(sizeof *tprops, GFP_KERNEL); + if (!tprops) + return -ENOMEM; + + ret = ib_query_port(&dev->ib_dev, port_num, tprops); + if (ret) { + printk(KERN_WARNING "ib_query_port failed (%d) for %s port %d\n", + ret, dev->ib_dev.name, port_num); + goto out; + } + + dev->rate[port_num - 1] = tprops->active_speed * + ib_width_enum_to_int(tprops->active_width); + +out: + kfree(tprops); + return ret; +} + static void update_sm_ah(struct mthca_dev *dev, u8 port_num, u16 lid, u8 sl) { @@ -90,6 +114,7 @@ static void smp_snoop(struct ib_device *ibdev, mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && mad->mad_hdr.method == IB_MGMT_METHOD_SET) { if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) { + mthca_update_rate(to_mdev(ibdev), port_num); update_sm_ah(to_mdev(ibdev), port_num, be16_to_cpup((__be16 *) (mad->data + 58)), (*(u8 *) (mad->data + 76)) & 0xf); @@ -246,6 +271,7 @@ int mthca_create_agents(struct mthca_dev *dev) { struct ib_mad_agent *agent; int p, q; + int ret; spin_lock_init(&dev->sm_lock); @@ -255,11 +281,23 @@ int mthca_create_agents(struct mthca_dev *dev) q ? IB_QPT_GSI : IB_QPT_SMI, NULL, 0, send_handler, NULL, NULL); - if (IS_ERR(agent)) + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); goto err; + } dev->send_agent[p][q] = agent; } + + for (p = 1; p <= dev->limits.num_ports; ++p) { + ret = mthca_update_rate(dev, p); + if (ret) { + mthca_err(dev, "Failed to obtain port %d rate." + " aborting.\n", p); + goto err; + } + } + return 0; err: @@ -268,7 +306,7 @@ err: if (dev->send_agent[p][q]) ib_unregister_mad_agent(dev->send_agent[p][q]); - return PTR_ERR(agent); + return ret; } void __devexit mthca_free_agents(struct mthca_dev *dev) diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 266f347..9b9ff7b 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -52,6 +52,14 @@ MODULE_DESCRIPTION("Mellanox InfiniBand HCA low-level driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(DRV_VERSION); +#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG + +int mthca_debug_level = 0; +module_param_named(debug_level, mthca_debug_level, int, 0644); +MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); + +#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */ + #ifdef CONFIG_PCI_MSI static int msi_x = 0; @@ -69,6 +77,10 @@ MODULE_PARM_DESC(msi, "attempt to use MSI if nonzero"); #endif /* CONFIG_PCI_MSI */ +static int tune_pci = 0; +module_param(tune_pci, int, 0444); +MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero"); + static const char mthca_version[] __devinitdata = DRV_NAME ": Mellanox InfiniBand HCA driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; @@ -90,6 +102,9 @@ static int __devinit mthca_tune_pci(struct mthca_dev *mdev) int cap; u16 val; + if (!tune_pci) + return 0; + /* First try to max out Read Byte Count */ cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX); if (cap) { @@ -176,6 +191,7 @@ static int __devinit mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim mdev->limits.reserved_srqs = dev_lim->reserved_srqs; mdev->limits.reserved_eecs = dev_lim->reserved_eecs; mdev->limits.max_desc_sz = dev_lim->max_desc_sz; + mdev->limits.max_srq_sge = mthca_max_srq_sge(mdev); /* * Subtract 1 from the limit because we need to allocate a * spare CQE so the HCA HW can tell the difference between an @@ -191,6 +207,18 @@ static int __devinit mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim mdev->limits.port_width_cap = dev_lim->max_port_width; mdev->limits.page_size_cap = ~(u32) (dev_lim->min_page_sz - 1); mdev->limits.flags = dev_lim->flags; + /* + * For old FW that doesn't return static rate support, use a + * value of 0x3 (only static rate values of 0 or 1 are handled), + * except on Sinai, where even old FW can handle static rate + * values of 2 and 3. + */ + if (dev_lim->stat_rate_support) + mdev->limits.stat_rate_support = dev_lim->stat_rate_support; + else if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT) + mdev->limits.stat_rate_support = 0xf; + else + mdev->limits.stat_rate_support = 0x3; /* IB_DEVICE_RESIZE_MAX_WR not supported by driver. May be doable since hardware supports it for SRQ. diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 2c250bc..565a24b 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -106,7 +106,7 @@ static int mthca_query_device(struct ib_device *ibdev, props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq = mdev->limits.num_srqs - mdev->limits.reserved_srqs; props->max_srq_wr = mdev->limits.max_srq_wqes; - props->max_srq_sge = mdev->limits.max_sg; + props->max_srq_sge = mdev->limits.max_srq_sge; props->local_ca_ack_delay = mdev->limits.local_ca_ack_delay; props->atomic_cap = mdev->limits.flags & DEV_LIM_FLAG_ATOMIC ? IB_ATOMIC_HCA : IB_ATOMIC_NONE; diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h index 2e7f521..6676a78 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.h +++ b/drivers/infiniband/hw/mthca/mthca_provider.h @@ -257,6 +257,8 @@ struct mthca_qp { atomic_t refcount; u32 qpn; int is_direct; + u8 port; /* for SQP and memfree use only */ + u8 alt_port; /* for memfree use only */ u8 transport; u8 state; u8 atomic_rd_en; @@ -278,7 +280,6 @@ struct mthca_qp { struct mthca_sqp { struct mthca_qp qp; - int port; int pkey_index; u32 qkey; u32 send_psn; diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 057c8e6..f37b0e3 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -248,6 +248,9 @@ void mthca_qp_event(struct mthca_dev *dev, u32 qpn, return; } + if (event_type == IB_EVENT_PATH_MIG) + qp->port = qp->alt_port; + event.device = &dev->ib_dev; event.event = event_type; event.element.qp = &qp->ibqp; @@ -392,10 +395,16 @@ static void to_ib_ah_attr(struct mthca_dev *dev, struct ib_ah_attr *ib_ah_attr, { memset(ib_ah_attr, 0, sizeof *path); ib_ah_attr->port_num = (be32_to_cpu(path->port_pkey) >> 24) & 0x3; + + if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->limits.num_ports) + return; + ib_ah_attr->dlid = be16_to_cpu(path->rlid); ib_ah_attr->sl = be32_to_cpu(path->sl_tclass_flowlabel) >> 28; ib_ah_attr->src_path_bits = path->g_mylmc & 0x7f; - ib_ah_attr->static_rate = path->static_rate & 0x7; + ib_ah_attr->static_rate = mthca_rate_to_ib(dev, + path->static_rate & 0x7, + ib_ah_attr->port_num); ib_ah_attr->ah_flags = (path->g_mylmc & (1 << 7)) ? IB_AH_GRH : 0; if (ib_ah_attr->ah_flags) { ib_ah_attr->grh.sgid_index = path->mgid_index & (dev->limits.gid_table_len - 1); @@ -455,8 +464,10 @@ int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_m qp_attr->cap.max_recv_sge = qp->rq.max_gs; qp_attr->cap.max_inline_data = qp->max_inline_data; - to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); - to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); + if (qp->transport == RC || qp->transport == UC) { + to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); + to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); + } qp_attr->pkey_index = be32_to_cpu(context->pri_path.port_pkey) & 0x7f; qp_attr->alt_pkey_index = be32_to_cpu(context->alt_path.port_pkey) & 0x7f; @@ -484,11 +495,11 @@ out: } static int mthca_path_set(struct mthca_dev *dev, struct ib_ah_attr *ah, - struct mthca_qp_path *path) + struct mthca_qp_path *path, u8 port) { path->g_mylmc = ah->src_path_bits & 0x7f; path->rlid = cpu_to_be16(ah->dlid); - path->static_rate = !!ah->static_rate; + path->static_rate = mthca_get_rate(dev, ah->static_rate, port); if (ah->ah_flags & IB_AH_GRH) { if (ah->grh.sgid_index >= dev->limits.gid_table_len) { @@ -634,7 +645,7 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) if (qp->transport == MLX) qp_context->pri_path.port_pkey |= - cpu_to_be32(to_msqp(qp)->port << 24); + cpu_to_be32(qp->port << 24); else { if (attr_mask & IB_QP_PORT) { qp_context->pri_path.port_pkey |= @@ -657,7 +668,8 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) } if (attr_mask & IB_QP_AV) { - if (mthca_path_set(dev, &attr->ah_attr, &qp_context->pri_path)) + if (mthca_path_set(dev, &attr->ah_attr, &qp_context->pri_path, + attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) return -EINVAL; qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH); @@ -681,7 +693,8 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) return -EINVAL; } - if (mthca_path_set(dev, &attr->alt_ah_attr, &qp_context->alt_path)) + if (mthca_path_set(dev, &attr->alt_ah_attr, &qp_context->alt_path, + attr->alt_ah_attr.port_num)) return -EINVAL; qp_context->alt_path.port_pkey |= cpu_to_be32(attr->alt_pkey_index | @@ -791,6 +804,10 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) qp->atomic_rd_en = attr->qp_access_flags; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) qp->resp_depth = attr->max_dest_rd_atomic; + if (attr_mask & IB_QP_PORT) + qp->port = attr->port_num; + if (attr_mask & IB_QP_ALT_PATH) + qp->alt_port = attr->alt_port_num; if (is_sqp(dev, qp)) store_attrs(to_msqp(qp), attr, attr_mask); @@ -802,13 +819,13 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) if (is_qp0(dev, qp)) { if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR) - init_port(dev, to_msqp(qp)->port); + init_port(dev, qp->port); if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) - mthca_CLOSE_IB(dev, to_msqp(qp)->port, &status); + mthca_CLOSE_IB(dev, qp->port, &status); } /* @@ -1212,6 +1229,9 @@ int mthca_alloc_qp(struct mthca_dev *dev, if (qp->qpn == -1) return -ENOMEM; + /* initialize port to zero for error-catching. */ + qp->port = 0; + err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq, send_policy, qp); if (err) { @@ -1261,7 +1281,7 @@ int mthca_alloc_sqp(struct mthca_dev *dev, if (err) goto err_out; - sqp->port = port; + sqp->qp.port = port; sqp->qp.qpn = mqpn; sqp->qp.transport = MLX; @@ -1404,10 +1424,10 @@ static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp, sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); if (!sqp->qp.ibqp.qp_num) - ib_get_cached_pkey(&dev->ib_dev, sqp->port, + ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port, sqp->pkey_index, &pkey); else - ib_get_cached_pkey(&dev->ib_dev, sqp->port, + ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey); sqp->ud_header.bth.pkey = cpu_to_be16(pkey); sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c index 2dd3aea..adcaf85 100644 --- a/drivers/infiniband/hw/mthca/mthca_srq.c +++ b/drivers/infiniband/hw/mthca/mthca_srq.c @@ -192,7 +192,7 @@ int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd, /* Sanity check SRQ size before proceeding */ if (attr->max_wr > dev->limits.max_srq_wqes || - attr->max_sge > dev->limits.max_sg) + attr->max_sge > dev->limits.max_srq_sge) return -EINVAL; srq->max = attr->max_wr; @@ -660,6 +660,31 @@ int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, return err; } +int mthca_max_srq_sge(struct mthca_dev *dev) +{ + if (mthca_is_memfree(dev)) + return dev->limits.max_sg; + + /* + * SRQ allocations are based on powers of 2 for Tavor, + * (although they only need to be multiples of 16 bytes). + * + * Therefore, we need to base the max number of sg entries on + * the largest power of 2 descriptor size that is <= to the + * actual max WQE descriptor size, rather than return the + * max_sg value given by the firmware (which is based on WQE + * sizes as multiples of 16, not powers of 2). + * + * If SRQ implementation is changed for Tavor to be based on + * multiples of 16, the calculation below can be deleted and + * the FW max_sg value returned. + */ + return min_t(int, dev->limits.max_sg, + ((1 << (fls(dev->limits.max_desc_sz) - 1)) - + sizeof (struct mthca_next_seg)) / + sizeof (struct mthca_data_seg)); +} + int __devinit mthca_init_srq_table(struct mthca_dev *dev) { int err; |