diff options
549 files changed, 30261 insertions, 5465 deletions
diff --git a/Documentation/Smack.txt b/Documentation/Smack.txt new file mode 100644 index 0000000..989c2fc --- /dev/null +++ b/Documentation/Smack.txt @@ -0,0 +1,493 @@ + + + "Good for you, you've decided to clean the elevator!" + - The Elevator, from Dark Star + +Smack is the the Simplified Mandatory Access Control Kernel. +Smack is a kernel based implementation of mandatory access +control that includes simplicity in its primary design goals. + +Smack is not the only Mandatory Access Control scheme +available for Linux. Those new to Mandatory Access Control +are encouraged to compare Smack with the other mechanisms +available to determine which is best suited to the problem +at hand. + +Smack consists of three major components: + - The kernel + - A start-up script and a few modified applications + - Configuration data + +The kernel component of Smack is implemented as a Linux +Security Modules (LSM) module. It requires netlabel and +works best with file systems that support extended attributes, +although xattr support is not strictly required. +It is safe to run a Smack kernel under a "vanilla" distribution. +Smack kernels use the CIPSO IP option. Some network +configurations are intolerant of IP options and can impede +access to systems that use them as Smack does. + +The startup script etc-init.d-smack should be installed +in /etc/init.d/smack and should be invoked early in the +start-up process. On Fedora rc5.d/S02smack is recommended. +This script ensures that certain devices have the correct +Smack attributes and loads the Smack configuration if +any is defined. This script invokes two programs that +ensure configuration data is properly formatted. These +programs are /usr/sbin/smackload and /usr/sin/smackcipso. +The system will run just fine without these programs, +but it will be difficult to set access rules properly. + +A version of "ls" that provides a "-M" option to display +Smack labels on long listing is available. + +A hacked version of sshd that allows network logins by users +with specific Smack labels is available. This version does +not work for scp. You must set the /etc/ssh/sshd_config +line: + UsePrivilegeSeparation no + +The format of /etc/smack/usr is: + + username smack + +In keeping with the intent of Smack, configuration data is +minimal and not strictly required. The most important +configuration step is mounting the smackfs pseudo filesystem. + +Add this line to /etc/fstab: + + smackfs /smack smackfs smackfsdef=* 0 0 + +and create the /smack directory for mounting. + +Smack uses extended attributes (xattrs) to store file labels. +The command to set a Smack label on a file is: + + # attr -S -s SMACK64 -V "value" path + +NOTE: Smack labels are limited to 23 characters. The attr command + does not enforce this restriction and can be used to set + invalid Smack labels on files. + +If you don't do anything special all users will get the floor ("_") +label when they log in. If you do want to log in via the hacked ssh +at other labels use the attr command to set the smack value on the +home directory and it's contents. + +You can add access rules in /etc/smack/accesses. They take the form: + + subjectlabel objectlabel access + +access is a combination of the letters rwxa which specify the +kind of access permitted a subject with subjectlabel on an +object with objectlabel. If there is no rule no access is allowed. + +A process can see the smack label it is running with by +reading /proc/self/attr/current. A privileged process can +set the process smack by writing there. + +Look for additional programs on http://schaufler-ca.com + +From the Smack Whitepaper: + +The Simplified Mandatory Access Control Kernel + +Casey Schaufler +casey@schaufler-ca.com + +Mandatory Access Control + +Computer systems employ a variety of schemes to constrain how information is +shared among the people and services using the machine. Some of these schemes +allow the program or user to decide what other programs or users are allowed +access to pieces of data. These schemes are called discretionary access +control mechanisms because the access control is specified at the discretion +of the user. Other schemes do not leave the decision regarding what a user or +program can access up to users or programs. These schemes are called mandatory +access control mechanisms because you don't have a choice regarding the users +or programs that have access to pieces of data. + +Bell & LaPadula + +From the middle of the 1980's until the turn of the century Mandatory Access +Control (MAC) was very closely associated with the Bell & LaPadula security +model, a mathematical description of the United States Department of Defense +policy for marking paper documents. MAC in this form enjoyed a following +within the Capital Beltway and Scandinavian supercomputer centers but was +often sited as failing to address general needs. + +Domain Type Enforcement + +Around the turn of the century Domain Type Enforcement (DTE) became popular. +This scheme organizes users, programs, and data into domains that are +protected from each other. This scheme has been widely deployed as a component +of popular Linux distributions. The administrative overhead required to +maintain this scheme and the detailed understanding of the whole system +necessary to provide a secure domain mapping leads to the scheme being +disabled or used in limited ways in the majority of cases. + +Smack + +Smack is a Mandatory Access Control mechanism designed to provide useful MAC +while avoiding the pitfalls of its predecessors. The limitations of Bell & +LaPadula are addressed by providing a scheme whereby access can be controlled +according to the requirements of the system and its purpose rather than those +imposed by an arcane government policy. The complexity of Domain Type +Enforcement and avoided by defining access controls in terms of the access +modes already in use. + +Smack Terminology + +The jargon used to talk about Smack will be familiar to those who have dealt +with other MAC systems and shouldn't be too difficult for the uninitiated to +pick up. There are four terms that are used in a specific way and that are +especially important: + + Subject: A subject is an active entity on the computer system. + On Smack a subject is a task, which is in turn the basic unit + of execution. + + Object: An object is a passive entity on the computer system. + On Smack files of all types, IPC, and tasks can be objects. + + Access: Any attempt by a subject to put information into or get + information from an object is an access. + + Label: Data that identifies the Mandatory Access Control + characteristics of a subject or an object. + +These definitions are consistent with the traditional use in the security +community. There are also some terms from Linux that are likely to crop up: + + Capability: A task that possesses a capability has permission to + violate an aspect of the system security policy, as identified by + the specific capability. A task that possesses one or more + capabilities is a privileged task, whereas a task with no + capabilities is an unprivileged task. + + Privilege: A task that is allowed to violate the system security + policy is said to have privilege. As of this writing a task can + have privilege either by possessing capabilities or by having an + effective user of root. + +Smack Basics + +Smack is an extension to a Linux system. It enforces additional restrictions +on what subjects can access which objects, based on the labels attached to +each of the subject and the object. + +Labels + +Smack labels are ASCII character strings, one to twenty-three characters in +length. Single character labels using special characters, that being anything +other than a letter or digit, are reserved for use by the Smack development +team. Smack labels are unstructured, case sensitive, and the only operation +ever performed on them is comparison for equality. Smack labels cannot +contain unprintable characters or the "/" (slash) character. + +There are some predefined labels: + + _ Pronounced "floor", a single underscore character. + ^ Pronounced "hat", a single circumflex character. + * Pronounced "star", a single asterisk character. + ? Pronounced "huh", a single question mark character. + +Every task on a Smack system is assigned a label. System tasks, such as +init(8) and systems daemons, are run with the floor ("_") label. User tasks +are assigned labels according to the specification found in the +/etc/smack/user configuration file. + +Access Rules + +Smack uses the traditional access modes of Linux. These modes are read, +execute, write, and occasionally append. There are a few cases where the +access mode may not be obvious. These include: + + Signals: A signal is a write operation from the subject task to + the object task. + Internet Domain IPC: Transmission of a packet is considered a + write operation from the source task to the destination task. + +Smack restricts access based on the label attached to a subject and the label +attached to the object it is trying to access. The rules enforced are, in +order: + + 1. Any access requested by a task labeled "*" is denied. + 2. A read or execute access requested by a task labeled "^" + is permitted. + 3. A read or execute access requested on an object labeled "_" + is permitted. + 4. Any access requested on an object labeled "*" is permitted. + 5. Any access requested by a task on an object with the same + label is permitted. + 6. Any access requested that is explicitly defined in the loaded + rule set is permitted. + 7. Any other access is denied. + +Smack Access Rules + +With the isolation provided by Smack access separation is simple. There are +many interesting cases where limited access by subjects to objects with +different labels is desired. One example is the familiar spy model of +sensitivity, where a scientist working on a highly classified project would be +able to read documents of lower classifications and anything she writes will +be "born" highly classified. To accommodate such schemes Smack includes a +mechanism for specifying rules allowing access between labels. + +Access Rule Format + +The format of an access rule is: + + subject-label object-label access + +Where subject-label is the Smack label of the task, object-label is the Smack +label of the thing being accessed, and access is a string specifying the sort +of access allowed. The Smack labels are limited to 23 characters. The access +specification is searched for letters that describe access modes: + + a: indicates that append access should be granted. + r: indicates that read access should be granted. + w: indicates that write access should be granted. + x: indicates that execute access should be granted. + +Uppercase values for the specification letters are allowed as well. +Access mode specifications can be in any order. Examples of acceptable rules +are: + + TopSecret Secret rx + Secret Unclass R + Manager Game x + User HR w + New Old rRrRr + Closed Off - + +Examples of unacceptable rules are: + + Top Secret Secret rx + Ace Ace r + Odd spells waxbeans + +Spaces are not allowed in labels. Since a subject always has access to files +with the same label specifying a rule for that case is pointless. Only +valid letters (rwxaRWXA) and the dash ('-') character are allowed in +access specifications. The dash is a placeholder, so "a-r" is the same +as "ar". A lone dash is used to specify that no access should be allowed. + +Applying Access Rules + +The developers of Linux rarely define new sorts of things, usually importing +schemes and concepts from other systems. Most often, the other systems are +variants of Unix. Unix has many endearing properties, but consistency of +access control models is not one of them. Smack strives to treat accesses as +uniformly as is sensible while keeping with the spirit of the underlying +mechanism. + +File system objects including files, directories, named pipes, symbolic links, +and devices require access permissions that closely match those used by mode +bit access. To open a file for reading read access is required on the file. To +search a directory requires execute access. Creating a file with write access +requires both read and write access on the containing directory. Deleting a +file requires read and write access to the file and to the containing +directory. It is possible that a user may be able to see that a file exists +but not any of its attributes by the circumstance of having read access to the +containing directory but not to the differently labeled file. This is an +artifact of the file name being data in the directory, not a part of the file. + +IPC objects, message queues, semaphore sets, and memory segments exist in flat +namespaces and access requests are only required to match the object in +question. + +Process objects reflect tasks on the system and the Smack label used to access +them is the same Smack label that the task would use for its own access +attempts. Sending a signal via the kill() system call is a write operation +from the signaler to the recipient. Debugging a process requires both reading +and writing. Creating a new task is an internal operation that results in two +tasks with identical Smack labels and requires no access checks. + +Sockets are data structures attached to processes and sending a packet from +one process to another requires that the sender have write access to the +receiver. The receiver is not required to have read access to the sender. + +Setting Access Rules + +The configuration file /etc/smack/accesses contains the rules to be set at +system startup. The contents are written to the special file /smack/load. +Rules can be written to /smack/load at any time and take effect immediately. +For any pair of subject and object labels there can be only one rule, with the +most recently specified overriding any earlier specification. + +The program smackload is provided to ensure data is formatted +properly when written to /smack/load. This program reads lines +of the form + + subjectlabel objectlabel mode. + +Task Attribute + +The Smack label of a process can be read from /proc/<pid>/attr/current. A +process can read its own Smack label from /proc/self/attr/current. A +privileged process can change its own Smack label by writing to +/proc/self/attr/current but not the label of another process. + +File Attribute + +The Smack label of a filesystem object is stored as an extended attribute +named SMACK64 on the file. This attribute is in the security namespace. It can +only be changed by a process with privilege. + +Privilege + +A process with CAP_MAC_OVERRIDE is privileged. + +Smack Networking + +As mentioned before, Smack enforces access control on network protocol +transmissions. Every packet sent by a Smack process is tagged with its Smack +label. This is done by adding a CIPSO tag to the header of the IP packet. Each +packet received is expected to have a CIPSO tag that identifies the label and +if it lacks such a tag the network ambient label is assumed. Before the packet +is delivered a check is made to determine that a subject with the label on the +packet has write access to the receiving process and if that is not the case +the packet is dropped. + +CIPSO Configuration + +It is normally unnecessary to specify the CIPSO configuration. The default +values used by the system handle all internal cases. Smack will compose CIPSO +label values to match the Smack labels being used without administrative +intervention. Unlabeled packets that come into the system will be given the +ambient label. + +Smack requires configuration in the case where packets from a system that is +not smack that speaks CIPSO may be encountered. Usually this will be a Trusted +Solaris system, but there are other, less widely deployed systems out there. +CIPSO provides 3 important values, a Domain Of Interpretation (DOI), a level, +and a category set with each packet. The DOI is intended to identify a group +of systems that use compatible labeling schemes, and the DOI specified on the +smack system must match that of the remote system or packets will be +discarded. The DOI is 3 by default. The value can be read from /smack/doi and +can be changed by writing to /smack/doi. + +The label and category set are mapped to a Smack label as defined in +/etc/smack/cipso. + +A Smack/CIPSO mapping has the form: + + smack level [category [category]*] + +Smack does not expect the level or category sets to be related in any +particular way and does not assume or assign accesses based on them. Some +examples of mappings: + + TopSecret 7 + TS:A,B 7 1 2 + SecBDE 5 2 4 6 + RAFTERS 7 12 26 + +The ":" and "," characters are permitted in a Smack label but have no special +meaning. + +The mapping of Smack labels to CIPSO values is defined by writing to +/smack/cipso. Again, the format of data written to this special file +is highly restrictive, so the program smackcipso is provided to +ensure the writes are done properly. This program takes mappings +on the standard input and sends them to /smack/cipso properly. + +In addition to explicit mappings Smack supports direct CIPSO mappings. One +CIPSO level is used to indicate that the category set passed in the packet is +in fact an encoding of the Smack label. The level used is 250 by default. The +value can be read from /smack/direct and changed by writing to /smack/direct. + +Socket Attributes + +There are two attributes that are associated with sockets. These attributes +can only be set by privileged tasks, but any task can read them for their own +sockets. + + SMACK64IPIN: The Smack label of the task object. A privileged + program that will enforce policy may set this to the star label. + + SMACK64IPOUT: The Smack label transmitted with outgoing packets. + A privileged program may set this to match the label of another + task with which it hopes to communicate. + +Writing Applications for Smack + +There are three sorts of applications that will run on a Smack system. How an +application interacts with Smack will determine what it will have to do to +work properly under Smack. + +Smack Ignorant Applications + +By far the majority of applications have no reason whatever to care about the +unique properties of Smack. Since invoking a program has no impact on the +Smack label associated with the process the only concern likely to arise is +whether the process has execute access to the program. + +Smack Relevant Applications + +Some programs can be improved by teaching them about Smack, but do not make +any security decisions themselves. The utility ls(1) is one example of such a +program. + +Smack Enforcing Applications + +These are special programs that not only know about Smack, but participate in +the enforcement of system policy. In most cases these are the programs that +set up user sessions. There are also network services that provide information +to processes running with various labels. + +File System Interfaces + +Smack maintains labels on file system objects using extended attributes. The +Smack label of a file, directory, or other file system object can be obtained +using getxattr(2). + + len = getxattr("/", "security.SMACK64", value, sizeof (value)); + +will put the Smack label of the root directory into value. A privileged +process can set the Smack label of a file system object with setxattr(2). + + len = strlen("Rubble"); + rc = setxattr("/foo", "security.SMACK64", "Rubble", len, 0); + +will set the Smack label of /foo to "Rubble" if the program has appropriate +privilege. + +Socket Interfaces + +The socket attributes can be read using fgetxattr(2). + +A privileged process can set the Smack label of outgoing packets with +fsetxattr(2). + + len = strlen("Rubble"); + rc = fsetxattr(fd, "security.SMACK64IPOUT", "Rubble", len, 0); + +will set the Smack label "Rubble" on packets going out from the socket if the +program has appropriate privilege. + + rc = fsetxattr(fd, "security.SMACK64IPIN, "*", strlen("*"), 0); + +will set the Smack label "*" as the object label against which incoming +packets will be checked if the program has appropriate privilege. + +Administration + +Smack supports some mount options: + + smackfsdef=label: specifies the label to give files that lack + the Smack label extended attribute. + + smackfsroot=label: specifies the label to assign the root of the + file system if it lacks the Smack extended attribute. + + smackfshat=label: specifies a label that must have read access to + all labels set on the filesystem. Not yet enforced. + + smackfsfloor=label: specifies a label to which all labels set on the + filesystem must have read access. Not yet enforced. + +These mount options apply to all file system types. + diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 0b1b0c0..e2799b5 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1315,13 +1315,28 @@ for writeout by the pdflush daemons. It is expressed in 100'ths of a second. Data which has been dirty in-memory for longer than this interval will be written out next time a pdflush daemon wakes up. +highmem_is_dirtyable +-------------------- + +Only present if CONFIG_HIGHMEM is set. + +This defaults to 0 (false), meaning that the ratios set above are calculated +as a percentage of lowmem only. This protects against excessive scanning +in page reclaim, swapping and general VM distress. + +Setting this to 1 can be useful on 32 bit machines where you want to make +random changes within an MMAPed file that is larger than your available +lowmem without causing large quantities of random IO. Is is safe if the +behavior of all programs running on the machine is known and memory will +not be otherwise stressed. + legacy_va_layout ---------------- If non-zero, this sysctl disables the new 32-bit mmap mmap layout - the kernel will use the legacy (2.4) layout for all processes. -lower_zone_protection +lowmem_reserve_ratio --------------------- For some specialised workloads on highmem machines it is dangerous for @@ -1341,25 +1356,71 @@ captured into pinned user memory. mechanism will also defend that region from allocations which could use highmem or lowmem). -The `lower_zone_protection' tunable determines how aggressive the kernel is -in defending these lower zones. The default value is zero - no -protection at all. +The `lowmem_reserve_ratio' tunable determines how aggressive the kernel is +in defending these lower zones. If you have a machine which uses highmem or ISA DMA and your applications are using mlock(), or if you are running with no swap then -you probably should increase the lower_zone_protection setting. - -The units of this tunable are fairly vague. It is approximately equal -to "megabytes," so setting lower_zone_protection=100 will protect around 100 -megabytes of the lowmem zone from user allocations. It will also make -those 100 megabytes unavailable for use by applications and by -pagecache, so there is a cost. - -The effects of this tunable may be observed by monitoring -/proc/meminfo:LowFree. Write a single huge file and observe the point -at which LowFree ceases to fall. - -A reasonable value for lower_zone_protection is 100. +you probably should change the lowmem_reserve_ratio setting. + +The lowmem_reserve_ratio is an array. You can see them by reading this file. +- +% cat /proc/sys/vm/lowmem_reserve_ratio +256 256 32 +- +Note: # of this elements is one fewer than number of zones. Because the highest + zone's value is not necessary for following calculation. + +But, these values are not used directly. The kernel calculates # of protection +pages for each zones from them. These are shown as array of protection pages +in /proc/zoneinfo like followings. (This is an example of x86-64 box). +Each zone has an array of protection pages like this. + +- +Node 0, zone DMA + pages free 1355 + min 3 + low 3 + high 4 + : + : + numa_other 0 + protection: (0, 2004, 2004, 2004) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + pagesets + cpu: 0 pcp: 0 + : +- +These protections are added to score to judge whether this zone should be used +for page allocation or should be reclaimed. + +In this example, if normal pages (index=2) are required to this DMA zone and +pages_high is used for watermark, the kernel judges this zone should not be +used because pages_free(1355) is smaller than watermark + protection[2] +(4 + 2004 = 2008). If this protection value is 0, this zone would be used for +normal page requirement. If requirement is DMA zone(index=0), protection[0] +(=0) is used. + +zone[i]'s protection[j] is calculated by following exprssion. + +(i < j): + zone[i]->protection[j] + = (total sums of present_pages from zone[i+1] to zone[j] on the node) + / lowmem_reserve_ratio[i]; +(i = j): + (should not be protected. = 0; +(i > j): + (not necessary, but looks 0) + +The default values of lowmem_reserve_ratio[i] are + 256 (if zone[i] means DMA or DMA32 zone) + 32 (others). +As above expression, they are reciprocal number of ratio. +256 means 1/256. # of protection pages becomes about "0.39%" of total present +pages of higher zones on the node. + +If you would like to protect more pages, smaller values are effective. +The minimum value is 1 (1/1 -> 100%). page-cluster ------------ diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt index 6bc2ba2..8da724e 100644 --- a/Documentation/gpio.txt +++ b/Documentation/gpio.txt @@ -32,7 +32,7 @@ The exact capabilities of GPIOs vary between systems. Common options: - Input values are likewise readable (1, 0). Some chips support readback of pins configured as "output", which is very useful in such "wire-OR" cases (to support bidirectional signaling). GPIO controllers may have - input de-glitch logic, sometimes with software controls. + input de-glitch/debounce logic, sometimes with software controls. - Inputs can often be used as IRQ signals, often edge triggered but sometimes level triggered. Such IRQs may be configurable as system @@ -60,10 +60,13 @@ used on a board that's wired differently. Only least-common-denominator functionality can be very portable. Other features are platform-specific, and that can be critical for glue logic. -Plus, this doesn't define an implementation framework, just an interface. +Plus, this doesn't require any implementation framework, just an interface. One platform might implement it as simple inline functions accessing chip registers; another might implement it by delegating through abstractions -used for several very different kinds of GPIO controller. +used for several very different kinds of GPIO controller. (There is some +optional code supporting such an implementation strategy, described later +in this document, but drivers acting as clients to the GPIO interface must +not care how it's implemented.) That said, if the convention is supported on their platform, drivers should use it when possible. Platforms should declare GENERIC_GPIO support in @@ -121,6 +124,11 @@ before tasking is enabled, as part of early board setup. For output GPIOs, the value provided becomes the initial output value. This helps avoid signal glitching during system startup. +For compatibility with legacy interfaces to GPIOs, setting the direction +of a GPIO implicitly requests that GPIO (see below) if it has not been +requested already. That compatibility may be removed in the future; +explicitly requesting GPIOs is strongly preferred. + Setting the direction can fail if the GPIO number is invalid, or when that particular GPIO can't be used in that mode. It's generally a bad idea to rely on boot firmware to have set the direction correctly, since @@ -133,6 +141,7 @@ Spinlock-Safe GPIO access ------------------------- Most GPIO controllers can be accessed with memory read/write instructions. That doesn't need to sleep, and can safely be done from inside IRQ handlers. +(That includes hardirq contexts on RT kernels.) Use these calls to access such GPIOs: @@ -145,7 +154,7 @@ Use these calls to access such GPIOs: The values are boolean, zero for low, nonzero for high. When reading the value of an output pin, the value returned should be what's seen on the pin ... that won't always match the specified output value, because of -issues including wire-OR and output latencies. +issues including open-drain signaling and output latencies. The get/set calls have no error returns because "invalid GPIO" should have been reported earlier from gpio_direction_*(). However, note that not all @@ -170,7 +179,8 @@ get to the head of a queue to transmit a command and get its response. This requires sleeping, which can't be done from inside IRQ handlers. Platforms that support this type of GPIO distinguish them from other GPIOs -by returning nonzero from this call: +by returning nonzero from this call (which requires a valid GPIO number, +either explicitly or implicitly requested): int gpio_cansleep(unsigned gpio); @@ -209,8 +219,11 @@ before tasking is enabled, as part of early board setup. These calls serve two basic purposes. One is marking the signals which are actually in use as GPIOs, for better diagnostics; systems may have several hundred potential GPIOs, but often only a dozen are used on any -given board. Another is to catch conflicts between drivers, reporting -errors when drivers wrongly think they have exclusive use of that signal. +given board. Another is to catch conflicts, identifying errors when +(a) two or more drivers wrongly think they have exclusive use of that +signal, or (b) something wrongly believes it's safe to remove drivers +needed to manage a signal that's in active use. That is, requesting a +GPIO can serve as a kind of lock. These two calls are optional because not not all current Linux platforms offer such functionality in their GPIO support; a valid implementation @@ -223,6 +236,9 @@ Note that requesting a GPIO does NOT cause it to be configured in any way; it just marks that GPIO as in use. Separate code must handle any pin setup (e.g. controlling which pin the GPIO uses, pullup/pulldown). +Also note that it's your responsibility to have stopped using a GPIO +before you free it. + GPIOs mapped to IRQs -------------------- @@ -238,7 +254,7 @@ map between them using calls like: Those return either the corresponding number in the other namespace, or else a negative errno code if the mapping can't be done. (For example, -some GPIOs can't used as IRQs.) It is an unchecked error to use a GPIO +some GPIOs can't be used as IRQs.) It is an unchecked error to use a GPIO number that wasn't set up as an input using gpio_direction_input(), or to use an IRQ number that didn't originally come from gpio_to_irq(). @@ -299,17 +315,110 @@ Related to multiplexing is configuration and enabling of the pullups or pulldowns integrated on some platforms. Not all platforms support them, or support them in the same way; and any given board might use external pullups (or pulldowns) so that the on-chip ones should not be used. +(When a circuit needs 5 kOhm, on-chip 100 kOhm resistors won't do.) There are other system-specific mechanisms that are not specified here, like the aforementioned options for input de-glitching and wire-OR output. Hardware may support reading or writing GPIOs in gangs, but that's usually configuration dependent: for GPIOs sharing the same bank. (GPIOs are commonly grouped in banks of 16 or 32, with a given SOC having several such -banks.) Some systems can trigger IRQs from output GPIOs. Code relying on -such mechanisms will necessarily be nonportable. +banks.) Some systems can trigger IRQs from output GPIOs, or read values +from pins not managed as GPIOs. Code relying on such mechanisms will +necessarily be nonportable. -Dynamic definition of GPIOs is not currently supported; for example, as +Dynamic definition of GPIOs is not currently standard; for example, as a side effect of configuring an add-on board with some GPIO expanders. These calls are purely for kernel space, but a userspace API could be built -on top of it. +on top of them. + + +GPIO implementor's framework (OPTIONAL) +======================================= +As noted earlier, there is an optional implementation framework making it +easier for platforms to support different kinds of GPIO controller using +the same programming interface. + +As a debugging aid, if debugfs is available a /sys/kernel/debug/gpio file +will be found there. That will list all the controllers registered through +this framework, and the state of the GPIOs currently in use. + + +Controller Drivers: gpio_chip +----------------------------- +In this framework each GPIO controller is packaged as a "struct gpio_chip" +with information common to each controller of that type: + + - methods to establish GPIO direction + - methods used to access GPIO values + - flag saying whether calls to its methods may sleep + - optional debugfs dump method (showing extra state like pullup config) + - label for diagnostics + +There is also per-instance data, which may come from device.platform_data: +the number of its first GPIO, and how many GPIOs it exposes. + +The code implementing a gpio_chip should support multiple instances of the +controller, possibly using the driver model. That code will configure each +gpio_chip and issue gpiochip_add(). Removing a GPIO controller should be +rare; use gpiochip_remove() when it is unavoidable. + +Most often a gpio_chip is part of an instance-specific structure with state +not exposed by the GPIO interfaces, such as addressing, power management, +and more. Chips such as codecs will have complex non-GPIO state, + +Any debugfs dump method should normally ignore signals which haven't been +requested as GPIOs. They can use gpiochip_is_requested(), which returns +either NULL or the label associated with that GPIO when it was requested. + + +Platform Support +---------------- +To support this framework, a platform's Kconfig will "select HAVE_GPIO_LIB" +and arrange that its <asm/gpio.h> includes <asm-generic/gpio.h> and defines +three functions: gpio_get_value(), gpio_set_value(), and gpio_cansleep(). +They may also want to provide a custom value for ARCH_NR_GPIOS. + +Trivial implementations of those functions can directly use framework +code, which always dispatches through the gpio_chip: + + #define gpio_get_value __gpio_get_value + #define gpio_set_value __gpio_set_value + #define gpio_cansleep __gpio_cansleep + +Fancier implementations could instead define those as inline functions with +logic optimizing access to specific SOC-based GPIOs. For example, if the +referenced GPIO is the constant "12", getting or setting its value could +cost as little as two or three instructions, never sleeping. When such an +optimization is not possible those calls must delegate to the framework +code, costing at least a few dozen instructions. For bitbanged I/O, such +instruction savings can be significant. + +For SOCs, platform-specific code defines and registers gpio_chip instances +for each bank of on-chip GPIOs. Those GPIOs should be numbered/labeled to +match chip vendor documentation, and directly match board schematics. They +may well start at zero and go up to a platform-specific limit. Such GPIOs +are normally integrated into platform initialization to make them always be +available, from arch_initcall() or earlier; they can often serve as IRQs. + + +Board Support +------------- +For external GPIO controllers -- such as I2C or SPI expanders, ASICs, multi +function devices, FPGAs or CPLDs -- most often board-specific code handles +registering controller devices and ensures that their drivers know what GPIO +numbers to use with gpiochip_add(). Their numbers often start right after +platform-specific GPIOs. + +For example, board setup code could create structures identifying the range +of GPIOs that chip will expose, and passes them to each GPIO expander chip +using platform_data. Then the chip driver's probe() routine could pass that +data to gpiochip_add(). + +Initialization order can be important. For example, when a device relies on +an I2C-based GPIO, its probe() routine should only be called after that GPIO +becomes available. That may mean the device should not be registered until +calls for that GPIO can work. One way to address such dependencies is for +such gpio_chip controllers to provide setup() and teardown() callbacks to +board specific code; those board specific callbacks would register devices +once all the necessary resources are available. diff --git a/Documentation/i2c/chips/pca9539 b/Documentation/i2c/chips/pca9539 index c4fce6a..1d81c53 100644 --- a/Documentation/i2c/chips/pca9539 +++ b/Documentation/i2c/chips/pca9539 @@ -1,6 +1,9 @@ Kernel driver pca9539 ===================== +NOTE: this driver is deprecated and will be dropped soon, use +drivers/gpio/pca9539.c instead. + Supported chips: * Philips PCA9539 Prefix: 'pca9539' diff --git a/Documentation/pcmcia/driver-changes.txt b/Documentation/pcmcia/driver-changes.txt index 4739c5c..96f155e 100644 --- a/Documentation/pcmcia/driver-changes.txt +++ b/Documentation/pcmcia/driver-changes.txt @@ -33,8 +33,8 @@ This file details changes in 2.6 which affect PCMCIA card driver authors: and can be used (e.g. for SET_NETDEV_DEV) by using handle_to_dev(client_handle_t * handle). -* Convert internal I/O port addresses to unsigned long (as of 2.6.11) - ioaddr_t should be replaced by kio_addr_t in PCMCIA card drivers. +* Convert internal I/O port addresses to unsigned int (as of 2.6.11) + ioaddr_t should be replaced by unsigned int in PCMCIA card drivers. * irq_mask and irq_list parameters (as of 2.6.11) The irq_mask and irq_list parameters should no longer be used in diff --git a/Documentation/pm_qos_interface.txt b/Documentation/pm_qos_interface.txt new file mode 100644 index 0000000..49adb1a --- /dev/null +++ b/Documentation/pm_qos_interface.txt @@ -0,0 +1,59 @@ +PM quality of Service interface. + +This interface provides a kernel and user mode interface for registering +performance expectations by drivers, subsystems and user space applications on +one of the parameters. + +Currently we have {cpu_dma_latency, network_latency, network_throughput} as the +initial set of pm_qos parameters. + +The infrastructure exposes multiple misc device nodes one per implemented +parameter. The set of parameters implement is defined by pm_qos_power_init() +and pm_qos_params.h. This is done because having the available parameters +being runtime configurable or changeable from a driver was seen as too easy to +abuse. + +For each parameter a list of performance requirements is maintained along with +an aggregated target value. The aggregated target value is updated with +changes to the requirement list or elements of the list. Typically the +aggregated target value is simply the max or min of the requirement values held +in the parameter list elements. + +From kernel mode the use of this interface is simple: +pm_qos_add_requirement(param_id, name, target_value): +Will insert a named element in the list for that identified PM_QOS parameter +with the target value. Upon change to this list the new target is recomputed +and any registered notifiers are called only if the target value is now +different. + +pm_qos_update_requirement(param_id, name, new_target_value): +Will search the list identified by the param_id for the named list element and +then update its target value, calling the notification tree if the aggregated +target is changed. with that name is already registered. + +pm_qos_remove_requirement(param_id, name): +Will search the identified list for the named element and remove it, after +removal it will update the aggregate target and call the notification tree if +the target was changed as a result of removing the named requirement. + + +From user mode: +Only processes can register a pm_qos requirement. To provide for automatic +cleanup for process the interface requires the process to register its +parameter requirements in the following way: + +To register the default pm_qos target for the specific parameter, the process +must open one of /dev/[cpu_dma_latency, network_latency, network_throughput] + +As long as the device node is held open that process has a registered +requirement on the parameter. The name of the requirement is "process_<PID>" +derived from the current->pid from within the open system call. + +To change the requested target value the process needs to write a s32 value to +the open device node. This translates to a pm_qos_update_requirement call. + +To remove the user mode request for a target value simply close the device +node. + + + diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 6f31f0a..24eac1b 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -22,6 +22,7 @@ Currently, these files are in /proc/sys/vm: - dirty_background_ratio - dirty_expire_centisecs - dirty_writeback_centisecs +- highmem_is_dirtyable (only if CONFIG_HIGHMEM set) - max_map_count - min_free_kbytes - laptop_mode @@ -40,9 +41,9 @@ Currently, these files are in /proc/sys/vm: ============================================================== dirty_ratio, dirty_background_ratio, dirty_expire_centisecs, -dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode, -block_dump, swap_token_timeout, drop-caches, -hugepages_treat_as_movable: +dirty_writeback_centisecs, highmem_is_dirtyable, +vfs_cache_pressure, laptop_mode, block_dump, swap_token_timeout, +drop-caches, hugepages_treat_as_movable: See Documentation/filesystems/proc.txt diff --git a/MAINTAINERS b/MAINTAINERS index da30a72..548df4b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2681,6 +2681,16 @@ M: James.Bottomley@HansenPartnership.com L: linux-scsi@vger.kernel.org S: Maintained +NETEFFECT IWARP RNIC DRIVER (IW_NES) +P: Faisal Latif +M: flatif@neteffect.com +P: Glenn Streiff +M: gstreiff@neteffect.com +L: general@lists.openfabrics.org +W: http://www.neteffect.com +S: Supported +F: drivers/infiniband/hw/nes/ + NETEM NETWORK EMULATOR P: Stephen Hemminger M: shemminger@linux-foundation.org diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c index 468b76c..8ac0831 100644 --- a/arch/alpha/kernel/pci-noop.c +++ b/arch/alpha/kernel/pci-noop.c @@ -165,7 +165,7 @@ dma_alloc_coherent(struct device *dev, size_t size, ret = (void *)__get_free_pages(gfp, get_order(size)); if (ret) { memset(ret, 0, size); - *dma_handle = virt_to_bus(ret); + *dma_handle = virt_to_phys(ret); } return ret; } @@ -184,7 +184,7 @@ dma_map_sg(struct device *dev, struct scatterlist *sgl, int nents, BUG_ON(!sg_page(sg)); va = sg_virt(sg); - sg_dma_address(sg) = (dma_addr_t)virt_to_bus(va); + sg_dma_address(sg) = (dma_addr_t)virt_to_phys(va); sg_dma_len(sg) = sg->length; } diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index 2d00a08..26d3789 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -9,6 +9,7 @@ #include <linux/bootmem.h> #include <linux/scatterlist.h> #include <linux/log2.h> +#include <linux/dma-mapping.h> #include <asm/io.h> #include <asm/hwrpb.h> @@ -470,22 +471,29 @@ EXPORT_SYMBOL(pci_free_consistent); #define SG_ENT_PHYS_ADDRESS(SG) __pa(SG_ENT_VIRT_ADDRESS(SG)) static void -sg_classify(struct scatterlist *sg, struct scatterlist *end, int virt_ok) +sg_classify(struct device *dev, struct scatterlist *sg, struct scatterlist *end, + int virt_ok) { unsigned long next_paddr; struct scatterlist *leader; long leader_flag, leader_length; + unsigned int max_seg_size; leader = sg; leader_flag = 0; leader_length = leader->length; next_paddr = SG_ENT_PHYS_ADDRESS(leader) + leader_length; + /* we will not marge sg without device. */ + max_seg_size = dev ? dma_get_max_seg_size(dev) : 0; for (++sg; sg < end; ++sg) { unsigned long addr, len; addr = SG_ENT_PHYS_ADDRESS(sg); len = sg->length; + if (leader_length + len > max_seg_size) + goto new_segment; + if (next_paddr == addr) { sg->dma_address = -1; leader_length += len; @@ -494,6 +502,7 @@ sg_classify(struct scatterlist *sg, struct scatterlist *end, int virt_ok) leader_flag = 1; leader_length += len; } else { +new_segment: leader->dma_address = leader_flag; leader->dma_length = leader_length; leader = sg; @@ -512,7 +521,7 @@ sg_classify(struct scatterlist *sg, struct scatterlist *end, int virt_ok) in the blanks. */ static int -sg_fill(struct scatterlist *leader, struct scatterlist *end, +sg_fill(struct device *dev, struct scatterlist *leader, struct scatterlist *end, struct scatterlist *out, struct pci_iommu_arena *arena, dma_addr_t max_dma, int dac_allowed) { @@ -562,8 +571,8 @@ sg_fill(struct scatterlist *leader, struct scatterlist *end, /* Otherwise, break up the remaining virtually contiguous hunks into individual direct maps and retry. */ - sg_classify(leader, end, 0); - return sg_fill(leader, end, out, arena, max_dma, dac_allowed); + sg_classify(dev, leader, end, 0); + return sg_fill(dev, leader, end, out, arena, max_dma, dac_allowed); } out->dma_address = arena->dma_base + dma_ofs*PAGE_SIZE + paddr; @@ -619,12 +628,15 @@ pci_map_sg(struct pci_dev *pdev, struct scatterlist *sg, int nents, struct pci_iommu_arena *arena; dma_addr_t max_dma; int dac_allowed; + struct device *dev; if (direction == PCI_DMA_NONE) BUG(); dac_allowed = pdev ? pci_dac_dma_supported(pdev, pdev->dma_mask) : 0; + dev = pdev ? &pdev->dev : NULL; + /* Fast path single entry scatterlists. */ if (nents == 1) { sg->dma_length = sg->length; @@ -638,7 +650,7 @@ pci_map_sg(struct pci_dev *pdev, struct scatterlist *sg, int nents, end = sg + nents; /* First, prepare information about the entries. */ - sg_classify(sg, end, alpha_mv.mv_pci_tbi != 0); + sg_classify(dev, sg, end, alpha_mv.mv_pci_tbi != 0); /* Second, figure out where we're going to map things. */ if (alpha_mv.mv_pci_tbi) { @@ -658,7 +670,7 @@ pci_map_sg(struct pci_dev *pdev, struct scatterlist *sg, int nents, for (out = sg; sg < end; ++sg) { if ((int) sg->dma_address < 0) continue; - if (sg_fill(sg, end, out, arena, max_dma, dac_allowed) < 0) + if (sg_fill(dev, sg, end, out, arena, max_dma, dac_allowed) < 0) goto error; out++; } diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index bd5e68c..beff629 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -58,7 +58,6 @@ static struct notifier_block alpha_panic_block = { #include <asm/system.h> #include <asm/hwrpb.h> #include <asm/dma.h> -#include <asm/io.h> #include <asm/mmu_context.h> #include <asm/console.h> diff --git a/arch/alpha/kernel/systbls.S b/arch/alpha/kernel/systbls.S index 79de99e3..ba914af 100644 --- a/arch/alpha/kernel/systbls.S +++ b/arch/alpha/kernel/systbls.S @@ -495,7 +495,7 @@ sys_call_table: .quad sys_epoll_pwait .quad sys_utimensat /* 475 */ .quad sys_signalfd - .quad sys_timerfd + .quad sys_ni_syscall .quad sys_eventfd .size sys_call_table, . - sys_call_table diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 64d19ef..e19e774 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -385,6 +385,7 @@ config ARCH_PXA depends on MMU select ARCH_MTD_XIP select GENERIC_GPIO + select HAVE_GPIO_LIB select GENERIC_TIME select GENERIC_CLOCKEVENTS select TICK_ONESHOT @@ -1122,6 +1123,8 @@ source "drivers/i2c/Kconfig" source "drivers/spi/Kconfig" +source "drivers/gpio/Kconfig" + source "drivers/w1/Kconfig" source "drivers/power/Kconfig" diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S index cecf658..283e14f 100644 --- a/arch/arm/kernel/calls.S +++ b/arch/arm/kernel/calls.S @@ -359,7 +359,7 @@ CALL(sys_kexec_load) CALL(sys_utimensat) CALL(sys_signalfd) -/* 350 */ CALL(sys_timerfd) +/* 350 */ CALL(sys_ni_syscall) CALL(sys_eventfd) CALL(sys_fallocate) #ifndef syscalls_counted diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index e9dfbab..eefae1d 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -150,7 +150,7 @@ int __cpuinit __cpu_up(unsigned int cpu) secondary_data.pgdir = 0; *pmd_offset(pgd, PHYS_OFFSET) = __pmd(0); - pgd_free(pgd); + pgd_free(&init_mm, pgd); if (ret) { printk(KERN_CRIT "CPU%u: processor failed to boot\n", cpu); diff --git a/arch/arm/mach-pxa/Makefile b/arch/arm/mach-pxa/Makefile index 8604938..6e0c4f5 100644 --- a/arch/arm/mach-pxa/Makefile +++ b/arch/arm/mach-pxa/Makefile @@ -3,7 +3,8 @@ # # Common support (must be linked before board specific support) -obj-y += clock.o devices.o generic.o irq.o dma.o time.o +obj-y += clock.o devices.o generic.o irq.o dma.o \ + time.o gpio.o obj-$(CONFIG_PXA25x) += pxa25x.o obj-$(CONFIG_PXA27x) += pxa27x.o obj-$(CONFIG_PXA3xx) += pxa3xx.o mfp.o smemc.o diff --git a/arch/arm/mach-pxa/generic.c b/arch/arm/mach-pxa/generic.c index 7697059..80721c6 100644 --- a/arch/arm/mach-pxa/generic.c +++ b/arch/arm/mach-pxa/generic.c @@ -32,7 +32,6 @@ #include <asm/mach/map.h> #include <asm/arch/pxa-regs.h> -#include <asm/arch/gpio.h> #include "generic.h" @@ -67,97 +66,6 @@ unsigned int get_memclk_frequency_10khz(void) EXPORT_SYMBOL(get_memclk_frequency_10khz); /* - * Handy function to set GPIO alternate functions - */ -int pxa_last_gpio; - -int pxa_gpio_mode(int gpio_mode) -{ - unsigned long flags; - int gpio = gpio_mode & GPIO_MD_MASK_NR; - int fn = (gpio_mode & GPIO_MD_MASK_FN) >> 8; - int gafr; - - if (gpio > pxa_last_gpio) - return -EINVAL; - - local_irq_save(flags); - if (gpio_mode & GPIO_DFLT_LOW) - GPCR(gpio) = GPIO_bit(gpio); - else if (gpio_mode & GPIO_DFLT_HIGH) - GPSR(gpio) = GPIO_bit(gpio); - if (gpio_mode & GPIO_MD_MASK_DIR) - GPDR(gpio) |= GPIO_bit(gpio); - else - GPDR(gpio) &= ~GPIO_bit(gpio); - gafr = GAFR(gpio) & ~(0x3 << (((gpio) & 0xf)*2)); - GAFR(gpio) = gafr | (fn << (((gpio) & 0xf)*2)); - local_irq_restore(flags); - - return 0; -} - -EXPORT_SYMBOL(pxa_gpio_mode); - -int gpio_direction_input(unsigned gpio) -{ - unsigned long flags; - u32 mask; - - if (gpio > pxa_last_gpio) - return -EINVAL; - - mask = GPIO_bit(gpio); - local_irq_save(flags); - GPDR(gpio) &= ~mask; - local_irq_restore(flags); - - return 0; -} -EXPORT_SYMBOL(gpio_direction_input); - -int gpio_direction_output(unsigned gpio, int value) -{ - unsigned long flags; - u32 mask; - - if (gpio > pxa_last_gpio) - return -EINVAL; - - mask = GPIO_bit(gpio); - local_irq_save(flags); - if (value) - GPSR(gpio) = mask; - else - GPCR(gpio) = mask; - GPDR(gpio) |= mask; - local_irq_restore(flags); - - return 0; -} -EXPORT_SYMBOL(gpio_direction_output); - -/* - * Return GPIO level - */ -int pxa_gpio_get_value(unsigned gpio) -{ - return __gpio_get_value(gpio); -} - -EXPORT_SYMBOL(pxa_gpio_get_value); - -/* - * Set output GPIO level - */ -void pxa_gpio_set_value(unsigned gpio, int value) -{ - __gpio_set_value(gpio, value); -} - -EXPORT_SYMBOL(pxa_gpio_set_value); - -/* * Routine to safely enable or disable a clock in the CKEN */ void __pxa_set_cken(int clock, int enable) @@ -172,7 +80,6 @@ void __pxa_set_cken(int clock, int enable) local_irq_restore(flags); } - EXPORT_SYMBOL(__pxa_set_cken); /* diff --git a/arch/arm/mach-pxa/generic.h b/arch/arm/mach-pxa/generic.h index 1a16ad3..b3d10b0 100644 --- a/arch/arm/mach-pxa/generic.h +++ b/arch/arm/mach-pxa/generic.h @@ -16,6 +16,7 @@ extern void __init pxa_init_irq_low(void); extern void __init pxa_init_irq_high(void); extern void __init pxa_init_irq_gpio(int gpio_nr); extern void __init pxa_init_irq_set_wake(int (*set_wake)(unsigned int, unsigned int)); +extern void __init pxa_init_gpio(int gpio_nr); extern void __init pxa25x_init_irq(void); extern void __init pxa27x_init_irq(void); extern void __init pxa3xx_init_irq(void); diff --git a/arch/arm/mach-pxa/gpio.c b/arch/arm/mach-pxa/gpio.c new file mode 100644 index 0000000..8638dd7 --- /dev/null +++ b/arch/arm/mach-pxa/gpio.c @@ -0,0 +1,197 @@ +/* + * linux/arch/arm/mach-pxa/gpio.c + * + * Generic PXA GPIO handling + * + * Author: Nicolas Pitre + * Created: Jun 15, 2001 + * Copyright: MontaVista Software Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/module.h> + +#include <asm/gpio.h> +#include <asm/hardware.h> +#include <asm/io.h> +#include <asm/arch/pxa-regs.h> + +#include "generic.h" + + +struct pxa_gpio_chip { + struct gpio_chip chip; + void __iomem *regbase; +}; + +int pxa_last_gpio; + +/* + * Configure pins for GPIO or other functions + */ +int pxa_gpio_mode(int gpio_mode) +{ + unsigned long flags; + int gpio = gpio_mode & GPIO_MD_MASK_NR; + int fn = (gpio_mode & GPIO_MD_MASK_FN) >> 8; + int gafr; + + if (gpio > pxa_last_gpio) + return -EINVAL; + + local_irq_save(flags); + if (gpio_mode & GPIO_DFLT_LOW) + GPCR(gpio) = GPIO_bit(gpio); + else if (gpio_mode & GPIO_DFLT_HIGH) + GPSR(gpio) = GPIO_bit(gpio); + if (gpio_mode & GPIO_MD_MASK_DIR) + GPDR(gpio) |= GPIO_bit(gpio); + else + GPDR(gpio) &= ~GPIO_bit(gpio); + gafr = GAFR(gpio) & ~(0x3 << (((gpio) & 0xf)*2)); + GAFR(gpio) = gafr | (fn << (((gpio) & 0xf)*2)); + local_irq_restore(flags); + + return 0; +} +EXPORT_SYMBOL(pxa_gpio_mode); + +static int pxa_gpio_direction_input(struct gpio_chip *chip, unsigned offset) +{ + unsigned long flags; + u32 mask = 1 << offset; + u32 value; + struct pxa_gpio_chip *pxa; + void __iomem *gpdr; + + pxa = container_of(chip, struct pxa_gpio_chip, chip); + gpdr = pxa->regbase + GPDR_OFFSET; + local_irq_save(flags); + value = __raw_readl(gpdr); + value &= ~mask; + __raw_writel(value, gpdr); + local_irq_restore(flags); + + return 0; +} + +static int pxa_gpio_direction_output(struct gpio_chip *chip, + unsigned offset, int value) +{ + unsigned long flags; + u32 mask = 1 << offset; + u32 tmp; + struct pxa_gpio_chip *pxa; + void __iomem *gpdr; + + pxa = container_of(chip, struct pxa_gpio_chip, chip); + __raw_writel(mask, + pxa->regbase + (value ? GPSR_OFFSET : GPCR_OFFSET)); + gpdr = pxa->regbase + GPDR_OFFSET; + local_irq_save(flags); + tmp = __raw_readl(gpdr); + tmp |= mask; + __raw_writel(tmp, gpdr); + local_irq_restore(flags); + + return 0; +} + +/* + * Return GPIO level + */ +static int pxa_gpio_get(struct gpio_chip *chip, unsigned offset) +{ + u32 mask = 1 << offset; + struct pxa_gpio_chip *pxa; + + pxa = container_of(chip, struct pxa_gpio_chip, chip); + return __raw_readl(pxa->regbase + GPLR_OFFSET) & mask; +} + +/* + * Set output GPIO level + */ +static void pxa_gpio_set(struct gpio_chip *chip, unsigned offset, int value) +{ + u32 mask = 1 << offset; + struct pxa_gpio_chip *pxa; + + pxa = container_of(chip, struct pxa_gpio_chip, chip); + + if (value) + __raw_writel(mask, pxa->regbase + GPSR_OFFSET); + else + __raw_writel(mask, pxa->regbase + GPCR_OFFSET); +} + +static struct pxa_gpio_chip pxa_gpio_chip[] = { + [0] = { + .regbase = GPIO0_BASE, + .chip = { + .label = "gpio-0", + .direction_input = pxa_gpio_direction_input, + .direction_output = pxa_gpio_direction_output, + .get = pxa_gpio_get, + .set = pxa_gpio_set, + .base = 0, + .ngpio = 32, + }, + }, + [1] = { + .regbase = GPIO1_BASE, + .chip = { + .label = "gpio-1", + .direction_input = pxa_gpio_direction_input, + .direction_output = pxa_gpio_direction_output, + .get = pxa_gpio_get, + .set = pxa_gpio_set, + .base = 32, + .ngpio = 32, + }, + }, + [2] = { + .regbase = GPIO2_BASE, + .chip = { + .label = "gpio-2", + .direction_input = pxa_gpio_direction_input, + .direction_output = pxa_gpio_direction_output, + .get = pxa_gpio_get, + .set = pxa_gpio_set, + .base = 64, + .ngpio = 32, /* 21 for PXA25x */ + }, + }, +#if defined(CONFIG_PXA27x) || defined(CONFIG_PXA3xx) + [3] = { + .regbase = GPIO3_BASE, + .chip = { + .label = "gpio-3", + .direction_input = pxa_gpio_direction_input, + .direction_output = pxa_gpio_direction_output, + .get = pxa_gpio_get, + .set = pxa_gpio_set, + .base = 96, + .ngpio = 32, + }, + }, +#endif +}; + +void __init pxa_init_gpio(int gpio_nr) +{ + int i; + + /* add a GPIO chip for each register bank. + * the last PXA25x register only contains 21 GPIOs + */ + for (i = 0; i < gpio_nr; i += 32) { + if (i+32 > gpio_nr) + pxa_gpio_chip[i/32].chip.ngpio = gpio_nr - i; + gpiochip_add(&pxa_gpio_chip[i/32].chip); + } +} diff --git a/arch/arm/mach-pxa/irq.c b/arch/arm/mach-pxa/irq.c index 5a1d5ee..36c6a68 100644 --- a/arch/arm/mach-pxa/irq.c +++ b/arch/arm/mach-pxa/irq.c @@ -311,6 +311,8 @@ void __init pxa_init_irq_gpio(int gpio_nr) /* Install handler for GPIO>=2 edge detect interrupts */ set_irq_chip(IRQ_GPIO_2_x, &pxa_internal_chip_low); set_irq_chained_handler(IRQ_GPIO_2_x, pxa_gpio_demux_handler); + + pxa_init_gpio(gpio_nr); } void __init pxa_init_irq_set_wake(int (*set_wake)(unsigned int, unsigned int)) diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index 7595277..303a7ff 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -162,7 +162,7 @@ static void unmap_area_sections(unsigned long virt, unsigned long size) * Free the page table, if there was one. */ if ((pmd_val(pmd) & PMD_TYPE_MASK) == PMD_TYPE_TABLE) - pte_free_kernel(pmd_page_vaddr(pmd)); + pte_free_kernel(&init_mm, pmd_page_vaddr(pmd)); } addr += PGDIR_SIZE; diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index 50b9aed..500c961 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -65,14 +65,14 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) return new_pgd; no_pte: - pmd_free(new_pmd); + pmd_free(mm, new_pmd); no_pmd: free_pages((unsigned long)new_pgd, 2); no_pgd: return NULL; } -void free_pgd_slow(pgd_t *pgd) +void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd) { pmd_t *pmd; struct page *pte; @@ -94,8 +94,8 @@ void free_pgd_slow(pgd_t *pgd) pmd_clear(pmd); dec_zone_page_state(virt_to_page((unsigned long *)pgd), NR_PAGETABLE); pte_lock_deinit(pte); - pte_free(pte); - pmd_free(pmd); + pte_free(mm, pte); + pmd_free(mm, pmd); free: free_pages((unsigned long) pgd, 2); } diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index c816f29..28e0caf 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -82,6 +82,7 @@ config PLATFORM_AT32AP select SUBARCH_AVR32B select MMU select PERFORMANCE_COUNTERS + select HAVE_GPIO_LIB # # CPU types diff --git a/arch/avr32/mach-at32ap/pio.c b/arch/avr32/mach-at32ap/pio.c index d61a02d..38a8fa3 100644 --- a/arch/avr32/mach-at32ap/pio.c +++ b/arch/avr32/mach-at32ap/pio.c @@ -24,11 +24,11 @@ #define MAX_NR_PIO_DEVICES 8 struct pio_device { + struct gpio_chip chip; void __iomem *regs; const struct platform_device *pdev; struct clk *clk; u32 pinmux_mask; - u32 gpio_mask; char name[8]; }; @@ -64,7 +64,8 @@ void __init at32_select_periph(unsigned int pin, unsigned int periph, goto fail; } - if (unlikely(test_and_set_bit(pin_index, &pio->pinmux_mask))) { + if (unlikely(test_and_set_bit(pin_index, &pio->pinmux_mask) + || gpiochip_is_requested(&pio->chip, pin_index))) { printk("%s: pin %u is busy\n", pio->name, pin_index); goto fail; } @@ -79,9 +80,6 @@ void __init at32_select_periph(unsigned int pin, unsigned int periph, if (!(flags & AT32_GPIOF_PULLUP)) pio_writel(pio, PUDR, mask); - /* gpio_request NOT allowed */ - set_bit(pin_index, &pio->gpio_mask); - return; fail: @@ -130,9 +128,6 @@ void __init at32_select_gpio(unsigned int pin, unsigned long flags) pio_writel(pio, PER, mask); - /* gpio_request now allowed */ - clear_bit(pin_index, &pio->gpio_mask); - return; fail: @@ -166,96 +161,50 @@ fail: /* GPIO API */ -int gpio_request(unsigned int gpio, const char *label) +static int direction_input(struct gpio_chip *chip, unsigned offset) { - struct pio_device *pio; - unsigned int pin; - - pio = gpio_to_pio(gpio); - if (!pio) - return -ENODEV; + struct pio_device *pio = container_of(chip, struct pio_device, chip); + u32 mask = 1 << offset; - pin = gpio & 0x1f; - if (test_and_set_bit(pin, &pio->gpio_mask)) - return -EBUSY; + if (!(pio_readl(pio, PSR) & mask)) + return -EINVAL; + pio_writel(pio, ODR, mask); return 0; } -EXPORT_SYMBOL(gpio_request); -void gpio_free(unsigned int gpio) +static int gpio_get(struct gpio_chip *chip, unsigned offset) { - struct pio_device *pio; - unsigned int pin; + struct pio_device *pio = container_of(chip, struct pio_device, chip); - pio = gpio_to_pio(gpio); - if (!pio) { - printk(KERN_ERR - "gpio: attempted to free invalid pin %u\n", gpio); - return; - } - - pin = gpio & 0x1f; - if (!test_and_clear_bit(pin, &pio->gpio_mask)) - printk(KERN_ERR "gpio: freeing free or non-gpio pin %s-%u\n", - pio->name, pin); + return (pio_readl(pio, PDSR) >> offset) & 1; } -EXPORT_SYMBOL(gpio_free); -int gpio_direction_input(unsigned int gpio) -{ - struct pio_device *pio; - unsigned int pin; - - pio = gpio_to_pio(gpio); - if (!pio) - return -ENODEV; - - pin = gpio & 0x1f; - pio_writel(pio, ODR, 1 << pin); - - return 0; -} -EXPORT_SYMBOL(gpio_direction_input); +static void gpio_set(struct gpio_chip *chip, unsigned offset, int value); -int gpio_direction_output(unsigned int gpio, int value) +static int direction_output(struct gpio_chip *chip, unsigned offset, int value) { - struct pio_device *pio; - unsigned int pin; - - pio = gpio_to_pio(gpio); - if (!pio) - return -ENODEV; + struct pio_device *pio = container_of(chip, struct pio_device, chip); + u32 mask = 1 << offset; - gpio_set_value(gpio, value); - - pin = gpio & 0x1f; - pio_writel(pio, OER, 1 << pin); + if (!(pio_readl(pio, PSR) & mask)) + return -EINVAL; + gpio_set(chip, offset, value); + pio_writel(pio, OER, mask); return 0; } -EXPORT_SYMBOL(gpio_direction_output); -int gpio_get_value(unsigned int gpio) +static void gpio_set(struct gpio_chip *chip, unsigned offset, int value) { - struct pio_device *pio = &pio_dev[gpio >> 5]; + struct pio_device *pio = container_of(chip, struct pio_device, chip); + u32 mask = 1 << offset; - return (pio_readl(pio, PDSR) >> (gpio & 0x1f)) & 1; -} -EXPORT_SYMBOL(gpio_get_value); - -void gpio_set_value(unsigned int gpio, int value) -{ - struct pio_device *pio = &pio_dev[gpio >> 5]; - u32 mask; - - mask = 1 << (gpio & 0x1f); if (value) pio_writel(pio, SODR, mask); else pio_writel(pio, CODR, mask); } -EXPORT_SYMBOL(gpio_set_value); /*--------------------------------------------------------------------------*/ @@ -339,6 +288,63 @@ gpio_irq_setup(struct pio_device *pio, int irq, int gpio_irq) /*--------------------------------------------------------------------------*/ +#ifdef CONFIG_DEBUG_FS + +#include <linux/seq_file.h> + +/* + * This shows more info than the generic gpio dump code: + * pullups, deglitching, open drain drive. + */ +static void pio_bank_show(struct seq_file *s, struct gpio_chip *chip) +{ + struct pio_device *pio = container_of(chip, struct pio_device, chip); + u32 psr, osr, imr, pdsr, pusr, ifsr, mdsr; + unsigned i; + u32 mask; + char bank; + + psr = pio_readl(pio, PSR); + osr = pio_readl(pio, OSR); + imr = pio_readl(pio, IMR); + pdsr = pio_readl(pio, PDSR); + pusr = pio_readl(pio, PUSR); + ifsr = pio_readl(pio, IFSR); + mdsr = pio_readl(pio, MDSR); + + bank = 'A' + pio->pdev->id; + + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { + const char *label; + + label = gpiochip_is_requested(chip, i); + if (!label) + continue; + + seq_printf(s, " gpio-%-3d P%c%-2d (%-12s) %s %s %s", + chip->base + i, bank, i, + label, + (osr & mask) ? "out" : "in ", + (mask & pdsr) ? "hi" : "lo", + (mask & pusr) ? " " : "up"); + if (ifsr & mask) + seq_printf(s, " deglitch"); + if ((osr & mdsr) & mask) + seq_printf(s, " open-drain"); + if (imr & mask) + seq_printf(s, " irq-%d edge-both", + gpio_to_irq(chip->base + i)); + seq_printf(s, "\n"); + } +} + +#else +#define pio_bank_show NULL +#endif + + +/*--------------------------------------------------------------------------*/ + static int __init pio_probe(struct platform_device *pdev) { struct pio_device *pio = NULL; @@ -349,6 +355,18 @@ static int __init pio_probe(struct platform_device *pdev) pio = &pio_dev[pdev->id]; BUG_ON(!pio->regs); + pio->chip.label = pio->name; + pio->chip.base = pdev->id * 32; + pio->chip.ngpio = 32; + + pio->chip.direction_input = direction_input; + pio->chip.get = gpio_get; + pio->chip.direction_output = direction_output; + pio->chip.set = gpio_set; + pio->chip.dbg_show = pio_bank_show; + + gpiochip_add(&pio->chip); + gpio_irq_setup(pio, irq, gpio_irq_base); platform_set_drvdata(pdev, pio); @@ -406,12 +424,6 @@ void __init at32_init_pio(struct platform_device *pdev) pio->pdev = pdev; pio->regs = ioremap(regs->start, regs->end - regs->start + 1); - /* - * request_gpio() is only valid for pins that have been - * explicitly configured as GPIO and not previously requested - */ - pio->gpio_mask = ~0UL; - /* start with irqs disabled and acked */ pio_writel(pio, IDR, ~0UL); (void) pio_readl(pio, ISR); diff --git a/arch/avr32/mach-at32ap/pio.h b/arch/avr32/mach-at32ap/pio.h index 50fa3ac..7795116 100644 --- a/arch/avr32/mach-at32ap/pio.h +++ b/arch/avr32/mach-at32ap/pio.h @@ -19,7 +19,7 @@ #define PIO_OSR 0x0018 #define PIO_IFER 0x0020 #define PIO_IFDR 0x0024 -#define PIO_ISFR 0x0028 +#define PIO_IFSR 0x0028 #define PIO_SODR 0x0030 #define PIO_CODR 0x0034 #define PIO_ODSR 0x0038 diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S index 56ff51b..fdd9bf43 100644 --- a/arch/blackfin/mach-common/entry.S +++ b/arch/blackfin/mach-common/entry.S @@ -1373,7 +1373,7 @@ ENTRY(_sys_call_table) .long _sys_epoll_pwait .long _sys_utimensat .long _sys_signalfd - .long _sys_timerfd + .long _sys_ni_syscall .long _sys_eventfd /* 350 */ .long _sys_pread64 .long _sys_pwrite64 diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 7f0be4c..27b082a 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -150,6 +150,7 @@ config ETRAX_FLASH_BUSWIDTH Width in bytes of the Flash bus (1, 2 or 4). Is usually 2. source arch/cris/arch-v10/Kconfig +source arch/cris/arch-v32/Kconfig endmenu @@ -157,8 +158,8 @@ source "net/Kconfig" # bring in ETRAX built-in drivers menu "Drivers for built-in interfaces" -# arch/cris/arch is a symlink to correct arch (arch-v10 or arch-v32) -source arch/cris/arch/drivers/Kconfig +source arch/cris/arch-v10/drivers/Kconfig +source arch/cris/arch-v32/drivers/Kconfig endmenu diff --git a/arch/cris/arch-v10/Kconfig b/arch/cris/arch-v10/Kconfig index f1ce6f6..1d61fae 100644 --- a/arch/cris/arch-v10/Kconfig +++ b/arch/cris/arch-v10/Kconfig @@ -1,3 +1,5 @@ +if ETRAX_ARCH_V10 + # ETRAX 100LX v1 has a MMU "feature" requiring a low mapping config CRIS_LOW_MAP bool @@ -451,3 +453,5 @@ config ETRAX_POWERBUTTON_BIT default "25" help Configure where power button is connected. + +endif diff --git a/arch/cris/arch-v10/drivers/Kconfig b/arch/cris/arch-v10/drivers/Kconfig index e3c0f29..96740ef 100644 --- a/arch/cris/arch-v10/drivers/Kconfig +++ b/arch/cris/arch-v10/drivers/Kconfig @@ -1,3 +1,5 @@ +if ETRAX_ARCH_V10 + config ETRAX_ETHERNET bool "Ethernet support" depends on ETRAX_ARCH_V10 @@ -806,3 +808,5 @@ config ETRAX_DS1302_TRICKLE_CHARGE 1 = 2kohm, 2 = 4kohm, 3 = 4kohm 4 = 1 diode, 8 = 2 diodes Allowed values are (increasing current): 0, 11, 10, 9, 7, 6, 5 + +endif diff --git a/arch/cris/arch-v10/kernel/entry.S b/arch/cris/arch-v10/kernel/entry.S index ec62c95..d1361dc 100644 --- a/arch/cris/arch-v10/kernel/entry.S +++ b/arch/cris/arch-v10/kernel/entry.S @@ -1167,7 +1167,7 @@ sys_call_table: .long sys_epoll_pwait .long sys_utimensat /* 320 */ .long sys_signalfd - .long sys_timerfd + .long sys_ni_syscall .long sys_eventfd .long sys_fallocate diff --git a/arch/cris/arch-v32/Kconfig b/arch/cris/arch-v32/Kconfig index 4f79d8e..d8acaa9 100644 --- a/arch/cris/arch-v32/Kconfig +++ b/arch/cris/arch-v32/Kconfig @@ -1,3 +1,5 @@ +if ETRAX_ARCH_V32 + config ETRAX_DRAM_VIRTUAL_BASE hex depends on ETRAX_ARCH_V32 @@ -294,3 +296,5 @@ config ETRAX_DEF_GIO_PE_OUT help Configures the initial data for the general port E bits. Most products should use 00000 here. + +endif diff --git a/arch/cris/arch-v32/drivers/Kconfig b/arch/cris/arch-v32/drivers/Kconfig index 9bccb5e2..c329cce 100644 --- a/arch/cris/arch-v32/drivers/Kconfig +++ b/arch/cris/arch-v32/drivers/Kconfig @@ -1,3 +1,5 @@ +if ETRAX_ARCH_V32 + config ETRAX_ETHERNET bool "Ethernet support" depends on ETRAX_ARCH_V32 @@ -610,3 +612,5 @@ config ETRAX_STREAMCOPROC help This option enables a driver for the stream co-processor for cryptographic operations. + +endif diff --git a/arch/cris/arch-v32/drivers/pci/dma.c b/arch/cris/arch-v32/drivers/pci/dma.c index 66f9500..e036465 100644 --- a/arch/cris/arch-v32/drivers/pci/dma.c +++ b/arch/cris/arch-v32/drivers/pci/dma.c @@ -93,7 +93,7 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); if (!dev->dma_mem) - goto out; + goto iounmap_out; dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); if (!dev->dma_mem->bitmap) goto free1_out; @@ -110,6 +110,8 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, free1_out: kfree(dev->dma_mem); + iounmap_out: + iounmap(mem_base); out: return 0; } diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index bf0468c..96f7d70 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -138,6 +138,15 @@ config UCPAGE_OFFSET_C0000000 endchoice +config PAGE_OFFSET + hex + default 0x20000000 if UCPAGE_OFFSET_20000000 + default 0x40000000 if UCPAGE_OFFSET_40000000 + default 0x60000000 if UCPAGE_OFFSET_60000000 + default 0x80000000 if UCPAGE_OFFSET_80000000 + default 0xA0000000 if UCPAGE_OFFSET_A0000000 + default 0xC0000000 + config PROTECT_KERNEL bool "Protect core kernel against userspace" depends on !MMU diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S index f42b328..ef7527b 100644 --- a/arch/frv/kernel/vmlinux.lds.S +++ b/arch/frv/kernel/vmlinux.lds.S @@ -13,7 +13,7 @@ ENTRY(_start) jiffies = jiffies_64 + 4; -__page_offset = 0xc0000000; /* start of area covered by struct pages */ +__page_offset = CONFIG_PAGE_OFFSET; /* start of area covered by struct pages */ __kernel_image_start = __page_offset; /* address at which kernel image resides */ SECTIONS diff --git a/arch/frv/mm/mmu-context.c b/arch/frv/mm/mmu-context.c index 1530a411..81757d5 100644 --- a/arch/frv/mm/mmu-context.c +++ b/arch/frv/mm/mmu-context.c @@ -181,7 +181,7 @@ int cxn_pin_by_pid(pid_t pid) /* get a handle on the mm_struct */ read_lock(&tasklist_lock); - tsk = find_task_by_pid(pid); + tsk = find_task_by_vpid(pid); if (tsk) { ret = -EINVAL; diff --git a/arch/frv/mm/pgalloc.c b/arch/frv/mm/pgalloc.c index 7787c3c..1a2e5c8 100644 --- a/arch/frv/mm/pgalloc.c +++ b/arch/frv/mm/pgalloc.c @@ -140,7 +140,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) return pgd; } -void pgd_free(pgd_t *pgd) +void pgd_free(struct mm_struct *mm, pgd_t *pgd) { /* in the non-PAE case, clear_page_tables() clears user pgd entries */ quicklist_free(0, pgd_dtor, pgd); diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index 45bf04e..c412fe6 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -1265,7 +1265,7 @@ sba_fill_pdir( * the sglist do both. */ static SBA_INLINE int -sba_coalesce_chunks( struct ioc *ioc, +sba_coalesce_chunks(struct ioc *ioc, struct device *dev, struct scatterlist *startsg, int nents) { @@ -1275,6 +1275,7 @@ sba_coalesce_chunks( struct ioc *ioc, struct scatterlist *dma_sg; /* next DMA stream head */ unsigned long dma_offset, dma_len; /* start/len of DMA stream */ int n_mappings = 0; + unsigned int max_seg_size = dma_get_max_seg_size(dev); while (nents > 0) { unsigned long vaddr = (unsigned long) sba_sg_address(startsg); @@ -1314,6 +1315,9 @@ sba_coalesce_chunks( struct ioc *ioc, > DMA_CHUNK_SIZE) break; + if (dma_len + startsg->length > max_seg_size) + break; + /* ** Then look for virtually contiguous blocks. ** @@ -1441,7 +1445,7 @@ int sba_map_sg(struct device *dev, struct scatterlist *sglist, int nents, int di ** w/o this association, we wouldn't have coherent DMA! ** Access to the virtual address is what forces a two pass algorithm. */ - coalesced = sba_coalesce_chunks(ioc, sglist, nents); + coalesced = sba_coalesce_chunks(ioc, dev, sglist, nents); /* ** Program the I/O Pdir diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index c36f43c..f5d3efb 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1586,7 +1586,7 @@ sys_call_table: data8 sys_epoll_pwait // 1305 data8 sys_utimensat data8 sys_signalfd - data8 sys_timerfd + data8 sys_ni_syscall data8 sys_eventfd .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls diff --git a/arch/m32r/boot/compressed/m32r_sio.c b/arch/m32r/boot/compressed/m32r_sio.c index ee3c8be..01d877c 100644 --- a/arch/m32r/boot/compressed/m32r_sio.c +++ b/arch/m32r/boot/compressed/m32r_sio.c @@ -17,7 +17,7 @@ static int puts(const char *s) return 0; } -#if defined(CONFIG_PLAT_M32700UT_Alpha) || defined(CONFIG_PLAT_M32700UT) || defined(CONFIG_PLAT_OPSPUT) +#if defined(CONFIG_PLAT_M32700UT) || defined(CONFIG_PLAT_OPSPUT) #include <asm/m32r.h> #include <asm/io.h> @@ -52,7 +52,7 @@ static void putc(char c) } *BOOT_SIO0TXB = c; } -#else /* !(CONFIG_PLAT_M32700UT_Alpha) && !(CONFIG_PLAT_M32700UT) */ +#else /* !(CONFIG_PLAT_M32700UT) */ #if defined(CONFIG_PLAT_MAPPI2) #define SIO0STS (volatile unsigned short *)(0xa0efd000 + 14) #define SIO0TXB (volatile unsigned short *)(0xa0efd000 + 30) diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 8236e42..ffabd01 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -577,20 +577,6 @@ config MAC_HID depends on INPUT_ADBHID default y -config MAC_ADBKEYCODES - bool "Support for ADB raw keycodes" - depends on INPUT_ADBHID - help - This provides support for sending raw ADB keycodes to console - devices. This is the default up to 2.4.0, but in future this may be - phased out in favor of generic Linux keycodes. If you say Y here, - you can dynamically switch via the - /proc/sys/dev/mac_hid/keyboard_sends_linux_keycodes - sysctl and with the "keyboard_sends_linux_keycodes=" kernel - argument. - - If unsure, say Y here. - config ADB_KEYBOARD bool "Support for ADB keyboard (old driver)" depends on MAC && !INPUT_ADBHID diff --git a/arch/m68k/Makefile b/arch/m68k/Makefile index 4a1bd44..2cba605 100644 --- a/arch/m68k/Makefile +++ b/arch/m68k/Makefile @@ -13,16 +13,15 @@ # Copyright (C) 1994 by Hamish Macdonald # -# test for cross compiling -COMPILE_ARCH = $(shell uname -m) - # override top level makefile AS += -m68020 LDFLAGS := -m m68kelf LDFLAGS_MODULE += -T $(srctree)/arch/m68k/kernel/module.lds -ifneq ($(COMPILE_ARCH),$(ARCH)) - # prefix for cross-compiling binaries - CROSS_COMPILE = m68k-linux-gnu- +ifneq ($(SUBARCH),$(ARCH)) + ifeq ($(CROSS_COMPILE),) + CROSS_COMPILE := $(call cc-cross-prefix, \ + m68k-linux-gnu- m68k-linux- m68k-unknown-linux-gnu-) + endif endif ifdef CONFIG_SUN3 diff --git a/arch/m68k/amiga/Makefile b/arch/m68k/amiga/Makefile index 8b41565..6a0d765 100644 --- a/arch/m68k/amiga/Makefile +++ b/arch/m68k/amiga/Makefile @@ -2,6 +2,6 @@ # Makefile for Linux arch/m68k/amiga source directory # -obj-y := config.o amiints.o cia.o chipram.o amisound.o amiga_ksyms.o +obj-y := config.o amiints.o cia.o chipram.o amisound.o obj-$(CONFIG_AMIGA_PCMCIA) += pcmcia.o diff --git a/arch/m68k/amiga/amiga_ksyms.c b/arch/m68k/amiga/amiga_ksyms.c deleted file mode 100644 index 7fdcf6b..0000000 --- a/arch/m68k/amiga/amiga_ksyms.c +++ /dev/null @@ -1,33 +0,0 @@ -#include <linux/module.h> -#include <linux/types.h> -#include <asm/ptrace.h> -#include <asm/amigahw.h> -#include <asm/amigaints.h> -#include <asm/amipcmcia.h> - -extern volatile u_short amiga_audio_min_period; -extern u_short amiga_audio_period; - -/* - * Add things here when you find the need for it. - */ -EXPORT_SYMBOL(amiga_model); -EXPORT_SYMBOL(amiga_chipset); -EXPORT_SYMBOL(amiga_hw_present); -EXPORT_SYMBOL(amiga_eclock); -EXPORT_SYMBOL(amiga_colorclock); -EXPORT_SYMBOL(amiga_chip_alloc); -EXPORT_SYMBOL(amiga_chip_free); -EXPORT_SYMBOL(amiga_chip_avail); -EXPORT_SYMBOL(amiga_chip_size); -EXPORT_SYMBOL(amiga_audio_period); -EXPORT_SYMBOL(amiga_audio_min_period); - -#ifdef CONFIG_AMIGA_PCMCIA - EXPORT_SYMBOL(pcmcia_reset); - EXPORT_SYMBOL(pcmcia_copy_tuple); - EXPORT_SYMBOL(pcmcia_program_voltage); - EXPORT_SYMBOL(pcmcia_access_speed); - EXPORT_SYMBOL(pcmcia_write_enable); - EXPORT_SYMBOL(pcmcia_write_disable); -#endif diff --git a/arch/m68k/amiga/amisound.c b/arch/m68k/amiga/amisound.c index 1f5bfb5..61e5c54 100644 --- a/arch/m68k/amiga/amisound.c +++ b/arch/m68k/amiga/amisound.c @@ -12,6 +12,7 @@ #include <linux/timer.h> #include <linux/init.h> #include <linux/string.h> +#include <linux/module.h> #include <asm/system.h> #include <asm/amigahw.h> @@ -21,7 +22,7 @@ static const signed char sine_data[] = { 0, 39, 75, 103, 121, 127, 121, 103, 75, 39, 0, -39, -75, -103, -121, -127, -121, -103, -75, -39 }; -#define DATA_SIZE (sizeof(sine_data)/sizeof(sine_data[0])) +#define DATA_SIZE ARRAY_SIZE(sine_data) #define custom amiga_custom @@ -31,6 +32,7 @@ static const signed char sine_data[] = { */ volatile unsigned short amiga_audio_min_period = 124; /* Default for pre-OCS */ +EXPORT_SYMBOL(amiga_audio_min_period); #define MAX_PERIOD (65535) @@ -40,6 +42,7 @@ volatile unsigned short amiga_audio_min_period = 124; /* Default for pre-OCS */ */ unsigned short amiga_audio_period = MAX_PERIOD; +EXPORT_SYMBOL(amiga_audio_period); static unsigned long clock_constant; diff --git a/arch/m68k/amiga/chipram.c b/arch/m68k/amiga/chipram.c index fa015d8..d10726f 100644 --- a/arch/m68k/amiga/chipram.c +++ b/arch/m68k/amiga/chipram.c @@ -13,10 +13,13 @@ #include <linux/ioport.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/module.h> + #include <asm/page.h> #include <asm/amigahw.h> unsigned long amiga_chip_size; +EXPORT_SYMBOL(amiga_chip_size); static struct resource chipram_res = { .name = "Chip RAM", .start = CHIP_PHYSADDR @@ -67,6 +70,7 @@ void *amiga_chip_alloc(unsigned long size, const char *name) #endif return (void *)ZTWO_VADDR(res->start); } +EXPORT_SYMBOL(amiga_chip_alloc); /* @@ -120,6 +124,7 @@ void amiga_chip_free(void *ptr) } printk("amiga_chip_free: trying to free nonexistent region at %p\n", ptr); } +EXPORT_SYMBOL(amiga_chip_free); unsigned long amiga_chip_avail(void) @@ -129,3 +134,5 @@ unsigned long amiga_chip_avail(void) #endif return chipavail; } +EXPORT_SYMBOL(amiga_chip_avail); + diff --git a/arch/m68k/amiga/config.c b/arch/m68k/amiga/config.c index 3574853..50f5daa 100644 --- a/arch/m68k/amiga/config.c +++ b/arch/m68k/amiga/config.c @@ -23,6 +23,7 @@ #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/zorro.h> +#include <linux/module.h> #include <asm/bootinfo.h> #include <asm/setup.h> @@ -36,13 +37,24 @@ #include <asm/io.h> unsigned long amiga_model; +EXPORT_SYMBOL(amiga_model); + unsigned long amiga_eclock; +EXPORT_SYMBOL(amiga_eclock); + unsigned long amiga_masterclock; + unsigned long amiga_colorclock; +EXPORT_SYMBOL(amiga_colorclock); + unsigned long amiga_chipset; +EXPORT_SYMBOL(amiga_chipset); + unsigned char amiga_vblank; unsigned char amiga_psfreq; + struct amiga_hw_present amiga_hw_present; +EXPORT_SYMBOL(amiga_hw_present); static char s_a500[] __initdata = "A500"; static char s_a500p[] __initdata = "A500+"; diff --git a/arch/m68k/amiga/pcmcia.c b/arch/m68k/amiga/pcmcia.c index 186662c..7106f0c 100644 --- a/arch/m68k/amiga/pcmcia.c +++ b/arch/m68k/amiga/pcmcia.c @@ -15,6 +15,8 @@ #include <linux/types.h> #include <linux/jiffies.h> #include <linux/timer.h> +#include <linux/module.h> + #include <asm/amigayle.h> #include <asm/amipcmcia.h> @@ -30,6 +32,7 @@ void pcmcia_reset(void) while (time_before(jiffies, reset_start_time + 1*HZ/100)); b = gayle_reset; } +EXPORT_SYMBOL(pcmcia_reset); /* copy a tuple, including tuple header. return nb bytes copied */ @@ -61,6 +64,7 @@ int pcmcia_copy_tuple(unsigned char tuple_id, void *tuple, int max_len) return 0; } +EXPORT_SYMBOL(pcmcia_copy_tuple); void pcmcia_program_voltage(int voltage) { @@ -84,6 +88,7 @@ void pcmcia_program_voltage(int voltage) gayle.config = cfg_byte; } +EXPORT_SYMBOL(pcmcia_program_voltage); void pcmcia_access_speed(int speed) { @@ -101,13 +106,17 @@ void pcmcia_access_speed(int speed) cfg_byte = (cfg_byte & 0xf3) | s; gayle.config = cfg_byte; } +EXPORT_SYMBOL(pcmcia_access_speed); void pcmcia_write_enable(void) { gayle.cardstatus = GAYLE_CS_WR|GAYLE_CS_DA; } +EXPORT_SYMBOL(pcmcia_write_enable); void pcmcia_write_disable(void) { gayle.cardstatus = 0; } +EXPORT_SYMBOL(pcmcia_write_disable); + diff --git a/arch/m68k/atari/Makefile b/arch/m68k/atari/Makefile index 2cb8619..2cd905e 100644 --- a/arch/m68k/atari/Makefile +++ b/arch/m68k/atari/Makefile @@ -3,7 +3,7 @@ # obj-y := config.o time.o debug.o ataints.o stdma.o \ - atasound.o stram.o atari_ksyms.o + atasound.o stram.o ifeq ($(CONFIG_PCI),y) obj-$(CONFIG_HADES) += hades-pci.o diff --git a/arch/m68k/atari/ataints.c b/arch/m68k/atari/ataints.c index b85ca22..b45593a 100644 --- a/arch/m68k/atari/ataints.c +++ b/arch/m68k/atari/ataints.c @@ -40,6 +40,7 @@ #include <linux/kernel_stat.h> #include <linux/init.h> #include <linux/seq_file.h> +#include <linux/module.h> #include <asm/system.h> #include <asm/traps.h> @@ -446,6 +447,7 @@ unsigned long atari_register_vme_int(void) free_vme_vec_bitmap |= 1 << i; return VME_SOURCE_BASE + i; } +EXPORT_SYMBOL(atari_register_vme_int); void atari_unregister_vme_int(unsigned long irq) @@ -455,5 +457,6 @@ void atari_unregister_vme_int(unsigned long irq) free_vme_vec_bitmap &= ~(1 << irq); } } +EXPORT_SYMBOL(atari_unregister_vme_int); diff --git a/arch/m68k/atari/atari_ksyms.c b/arch/m68k/atari/atari_ksyms.c deleted file mode 100644 index a047571..0000000 --- a/arch/m68k/atari/atari_ksyms.c +++ /dev/null @@ -1,35 +0,0 @@ -#include <linux/module.h> - -#include <asm/ptrace.h> -#include <asm/traps.h> -#include <asm/atarihw.h> -#include <asm/atariints.h> -#include <asm/atarikb.h> -#include <asm/atari_joystick.h> -#include <asm/atari_stdma.h> -#include <asm/atari_stram.h> - -extern void atari_microwire_cmd( int cmd ); -extern int atari_MFP_init_done; -extern int atari_SCC_init_done; -extern int atari_SCC_reset_done; - -EXPORT_SYMBOL(atari_mch_cookie); -EXPORT_SYMBOL(atari_mch_type); -EXPORT_SYMBOL(atari_hw_present); -EXPORT_SYMBOL(atari_switches); -EXPORT_SYMBOL(atari_dont_touch_floppy_select); -EXPORT_SYMBOL(atari_register_vme_int); -EXPORT_SYMBOL(atari_unregister_vme_int); -EXPORT_SYMBOL(stdma_lock); -EXPORT_SYMBOL(stdma_release); -EXPORT_SYMBOL(stdma_others_waiting); -EXPORT_SYMBOL(stdma_islocked); -EXPORT_SYMBOL(atari_stram_alloc); -EXPORT_SYMBOL(atari_stram_free); - -EXPORT_SYMBOL(atari_MFP_init_done); -EXPORT_SYMBOL(atari_SCC_init_done); -EXPORT_SYMBOL(atari_SCC_reset_done); - -EXPORT_SYMBOL(atari_microwire_cmd); diff --git a/arch/m68k/atari/atasound.c b/arch/m68k/atari/atasound.c index ee04250..d266fe8 100644 --- a/arch/m68k/atari/atasound.c +++ b/arch/m68k/atari/atasound.c @@ -22,6 +22,7 @@ #include <linux/fcntl.h> #include <linux/errno.h> #include <linux/mm.h> +#include <linux/module.h> #include <asm/atarihw.h> #include <asm/system.h> @@ -43,6 +44,7 @@ void atari_microwire_cmd (int cmd) while( tt_microwire.mask != 0x7ff) ; } +EXPORT_SYMBOL(atari_microwire_cmd); /* PSG base frequency */ diff --git a/arch/m68k/atari/config.c b/arch/m68k/atari/config.c index e40e5dc..5945e15 100644 --- a/arch/m68k/atari/config.c +++ b/arch/m68k/atari/config.c @@ -31,6 +31,7 @@ #include <linux/delay.h> #include <linux/ioport.h> #include <linux/vt_kern.h> +#include <linux/module.h> #include <asm/bootinfo.h> #include <asm/setup.h> @@ -43,10 +44,20 @@ #include <asm/io.h> u_long atari_mch_cookie; +EXPORT_SYMBOL(atari_mch_cookie); + u_long atari_mch_type; +EXPORT_SYMBOL(atari_mch_type); + struct atari_hw_present atari_hw_present; +EXPORT_SYMBOL(atari_hw_present); + u_long atari_switches; +EXPORT_SYMBOL(atari_switches); + int atari_dont_touch_floppy_select; +EXPORT_SYMBOL(atari_dont_touch_floppy_select); + int atari_rtc_year_offset; /* local function prototypes */ diff --git a/arch/m68k/atari/debug.c b/arch/m68k/atari/debug.c index fbeed8c..043ddbc 100644 --- a/arch/m68k/atari/debug.c +++ b/arch/m68k/atari/debug.c @@ -15,17 +15,23 @@ #include <linux/console.h> #include <linux/init.h> #include <linux/delay.h> +#include <linux/module.h> #include <asm/atarihw.h> #include <asm/atariints.h> /* Flag that Modem1 port is already initialized and used */ int atari_MFP_init_done; +EXPORT_SYMBOL(atari_MFP_init_done); + /* Flag that Modem1 port is already initialized and used */ int atari_SCC_init_done; +EXPORT_SYMBOL(atari_SCC_init_done); + /* Can be set somewhere, if a SCC master reset has already be done and should * not be repeated; used by kgdb */ int atari_SCC_reset_done; +EXPORT_SYMBOL(atari_SCC_reset_done); static struct console atari_console_driver = { .name = "debug", diff --git a/arch/m68k/atari/hades-pci.c b/arch/m68k/atari/hades-pci.c index bee2b14..2bbabc0 100644 --- a/arch/m68k/atari/hades-pci.c +++ b/arch/m68k/atari/hades-pci.c @@ -376,8 +376,8 @@ struct pci_bus_info * __init init_hades_pci(void) */ bus = kzalloc(sizeof(struct pci_bus_info), GFP_KERNEL); - if (!bus) - return NULL; + if (unlikely(!bus)) + goto iounmap_base_virt; /* * Claim resources. The m68k has no separate I/O space, both @@ -385,43 +385,25 @@ struct pci_bus_info * __init init_hades_pci(void) * the I/O resources are requested in memory space as well. */ - if (request_resource(&iomem_resource, &config_space) != 0) - { - kfree(bus); - return NULL; - } + if (unlikely(request_resource(&iomem_resource, &config_space) != 0)) + goto free_bus; - if (request_resource(&iomem_resource, &io_space) != 0) - { - release_resource(&config_space); - kfree(bus); - return NULL; - } + if (unlikely(request_resource(&iomem_resource, &io_space) != 0)) + goto release_config_space; bus->mem_space.start = HADES_MEM_BASE; bus->mem_space.end = HADES_MEM_BASE + HADES_MEM_SIZE - 1; bus->mem_space.name = pci_mem_name; #if 1 - if (request_resource(&iomem_resource, &bus->mem_space) != 0) - { - release_resource(&io_space); - release_resource(&config_space); - kfree(bus); - return NULL; - } + if (unlikely(request_resource(&iomem_resource, &bus->mem_space) != 0)) + goto release_io_space; #endif bus->io_space.start = pci_io_base_virt; bus->io_space.end = pci_io_base_virt + HADES_VIRT_IO_SIZE - 1; bus->io_space.name = pci_io_name; #if 1 - if (request_resource(&ioport_resource, &bus->io_space) != 0) - { - release_resource(&bus->mem_space); - release_resource(&io_space); - release_resource(&config_space); - kfree(bus); - return NULL; - } + if (unlikely(request_resource(&ioport_resource, &bus->io_space) != 0)) + goto release_bus_mem_space; #endif /* * Set hardware dependent functions. @@ -438,5 +420,21 @@ struct pci_bus_info * __init init_hades_pci(void) tt_mfp.active_edge &= ~0x27; return bus; + +release_bus_mem_space: + release_resource(&bus->mem_space); +release_io_space: + release_resource(&io_space); +release_config_space: + release_resource(&config_space); +free_bus: + kfree(bus); +iounmap_base_virt: + iounmap((void *)pci_io_base_virt); + + for (i = 0; i < N_SLOTS; i++) + iounmap((void *)pci_conf_base_virt[i]); + + return NULL; } #endif diff --git a/arch/m68k/atari/stdma.c b/arch/m68k/atari/stdma.c index ab3fd52..d1bd029 100644 --- a/arch/m68k/atari/stdma.c +++ b/arch/m68k/atari/stdma.c @@ -35,6 +35,7 @@ #include <linux/init.h> #include <linux/interrupt.h> #include <linux/wait.h> +#include <linux/module.h> #include <asm/atari_stdma.h> #include <asm/atariints.h> @@ -91,6 +92,7 @@ void stdma_lock(irq_handler_t handler, void *data) stdma_isr_data = data; local_irq_restore(flags); } +EXPORT_SYMBOL(stdma_lock); /* @@ -117,6 +119,7 @@ void stdma_release(void) local_irq_restore(flags); } +EXPORT_SYMBOL(stdma_release); /* @@ -134,6 +137,7 @@ int stdma_others_waiting(void) { return waitqueue_active(&stdma_wait); } +EXPORT_SYMBOL(stdma_others_waiting); /* @@ -155,6 +159,7 @@ int stdma_islocked(void) { return stdma_locked; } +EXPORT_SYMBOL(stdma_islocked); /* diff --git a/arch/m68k/atari/stram.c b/arch/m68k/atari/stram.c index bf4588c..8dda651 100644 --- a/arch/m68k/atari/stram.c +++ b/arch/m68k/atari/stram.c @@ -20,6 +20,7 @@ #include <linux/bootmem.h> #include <linux/mount.h> #include <linux/blkdev.h> +#include <linux/module.h> #include <asm/setup.h> #include <asm/machdep.h> @@ -208,6 +209,7 @@ void *atari_stram_alloc(long size, const char *owner) } return( addr ); } +EXPORT_SYMBOL(atari_stram_alloc); void atari_stram_free( void *addr ) @@ -237,6 +239,7 @@ void atari_stram_free( void *addr ) printk( KERN_ERR "atari_stram_free: cannot free block at %p " "(called from %p)\n", addr, __builtin_return_address(0) ); } +EXPORT_SYMBOL(atari_stram_free); /* ------------------------------------------------------------------------ */ diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 15b80ab..ff9dffa 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -678,7 +678,6 @@ CONFIG_LOGO_MAC_CLUT224=y # CONFIG_MAC_SCC=y CONFIG_MAC_HID=y -CONFIG_MAC_ADBKEYCODES=y CONFIG_SERIAL_CONSOLE=y # diff --git a/arch/m68k/hp300/Makefile b/arch/m68k/hp300/Makefile index 288b9c6..96d4244 100644 --- a/arch/m68k/hp300/Makefile +++ b/arch/m68k/hp300/Makefile @@ -2,4 +2,4 @@ # Makefile for Linux arch/m68k/hp300 source directory # -obj-y := ksyms.o config.o time.o reboot.o +obj-y := config.o time.o reboot.o diff --git a/arch/m68k/hp300/ksyms.c b/arch/m68k/hp300/ksyms.c deleted file mode 100644 index 8202830..0000000 --- a/arch/m68k/hp300/ksyms.c +++ /dev/null @@ -1,9 +0,0 @@ -/* - * linux/arch/m68k/hp300/ksyms.c - * - * Copyright (C) 1998 Philip Blundell <philb@gnu.org> - * - * This file contains the HP300-specific kernel symbols. None yet. :-) - */ - -#include <linux/module.h> diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S index 918f5db..6dfa3b3 100644 --- a/arch/m68k/kernel/entry.S +++ b/arch/m68k/kernel/entry.S @@ -742,7 +742,7 @@ sys_call_table: .long sys_epoll_pwait /* 315 */ .long sys_utimensat .long sys_signalfd - .long sys_timerfd + .long sys_ni_syscall .long sys_eventfd .long sys_fallocate /* 320 */ diff --git a/arch/m68k/mac/Makefile b/arch/m68k/mac/Makefile index 995a09d9..1d265ba 100644 --- a/arch/m68k/mac/Makefile +++ b/arch/m68k/mac/Makefile @@ -3,4 +3,4 @@ # obj-y := config.o bootparse.o macints.o iop.o via.o oss.o psc.o \ - baboon.o macboing.o debug.o misc.o mac_ksyms.o + baboon.o macboing.o debug.o misc.o diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c index 01b468b..735a49b 100644 --- a/arch/m68k/mac/config.c +++ b/arch/m68k/mac/config.c @@ -58,8 +58,6 @@ extern struct mem_info m68k_memory[NUM_MEMINFO]; extern struct mem_info m68k_ramdisk; -extern char m68k_command_line[CL_SIZE]; - void *mac_env; /* Loaded by the boot asm */ /* The phys. video addr. - might be bogus on some machines */ diff --git a/arch/m68k/mac/mac_ksyms.c b/arch/m68k/mac/mac_ksyms.c deleted file mode 100644 index 6e37ceb..0000000 --- a/arch/m68k/mac/mac_ksyms.c +++ /dev/null @@ -1,8 +0,0 @@ -#include <linux/module.h> -#include <asm/ptrace.h> -#include <asm/traps.h> - -/* Says whether we're using A/UX interrupts or not */ -extern int via_alt_mapping; - -EXPORT_SYMBOL(via_alt_mapping); diff --git a/arch/m68k/mac/via.c b/arch/m68k/mac/via.c index 8df270e..fa485df 100644 --- a/arch/m68k/mac/via.c +++ b/arch/m68k/mac/via.c @@ -28,6 +28,7 @@ #include <linux/delay.h> #include <linux/init.h> #include <linux/ide.h> +#include <linux/module.h> #include <asm/bootinfo.h> #include <asm/macintosh.h> @@ -41,7 +42,9 @@ volatile __u8 *via1, *via2; /* See note in mac_via.h about how this is possibly not useful */ volatile long *via_memory_bogon=(long *)&via_memory_bogon; #endif -int rbv_present, via_alt_mapping; +int rbv_present; +int via_alt_mapping; +EXPORT_SYMBOL(via_alt_mapping); __u8 rbv_clear; /* diff --git a/arch/m68k/mvme16x/Makefile b/arch/m68k/mvme16x/Makefile index 950e82f..edb3f6e 100644 --- a/arch/m68k/mvme16x/Makefile +++ b/arch/m68k/mvme16x/Makefile @@ -2,4 +2,4 @@ # Makefile for Linux arch/m68k/mvme16x source directory # -obj-y := config.o rtc.o mvme16x_ksyms.o +obj-y := config.o rtc.o diff --git a/arch/m68k/mvme16x/config.c b/arch/m68k/mvme16x/config.c index daa7851..24cbc30 100644 --- a/arch/m68k/mvme16x/config.c +++ b/arch/m68k/mvme16x/config.c @@ -25,6 +25,7 @@ #include <linux/genhd.h> #include <linux/rtc.h> #include <linux/interrupt.h> +#include <linux/module.h> #include <asm/bootinfo.h> #include <asm/system.h> @@ -58,6 +59,7 @@ static irq_handler_t tick_handler; unsigned short mvme16x_config; +EXPORT_SYMBOL(mvme16x_config); int mvme16x_parse_bootinfo(const struct bi_record *bi) diff --git a/arch/m68k/mvme16x/mvme16x_ksyms.c b/arch/m68k/mvme16x/mvme16x_ksyms.c deleted file mode 100644 index 4a8a363..0000000 --- a/arch/m68k/mvme16x/mvme16x_ksyms.c +++ /dev/null @@ -1,6 +0,0 @@ -#include <linux/module.h> -#include <linux/types.h> -#include <asm/ptrace.h> -#include <asm/mvme16xhw.h> - -EXPORT_SYMBOL(mvme16x_config); diff --git a/arch/m68knommu/Kconfig.debug b/arch/m68knommu/Kconfig.debug index 9ff47bd..ed6d9a83 100644 --- a/arch/m68knommu/Kconfig.debug +++ b/arch/m68knommu/Kconfig.debug @@ -21,13 +21,6 @@ config BOOTPARAM_STRING default 'console=ttyS0,19200' depends on BOOTPARAM -config DUMPTOFLASH - bool "Panic/Dump to FLASH" - depends on COLDFIRE - help - Dump any panic of trap output into a flash memory segment - for later analysis. - config NO_KERNEL_MSG bool "Suppress Kernel BUG Messages" help diff --git a/arch/m68knommu/defconfig b/arch/m68knommu/defconfig index 5a0ecaa..6481130 100644 --- a/arch/m68knommu/defconfig +++ b/arch/m68knommu/defconfig @@ -597,7 +597,6 @@ CONFIG_MSDOS_PARTITION=y # CONFIG_FULLDEBUG is not set # CONFIG_HIGHPROFILE is not set # CONFIG_BOOTPARAM is not set -# CONFIG_DUMPTOFLASH is not set # CONFIG_NO_KERNEL_MSG is not set # CONFIG_BDM_DISABLE is not set diff --git a/arch/m68knommu/kernel/m68k_ksyms.c b/arch/m68knommu/kernel/m68k_ksyms.c index f795062..53fad14 100644 --- a/arch/m68knommu/kernel/m68k_ksyms.c +++ b/arch/m68knommu/kernel/m68k_ksyms.c @@ -24,14 +24,6 @@ extern int dump_fpu(struct pt_regs *, elf_fpregset_t *); EXPORT_SYMBOL(__ioremap); EXPORT_SYMBOL(iounmap); EXPORT_SYMBOL(dump_fpu); -EXPORT_SYMBOL(strnlen); -EXPORT_SYMBOL(strrchr); -EXPORT_SYMBOL(strstr); -EXPORT_SYMBOL(strchr); -EXPORT_SYMBOL(strcat); -EXPORT_SYMBOL(strlen); -EXPORT_SYMBOL(strcmp); -EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(ip_fast_csum); @@ -46,9 +38,6 @@ EXPORT_SYMBOL(csum_partial_copy_nocheck); it's OK to leave it out of version control. */ EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(memcmp); -EXPORT_SYMBOL(memscan); -EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(__down_failed); EXPORT_SYMBOL(__down_failed_interruptible); diff --git a/arch/m68knommu/kernel/setup.c b/arch/m68knommu/kernel/setup.c index 332345d..81507c5 100644 --- a/arch/m68knommu/kernel/setup.c +++ b/arch/m68knommu/kernel/setup.c @@ -64,9 +64,6 @@ void (*mach_power_off)(void); #ifdef CONFIG_M68VZ328 #define CPU "MC68VZ328" #endif -#ifdef CONFIG_M68332 - #define CPU "MC68332" -#endif #ifdef CONFIG_M68360 #define CPU "MC68360" #endif diff --git a/arch/m68knommu/kernel/syscalltable.S b/arch/m68knommu/kernel/syscalltable.S index 9620093..1b02b88 100644 --- a/arch/m68knommu/kernel/syscalltable.S +++ b/arch/m68knommu/kernel/syscalltable.S @@ -336,7 +336,7 @@ ENTRY(sys_call_table) .long sys_epoll_pwait /* 315 */ .long sys_utimensat .long sys_signalfd - .long sys_timerfd + .long sys_ni_syscall .long sys_eventfd .long sys_fallocate /* 320 */ diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 82480a1..f798139 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -660,7 +660,7 @@ einval: li v0, -EINVAL sys sys_ioprio_get 2 /* 4315 */ sys sys_utimensat 4 sys sys_signalfd 3 - sys sys_timerfd 4 + sys sys_ni_syscall 0 sys sys_eventfd 1 sys sys_fallocate 6 /* 4320 */ .endm diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S index c2c1087..a626be6 100644 --- a/arch/mips/kernel/scall64-64.S +++ b/arch/mips/kernel/scall64-64.S @@ -475,7 +475,7 @@ sys_call_table: PTR sys_ioprio_get PTR sys_utimensat /* 5275 */ PTR sys_signalfd - PTR sys_timerfd + PTR sys_ni_syscall PTR sys_eventfd PTR sys_fallocate .size sys_call_table,.-sys_call_table diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index 01993ec..9d5bcaf 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -401,7 +401,7 @@ EXPORT(sysn32_call_table) PTR sys_ioprio_get PTR compat_sys_utimensat PTR compat_sys_signalfd /* 5280 */ - PTR compat_sys_timerfd + PTR sys_ni_syscall PTR sys_eventfd PTR sys_fallocate .size sysn32_call_table,.-sysn32_call_table diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index dd68afc..fd2019c 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -523,7 +523,7 @@ sys_call_table: PTR sys_ioprio_get /* 4315 */ PTR compat_sys_utimensat PTR compat_sys_signalfd - PTR compat_sys_timerfd + PTR sys_ni_syscall PTR sys_eventfd PTR sys32_fallocate /* 4320 */ .size sys_call_table,.-sys_call_table diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b94d450..cf030b0 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -256,6 +256,9 @@ config IOMMU_VMERGE Most drivers don't have this problem; it is safe to say Y here. +config IOMMU_HELPER + def_bool PPC64 + config HOTPLUG_CPU bool "Support for enabling/disabling CPUs" depends on SMP && HOTPLUG && EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC) diff --git a/arch/powerpc/kernel/dma_64.c b/arch/powerpc/kernel/dma_64.c index 8423907..3a317cb 100644 --- a/arch/powerpc/kernel/dma_64.c +++ b/arch/powerpc/kernel/dma_64.c @@ -31,8 +31,8 @@ static inline unsigned long device_to_mask(struct device *dev) static void *dma_iommu_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { - return iommu_alloc_coherent(dev->archdata.dma_data, size, dma_handle, - device_to_mask(dev), flag, + return iommu_alloc_coherent(dev, dev->archdata.dma_data, size, + dma_handle, device_to_mask(dev), flag, dev->archdata.numa_node); } @@ -52,7 +52,7 @@ static dma_addr_t dma_iommu_map_single(struct device *dev, void *vaddr, size_t size, enum dma_data_direction direction) { - return iommu_map_single(dev->archdata.dma_data, vaddr, size, + return iommu_map_single(dev, dev->archdata.dma_data, vaddr, size, device_to_mask(dev), direction); } @@ -68,7 +68,7 @@ static void dma_iommu_unmap_single(struct device *dev, dma_addr_t dma_handle, static int dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, int nelems, enum dma_data_direction direction) { - return iommu_map_sg(dev->archdata.dma_data, sglist, nelems, + return iommu_map_sg(dev, sglist, nelems, device_to_mask(dev), direction); } diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index a3c406a..8f1f4e5 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -31,6 +31,7 @@ #include <linux/string.h> #include <linux/dma-mapping.h> #include <linux/bitops.h> +#include <linux/iommu-helper.h> #include <asm/io.h> #include <asm/prom.h> #include <asm/iommu.h> @@ -81,17 +82,19 @@ static int __init setup_iommu(char *str) __setup("protect4gb=", setup_protect4gb); __setup("iommu=", setup_iommu); -static unsigned long iommu_range_alloc(struct iommu_table *tbl, +static unsigned long iommu_range_alloc(struct device *dev, + struct iommu_table *tbl, unsigned long npages, unsigned long *handle, unsigned long mask, unsigned int align_order) { - unsigned long n, end, i, start; + unsigned long n, end, start; unsigned long limit; int largealloc = npages > 15; int pass = 0; unsigned long align_mask; + unsigned long boundary_size; align_mask = 0xffffffffffffffffl >> (64 - align_order); @@ -136,14 +139,17 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl, start &= mask; } - n = find_next_zero_bit(tbl->it_map, limit, start); - - /* Align allocation */ - n = (n + align_mask) & ~align_mask; - - end = n + npages; + if (dev) + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + 1 << IOMMU_PAGE_SHIFT); + else + boundary_size = ALIGN(1UL << 32, 1 << IOMMU_PAGE_SHIFT); + /* 4GB boundary for iseries_hv_alloc and iseries_hv_map */ - if (unlikely(end >= limit)) { + n = iommu_area_alloc(tbl->it_map, limit, start, npages, + tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT, + align_mask); + if (n == -1) { if (likely(pass < 2)) { /* First failure, just rescan the half of the table. * Second failure, rescan the other half of the table. @@ -158,14 +164,7 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl, } } - for (i = n; i < end; i++) - if (test_bit(i, tbl->it_map)) { - start = i+1; - goto again; - } - - for (i = n; i < end; i++) - __set_bit(i, tbl->it_map); + end = n + npages; /* Bump the hint to a new block for small allocs. */ if (largealloc) { @@ -184,16 +183,17 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl, return n; } -static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page, - unsigned int npages, enum dma_data_direction direction, - unsigned long mask, unsigned int align_order) +static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, + void *page, unsigned int npages, + enum dma_data_direction direction, + unsigned long mask, unsigned int align_order) { unsigned long entry, flags; dma_addr_t ret = DMA_ERROR_CODE; spin_lock_irqsave(&(tbl->it_lock), flags); - entry = iommu_range_alloc(tbl, npages, NULL, mask, align_order); + entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order); if (unlikely(entry == DMA_ERROR_CODE)) { spin_unlock_irqrestore(&(tbl->it_lock), flags); @@ -224,7 +224,6 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages) { unsigned long entry, free_entry; - unsigned long i; entry = dma_addr >> IOMMU_PAGE_SHIFT; free_entry = entry - tbl->it_offset; @@ -246,9 +245,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, } ppc_md.tce_free(tbl, entry, npages); - - for (i = 0; i < npages; i++) - __clear_bit(free_entry+i, tbl->it_map); + iommu_area_free(tbl->it_map, free_entry, npages); } static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, @@ -270,16 +267,18 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, spin_unlock_irqrestore(&(tbl->it_lock), flags); } -int iommu_map_sg(struct iommu_table *tbl, struct scatterlist *sglist, +int iommu_map_sg(struct device *dev, struct scatterlist *sglist, int nelems, unsigned long mask, enum dma_data_direction direction) { + struct iommu_table *tbl = dev->archdata.dma_data; dma_addr_t dma_next = 0, dma_addr; unsigned long flags; struct scatterlist *s, *outs, *segstart; int outcount, incount, i; unsigned int align; unsigned long handle; + unsigned int max_seg_size; BUG_ON(direction == DMA_NONE); @@ -298,6 +297,7 @@ int iommu_map_sg(struct iommu_table *tbl, struct scatterlist *sglist, spin_lock_irqsave(&(tbl->it_lock), flags); + max_seg_size = dma_get_max_seg_size(dev); for_each_sg(sglist, s, nelems, i) { unsigned long vaddr, npages, entry, slen; @@ -314,7 +314,7 @@ int iommu_map_sg(struct iommu_table *tbl, struct scatterlist *sglist, if (IOMMU_PAGE_SHIFT < PAGE_SHIFT && slen >= PAGE_SIZE && (vaddr & ~PAGE_MASK) == 0) align = PAGE_SHIFT - IOMMU_PAGE_SHIFT; - entry = iommu_range_alloc(tbl, npages, &handle, + entry = iommu_range_alloc(dev, tbl, npages, &handle, mask >> IOMMU_PAGE_SHIFT, align); DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen); @@ -344,7 +344,8 @@ int iommu_map_sg(struct iommu_table *tbl, struct scatterlist *sglist, /* We cannot merge if: * - allocated dma_addr isn't contiguous to previous allocation */ - if (novmerge || (dma_addr != dma_next)) { + if (novmerge || (dma_addr != dma_next) || + (outs->dma_length + s->length > max_seg_size)) { /* Can't merge: create a new segment */ segstart = s; outcount++; @@ -452,9 +453,6 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) { unsigned long sz; - unsigned long start_index, end_index; - unsigned long entries_per_4g; - unsigned long index; static int welcomed = 0; struct page *page; @@ -476,6 +474,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) #ifdef CONFIG_CRASH_DUMP if (ppc_md.tce_get) { + unsigned long index; unsigned long tceval; unsigned long tcecount = 0; @@ -506,23 +505,6 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size); #endif - /* - * DMA cannot cross 4 GB boundary. Mark last entry of each 4 - * GB chunk as reserved. - */ - if (protect4gb) { - entries_per_4g = 0x100000000l >> IOMMU_PAGE_SHIFT; - - /* Mark the last bit before a 4GB boundary as used */ - start_index = tbl->it_offset | (entries_per_4g - 1); - start_index -= tbl->it_offset; - - end_index = tbl->it_size; - - for (index = start_index; index < end_index - 1; index += entries_per_4g) - __set_bit(index, tbl->it_map); - } - if (!welcomed) { printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n", novmerge ? "disabled" : "enabled"); @@ -570,9 +552,9 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name) * need not be page aligned, the dma_addr_t returned will point to the same * byte within the page as vaddr. */ -dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr, - size_t size, unsigned long mask, - enum dma_data_direction direction) +dma_addr_t iommu_map_single(struct device *dev, struct iommu_table *tbl, + void *vaddr, size_t size, unsigned long mask, + enum dma_data_direction direction) { dma_addr_t dma_handle = DMA_ERROR_CODE; unsigned long uaddr; @@ -589,7 +571,7 @@ dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr, ((unsigned long)vaddr & ~PAGE_MASK) == 0) align = PAGE_SHIFT - IOMMU_PAGE_SHIFT; - dma_handle = iommu_alloc(tbl, vaddr, npages, direction, + dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction, mask >> IOMMU_PAGE_SHIFT, align); if (dma_handle == DMA_ERROR_CODE) { if (printk_ratelimit()) { @@ -621,8 +603,9 @@ void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle, * Returns the virtual address of the buffer and sets dma_handle * to the dma address (mapping) of the first page. */ -void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size, - dma_addr_t *dma_handle, unsigned long mask, gfp_t flag, int node) +void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, + size_t size, dma_addr_t *dma_handle, + unsigned long mask, gfp_t flag, int node) { void *ret = NULL; dma_addr_t mapping; @@ -656,7 +639,7 @@ void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size, /* Set up tces to cover the allocated range */ nio_pages = size >> IOMMU_PAGE_SHIFT; io_order = get_iommu_order(size); - mapping = iommu_alloc(tbl, ret, nio_pages, DMA_BIDIRECTIONAL, + mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL, mask >> IOMMU_PAGE_SHIFT, io_order); if (mapping == DMA_ERROR_CODE) { free_pages((unsigned long)ret, order); diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 6448872..f80f90c 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -86,7 +86,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) return ret; } -void pgd_free(pgd_t *pgd) +void pgd_free(struct mm_struct *mm, pgd_t *pgd) { free_pages((unsigned long)pgd, PGDIR_ORDER); } @@ -123,7 +123,7 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) return ptepage; } -void pte_free_kernel(pte_t *pte) +void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { #ifdef CONFIG_SMP hash_page_sync(); @@ -131,7 +131,7 @@ void pte_free_kernel(pte_t *pte) free_page((unsigned long)pte); } -void pte_free(struct page *ptepage) +void pte_free(struct mm_struct *mm, struct page *ptepage) { #ifdef CONFIG_SMP hash_page_sync(); diff --git a/arch/powerpc/platforms/iseries/iommu.c b/arch/powerpc/platforms/iseries/iommu.c index 6a0c6f6..11fa3c7 100644 --- a/arch/powerpc/platforms/iseries/iommu.c +++ b/arch/powerpc/platforms/iseries/iommu.c @@ -199,7 +199,7 @@ static struct iommu_table vio_iommu_table; void *iseries_hv_alloc(size_t size, dma_addr_t *dma_handle, gfp_t flag) { - return iommu_alloc_coherent(&vio_iommu_table, size, dma_handle, + return iommu_alloc_coherent(NULL, &vio_iommu_table, size, dma_handle, DMA_32BIT_MASK, flag, -1); } EXPORT_SYMBOL_GPL(iseries_hv_alloc); @@ -213,7 +213,7 @@ EXPORT_SYMBOL_GPL(iseries_hv_free); dma_addr_t iseries_hv_map(void *vaddr, size_t size, enum dma_data_direction direction) { - return iommu_map_single(&vio_iommu_table, vaddr, size, + return iommu_map_single(NULL, &vio_iommu_table, vaddr, size, DMA_32BIT_MASK, direction); } diff --git a/arch/ppc/mm/pgtable.c b/arch/ppc/mm/pgtable.c index fadacfd..409fcaa 100644 --- a/arch/ppc/mm/pgtable.c +++ b/arch/ppc/mm/pgtable.c @@ -74,7 +74,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) return ret; } -void pgd_free(pgd_t *pgd) +void pgd_free(struct mm_struct *mm, pgd_t *pgd) { free_pages((unsigned long)pgd, PGDIR_ORDER); } @@ -111,7 +111,7 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) return ptepage; } -void pte_free_kernel(pte_t *pte) +void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { #ifdef CONFIG_SMP hash_page_sync(); @@ -119,7 +119,7 @@ void pte_free_kernel(pte_t *pte) free_page((unsigned long)pte); } -void pte_free(struct page *ptepage) +void pte_free(struct mm_struct *mm, struct page *ptepage) { #ifdef CONFIG_SMP hash_page_sync(); diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S index 6ee1bed..062c3d4 100644 --- a/arch/s390/kernel/compat_wrapper.S +++ b/arch/s390/kernel/compat_wrapper.S @@ -1698,14 +1698,6 @@ compat_sys_signalfd_wrapper: llgfr %r4,%r4 # compat_size_t jg compat_sys_signalfd - .globl compat_sys_timerfd_wrapper -compat_sys_timerfd_wrapper: - lgfr %r2,%r2 # int - lgfr %r3,%r3 # int - lgfr %r4,%r4 # int - llgtr %r5,%r5 # struct compat_itimerspec * - jg compat_sys_timerfd - .globl sys_eventfd_wrapper sys_eventfd_wrapper: llgfr %r2,%r2 # unsigned int diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index 9e26ed9..25eac78 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -325,5 +325,5 @@ SYSCALL(sys_utimes,sys_utimes,compat_sys_utimes_wrapper) SYSCALL(s390_fallocate,sys_fallocate,sys_fallocate_wrapper) SYSCALL(sys_utimensat,sys_utimensat,compat_sys_utimensat_wrapper) /* 315 */ SYSCALL(sys_signalfd,sys_signalfd,compat_sys_signalfd_wrapper) -SYSCALL(sys_timerfd,sys_timerfd,compat_sys_timerfd_wrapper) +NI_SYSCALL /* 317 old sys_timer_fd */ SYSCALL(sys_eventfd,sys_eventfd,sys_eventfd_wrapper) diff --git a/arch/sparc/kernel/systbls.S b/arch/sparc/kernel/systbls.S index 5572284..ee010f4 100644 --- a/arch/sparc/kernel/systbls.S +++ b/arch/sparc/kernel/systbls.S @@ -79,7 +79,7 @@ sys_call_table: /*295*/ .long sys_fchmodat, sys_faccessat, sys_pselect6, sys_ppoll, sys_unshare /*300*/ .long sys_set_robust_list, sys_get_robust_list, sys_migrate_pages, sys_mbind, sys_get_mempolicy /*305*/ .long sys_set_mempolicy, sys_kexec_load, sys_move_pages, sys_getcpu, sys_epoll_pwait -/*310*/ .long sys_utimensat, sys_signalfd, sys_timerfd, sys_eventfd, sys_fallocate +/*310*/ .long sys_utimensat, sys_signalfd, sys_ni_syscall, sys_eventfd, sys_fallocate #ifdef CONFIG_SUNOS_EMUL /* Now the SunOS syscall table. */ diff --git a/arch/sparc64/kernel/iommu.c b/arch/sparc64/kernel/iommu.c index 070a484..4b9115a 100644 --- a/arch/sparc64/kernel/iommu.c +++ b/arch/sparc64/kernel/iommu.c @@ -580,7 +580,7 @@ static int dma_4u_map_sg(struct device *dev, struct scatterlist *sglist, /* Step 1: Prepare scatter list. */ - npages = prepare_sg(sglist, nelems); + npages = prepare_sg(dev, sglist, nelems); /* Step 2: Allocate a cluster and context, if necessary. */ diff --git a/arch/sparc64/kernel/iommu_common.c b/arch/sparc64/kernel/iommu_common.c index efd5dff..72a4acf 100644 --- a/arch/sparc64/kernel/iommu_common.c +++ b/arch/sparc64/kernel/iommu_common.c @@ -4,6 +4,7 @@ * Copyright (C) 1999 David S. Miller (davem@redhat.com) */ +#include <linux/dma-mapping.h> #include "iommu_common.h" /* You are _strongly_ advised to enable the following debugging code @@ -201,21 +202,24 @@ void verify_sglist(struct scatterlist *sglist, int nents, iopte_t *iopte, int np } #endif -unsigned long prepare_sg(struct scatterlist *sg, int nents) +unsigned long prepare_sg(struct device *dev, struct scatterlist *sg, int nents) { struct scatterlist *dma_sg = sg; unsigned long prev; u32 dent_addr, dent_len; + unsigned int max_seg_size; prev = (unsigned long) sg_virt(sg); prev += (unsigned long) (dent_len = sg->length); dent_addr = (u32) ((unsigned long)(sg_virt(sg)) & (IO_PAGE_SIZE - 1UL)); + max_seg_size = dma_get_max_seg_size(dev); while (--nents) { unsigned long addr; sg = sg_next(sg); addr = (unsigned long) sg_virt(sg); - if (! VCONTIG(prev, addr)) { + if (! VCONTIG(prev, addr) || + dent_len + sg->length > max_seg_size) { dma_sg->dma_address = dent_addr; dma_sg->dma_length = dent_len; dma_sg = sg_next(dma_sg); diff --git a/arch/sparc64/kernel/iommu_common.h b/arch/sparc64/kernel/iommu_common.h index 75b5a58..a90d046e 100644 --- a/arch/sparc64/kernel/iommu_common.h +++ b/arch/sparc64/kernel/iommu_common.h @@ -9,6 +9,7 @@ #include <linux/sched.h> #include <linux/mm.h> #include <linux/scatterlist.h> +#include <linux/device.h> #include <asm/iommu.h> #include <asm/scatterlist.h> @@ -46,4 +47,4 @@ extern void verify_sglist(struct scatterlist *sg, int nents, iopte_t *iopte, int #define VCONTIG(__X, __Y) (((__X) == (__Y)) || \ (((__X) | (__Y)) << (64UL - PAGE_SHIFT)) == 0UL) -extern unsigned long prepare_sg(struct scatterlist *sg, int nents); +extern unsigned long prepare_sg(struct device *dev, struct scatterlist *sg, int nents); diff --git a/arch/sparc64/kernel/pci_sun4v.c b/arch/sparc64/kernel/pci_sun4v.c index 1aa8e04..67d6dce 100644 --- a/arch/sparc64/kernel/pci_sun4v.c +++ b/arch/sparc64/kernel/pci_sun4v.c @@ -490,7 +490,7 @@ static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist, goto bad; /* Step 1: Prepare scatter list. */ - npages = prepare_sg(sglist, nelems); + npages = prepare_sg(dev, sglist, nelems); /* Step 2: Allocate a cluster and context, if necessary. */ spin_lock_irqsave(&iommu->lock, flags); diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S index 06d1090..b805890 100644 --- a/arch/sparc64/kernel/systbls.S +++ b/arch/sparc64/kernel/systbls.S @@ -80,7 +80,7 @@ sys_call_table32: .word sys_fchmodat, sys_faccessat, compat_sys_pselect6, compat_sys_ppoll, sys_unshare /*300*/ .word compat_sys_set_robust_list, compat_sys_get_robust_list, compat_sys_migrate_pages, compat_sys_mbind, compat_sys_get_mempolicy .word compat_sys_set_mempolicy, compat_sys_kexec_load, compat_sys_move_pages, sys_getcpu, compat_sys_epoll_pwait -/*310*/ .word compat_sys_utimensat, compat_sys_signalfd, compat_sys_timerfd, sys_eventfd, compat_sys_fallocate +/*310*/ .word compat_sys_utimensat, compat_sys_signalfd, sys_ni_syscall, sys_eventfd, compat_sys_fallocate #endif /* CONFIG_COMPAT */ @@ -152,7 +152,7 @@ sys_call_table: .word sys_fchmodat, sys_faccessat, sys_pselect6, sys_ppoll, sys_unshare /*300*/ .word sys_set_robust_list, sys_get_robust_list, sys_migrate_pages, sys_mbind, sys_get_mempolicy .word sys_set_mempolicy, sys_kexec_load, sys_move_pages, sys_getcpu, sys_epoll_pwait -/*310*/ .word sys_utimensat, sys_signalfd, sys_timerfd, sys_eventfd, sys_fallocate +/*310*/ .word sys_utimensat, sys_signalfd, sys_ni_syscall, sys_eventfd, sys_fallocate #if defined(CONFIG_SUNOS_EMUL) || defined(CONFIG_SOLARIS_EMUL) || \ defined(CONFIG_SOLARIS_EMUL_MODULE) diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 55945db..99e51d0 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -68,6 +68,10 @@ config IRQ_RELEASE_METHOD bool default y +config HZ + int + default 100 + menu "UML-specific options" config STATIC_LINK @@ -95,23 +99,6 @@ config LD_SCRIPT_DYN default y depends on !LD_SCRIPT_STATIC -config NET - bool "Networking support" - help - Unless you really know what you are doing, you should say Y here. - The reason is that some programs need kernel networking support even - when running on a stand-alone machine that isn't connected to any - other computer. If you are upgrading from an older kernel, you - should consider updating your networking tools too because changes - in the kernel and the tools often go hand in hand. The tools are - contained in the package net-tools, the location and version number - of which are given in <file:Documentation/Changes>. - - For a general introduction to Linux networking, it is highly - recommended to read the NET-HOWTO, available from - <http://www.tldp.org/docs.html#howto>. - - source "fs/Kconfig.binfmt" config HOSTFS @@ -145,7 +132,7 @@ config HPPFS by removing or changing anything in /proc which gives away the identity of a UML. - See <http://user-mode-linux.sf.net/hppfs.html> for more information. + See <http://user-mode-linux.sf.net/old/hppfs.html> for more information. You only need this if you are setting up a UML honeypot. Otherwise, it is safe to say 'N' here. @@ -189,8 +176,7 @@ config MAGIC_SYSRQ config SMP bool "Symmetric multi-processing support (EXPERIMENTAL)" default n - #SMP_BROKEN is for x86_64. - depends on EXPERIMENTAL && (!SMP_BROKEN || (BROKEN && SMP_BROKEN)) + depends on BROKEN help This option enables UML SMP support. It is NOT related to having a real SMP box. Not directly, at least. diff --git a/arch/um/Kconfig.char b/arch/um/Kconfig.char index 9a78d35..3a4b396 100644 --- a/arch/um/Kconfig.char +++ b/arch/um/Kconfig.char @@ -18,7 +18,7 @@ config SSL lines on the UML that are usually made to show up on the host as ttys or ptys. - See <http://user-mode-linux.sourceforge.net/input.html> for more + See <http://user-mode-linux.sourceforge.net/old/input.html> for more information and command line examples of how to use this facility. Unless you have a specific reason for disabling this, say Y. diff --git a/arch/um/Kconfig.debug b/arch/um/Kconfig.debug index 1f6462f..8fce5e5 100644 --- a/arch/um/Kconfig.debug +++ b/arch/um/Kconfig.debug @@ -4,12 +4,12 @@ source "lib/Kconfig.debug" config GPROF bool "Enable gprof support" - depends on DEBUG_INFO + depends on DEBUG_INFO && FRAME_POINTER help This allows profiling of a User-Mode Linux kernel with the gprof utility. - See <http://user-mode-linux.sourceforge.net/gprof.html> for more + See <http://user-mode-linux.sourceforge.net/old/gprof.html> for more details. If you're involved in UML kernel development and want to use gprof, @@ -22,7 +22,7 @@ config GCOV This option allows developers to retrieve coverage data from a UML session. - See <http://user-mode-linux.sourceforge.net/gprof.html> for more + See <http://user-mode-linux.sourceforge.net/old/gprof.html> for more details. If you're involved in UML kernel development and want to use gcov, diff --git a/arch/um/Kconfig.net b/arch/um/Kconfig.net index 66e5002..9e9a4aa 100644 --- a/arch/um/Kconfig.net +++ b/arch/um/Kconfig.net @@ -14,7 +14,7 @@ config UML_NET For more information, including explanations of the networking and sample configurations, see - <http://user-mode-linux.sourceforge.net/networking.html>. + <http://user-mode-linux.sourceforge.net/old/networking.html>. If you'd like to be able to enable networking in the User-Mode linux environment, say Y; otherwise say N. Note that you must @@ -38,7 +38,7 @@ config UML_NET_ETHERTAP CONFIG_NETLINK_DEV configured as Y or M. For more information, see - <http://user-mode-linux.sourceforge.net/networking.html> That site + <http://user-mode-linux.sourceforge.net/old/networking.html> That site has examples of the UML command line to use to enable Ethertap networking. @@ -72,7 +72,7 @@ config UML_NET_SLIP To use this, your host must support slip devices. For more information, see - <http://user-mode-linux.sourceforge.net/networking.html>. That site + <http://user-mode-linux.sourceforge.net/old/networking.html>. has examples of the UML command line to use to enable slip networking, and details of a few quirks with it. @@ -96,7 +96,7 @@ config UML_NET_DAEMON networking daemon on the host. For more information, see - <http://user-mode-linux.sourceforge.net/networking.html> That site + <http://user-mode-linux.sourceforge.net/old/networking.html> That site has examples of the UML command line to use to enable Daemon networking. @@ -144,7 +144,7 @@ config UML_NET_MCAST To use this, your host kernel(s) must support IP Multicasting. For more information, see - <http://user-mode-linux.sourceforge.net/networking.html> That site + <http://user-mode-linux.sourceforge.net/old/networking.html> That site has examples of the UML command line to use to enable Multicast networking, and notes about the security of this approach. @@ -165,7 +165,7 @@ config UML_NET_PCAP installed in order to build the pcap transport into UML. For more information, see - <http://user-mode-linux.sourceforge.net/networking.html> That site + <http://user-mode-linux.sourceforge.net/old/networking.html> That site has examples of the UML command line to use to enable this option. If you intend to use UML as a network monitor for the host, say diff --git a/arch/um/Makefile b/arch/um/Makefile index ba6813a..cb4af9b 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -49,7 +49,7 @@ SYS_DIR := $(ARCH_DIR)/include/sysdep-$(SUBARCH) # # These apply to USER_CFLAGS to. -KBUILD_CFLAGS += $(CFLAGS-y) -D__arch_um__ -DSUBARCH=\"$(SUBARCH)\" \ +KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) -D__arch_um__ -DSUBARCH=\"$(SUBARCH)\" \ $(ARCH_INCLUDE) $(MODE_INCLUDE) -Dvmap=kernel_vmap \ -Din6addr_loopback=kernel_in6addr_loopback \ -Din6addr_any=kernel_in6addr_any @@ -58,7 +58,7 @@ KBUILD_AFLAGS += $(ARCH_INCLUDE) USER_CFLAGS = $(patsubst $(KERNEL_DEFINES),,$(patsubst -D__KERNEL__,,\ $(patsubst -I%,,$(KBUILD_CFLAGS)))) $(ARCH_INCLUDE) $(MODE_INCLUDE) \ - -D_FILE_OFFSET_BITS=64 + $(filter -I%,$(CFLAGS)) -D_FILE_OFFSET_BITS=64 include $(srctree)/$(ARCH_DIR)/Makefile-$(SUBARCH) @@ -130,7 +130,9 @@ CPPFLAGS_vmlinux.lds = -U$(SUBARCH) -DSTART=$(START) -DELF_ARCH=$(ELF_ARCH) \ # The wrappers will select whether using "malloc" or the kernel allocator. LINK_WRAPS = -Wl,--wrap,malloc -Wl,--wrap,free -Wl,--wrap,calloc -CFLAGS_vmlinux := $(LINK-y) $(LINK_WRAPS) +LD_FLAGS_CMDLINE = $(foreach opt,$(LDFLAGS),-Wl,$(opt)) + +CFLAGS_vmlinux := $(LINK-y) $(LINK_WRAPS) $(LD_FLAGS_CMDLINE) define cmd_vmlinux__ $(CC) $(CFLAGS_vmlinux) -o $@ \ -Wl,-T,$(vmlinux-lds) $(vmlinux-init) \ @@ -158,7 +160,7 @@ ifneq ($(KBUILD_SRC),) $(Q)mkdir -p $(objtree)/include/asm-um $(Q)ln -fsn $(srctree)/include/asm-um/$(basename $(notdir $@))-$(SUBARCH)$(suffix $@) $@ else - $(Q)cd $(TOPDIR)/$(dir $@) ; \ + $(Q)cd $(srctree)/$(dir $@) ; \ ln -sf $(basename $(notdir $@))-$(SUBARCH)$(suffix $@) $(notdir $@) endif @@ -168,7 +170,7 @@ ifneq ($(KBUILD_SRC),) $(Q)mkdir -p $(objtree)/include/asm-um $(Q)ln -fsn $(srctree)/include/asm-$(HEADER_ARCH) include/asm-um/arch else - $(Q)cd $(TOPDIR)/include/asm-um && ln -fsn ../asm-$(HEADER_ARCH) arch + $(Q)cd $(srctree)/include/asm-um && ln -fsn ../asm-$(HEADER_ARCH) arch endif $(objtree)/$(ARCH_DIR)/include: diff --git a/arch/um/Makefile-tt b/arch/um/Makefile-tt deleted file mode 100644 index 03f7b10..0000000 --- a/arch/um/Makefile-tt +++ /dev/null @@ -1,5 +0,0 @@ -# -# Copyright (C) 2002 Jeff Dike (jdike@karaya.com) -# Licensed under the GPL -# - diff --git a/arch/um/defconfig b/arch/um/defconfig index f609ede..86db286 100644 --- a/arch/um/defconfig +++ b/arch/um/defconfig @@ -77,7 +77,7 @@ CONFIG_LD_SCRIPT_DYN=y CONFIG_NET=y CONFIG_BINFMT_ELF=y CONFIG_BINFMT_MISC=m -# CONFIG_HOSTFS is not set +CONFIG_HOSTFS=y # CONFIG_HPPFS is not set CONFIG_MCONSOLE=y CONFIG_MAGIC_SYSRQ=y @@ -188,7 +188,7 @@ CONFIG_CON_CHAN="xterm" CONFIG_SSL_CHAN="pts" CONFIG_UNIX98_PTYS=y CONFIG_LEGACY_PTYS=y -CONFIG_LEGACY_PTY_COUNT=256 +CONFIG_LEGACY_PTY_COUNT=32 # CONFIG_WATCHDOG is not set CONFIG_UML_SOUND=m CONFIG_SOUND=m @@ -508,7 +508,7 @@ CONFIG_DEBUG_KERNEL=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_SCHEDSTATS is not set -CONFIG_DEBUG_SLAB=y +# CONFIG_DEBUG_SLAB is not set # CONFIG_DEBUG_SLAB_LEAK is not set # CONFIG_DEBUG_MUTEXES is not set # CONFIG_DEBUG_SPINLOCK is not set diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c index 83bf15a..2c898c4 100644 --- a/arch/um/drivers/line.c +++ b/arch/um/drivers/line.c @@ -8,6 +8,7 @@ #include "chan_kern.h" #include "irq_kern.h" #include "irq_user.h" +#include "kern_util.h" #include "os.h" #define LINE_BUFSIZE 4096 @@ -48,7 +49,7 @@ static int write_room(struct line *line) n = line->head - line->tail; if (n <= 0) - n = LINE_BUFSIZE + n; /* The other case */ + n += LINE_BUFSIZE; /* The other case */ return n - 1; } @@ -58,17 +59,10 @@ int line_write_room(struct tty_struct *tty) unsigned long flags; int room; - if (tty->stopped) - return 0; - spin_lock_irqsave(&line->lock, flags); room = write_room(line); spin_unlock_irqrestore(&line->lock, flags); - /*XXX: Warning to remove */ - if (0 == room) - printk(KERN_DEBUG "%s: %s: no room left in buffer\n", - __FUNCTION__,tty->name); return room; } @@ -79,8 +73,7 @@ int line_chars_in_buffer(struct tty_struct *tty) int ret; spin_lock_irqsave(&line->lock, flags); - - /*write_room subtracts 1 for the needed NULL, so we readd it.*/ + /* write_room subtracts 1 for the needed NULL, so we readd it.*/ ret = LINE_BUFSIZE - (write_room(line) + 1); spin_unlock_irqrestore(&line->lock, flags); @@ -184,10 +177,6 @@ void line_flush_buffer(struct tty_struct *tty) unsigned long flags; int err; - /*XXX: copied from line_write, verify if it is correct!*/ - if (tty->stopped) - return; - spin_lock_irqsave(&line->lock, flags); err = flush_buffer(line); spin_unlock_irqrestore(&line->lock, flags); @@ -213,9 +202,6 @@ int line_write(struct tty_struct *tty, const unsigned char *buf, int len) unsigned long flags; int n, ret = 0; - if (tty->stopped) - return 0; - spin_lock_irqsave(&line->lock, flags); if (line->head != line->tail) ret = buffer_data(line, buf, len); @@ -788,9 +774,11 @@ static irqreturn_t winch_interrupt(int irq, void *data) tty = winch->tty; if (tty != NULL) { line = tty->driver_data; - chan_window_size(&line->chan_list, &tty->winsize.ws_row, - &tty->winsize.ws_col); - kill_pgrp(tty->pgrp, SIGWINCH, 1); + if (line != NULL) { + chan_window_size(&line->chan_list, &tty->winsize.ws_row, + &tty->winsize.ws_col); + kill_pgrp(tty->pgrp, SIGWINCH, 1); + } } out: if (winch->fd != -1) diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 0f3c7d1..ebb265c 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -1,23 +1,25 @@ /* * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Copyright (C) 2001 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ -#include "linux/console.h" -#include "linux/ctype.h" -#include "linux/interrupt.h" -#include "linux/list.h" -#include "linux/mm.h" -#include "linux/module.h" -#include "linux/notifier.h" -#include "linux/reboot.h" -#include "linux/proc_fs.h" -#include "linux/slab.h" -#include "linux/syscalls.h" -#include "linux/utsname.h" -#include "linux/workqueue.h" -#include "asm/uaccess.h" +#include <linux/console.h> +#include <linux/ctype.h> +#include <linux/interrupt.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/proc_fs.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/utsname.h> +#include <linux/workqueue.h> +#include <linux/mutex.h> +#include <asm/uaccess.h> + #include "init.h" #include "irq_kern.h" #include "irq_user.h" @@ -305,7 +307,9 @@ void mconsole_stop(struct mc_request *req) deactivate_fd(req->originating_fd, MCONSOLE_IRQ); os_set_fd_block(req->originating_fd, 1); mconsole_reply(req, "stopped", 0, 0); - while (mconsole_get_request(req->originating_fd, req)) { + for (;;) { + if (!mconsole_get_request(req->originating_fd, req)) + continue; if (req->cmd->handler == mconsole_go) break; if (req->cmd->handler == mconsole_stop) { @@ -358,7 +362,7 @@ struct unplugged_pages { void *pages[UNPLUGGED_PER_PAGE]; }; -static DECLARE_MUTEX(plug_mem_mutex); +static DEFINE_MUTEX(plug_mem_mutex); static unsigned long long unplugged_pages_count = 0; static LIST_HEAD(unplugged_pages); static int unplug_index = UNPLUGGED_PER_PAGE; @@ -394,7 +398,7 @@ static int mem_config(char *str, char **error_out) diff /= PAGE_SIZE; - down(&plug_mem_mutex); + mutex_lock(&plug_mem_mutex); for (i = 0; i < diff; i++) { struct unplugged_pages *unplugged; void *addr; @@ -451,7 +455,7 @@ static int mem_config(char *str, char **error_out) err = 0; out_unlock: - up(&plug_mem_mutex); + mutex_unlock(&plug_mem_mutex); out: return err; } @@ -741,7 +745,6 @@ void mconsole_stack(struct mc_request *req) { char *ptr = req->request.data; int pid_requested= -1; - struct task_struct *from = NULL; struct task_struct *to = NULL; /* @@ -763,9 +766,7 @@ void mconsole_stack(struct mc_request *req) return; } - from = current; - - to = find_task_by_pid(pid_requested); + to = find_task_by_pid_ns(pid_requested, &init_pid_ns); if ((to == NULL) || (pid_requested == 0)) { mconsole_reply(req, "Couldn't find that pid", 1, 0); return; @@ -795,6 +796,8 @@ static int __init mconsole_init(void) printk(KERN_ERR "Failed to initialize management console\n"); return 1; } + if (os_set_fd_block(sock, 0)) + goto out; register_reboot_notifier(&reboot_notifier); @@ -803,7 +806,7 @@ static int __init mconsole_init(void) "mconsole", (void *)sock); if (err) { printk(KERN_ERR "Failed to get IRQ for management console\n"); - return 1; + goto out; } if (notify_socket != NULL) { @@ -819,6 +822,10 @@ static int __init mconsole_init(void) printk(KERN_INFO "mconsole (version %d) initialized on %s\n", MCONSOLE_VERSION, mconsole_socket_name); return 0; + + out: + os_close_file(sock); + return 1; } __initcall(mconsole_init); diff --git a/arch/um/drivers/mconsole_user.c b/arch/um/drivers/mconsole_user.c index 430c024..13af2f0 100644 --- a/arch/um/drivers/mconsole_user.c +++ b/arch/um/drivers/mconsole_user.c @@ -83,9 +83,8 @@ int mconsole_get_request(int fd, struct mc_request *req) int len; req->originlen = sizeof(req->origin); - req->len = recvfrom(fd, &req->request, sizeof(req->request), - MSG_DONTWAIT, (struct sockaddr *) req->origin, - &req->originlen); + req->len = recvfrom(fd, &req->request, sizeof(req->request), 0, + (struct sockaddr *) req->origin, &req->originlen); if (req->len < 0) return 0; diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c index 3c6c44c..1e8f41a 100644 --- a/arch/um/drivers/net_kern.c +++ b/arch/um/drivers/net_kern.c @@ -318,7 +318,7 @@ static void setup_etheraddr(char *str, unsigned char *addr, char *name) if (str == NULL) goto random; - for (i = 0;i < 6; i++) { + for (i = 0; i < 6; i++) { addr[i] = simple_strtoul(str, &end, 16); if ((end == str) || ((*end != ':') && (*end != ',') && (*end != '\0'))) { @@ -343,14 +343,13 @@ static void setup_etheraddr(char *str, unsigned char *addr, char *name) } if (!is_local_ether_addr(addr)) { printk(KERN_WARNING - "Warning: attempt to assign a globally valid ethernet " + "Warning: Assigning a globally valid ethernet " "address to a device\n"); - printk(KERN_WARNING "You should better enable the 2nd " - "rightmost bit in the first byte of the MAC,\n"); + printk(KERN_WARNING "You should set the 2nd rightmost bit in " + "the first byte of the MAC,\n"); printk(KERN_WARNING "i.e. %02x:%02x:%02x:%02x:%02x:%02x\n", addr[0] | 0x02, addr[1], addr[2], addr[3], addr[4], addr[5]); - goto random; } return; @@ -368,7 +367,6 @@ static struct platform_driver uml_net_driver = { .name = DRIVER_NAME, }, }; -static int driver_registered; static void net_device_release(struct device *dev) { @@ -383,6 +381,12 @@ static void net_device_release(struct device *dev) free_netdev(netdev); } +/* + * Ensures that platform_driver_register is called only once by + * eth_configure. Will be set in an initcall. + */ +static int driver_registered; + static void eth_configure(int n, void *init, char *mac, struct transport *transport) { diff --git a/arch/um/drivers/net_user.c b/arch/um/drivers/net_user.c index 29185ca..abf2653 100644 --- a/arch/um/drivers/net_user.c +++ b/arch/um/drivers/net_user.c @@ -201,7 +201,7 @@ static int change_tramp(char **argv, char *output, int output_len) close(fds[1]); if (pid > 0) - helper_wait(pid, 0, "change_tramp"); + helper_wait(pid); return pid; } diff --git a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c index 330543b..1993008 100644 --- a/arch/um/drivers/port_kern.c +++ b/arch/um/drivers/port_kern.c @@ -6,6 +6,7 @@ #include "linux/completion.h" #include "linux/interrupt.h" #include "linux/list.h" +#include "linux/mutex.h" #include "asm/atomic.h" #include "init.h" #include "irq_kern.h" @@ -120,7 +121,7 @@ static int port_accept(struct port_list *port) return 0; } -static DECLARE_MUTEX(ports_sem); +static DEFINE_MUTEX(ports_mutex); static LIST_HEAD(ports); static void port_work_proc(struct work_struct *unused) @@ -161,7 +162,7 @@ void *port_data(int port_num) struct port_dev *dev = NULL; int fd; - down(&ports_sem); + mutex_lock(&ports_mutex); list_for_each(ele, &ports) { port = list_entry(ele, struct port_list, list); if (port->port == port_num) @@ -216,7 +217,7 @@ void *port_data(int port_num) out_free: kfree(port); out: - up(&ports_sem); + mutex_unlock(&ports_mutex); return dev; } diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c index e942e83..71f0959 100644 --- a/arch/um/drivers/random.c +++ b/arch/um/drivers/random.c @@ -5,6 +5,7 @@ * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. */ +#include <linux/sched.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/miscdevice.h> diff --git a/arch/um/drivers/slip_user.c b/arch/um/drivers/slip_user.c index b8711e5..8b80505 100644 --- a/arch/um/drivers/slip_user.c +++ b/arch/um/drivers/slip_user.c @@ -109,7 +109,7 @@ static int slip_tramp(char **argv, int fd) read_output(fds[0], output, output_len); printk("%s", output); - err = helper_wait(pid, 0, argv[0]); + err = helper_wait(pid); close(fds[0]); out_free: diff --git a/arch/um/drivers/slirp_user.c b/arch/um/drivers/slirp_user.c index 89c1be2..a0ada8f 100644 --- a/arch/um/drivers/slirp_user.c +++ b/arch/um/drivers/slirp_user.c @@ -98,7 +98,7 @@ static void slirp_close(int fd, void *data) "(%d)\n", pri->pid, errno); } #endif - err = helper_wait(pri->pid, 1, "slirp_close"); + err = helper_wait(pri->pid); if (err < 0) return; diff --git a/arch/um/drivers/ssl.c b/arch/um/drivers/ssl.c index 875d60d..f1786e6 100644 --- a/arch/um/drivers/ssl.c +++ b/arch/um/drivers/ssl.c @@ -15,7 +15,6 @@ #include "line.h" #include "ssl.h" #include "chan_kern.h" -#include "kern_util.h" #include "kern.h" #include "init.h" #include "irq_user.h" diff --git a/arch/um/drivers/stdio_console.c b/arch/um/drivers/stdio_console.c index 656036e..cec0c33 100644 --- a/arch/um/drivers/stdio_console.c +++ b/arch/um/drivers/stdio_console.c @@ -22,7 +22,6 @@ #include "stdio_console.h" #include "line.h" #include "chan_kern.h" -#include "kern_util.h" #include "irq_user.h" #include "mconsole_kern.h" #include "init.h" diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 99f9f96..be3a279 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -49,6 +49,7 @@ #include "irq_user.h" #include "irq_kern.h" #include "ubd_user.h" +#include "kern_util.h" #include "os.h" #include "mem.h" #include "mem_kern.h" @@ -229,7 +230,7 @@ static int proc_ide_read_media(char *page, char **start, off_t off, int count, return len; } -static void make_ide_entries(char *dev_name) +static void make_ide_entries(const char *dev_name) { struct proc_dir_entry *dir, *ent; char name[64]; @@ -244,7 +245,7 @@ static void make_ide_entries(char *dev_name) ent->data = NULL; ent->read_proc = proc_ide_read_media; ent->write_proc = NULL; - sprintf(name,"ide0/%s", dev_name); + snprintf(name, sizeof(name), "ide0/%s", dev_name); proc_symlink(dev_name, proc_ide_root, name); } @@ -437,7 +438,10 @@ __uml_help(ubd_setup, " machine by running 'dd' on the device. <n> must be in the range\n" " 0 to 7. Appending an 'r' to the number will cause that device\n" " to be mounted read-only. For example ubd1r=./ext_fs. Appending\n" -" an 's' will cause data to be written to disk on the host immediately.\n\n" +" an 's' will cause data to be written to disk on the host immediately.\n" +" 'c' will cause the device to be treated as being shared between multiple\n" +" UMLs and file locking will be turned off - this is appropriate for a\n" +" cluster filesystem and inappropriate at almost all other times.\n\n" ); static int udb_setup(char *str) @@ -456,20 +460,6 @@ __uml_help(udb_setup, " in the boot output.\n\n" ); -static int fakehd_set = 0; -static int fakehd(char *str) -{ - printk(KERN_INFO "fakehd : Changing ubd name to \"hd\".\n"); - fakehd_set = 1; - return 1; -} - -__setup("fakehd", fakehd); -__uml_help(fakehd, -"fakehd\n" -" Change the ubd device name to \"hd\".\n\n" -); - static void do_ubd_request(struct request_queue * q); /* Only changed by ubd_init, which is an initcall. */ @@ -718,8 +708,10 @@ static int ubd_add(int n, char **error_out) ubd_disk_register(fake_major, ubd_dev->size, n, &fake_gendisk[n]); - /* perhaps this should also be under the "if (fake_major)" above */ - /* using the fake_disk->disk_name and also the fakehd_set name */ + /* + * Perhaps this should also be under the "if (fake_major)" above + * using the fake_disk->disk_name + */ if (fake_ide) make_ide_entries(ubd_gendisk[n]->disk_name); diff --git a/arch/um/drivers/ubd_user.c b/arch/um/drivers/ubd_user.c index 48fc745..b591bb9 100644 --- a/arch/um/drivers/ubd_user.c +++ b/arch/um/drivers/ubd_user.c @@ -16,7 +16,6 @@ #include <sys/mman.h> #include <sys/param.h> #include "asm/types.h" -#include "kern_util.h" #include "user.h" #include "ubd_user.h" #include "os.h" diff --git a/arch/um/drivers/vde_user.c b/arch/um/drivers/vde_user.c index d9941fe..56533db 100644 --- a/arch/um/drivers/vde_user.c +++ b/arch/um/drivers/vde_user.c @@ -80,7 +80,7 @@ void vde_init_libstuff(struct vde_data *vpri, struct vde_init *init) vpri->args = kmalloc(sizeof(struct vde_open_args), UM_GFP_KERNEL); if (vpri->args == NULL) { - printk(UM_KERN_ERR "vde_init_libstuff - vde_open_args" + printk(UM_KERN_ERR "vde_init_libstuff - vde_open_args " "allocation failed"); return; } diff --git a/arch/um/include/arch.h b/arch/um/include/arch.h index 49c601ff..2de92a0 100644 --- a/arch/um/include/arch.h +++ b/arch/um/include/arch.h @@ -10,6 +10,6 @@ extern void arch_check_bugs(void); extern int arch_fixup(unsigned long address, struct uml_pt_regs *regs); -extern int arch_handle_signal(int sig, struct uml_pt_regs *regs); +extern void arch_examine_signal(int sig, struct uml_pt_regs *regs); #endif diff --git a/arch/um/include/as-layout.h b/arch/um/include/as-layout.h index a5cdf95..606bb5c 100644 --- a/arch/um/include/as-layout.h +++ b/arch/um/include/as-layout.h @@ -10,23 +10,31 @@ #include "kern_constants.h" /* - * Assembly doesn't want any casting, but C does, so define these - * without casts here, and define new symbols with casts inside the C - * section. + * Stolen from linux/const.h, which can't be directly included since + * this is used in userspace code, which has no access to the kernel + * headers. Changed to be suitable for adding casts to the start, + * rather than "UL" to the end. */ -#define ASM_STUB_CODE (UML_CONFIG_TOP_ADDR - 2 * UM_KERN_PAGE_SIZE) -#define ASM_STUB_DATA (UML_CONFIG_TOP_ADDR - UM_KERN_PAGE_SIZE) -#define ASM_STUB_START ASM_STUB_CODE -/* - * This file is included by the assembly stubs, which just want the - * definitions above. +/* Some constant macros are used in both assembler and + * C code. Therefore we cannot annotate them always with + * 'UL' and other type specifiers unilaterally. We + * use the following macros to deal with this. */ -#ifndef __ASSEMBLY__ -#define STUB_CODE ((unsigned long) ASM_STUB_CODE) -#define STUB_DATA ((unsigned long) ASM_STUB_DATA) -#define STUB_START ((unsigned long) ASM_STUB_START) +#ifdef __ASSEMBLY__ +#define _AC(X, Y) (Y) +#else +#define __AC(X, Y) (X (Y)) +#define _AC(X, Y) __AC(X, Y) +#endif + +#define STUB_START _AC(, 0x100000) +#define STUB_CODE _AC((unsigned long), STUB_START) +#define STUB_DATA _AC((unsigned long), STUB_CODE + UM_KERN_PAGE_SIZE) +#define STUB_END _AC((unsigned long), STUB_DATA + UM_KERN_PAGE_SIZE) + +#ifndef __ASSEMBLY__ #include "sysdep/ptrace.h" diff --git a/arch/um/include/chan_user.h b/arch/um/include/chan_user.h index 5a2263e..9b9ced8 100644 --- a/arch/um/include/chan_user.h +++ b/arch/um/include/chan_user.h @@ -48,7 +48,7 @@ extern void register_winch_irq(int fd, int tty_fd, int pid, #define __channel_help(fn, prefix) \ __uml_help(fn, prefix "[0-9]*=<channel description>\n" \ " Attach a console or serial line to a host channel. See\n" \ -" http://user-mode-linux.sourceforge.net/input.html for a complete\n" \ +" http://user-mode-linux.sourceforge.net/old/input.html for a complete\n" \ " description of this switch.\n\n" \ ); diff --git a/arch/um/include/common-offsets.h b/arch/um/include/common-offsets.h index 0edab69..b54bd35 100644 --- a/arch/um/include/common-offsets.h +++ b/arch/um/include/common-offsets.h @@ -18,6 +18,7 @@ DEFINE_STR(UM_KERN_WARNING, KERN_WARNING); DEFINE_STR(UM_KERN_NOTICE, KERN_NOTICE); DEFINE_STR(UM_KERN_INFO, KERN_INFO); DEFINE_STR(UM_KERN_DEBUG, KERN_DEBUG); +DEFINE_STR(UM_KERN_CONT, KERN_CONT); DEFINE(UM_ELF_CLASS, ELF_CLASS); DEFINE(UM_ELFCLASS32, ELFCLASS32); diff --git a/arch/um/include/init.h b/arch/um/include/init.h index cebc6ca..b00a957 100644 --- a/arch/um/include/init.h +++ b/arch/um/include/init.h @@ -40,6 +40,20 @@ typedef int (*initcall_t)(void); typedef void (*exitcall_t)(void); +#ifndef __KERNEL__ +#ifndef __section +# define __section(S) __attribute__ ((__section__(#S))) +#endif + +#if __GNUC_MINOR__ >= 3 +# define __used __attribute__((__used__)) +#else +# define __used __attribute__((__unused__)) +#endif + +#else +#include <linux/compiler.h> +#endif /* These are for everybody (although not all archs will actually discard it in modules) */ #define __init __section(.init.text) @@ -127,14 +141,3 @@ extern struct uml_param __uml_setup_start, __uml_setup_end; #endif #endif /* _LINUX_UML_INIT_H */ - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/include/irq_user.h b/arch/um/include/irq_user.h index 884a9c1..e60b318 100644 --- a/arch/um/include/irq_user.h +++ b/arch/um/include/irq_user.h @@ -14,7 +14,6 @@ struct irq_fd { int fd; int type; int irq; - int pid; int events; int current_events; }; diff --git a/arch/um/include/kern_util.h b/arch/um/include/kern_util.h index 74ce8e5..3c34122 100644 --- a/arch/um/include/kern_util.h +++ b/arch/um/include/kern_util.h @@ -9,107 +9,61 @@ #include "sysdep/ptrace.h" #include "sysdep/faultinfo.h" -typedef void (*kern_hndl)(int, struct uml_pt_regs *); - -struct kern_handlers { - kern_hndl relay_signal; - kern_hndl winch; - kern_hndl bus_handler; - kern_hndl page_fault; - kern_hndl sigio_handler; - kern_hndl timer_handler; -}; - -extern const struct kern_handlers handlinfo_kern; +extern int uml_exitcode; extern int ncpus; -extern char *gdb_init; extern int kmalloc_ok; -extern int jail; -extern int nsyscalls; -#define UML_ROUND_DOWN(addr) ((void *)(((unsigned long) addr) & PAGE_MASK)) #define UML_ROUND_UP(addr) \ - UML_ROUND_DOWN(((unsigned long) addr) + PAGE_SIZE - 1) + ((((unsigned long) addr) + PAGE_SIZE - 1) & PAGE_MASK) -extern int kernel_fork(unsigned long flags, int (*fn)(void *), void * arg); -extern int kernel_thread_proc(void *data); -extern void syscall_segv(int sig); -extern int current_pid(void); extern unsigned long alloc_stack(int order, int atomic); +extern void free_stack(unsigned long stack, int order); + extern int do_signal(void); -extern int is_stack_fault(unsigned long sp); +extern void copy_sc(struct uml_pt_regs *regs, void *from); +extern void interrupt_end(void); +extern void relay_signal(int sig, struct uml_pt_regs *regs); + extern unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, struct uml_pt_regs *regs); extern int handle_page_fault(unsigned long address, unsigned long ip, int is_write, int is_user, int *code_out); -extern void syscall_ready(void); -extern void set_tracing(void *t, int tracing); -extern int is_tracing(void *task); -extern int segv_syscall(void); -extern void kern_finish_exec(void *task, int new_pid, unsigned long stack); -extern unsigned long page_mask(void); -extern int need_finish_fork(void); -extern void free_stack(unsigned long stack, int order); -extern void add_input_request(int op, void (*proc)(int), void *arg); -extern char *current_cmd(void); -extern void timer_handler(int sig, struct uml_pt_regs *regs); -extern int set_signals(int enable); -extern int pid_to_processor_id(int pid); -extern void deliver_signals(void *t); -extern int next_trap_index(int max); -extern void default_idle(void); -extern void finish_fork(void); -extern void paging_init(void); -extern void init_flush_vm(void); -extern void *syscall_sp(void *t); -extern void syscall_trace(struct uml_pt_regs *regs, int entryexit); + extern unsigned int do_IRQ(int irq, struct uml_pt_regs *regs); -extern void interrupt_end(void); -extern void initial_thread_cb(void (*proc)(void *), void *arg); -extern int debugger_signal(int status, int pid); -extern void debugger_parent_signal(int status, int pid); -extern void child_signal(int pid, int status); -extern int init_ptrace_proxy(int idle_pid, int startup, int stop); -extern int init_parent_proxy(int pid); -extern int singlestepping(void *t); -extern void check_stack_overflow(void *ptr); -extern void relay_signal(int sig, struct uml_pt_regs *regs); -extern int user_context(unsigned long sp); -extern void timer_irq(struct uml_pt_regs *regs); -extern void do_uml_exitcalls(void); -extern int attach_debugger(int idle_pid, int pid, int stop); -extern int config_gdb(char *str); -extern int remove_gdb(void); -extern char *uml_strdup(char *string); -extern void unprotect_kernel_mem(void); -extern void protect_kernel_mem(void); -extern void uml_cleanup(void); -extern void lock_signalled_task(void *t); -extern void IPI_handler(int cpu); -extern int jail_setup(char *line, int *add); -extern void *get_init_task(void); -extern int clear_user_proc(void *buf, int size); -extern int copy_to_user_proc(void *to, void *from, int size); -extern int copy_from_user_proc(void *to, void *from, int size); -extern int strlen_user_proc(char *str); -extern long execute_syscall(void *r); extern int smp_sigio_handler(void); -extern void *get_current(void); -extern struct task_struct *get_task(int pid, int require); -extern void machine_halt(void); +extern void initial_thread_cb(void (*proc)(void *), void *arg); extern int is_syscall(unsigned long addr); +extern void timer_handler(int sig, struct uml_pt_regs *regs); -extern void free_irq(unsigned int, void *); -extern int cpu(void); +extern void timer_handler(int sig, struct uml_pt_regs *regs); -extern void time_init_kern(void); +extern int start_uml(void); +extern void paging_init(void); -/* Are we disallowed to sleep? Used to choose between GFP_KERNEL and GFP_ATOMIC. */ +extern void uml_cleanup(void); +extern void do_uml_exitcalls(void); + +/* + * Are we disallowed to sleep? Used to choose between GFP_KERNEL and + * GFP_ATOMIC. + */ extern int __cant_sleep(void); -extern void sigio_handler(int sig, struct uml_pt_regs *regs); -extern void copy_sc(struct uml_pt_regs *regs, void *from); +extern void *get_current(void); +extern int copy_from_user_proc(void *to, void *from, int size); +extern int cpu(void); +extern char *uml_strdup(const char *string); + extern unsigned long to_irq_stack(unsigned long *mask_out); -unsigned long from_irq_stack(int nested); -extern int start_uml(void); +extern unsigned long from_irq_stack(int nested); + +extern void syscall_trace(struct uml_pt_regs *regs, int entryexit); +extern int singlestepping(void *t); + +extern void segv_handler(int sig, struct uml_pt_regs *regs); +extern void bus_handler(int sig, struct uml_pt_regs *regs); +extern void winch(int sig, struct uml_pt_regs *regs); +extern void fatal_sigsegv(void) __attribute__ ((noreturn)); + + #endif diff --git a/arch/um/include/mem_user.h b/arch/um/include/mem_user.h index a54514d..46384ac 100644 --- a/arch/um/include/mem_user.h +++ b/arch/um/include/mem_user.h @@ -46,9 +46,6 @@ extern int iomem_size; #define ROUND_4M(n) ((((unsigned long) (n)) + (1 << 22)) & ~((1 << 22) - 1)) -extern unsigned long host_task_size; -extern unsigned long task_size; - extern int init_mem_user(void); extern void setup_memory(void *entry); extern unsigned long find_iomem(char *driver, unsigned long *len_out); @@ -59,9 +56,7 @@ extern void setup_physmem(unsigned long start, unsigned long usable, unsigned long len, unsigned long long highmem); extern void add_iomem(char *name, int fd, unsigned long size); extern unsigned long phys_offset(unsigned long phys); -extern void unmap_physmem(void); extern void map_memory(unsigned long virt, unsigned long phys, unsigned long len, int r, int w, int x); -extern unsigned long get_kmem_end(void); #endif diff --git a/arch/um/include/misc_constants.h b/arch/um/include/misc_constants.h deleted file mode 100644 index 989bc08..0000000 --- a/arch/um/include/misc_constants.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __MISC_CONSTANT_H_ -#define __MISC_CONSTANT_H_ - -#include <user_constants.h> - -#endif diff --git a/arch/um/include/os.h b/arch/um/include/os.h index 6f0d1c7..0b6b627 100644 --- a/arch/um/include/os.h +++ b/arch/um/include/os.h @@ -8,7 +8,6 @@ #include <stdarg.h> #include "irq_user.h" -#include "kern_util.h" #include "longjmp.h" #include "mm_id.h" #include "sysdep/tls.h" @@ -128,33 +127,31 @@ static inline struct openflags of_cloexec(struct openflags flags) extern int os_stat_file(const char *file_name, struct uml_stat *buf); extern int os_stat_fd(const int fd, struct uml_stat *buf); extern int os_access(const char *file, int mode); -extern int os_get_exec_close(int fd, int *close_on_exec); extern int os_set_exec_close(int fd); extern int os_ioctl_generic(int fd, unsigned int cmd, unsigned long arg); extern int os_get_ifname(int fd, char *namebuf); extern int os_set_slip(int fd); -extern int os_set_owner(int fd, int pid); extern int os_mode_fd(int fd, int mode); extern int os_seek_file(int fd, unsigned long long offset); -extern int os_open_file(char *file, struct openflags flags, int mode); +extern int os_open_file(const char *file, struct openflags flags, int mode); extern int os_read_file(int fd, void *buf, int len); extern int os_write_file(int fd, const void *buf, int count); -extern int os_file_size(char *file, unsigned long long *size_out); -extern int os_file_modtime(char *file, unsigned long *modtime); +extern int os_file_size(const char *file, unsigned long long *size_out); +extern int os_file_modtime(const char *file, unsigned long *modtime); extern int os_pipe(int *fd, int stream, int close_on_exec); -extern int os_set_fd_async(int fd, int owner); +extern int os_set_fd_async(int fd); extern int os_clear_fd_async(int fd); extern int os_set_fd_block(int fd, int blocking); extern int os_accept_connection(int fd); -extern int os_create_unix_socket(char *file, int len, int close_on_exec); +extern int os_create_unix_socket(const char *file, int len, int close_on_exec); extern int os_shutdown_socket(int fd, int r, int w); extern void os_close_file(int fd); extern int os_rcv_fd(int fd, int *helper_pid_out); extern int create_unix_socket(char *file, int len, int close_on_exec); -extern int os_connect_socket(char *name); +extern int os_connect_socket(const char *name); extern int os_file_type(char *file); -extern int os_file_mode(char *file, struct openflags *mode_out); +extern int os_file_mode(const char *file, struct openflags *mode_out); extern int os_lock_file(int fd, int excl); extern void os_flush_stdout(void); extern int os_stat_filesystem(char *path, long *bsize_out, @@ -168,14 +165,10 @@ extern int os_fchange_dir(int fd); /* start_up.c */ extern void os_early_checks(void); -extern int can_do_skas(void); +extern void can_do_skas(void); extern void os_check_bugs(void); extern void check_host_supports_tls(int *supports_tls, int *tls_min); -/* Make sure they are clear when running in TT mode. Required by - * SEGV_MAYBE_FIXABLE */ -#define clear_can_do_skas() do { ptrace_faultinfo = proc_mm = 0; } while (0) - /* mem.c */ extern int create_mem_file(unsigned long long len); @@ -214,7 +207,7 @@ extern int execvp_noalloc(char *buf, const char *file, char *const argv[]); extern int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv); extern int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, unsigned long *stack_out); -extern int helper_wait(int pid, int nohang, char *pname); +extern int helper_wait(int pid); /* tls.c */ @@ -237,16 +230,12 @@ extern void unblock_signals(void); extern int get_signals(void); extern int set_signals(int enable); -/* trap.c */ -extern void os_fill_handlinfo(struct kern_handlers h); - /* util.c */ extern void stack_protections(unsigned long address); extern int raw(int fd); extern void setup_machinename(char *machine_out); extern void setup_hostinfo(char *buf, int len); -extern int setjmp_wrapper(void (*proc)(void *, void *), ...); -extern void os_dump_core(void); +extern void os_dump_core(void) __attribute__ ((noreturn)); /* time.c */ extern void idle_sleep(unsigned long long nsecs); @@ -275,11 +264,9 @@ extern int protect(struct mm_id * mm_idp, unsigned long addr, extern int is_skas_winch(int pid, int fd, void *data); extern int start_userspace(unsigned long stub_stack); extern int copy_context_skas0(unsigned long stack, int pid); -extern void save_registers(int pid, struct uml_pt_regs *regs); -extern void restore_registers(int pid, struct uml_pt_regs *regs); extern void userspace(struct uml_pt_regs *regs); -extern void map_stub_pages(int fd, unsigned long code, - unsigned long data, unsigned long stack); +extern int map_stub_pages(int fd, unsigned long code, unsigned long data, + unsigned long stack); extern void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)); extern void switch_threads(jmp_buf *me, jmp_buf *you); extern int start_idle_thread(void *stack, jmp_buf *switch_buf); @@ -298,16 +285,12 @@ extern void os_free_irq_later(struct irq_fd *active_fds, extern int os_get_pollfd(int i); extern void os_set_pollfd(int i, int fd); extern void os_set_ioignore(void); -extern void init_irq_signals(int on_sigstack); /* sigio.c */ extern int add_sigio_fd(int fd); extern int ignore_sigio_fd(int fd); extern void maybe_sigio_broken(int fd, int read); -/* skas/trap */ -extern void sig_handler_common_skas(int sig, void *sc_ptr); - /* sys-x86_64/prctl.c */ extern int os_arch_prctl(int pid, int code, unsigned long *addr); diff --git a/arch/um/include/ptrace_user.h b/arch/um/include/ptrace_user.h index f3450e6..4bce6e0 100644 --- a/arch/um/include/ptrace_user.h +++ b/arch/um/include/ptrace_user.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ @@ -10,12 +10,6 @@ extern int ptrace_getregs(long pid, unsigned long *regs_out); extern int ptrace_setregs(long pid, unsigned long *regs_in); -extern int ptrace_getfpregs(long pid, unsigned long *regs_out); -extern int ptrace_setfpregs(long pid, unsigned long *regs); -extern void arch_enter_kernel(void *task, int pid); -extern void arch_leave_kernel(void *task, int pid); -extern void ptrace_pokeuser(unsigned long addr, unsigned long data); - /* syscall emulation path in ptrace */ @@ -54,7 +48,8 @@ extern int sysemu_supported; (((int[3][3] ) { \ { PTRACE_SYSCALL, PTRACE_SYSCALL, PTRACE_SINGLESTEP }, \ { PTRACE_SYSEMU, PTRACE_SYSEMU, PTRACE_SINGLESTEP }, \ - { PTRACE_SYSEMU, PTRACE_SYSEMU_SINGLESTEP, PTRACE_SYSEMU_SINGLESTEP }}) \ + { PTRACE_SYSEMU, PTRACE_SYSEMU_SINGLESTEP, \ + PTRACE_SYSEMU_SINGLESTEP } }) \ [sysemu_mode][singlestep_mode]) #endif diff --git a/arch/um/include/registers.h b/arch/um/include/registers.h index 0e27406..9ea1ae3 100644 --- a/arch/um/include/registers.h +++ b/arch/um/include/registers.h @@ -9,14 +9,13 @@ #include "sysdep/ptrace.h" #include "sysdep/archsetjmp.h" -extern void init_thread_registers(struct uml_pt_regs *to); extern int save_fp_registers(int pid, unsigned long *fp_regs); extern int restore_fp_registers(int pid, unsigned long *fp_regs); extern int save_fpx_registers(int pid, unsigned long *fp_regs); extern int restore_fpx_registers(int pid, unsigned long *fp_regs); -extern void save_registers(int pid, struct uml_pt_regs *regs); -extern void restore_registers(int pid, struct uml_pt_regs *regs); -extern void init_registers(int pid); +extern int save_registers(int pid, struct uml_pt_regs *regs); +extern int restore_registers(int pid, struct uml_pt_regs *regs); +extern int init_registers(int pid); extern void get_safe_registers(unsigned long *regs); extern unsigned long get_thread_reg(int reg, jmp_buf *buf); diff --git a/arch/um/include/signal_kern.h b/arch/um/include/signal_kern.h deleted file mode 100644 index aeb5d5a..0000000 --- a/arch/um/include/signal_kern.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#ifndef __SIGNAL_KERN_H__ -#define __SIGNAL_KERN_H__ - -extern int have_signals(void *t); - -#endif - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/include/skas/mode-skas.h b/arch/um/include/skas/mode-skas.h deleted file mode 100644 index e065feb..0000000 --- a/arch/um/include/skas/mode-skas.h +++ /dev/null @@ -1,11 +0,0 @@ -/* - * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com) - * Licensed under the GPL - */ - -#ifndef __MODE_SKAS_H__ -#define __MODE_SKAS_H__ - -extern void kill_off_processes_skas(void); - -#endif diff --git a/arch/um/include/sysdep-i386/syscalls.h b/arch/um/include/sysdep-i386/syscalls.h index 57bd79e..9056981 100644 --- a/arch/um/include/sysdep-i386/syscalls.h +++ b/arch/um/include/sysdep-i386/syscalls.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) + * Copyright (C) 2000 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ @@ -18,7 +18,8 @@ extern syscall_handler_t old_mmap_i386; extern syscall_handler_t *sys_call_table[]; #define EXECUTE_SYSCALL(syscall, regs) \ - ((long (*)(struct syscall_args)) (*sys_call_table[syscall]))(SYSCALL_ARGS(®s->regs)) + ((long (*)(struct syscall_args)) \ + (*sys_call_table[syscall]))(SYSCALL_ARGS(®s->regs)) extern long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, diff --git a/arch/um/include/sysdep-x86_64/kernel-offsets.h b/arch/um/include/sysdep-x86_64/kernel-offsets.h index c978b58..a307237 100644 --- a/arch/um/include/sysdep-x86_64/kernel-offsets.h +++ b/arch/um/include/sysdep-x86_64/kernel-offsets.h @@ -17,16 +17,7 @@ #define OFFSET(sym, str, mem) \ DEFINE(sym, offsetof(struct str, mem)); -#define __NO_STUBS 1 -#undef __SYSCALL -#undef _ASM_X86_64_UNISTD_H_ -#define __SYSCALL(nr, sym) [nr] = 1, -static char syscalls[] = { -#include <asm/arch/unistd.h> -}; - void foo(void) { #include <common-offsets.h> -DEFINE(UM_NR_syscall_max, sizeof(syscalls) - 1); } diff --git a/arch/um/include/sysdep-x86_64/syscalls.h b/arch/um/include/sysdep-x86_64/syscalls.h index cf72256..7cfb0b08 100644 --- a/arch/um/include/sysdep-x86_64/syscalls.h +++ b/arch/um/include/sysdep-x86_64/syscalls.h @@ -30,6 +30,4 @@ extern long old_mmap(unsigned long addr, unsigned long len, extern syscall_handler_t sys_modify_ldt; extern syscall_handler_t sys_arch_prctl; -#define NR_syscalls (UM_NR_syscall_max + 1) - #endif diff --git a/arch/um/include/um_mmu.h b/arch/um/include/um_mmu.h index 8855d8d..82865fc 100644 --- a/arch/um/include/um_mmu.h +++ b/arch/um/include/um_mmu.h @@ -12,10 +12,6 @@ typedef struct mm_context { struct mm_id id; - unsigned long last_page_table; -#ifdef CONFIG_3_LEVEL_PGTABLES - unsigned long last_pmd; -#endif struct uml_ldt ldt; } mm_context_t; diff --git a/arch/um/include/um_uaccess.h b/arch/um/include/um_uaccess.h index fdfc06b..2b6fc8e 100644 --- a/arch/um/include/um_uaccess.h +++ b/arch/um/include/um_uaccess.h @@ -6,7 +6,9 @@ #ifndef __ARCH_UM_UACCESS_H #define __ARCH_UM_UACCESS_H -#include "asm/fixmap.h" +#include <asm/elf.h> +#include <asm/fixmap.h> +#include "sysdep/archsetjmp.h" #define __under_task_size(addr, size) \ (((unsigned long) (addr) < TASK_SIZE) && \ diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c index 8196450..76a62c0 100644 --- a/arch/um/kernel/exec.c +++ b/arch/um/kernel/exec.c @@ -19,12 +19,13 @@ void flush_thread(void) { void *data = NULL; - unsigned long end = proc_mm ? task_size : STUB_START; int ret; arch_flush_thread(¤t->thread.arch); - ret = unmap(¤t->mm->context.id, 0, end, 1, &data); + ret = unmap(¤t->mm->context.id, 0, STUB_START, 0, &data); + ret = ret || unmap(¤t->mm->context.id, STUB_END, + TASK_SIZE - STUB_END, 1, &data); if (ret) { printk(KERN_ERR "flush_thread - clearing address space failed, " "err = %d\n", ret); diff --git a/arch/um/kernel/exitcode.c b/arch/um/kernel/exitcode.c index c716b5a..984f80e 100644 --- a/arch/um/kernel/exitcode.c +++ b/arch/um/kernel/exitcode.c @@ -1,15 +1,17 @@ /* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ -#include "linux/kernel.h" -#include "linux/init.h" -#include "linux/ctype.h" -#include "linux/proc_fs.h" -#include "asm/uaccess.h" +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/proc_fs.h> +#include <linux/types.h> +#include <asm/uaccess.h> -/* If read and write race, the read will still atomically read a valid +/* + * If read and write race, the read will still atomically read a valid * value. */ int uml_exitcode = 0; @@ -19,18 +21,19 @@ static int read_proc_exitcode(char *page, char **start, off_t off, { int len, val; - /* Save uml_exitcode in a local so that we don't need to guarantee + /* + * Save uml_exitcode in a local so that we don't need to guarantee * that sprintf accesses it atomically. */ val = uml_exitcode; len = sprintf(page, "%d\n", val); len -= off; - if(len <= off+count) + if (len <= off+count) *eof = 1; *start = page + off; - if(len > count) + if (len > count) len = count; - if(len < 0) + if (len < 0) len = 0; return len; } @@ -41,11 +44,11 @@ static int write_proc_exitcode(struct file *file, const char __user *buffer, char *end, buf[sizeof("nnnnn\0")]; int tmp; - if(copy_from_user(buf, buffer, count)) + if (copy_from_user(buf, buffer, count)) return -EFAULT; tmp = simple_strtol(buf, &end, 0); - if((*end != '\0') && !isspace(*end)) + if ((*end != '\0') && !isspace(*end)) return -EINVAL; uml_exitcode = tmp; @@ -57,7 +60,7 @@ static int make_proc_exitcode(void) struct proc_dir_entry *ent; ent = create_proc_entry("exitcode", 0600, &proc_root); - if(ent == NULL){ + if (ent == NULL) { printk(KERN_WARNING "make_proc_exitcode : Failed to register " "/proc/exitcode\n"); return 0; diff --git a/arch/um/kernel/gmon_syms.c b/arch/um/kernel/gmon_syms.c index 734f873..72eccd2 100644 --- a/arch/um/kernel/gmon_syms.c +++ b/arch/um/kernel/gmon_syms.c @@ -1,5 +1,5 @@ -/* - * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) +/* + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ @@ -8,12 +8,13 @@ extern void __bb_init_func(void *) __attribute__((weak)); EXPORT_SYMBOL(__bb_init_func); -/* This is defined (and referred to in profiling stub code) only by some GCC +/* + * This is defined (and referred to in profiling stub code) only by some GCC * versions in libgcov. * * Since SuSE backported the fix, we cannot handle it depending on GCC version. - * So, unconditionally export it. But also give it a weak declaration, which will - * be overridden by any other one. + * So, unconditionally export it. But also give it a weak declaration, which + * will be overridden by any other one. */ extern void __gcov_init(void *) __attribute__((weak)); diff --git a/arch/um/kernel/gprof_syms.c b/arch/um/kernel/gprof_syms.c index 9244f01..e2f043d 100644 --- a/arch/um/kernel/gprof_syms.c +++ b/arch/um/kernel/gprof_syms.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ @@ -7,14 +7,3 @@ extern void mcount(void); EXPORT_SYMBOL(mcount); - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/kernel/initrd.c b/arch/um/kernel/initrd.c index 16dc43e..fa01556 100644 --- a/arch/um/kernel/initrd.c +++ b/arch/um/kernel/initrd.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ @@ -7,7 +7,6 @@ #include "linux/bootmem.h" #include "linux/initrd.h" #include "asm/types.h" -#include "kern_util.h" #include "initrd.h" #include "init.h" #include "os.h" @@ -21,18 +20,27 @@ static int __init read_initrd(void) long long size; int err; - if(initrd == NULL) + if (initrd == NULL) return 0; err = os_file_size(initrd, &size); - if(err) + if (err) return 0; + /* + * This is necessary because alloc_bootmem craps out if you + * ask for no memory. + */ + if (size == 0) { + printk(KERN_ERR "\"%\" is a zero-size initrd\n"); + return 0; + } + area = alloc_bootmem(size); - if(area == NULL) + if (area == NULL) return 0; - if(load_initrd(initrd, area, size) == -1) + if (load_initrd(initrd, area, size) == -1) return 0; initrd_start = (unsigned long) area; @@ -59,13 +67,15 @@ int load_initrd(char *filename, void *buf, int size) int fd, n; fd = os_open_file(filename, of_read(OPENFLAGS()), 0); - if(fd < 0){ - printk("Opening '%s' failed - err = %d\n", filename, -fd); + if (fd < 0) { + printk(KERN_ERR "Opening '%s' failed - err = %d\n", filename, + -fd); return -1; } n = os_read_file(fd, buf, size); - if(n != size){ - printk("Read of %d bytes from '%s' failed, err = %d\n", size, + if (n != size) { + printk(KERN_ERR "Read of %d bytes from '%s' failed, " + "err = %d\n", size, filename, -n); return -1; } diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index ba11ccd..91587f8 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -107,10 +107,9 @@ int activate_fd(int irq, int fd, int type, void *dev_id) struct pollfd *tmp_pfd; struct irq_fd *new_fd, *irq_fd; unsigned long flags; - int pid, events, err, n; + int events, err, n; - pid = os_getpid(); - err = os_set_fd_async(fd, pid); + err = os_set_fd_async(fd); if (err < 0) goto out; @@ -127,7 +126,6 @@ int activate_fd(int irq, int fd, int type, void *dev_id) .fd = fd, .type = type, .irq = irq, - .pid = pid, .events = events, .current_events = 0 } ); diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c index 7c7142b..5311ee9 100644 --- a/arch/um/kernel/ksyms.c +++ b/arch/um/kernel/ksyms.c @@ -18,15 +18,11 @@ EXPORT_SYMBOL(set_signals); EXPORT_SYMBOL(get_signals); EXPORT_SYMBOL(kernel_thread); EXPORT_SYMBOL(sys_waitpid); -EXPORT_SYMBOL(task_size); EXPORT_SYMBOL(flush_tlb_range); -EXPORT_SYMBOL(host_task_size); EXPORT_SYMBOL(arch_validate); -EXPORT_SYMBOL(get_kmem_end); EXPORT_SYMBOL(high_physmem); EXPORT_SYMBOL(empty_zero_page); -EXPORT_SYMBOL(um_virt_to_phys); EXPORT_SYMBOL(handle_page_fault); EXPORT_SYMBOL(find_iomem); @@ -40,7 +36,6 @@ EXPORT_SYMBOL(uml_strdup); EXPORT_SYMBOL(os_stat_fd); EXPORT_SYMBOL(os_stat_file); EXPORT_SYMBOL(os_access); -EXPORT_SYMBOL(os_get_exec_close); EXPORT_SYMBOL(os_set_exec_close); EXPORT_SYMBOL(os_getpid); EXPORT_SYMBOL(os_open_file); diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 59822dee..d872fdc 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -1,49 +1,41 @@ /* - * Copyright (C) 2000 - 2003 Jeff Dike (jdike@addtoit.com) + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ -#include "linux/stddef.h" -#include "linux/kernel.h" -#include "linux/mm.h" -#include "linux/bootmem.h" -#include "linux/swap.h" -#include "linux/highmem.h" -#include "linux/gfp.h" -#include "asm/page.h" -#include "asm/fixmap.h" -#include "asm/pgalloc.h" -#include "kern_util.h" +#include <linux/stddef.h> +#include <linux/bootmem.h> +#include <linux/gfp.h> +#include <linux/highmem.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <asm/fixmap.h> +#include <asm/page.h> #include "as-layout.h" +#include "init.h" #include "kern.h" +#include "kern_util.h" #include "mem_user.h" -#include "um_uaccess.h" #include "os.h" -#include "linux/types.h" -#include "linux/string.h" -#include "init.h" -#include "kern_constants.h" /* allocated in paging_init, zeroed in mem_init, and unchanged thereafter */ unsigned long *empty_zero_page = NULL; /* allocated in paging_init and unchanged thereafter */ unsigned long *empty_bad_page = NULL; + +/* + * Initialized during boot, and readonly for initializing page tables + * afterwards + */ pgd_t swapper_pg_dir[PTRS_PER_PGD]; + +/* Initialized at boot time, and readonly after that */ unsigned long long highmem; int kmalloc_ok = 0; +/* Used during early boot */ static unsigned long brk_end; -void unmap_physmem(void) -{ - os_unmap_memory((void *) brk_end, uml_reserved - brk_end); -} - -static void map_cb(void *unused) -{ - map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0); -} - #ifdef CONFIG_HIGHMEM static void setup_highmem(unsigned long highmem_start, unsigned long highmem_len) @@ -53,7 +45,7 @@ static void setup_highmem(unsigned long highmem_start, int i; highmem_pfn = __pa(highmem_start) >> PAGE_SHIFT; - for(i = 0; i < highmem_len >> PAGE_SHIFT; i++){ + for (i = 0; i < highmem_len >> PAGE_SHIFT; i++) { page = &mem_map[highmem_pfn + i]; ClearPageReserved(page); init_page_count(page); @@ -65,14 +57,13 @@ static void setup_highmem(unsigned long highmem_start, void __init mem_init(void) { /* clear the zero-page */ - memset((void *) empty_zero_page, 0, PAGE_SIZE); + memset(empty_zero_page, 0, PAGE_SIZE); /* Map in the area just after the brk now that kmalloc is about * to be turned on. */ brk_end = (unsigned long) UML_ROUND_UP(sbrk(0)); - map_cb(NULL); - initial_thread_cb(map_cb, NULL); + map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0); free_bootmem(__pa(brk_end), uml_reserved - brk_end); uml_reserved = brk_end; @@ -85,7 +76,7 @@ void __init mem_init(void) #endif num_physpages = totalram_pages; max_pfn = totalram_pages; - printk(KERN_INFO "Memory: %luk available\n", + printk(KERN_INFO "Memory: %luk available\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10)); kmalloc_ok = 1; @@ -119,7 +110,7 @@ static void __init one_md_table_init(pud_t *pud) #endif } -static void __init fixrange_init(unsigned long start, unsigned long end, +static void __init fixrange_init(unsigned long start, unsigned long end, pgd_t *pgd_base) { pgd_t *pgd; @@ -138,7 +129,7 @@ static void __init fixrange_init(unsigned long start, unsigned long end, if (pud_none(*pud)) one_md_table_init(pud); pmd = pmd_offset(pud, vaddr); - for (; (j < PTRS_PER_PMD) && (vaddr != end); pmd++, j++) { + for (; (j < PTRS_PER_PMD) && (vaddr < end); pmd++, j++) { one_page_table_init(pmd); vaddr += PMD_SIZE; } @@ -152,7 +143,7 @@ pgprot_t kmap_prot; #define kmap_get_fixmap_pte(vaddr) \ pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), (vaddr)),\ - (vaddr)), (vaddr)) + (vaddr)), (vaddr)) static void __init kmap_init(void) { @@ -197,21 +188,23 @@ static void __init fixaddr_user_init( void) pud_t *pud; pmd_t *pmd; pte_t *pte; - unsigned long paddr, vaddr = FIXADDR_USER_START; + phys_t p; + unsigned long v, vaddr = FIXADDR_USER_START; - if ( ! size ) + if (!size) return; fixrange_init( FIXADDR_USER_START, FIXADDR_USER_END, swapper_pg_dir); - paddr = (unsigned long)alloc_bootmem_low_pages( size); - memcpy( (void *)paddr, (void *)FIXADDR_USER_START, size); - paddr = __pa(paddr); - for ( ; size > 0; size-=PAGE_SIZE, vaddr+=PAGE_SIZE, paddr+=PAGE_SIZE){ + v = (unsigned long) alloc_bootmem_low_pages(size); + memcpy((void *) v , (void *) FIXADDR_USER_START, size); + p = __pa(v); + for ( ; size > 0; size -= PAGE_SIZE, vaddr += PAGE_SIZE, + p += PAGE_SIZE) { pgd = swapper_pg_dir + pgd_index(vaddr); pud = pud_offset(pgd, vaddr); pmd = pmd_offset(pud, vaddr); pte = pte_offset_kernel(pmd, vaddr); - pte_set_val( (*pte), paddr, PAGE_READONLY); + pte_set_val(*pte, p, PAGE_READONLY); } #endif } @@ -223,7 +216,7 @@ void __init paging_init(void) empty_zero_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); empty_bad_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); - for(i = 0; i < ARRAY_SIZE(zones_size); i++) + for (i = 0; i < ARRAY_SIZE(zones_size); i++) zones_size[i] = 0; zones_size[ZONE_NORMAL] = (end_iomem >> PAGE_SHIFT) - @@ -253,32 +246,33 @@ struct page *arch_validate(struct page *page, gfp_t mask, int order) int i; again: - if(page == NULL) + if (page == NULL) return page; - if(PageHighMem(page)) + if (PageHighMem(page)) return page; addr = (unsigned long) page_address(page); - for(i = 0; i < (1 << order); i++){ + for (i = 0; i < (1 << order); i++) { current->thread.fault_addr = (void *) addr; - if(__do_copy_to_user((void __user *) addr, &zero, + if (__do_copy_to_user((void __user *) addr, &zero, sizeof(zero), ¤t->thread.fault_addr, - ¤t->thread.fault_catcher)){ - if(!(mask & __GFP_WAIT)) + ¤t->thread.fault_catcher)) { + if (!(mask & __GFP_WAIT)) return NULL; else break; } addr += PAGE_SIZE; } - if(i == (1 << order)) + if (i == (1 << order)) return page; page = alloc_pages(mask, order); goto again; } -/* This can't do anything because nothing in the kernel image can be freed +/* + * This can't do anything because nothing in the kernel image can be freed * since it's not in kernel physical memory. */ @@ -290,8 +284,8 @@ void free_initmem(void) void free_initrd_mem(unsigned long start, unsigned long end) { if (start < end) - printk ("Freeing initrd memory: %ldk freed\n", - (end - start) >> 10); + printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", + (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); init_page_count(virt_to_page(start)); @@ -308,32 +302,31 @@ void show_mem(void) int highmem = 0; struct page *page; - printk("Mem-info:\n"); + printk(KERN_INFO "Mem-info:\n"); show_free_areas(); - printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + printk(KERN_INFO "Free swap: %6ldkB\n", + nr_swap_pages<<(PAGE_SHIFT-10)); pfn = max_mapnr; - while(pfn-- > 0) { + while (pfn-- > 0) { page = pfn_to_page(pfn); total++; - if(PageHighMem(page)) + if (PageHighMem(page)) highmem++; - if(PageReserved(page)) + if (PageReserved(page)) reserved++; - else if(PageSwapCache(page)) + else if (PageSwapCache(page)) cached++; - else if(page_count(page)) + else if (page_count(page)) shared += page_count(page) - 1; } - printk("%d pages of RAM\n", total); - printk("%d pages of HIGHMEM\n", highmem); - printk("%d reserved pages\n", reserved); - printk("%d pages shared\n", shared); - printk("%d pages swap cached\n", cached); + printk(KERN_INFO "%d pages of RAM\n", total); + printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); + printk(KERN_INFO "%d reserved pages\n", reserved); + printk(KERN_INFO "%d pages shared\n", shared); + printk(KERN_INFO "%d pages swap cached\n", cached); } -/* - * Allocate and free page tables. - */ +/* Allocate and free page tables. */ pgd_t *pgd_alloc(struct mm_struct *mm) { @@ -341,14 +334,14 @@ pgd_t *pgd_alloc(struct mm_struct *mm) if (pgd) { memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, + memcpy(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); } return pgd; } -void pgd_free(pgd_t *pgd) +void pgd_free(struct mm_struct *mm, pgd_t *pgd) { free_page((unsigned long) pgd); } @@ -368,3 +361,15 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) pte = alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); return pte; } + +#ifdef CONFIG_3_LEVEL_PGTABLES +pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) +{ + pmd_t *pmd = (pmd_t *) __get_free_page(GFP_KERNEL); + + if (pmd) + memset(pmd, 0, PAGE_SIZE); + + return pmd; +} +#endif diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index e66432f..9757085 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -55,16 +55,6 @@ int __init init_maps(unsigned long physmem, unsigned long iomem, return 0; } -/* Changed during early boot */ -static unsigned long kmem_top = 0; - -unsigned long get_kmem_end(void) -{ - if (kmem_top == 0) - kmem_top = host_task_size - 1024 * 1024; - return kmem_top; -} - void map_memory(unsigned long virt, unsigned long phys, unsigned long len, int r, int w, int x) { @@ -174,10 +164,10 @@ __uml_setup("iomem=", parse_iomem, * setup_iomem, both of which run during early boot. Afterwards, it's * unchanged. */ -struct iomem_region *iomem_regions = NULL; +struct iomem_region *iomem_regions; -/* Initialized in parse_iomem */ -int iomem_size = 0; +/* Initialized in parse_iomem and unchanged thereafter */ +int iomem_size; unsigned long find_iomem(char *driver, unsigned long *len_out) { diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 0eae00b..c07961b 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -4,19 +4,21 @@ * Licensed under the GPL */ -#include "linux/stddef.h" -#include "linux/err.h" -#include "linux/hardirq.h" -#include "linux/mm.h" -#include "linux/personality.h" -#include "linux/proc_fs.h" -#include "linux/ptrace.h" -#include "linux/random.h" -#include "linux/sched.h" -#include "linux/tick.h" -#include "linux/threads.h" -#include "asm/pgtable.h" -#include "asm/uaccess.h" +#include <linux/stddef.h> +#include <linux/err.h> +#include <linux/hardirq.h> +#include <linux/gfp.h> +#include <linux/mm.h> +#include <linux/personality.h> +#include <linux/proc_fs.h> +#include <linux/ptrace.h> +#include <linux/random.h> +#include <linux/sched.h> +#include <linux/tick.h> +#include <linux/threads.h> +#include <asm/current.h> +#include <asm/pgtable.h> +#include <asm/uaccess.h> #include "as-layout.h" #include "kern_util.h" #include "os.h" @@ -30,7 +32,7 @@ */ struct cpu_task cpu_tasks[NR_CPUS] = { [0 ... NR_CPUS - 1] = { -1, NULL } }; -static inline int external_pid(struct task_struct *task) +static inline int external_pid(void) { /* FIXME: Need to look up userspace_pid by cpu */ return userspace_pid[0]; @@ -40,7 +42,7 @@ int pid_to_processor_id(int pid) { int i; - for(i = 0; i < ncpus; i++) { + for (i = 0; i < ncpus; i++) { if (cpu_tasks[i].pid == pid) return i; } @@ -60,8 +62,6 @@ unsigned long alloc_stack(int order, int atomic) if (atomic) flags = GFP_ATOMIC; page = __get_free_pages(flags, order); - if (page == 0) - return 0; return page; } @@ -80,15 +80,15 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) static inline void set_current(struct task_struct *task) { cpu_tasks[task_thread_info(task)->cpu] = ((struct cpu_task) - { external_pid(task), task }); + { external_pid(), task }); } -extern void arch_switch_to(struct task_struct *from, struct task_struct *to); +extern void arch_switch_to(struct task_struct *to); void *_switch_to(void *prev, void *next, void *last) { struct task_struct *from = prev; - struct task_struct *to= next; + struct task_struct *to = next; to->thread.prev_sched = from; set_current(to); @@ -99,13 +99,13 @@ void *_switch_to(void *prev, void *next, void *last) switch_threads(&from->thread.switch_buf, &to->thread.switch_buf); - arch_switch_to(current->thread.prev_sched, current); + arch_switch_to(current); if (current->thread.saved_task) show_regs(&(current->thread.regs)); - next= current->thread.saved_task; - prev= current; - } while(current->thread.saved_task); + to = current->thread.saved_task; + from = current; + } while (current->thread.saved_task); return current->thread.prev_sched; @@ -163,8 +163,6 @@ void new_thread_handler(void) void fork_handler(void) { force_flush_all(); - if (current->thread.prev_sched == NULL) - panic("blech"); schedule_tail(current->thread.prev_sched); @@ -173,7 +171,7 @@ void fork_handler(void) * arch_switch_to isn't needed. We could want to apply this to * improve performance. -bb */ - arch_switch_to(current->thread.prev_sched, current); + arch_switch_to(current); current->thread.prev_sched = NULL; @@ -204,7 +202,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, arch_copy_thread(¤t->thread.arch, &p->thread.arch); } else { - init_thread_registers(&p->thread.regs.regs); + get_safe_registers(p->thread.regs.regs.gp); p->thread.request.u.thread = current->thread.request.u.thread; handler = new_thread_handler; } @@ -237,7 +235,7 @@ void default_idle(void) { unsigned long long nsecs; - while(1) { + while (1) { /* endless idle loop with no priority at all */ /* @@ -256,53 +254,10 @@ void default_idle(void) void cpu_idle(void) { - cpu_tasks[current_thread->cpu].pid = os_getpid(); + cpu_tasks[current_thread_info()->cpu].pid = os_getpid(); default_idle(); } -void *um_virt_to_phys(struct task_struct *task, unsigned long addr, - pte_t *pte_out) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - pte_t ptent; - - if (task->mm == NULL) - return ERR_PTR(-EINVAL); - pgd = pgd_offset(task->mm, addr); - if (!pgd_present(*pgd)) - return ERR_PTR(-EINVAL); - - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - return ERR_PTR(-EINVAL); - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - return ERR_PTR(-EINVAL); - - pte = pte_offset_kernel(pmd, addr); - ptent = *pte; - if (!pte_present(ptent)) - return ERR_PTR(-EINVAL); - - if (pte_out != NULL) - *pte_out = ptent; - return (void *) (pte_val(ptent) & PAGE_MASK) + (addr & ~PAGE_MASK); -} - -char *current_cmd(void) -{ -#if defined(CONFIG_SMP) || defined(CONFIG_HIGHMEM) - return "(Unknown)"; -#else - void *addr = um_virt_to_phys(current, current->mm->arg_start, NULL); - return IS_ERR(addr) ? "(Unknown)": __va((unsigned long) addr); -#endif -} - void dump_thread(struct pt_regs *regs, struct user *u) { } @@ -317,7 +272,7 @@ int user_context(unsigned long sp) unsigned long stack; stack = sp & (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER); - return stack != (unsigned long) current_thread; + return stack != (unsigned long) current_thread_info(); } extern exitcall_t __uml_exitcall_begin, __uml_exitcall_end; @@ -331,7 +286,7 @@ void do_uml_exitcalls(void) (*call)(); } -char *uml_strdup(char *string) +char *uml_strdup(const char *string) { return kstrdup(string, GFP_KERNEL); } @@ -359,7 +314,7 @@ int strlen_user_proc(char __user *str) int smp_sigio_handler(void) { #ifdef CONFIG_SMP - int cpu = current_thread->cpu; + int cpu = current_thread_info()->cpu; IPI_handler(cpu); if (cpu != 0) return 1; @@ -369,7 +324,7 @@ int smp_sigio_handler(void) int cpu(void) { - return current_thread->cpu; + return current_thread_info()->cpu; } static atomic_t using_sysemu = ATOMIC_INIT(0); @@ -435,7 +390,7 @@ int singlestepping(void * t) { struct task_struct *task = t ? t : current; - if ( ! (task->ptrace & PT_DTRACE) ) + if (!(task->ptrace & PT_DTRACE)) return 0; if (task->thread.singlestep_syscall) @@ -459,3 +414,46 @@ unsigned long arch_align_stack(unsigned long sp) return sp & ~0xf; } #endif + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long stack_page, sp, ip; + bool seen_sched = 0; + + if ((p == NULL) || (p == current) || (p->state == TASK_RUNNING)) + return 0; + + stack_page = (unsigned long) task_stack_page(p); + /* Bail if the process has no kernel stack for some reason */ + if (stack_page == 0) + return 0; + + sp = p->thread.switch_buf->JB_SP; + /* + * Bail if the stack pointer is below the bottom of the kernel + * stack for some reason + */ + if (sp < stack_page) + return 0; + + while (sp < stack_page + THREAD_SIZE) { + ip = *((unsigned long *) sp); + if (in_sched_functions(ip)) + /* Ignore everything until we're above the scheduler */ + seen_sched = 1; + else if (kernel_text_address(ip) && seen_sched) + return ip; + + sp += sizeof(unsigned long); + } + + return 0; +} + +int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu) +{ + int cpu = current_thread_info()->cpu; + + return save_fp_registers(userspace_pid[cpu], (unsigned long *) fpu); +} + diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c index 04cebcf..00197d3 100644 --- a/arch/um/kernel/reboot.c +++ b/arch/um/kernel/reboot.c @@ -4,6 +4,7 @@ */ #include "linux/sched.h" +#include "kern_util.h" #include "os.h" #include "skas.h" @@ -11,7 +12,7 @@ void (*pm_power_off)(void); static void kill_off_processes(void) { - if(proc_mm) + if (proc_mm) /* * FIXME: need to loop over userspace_pids */ @@ -21,8 +22,8 @@ static void kill_off_processes(void) int pid, me; me = os_getpid(); - for_each_process(p){ - if(p->mm == NULL) + for_each_process(p) { + if (p->mm == NULL) continue; pid = p->mm->context.id.u.pid; diff --git a/arch/um/kernel/sigio.c b/arch/um/kernel/sigio.c index 89f9866..2b272b6 100644 --- a/arch/um/kernel/sigio.c +++ b/arch/um/kernel/sigio.c @@ -1,18 +1,12 @@ /* - * Copyright (C) 2002 - 2003 Jeff Dike (jdike@addtoit.com) + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com) * Licensed under the GPL */ -#include "linux/kernel.h" -#include "linux/list.h" -#include "linux/slab.h" -#include "linux/signal.h" -#include "linux/interrupt.h" -#include "init.h" -#include "sigio.h" -#include "irq_user.h" +#include <linux/interrupt.h> #include "irq_kern.h" #include "os.h" +#include "sigio.h" /* Protected by sigio_lock() called from write_sigio_workaround */ static int sigio_irq_fd = -1; @@ -33,9 +27,9 @@ int write_sigio_irq(int fd) err = um_request_irq(SIGIO_WRITE_IRQ, fd, IRQ_READ, sigio_interrupt, IRQF_DISABLED|IRQF_SAMPLE_RANDOM, "write sigio", NULL); - if(err){ - printk("write_sigio_irq : um_request_irq failed, err = %d\n", - err); + if (err) { + printk(KERN_ERR "write_sigio_irq : um_request_irq failed, " + "err = %d\n", err); return -1; } sigio_irq_fd = fd; diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c index 19cb977..b0fce72 100644 --- a/arch/um/kernel/signal.c +++ b/arch/um/kernel/signal.c @@ -3,12 +3,12 @@ * Licensed under the GPL */ -#include "linux/module.h" -#include "linux/ptrace.h" -#include "linux/sched.h" -#include "asm/siginfo.h" -#include "asm/signal.h" -#include "asm/unistd.h" +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/sched.h> +#include <asm/siginfo.h> +#include <asm/signal.h> +#include <asm/unistd.h> #include "frame_kern.h" #include "kern_util.h" #include "sigcontext.h" @@ -36,7 +36,7 @@ static int handle_signal(struct pt_regs *regs, unsigned long signr, /* Did we come from a system call? */ if (PT_REGS_SYSCALL_NR(regs) >= 0) { /* If so, check system call restarting.. */ - switch(PT_REGS_SYSCALL_RET(regs)) { + switch (PT_REGS_SYSCALL_RET(regs)) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: PT_REGS_SYSCALL_RET(regs) = -EINTR; @@ -116,7 +116,7 @@ static int kern_do_signal(struct pt_regs *regs) /* Did we come from a system call? */ if (!handled_sig && (PT_REGS_SYSCALL_NR(regs) >= 0)) { /* Restart the system call - no handlers present */ - switch(PT_REGS_SYSCALL_RET(regs)) { + switch (PT_REGS_SYSCALL_RET(regs)) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: diff --git a/arch/um/kernel/skas/clone.c b/arch/um/kernel/skas/clone.c index 8d07a7a..2c8583c 100644 --- a/arch/um/kernel/skas/clone.c +++ b/arch/um/kernel/skas/clone.c @@ -1,17 +1,20 @@ -#include <sched.h> +/* + * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + #include <signal.h> -#include <sys/mman.h> -#include <sys/time.h> +#include <sched.h> #include <asm/unistd.h> +#include <sys/time.h> #include "as-layout.h" +#include "kern_constants.h" #include "ptrace_user.h" -#include "skas.h" #include "stub-data.h" -#include "uml-config.h" #include "sysdep/stub.h" -#include "kern_constants.h" -/* This is in a separate file because it needs to be compiled with any +/* + * This is in a separate file because it needs to be compiled with any * extraneous gcc flags (-pg, -fprofile-arcs, -ftest-coverage) disabled * * Use UM_KERN_PAGE_SIZE instead of PAGE_SIZE because that calls getpagesize @@ -26,25 +29,26 @@ stub_clone_handler(void) err = stub_syscall2(__NR_clone, CLONE_PARENT | CLONE_FILES | SIGCHLD, STUB_DATA + UM_KERN_PAGE_SIZE / 2 - sizeof(void *)); - if(err != 0) + if (err != 0) goto out; err = stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); - if(err) + if (err) goto out; - err = stub_syscall3(__NR_setitimer, ITIMER_VIRTUAL, + err = stub_syscall3(__NR_setitimer, ITIMER_VIRTUAL, (long) &data->timer, 0); - if(err) + if (err) goto out; remap_stack(data->fd, data->offset); goto done; out: - /* save current result. - * Parent: pid; - * child: retcode of mmap already saved and it jumps around this + /* + * save current result. + * Parent: pid; + * child: retcode of mmap already saved and it jumps around this * assignment */ data->err = err; diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index f859ec3..78b3e9f 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -34,33 +34,14 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, if (!pte) goto out_pte; - /* - * There's an interaction between the skas0 stub pages, stack - * randomization, and the BUG at the end of exit_mmap. exit_mmap - * checks that the number of page tables freed is the same as had - * been allocated. If the stack is on the last page table page, - * then the stack pte page will be freed, and if not, it won't. To - * avoid having to know where the stack is, or if the process mapped - * something at the top of its address space for some other reason, - * we set TASK_SIZE to end at the start of the last page table. - * This keeps exit_mmap off the last page, but introduces a leak - * of that page. So, we hang onto it here and free it in - * destroy_context_skas. - */ - - mm->context.last_page_table = pmd_page_vaddr(*pmd); -#ifdef CONFIG_3_LEVEL_PGTABLES - mm->context.last_pmd = (unsigned long) __va(pud_val(*pud)); -#endif - *pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT)); *pte = pte_mkread(*pte); return 0; out_pmd: - pud_free(pud); + pud_free(mm, pud); out_pte: - pmd_free(pmd); + pmd_free(mm, pmd); out: return -ENOMEM; } @@ -76,24 +57,6 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm) stack = get_zeroed_page(GFP_KERNEL); if (stack == 0) goto out; - - /* - * This zeros the entry that pgd_alloc didn't, needed since - * we are about to reinitialize it, and want mm.nr_ptes to - * be accurate. - */ - mm->pgd[USER_PTRS_PER_PGD] = __pgd(0); - - ret = init_stub_pte(mm, STUB_CODE, - (unsigned long) &__syscall_stub_start); - if (ret) - goto out_free; - - ret = init_stub_pte(mm, STUB_DATA, stack); - if (ret) - goto out_free; - - mm->nr_ptes--; } to_mm->id.stack = stack; @@ -114,6 +77,11 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm) to_mm->id.u.pid = copy_context_skas0(stack, from_mm->id.u.pid); else to_mm->id.u.pid = start_userspace(stack); + + if (to_mm->id.u.pid < 0) { + ret = to_mm->id.u.pid; + goto out_free; + } } ret = init_new_ldt(to_mm, from_mm); @@ -132,24 +100,87 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm) return ret; } +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +{ + struct page **pages; + int err, ret; + + if (!skas_needs_stub) + return; + + ret = init_stub_pte(mm, STUB_CODE, + (unsigned long) &__syscall_stub_start); + if (ret) + goto out; + + ret = init_stub_pte(mm, STUB_DATA, mm->context.id.stack); + if (ret) + goto out; + + pages = kmalloc(2 * sizeof(struct page *), GFP_KERNEL); + if (pages == NULL) { + printk(KERN_ERR "arch_dup_mmap failed to allocate 2 page " + "pointers\n"); + goto out; + } + + pages[0] = virt_to_page(&__syscall_stub_start); + pages[1] = virt_to_page(mm->context.id.stack); + + /* dup_mmap already holds mmap_sem */ + err = install_special_mapping(mm, STUB_START, STUB_END - STUB_START, + VM_READ | VM_MAYREAD | VM_EXEC | + VM_MAYEXEC | VM_DONTCOPY, pages); + if (err) { + printk(KERN_ERR "install_special_mapping returned %d\n", err); + goto out_free; + } + return; + +out_free: + kfree(pages); +out: + force_sigsegv(SIGSEGV, current); +} + +void arch_exit_mmap(struct mm_struct *mm) +{ + pte_t *pte; + + pte = virt_to_pte(mm, STUB_CODE); + if (pte != NULL) + pte_clear(mm, STUB_CODE, pte); + + pte = virt_to_pte(mm, STUB_DATA); + if (pte == NULL) + return; + + pte_clear(mm, STUB_DATA, pte); +} + void destroy_context(struct mm_struct *mm) { struct mm_context *mmu = &mm->context; if (proc_mm) os_close_file(mmu->id.u.mm_fd); - else + else { + /* + * If init_new_context wasn't called, this will be + * zero, resulting in a kill(0), which will result in the + * whole UML suddenly dying. Also, cover negative and + * 1 cases, since they shouldn't happen either. + */ + if (mmu->id.u.pid < 2) { + printk(KERN_ERR "corrupt mm_context - pid = %d\n", + mmu->id.u.pid); + return; + } os_kill_ptraced_process(mmu->id.u.pid, 1); + } - if (!proc_mm || !ptrace_faultinfo) { + if (skas_needs_stub) free_page(mmu->id.stack); - pte_lock_deinit(virt_to_page(mmu->last_page_table)); - pte_free_kernel((pte_t *) mmu->last_page_table); - dec_zone_page_state(virt_to_page(mmu->last_page_table), NR_PAGETABLE); -#ifdef CONFIG_3_LEVEL_PGTABLES - pmd_free((pmd_t *) mmu->last_pmd); -#endif - } free_ldt(mmu); } diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c index fce389c..2e9852c 100644 --- a/arch/um/kernel/skas/process.c +++ b/arch/um/kernel/skas/process.c @@ -6,19 +6,25 @@ #include "linux/init.h" #include "linux/sched.h" #include "as-layout.h" +#include "kern.h" #include "os.h" #include "skas.h" int new_mm(unsigned long stack) { - int fd; + int fd, err; fd = os_open_file("/proc/mm", of_cloexec(of_write(OPENFLAGS())), 0); if (fd < 0) return fd; - if (skas_needs_stub) - map_stub_pages(fd, STUB_CODE, STUB_DATA, stack); + if (skas_needs_stub) { + err = map_stub_pages(fd, STUB_CODE, STUB_DATA, stack); + if (err) { + os_close_file(fd); + return err; + } + } return fd; } @@ -49,8 +55,14 @@ int __init start_uml(void) { stack_protections((unsigned long) &cpu0_irqstack); set_sigstack(cpu0_irqstack, THREAD_SIZE); - if (proc_mm) + if (proc_mm) { userspace_pid[0] = start_userspace(0); + if (userspace_pid[0] < 0) { + printf("start_uml - start_userspace returned %d\n", + userspace_pid[0]); + exit(1); + } + } init_new_thread_signals(); diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index 50b476f..4e3b820 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -9,6 +9,9 @@ #include "sysdep/ptrace.h" #include "sysdep/syscalls.h" +extern int syscall_table_size; +#define NR_syscalls (syscall_table_size / sizeof(void *)) + void handle_syscall(struct uml_pt_regs *r) { struct pt_regs *regs = container_of(r, struct pt_regs, regs); @@ -17,9 +20,6 @@ void handle_syscall(struct uml_pt_regs *r) syscall_trace(r, 0); - current->thread.nsyscalls++; - nsyscalls++; - /* * This should go in the declaration of syscall, but when I do that, * strace -f -c bash -c 'ls ; ls' breaks, sometimes not tracing diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c index 1d8b119..e22c969 100644 --- a/arch/um/kernel/skas/uaccess.c +++ b/arch/um/kernel/skas/uaccess.c @@ -3,128 +3,130 @@ * Licensed under the GPL */ -#include "linux/err.h" -#include "linux/highmem.h" -#include "linux/mm.h" -#include "asm/current.h" -#include "asm/page.h" -#include "asm/pgtable.h" +#include <linux/err.h> +#include <linux/highmem.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <asm/current.h> +#include <asm/page.h> +#include <asm/pgtable.h> #include "kern_util.h" #include "os.h" -extern void *um_virt_to_phys(struct task_struct *task, unsigned long addr, - pte_t *pte_out); - -static unsigned long maybe_map(unsigned long virt, int is_write) +pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr) { - pte_t pte; - int err; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + if (mm == NULL) + return NULL; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + return NULL; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + return NULL; - void *phys = um_virt_to_phys(current, virt, &pte); - int dummy_code; + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return NULL; + + return pte_offset_kernel(pmd, addr); +} + +static pte_t *maybe_map(unsigned long virt, int is_write) +{ + pte_t *pte = virt_to_pte(current->mm, virt); + int err, dummy_code; - if (IS_ERR(phys) || (is_write && !pte_write(pte))) { + if ((pte == NULL) || !pte_present(*pte) || + (is_write && !pte_write(*pte))) { err = handle_page_fault(virt, 0, is_write, 1, &dummy_code); if (err) - return -1UL; - phys = um_virt_to_phys(current, virt, NULL); + return NULL; + pte = virt_to_pte(current->mm, virt); } - if (IS_ERR(phys)) - phys = (void *) -1; + if (!pte_present(*pte)) + pte = NULL; - return (unsigned long) phys; + return pte; } static int do_op_one_page(unsigned long addr, int len, int is_write, int (*op)(unsigned long addr, int len, void *arg), void *arg) { + jmp_buf buf; struct page *page; - int n; + pte_t *pte; + int n, faulted; - addr = maybe_map(addr, is_write); - if (addr == -1UL) + pte = maybe_map(addr, is_write); + if (pte == NULL) return -1; - page = phys_to_page(addr); + page = pte_page(*pte); addr = (unsigned long) kmap_atomic(page, KM_UML_USERCOPY) + (addr & ~PAGE_MASK); - n = (*op)(addr, len, arg); + current->thread.fault_catcher = &buf; + + faulted = UML_SETJMP(&buf); + if (faulted == 0) + n = (*op)(addr, len, arg); + else + n = -1; + + current->thread.fault_catcher = NULL; kunmap_atomic(page, KM_UML_USERCOPY); return n; } -static void do_buffer_op(void *jmpbuf, void *arg_ptr) +static int buffer_op(unsigned long addr, int len, int is_write, + int (*op)(unsigned long, int, void *), void *arg) { - va_list args; - unsigned long addr; - int len, is_write, size, remain, n; - int (*op)(unsigned long, int, void *); - void *arg; - int *res; - - va_copy(args, *(va_list *)arg_ptr); - addr = va_arg(args, unsigned long); - len = va_arg(args, int); - is_write = va_arg(args, int); - op = va_arg(args, void *); - arg = va_arg(args, void *); - res = va_arg(args, int *); - va_end(args); + int size, remain, n; + size = min(PAGE_ALIGN(addr) - addr, (unsigned long) len); remain = len; - current->thread.fault_catcher = jmpbuf; n = do_op_one_page(addr, size, is_write, op, arg); if (n != 0) { - *res = (n < 0 ? remain : 0); + remain = (n < 0 ? remain : 0); goto out; } addr += size; remain -= size; - if (remain == 0) { - *res = 0; + if (remain == 0) goto out; - } - while(addr < ((addr + remain) & PAGE_MASK)) { + while (addr < ((addr + remain) & PAGE_MASK)) { n = do_op_one_page(addr, PAGE_SIZE, is_write, op, arg); if (n != 0) { - *res = (n < 0 ? remain : 0); + remain = (n < 0 ? remain : 0); goto out; } addr += PAGE_SIZE; remain -= PAGE_SIZE; } - if (remain == 0) { - *res = 0; + if (remain == 0) goto out; - } n = do_op_one_page(addr, remain, is_write, op, arg); - if (n != 0) - *res = (n < 0 ? remain : 0); - else *res = 0; - out: - current->thread.fault_catcher = NULL; -} - -static int buffer_op(unsigned long addr, int len, int is_write, - int (*op)(unsigned long addr, int len, void *arg), - void *arg) -{ - int faulted, res; - - faulted = setjmp_wrapper(do_buffer_op, addr, len, is_write, op, arg, - &res); - if (!faulted) - return res; + if (n != 0) { + remain = (n < 0 ? remain : 0); + goto out; + } - return addr + len - (unsigned long) current->thread.fault_addr; + return 0; + out: + return remain; } static int copy_chunk_from_user(unsigned long from, int len, void *arg) diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c index 36d89cf..e1062ec 100644 --- a/arch/um/kernel/smp.c +++ b/arch/um/kernel/smp.c @@ -21,7 +21,6 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); #include "asm/smp.h" #include "asm/processor.h" #include "asm/spinlock.h" -#include "kern_util.h" #include "kern.h" #include "irq_user.h" #include "os.h" @@ -61,7 +60,7 @@ void smp_send_stop(void) continue; os_write_file(cpu_data[i].ipi_pipe[1], "S", 1); } - printk(KERN_INFO "done\n"); + printk(KERN_CONT "done\n"); } static cpumask_t smp_commenced_mask = CPU_MASK_NONE; @@ -75,8 +74,7 @@ static int idle_proc(void *cpup) if (err < 0) panic("CPU#%d failed to create IPI pipe, err = %d", cpu, -err); - os_set_fd_async(cpu_data[cpu].ipi_pipe[0], - current->thread.mode.tt.extern_pid); + os_set_fd_async(cpu_data[cpu].ipi_pipe[0]); wmb(); if (cpu_test_and_set(cpu, cpu_callin_map)) { @@ -129,8 +127,7 @@ void smp_prepare_cpus(unsigned int maxcpus) if (err < 0) panic("CPU#0 failed to create IPI pipe, errno = %d", -err); - os_set_fd_async(cpu_data[me].ipi_pipe[0], - current->thread.mode.tt.extern_pid); + os_set_fd_async(cpu_data[me].ipi_pipe[0]); for (cpu = 1; cpu < ncpus; cpu++) { printk(KERN_INFO "Booting processor %d...\n", cpu); @@ -143,9 +140,8 @@ void smp_prepare_cpus(unsigned int maxcpus) while (waittime-- && !cpu_isset(cpu, cpu_callin_map)) cpu_relax(); - if (cpu_isset(cpu, cpu_callin_map)) - printk(KERN_INFO "done\n"); - else printk(KERN_INFO "failed\n"); + printk(KERN_INFO "%s\n", + cpu_isset(cpu, cpu_calling_map) ? "done" : "failed"); } } diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c index b9d92b2..9cffc62 100644 --- a/arch/um/kernel/syscall.c +++ b/arch/um/kernel/syscall.c @@ -13,9 +13,6 @@ #include "asm/uaccess.h" #include "asm/unistd.h" -/* Unlocked, I don't care if this is a bit off */ -int nsyscalls = 0; - long sys_fork(void) { long ret; diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index 9326357..56d43d0 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -1,38 +1,37 @@ -/* - * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) +/* + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ -#include "linux/sched.h" -#include "linux/kernel.h" -#include "linux/module.h" -#include "linux/kallsyms.h" -#include "asm/page.h" -#include "asm/processor.h" +#include <linux/kallsyms.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> #include "sysrq.h" /* Catch non-i386 SUBARCH's. */ #if !defined(CONFIG_UML_X86) || defined(CONFIG_64BIT) void show_trace(struct task_struct *task, unsigned long * stack) { - unsigned long addr; + unsigned long addr; - if (!stack) { + if (!stack) { stack = (unsigned long*) &stack; WARN_ON(1); } - printk("Call Trace: \n"); - while (((long) stack & (THREAD_SIZE-1)) != 0) { - addr = *stack; + printk(KERN_INFO "Call Trace: \n"); + while (((long) stack & (THREAD_SIZE-1)) != 0) { + addr = *stack; if (__kernel_text_address(addr)) { - printk("%08lx: [<%08lx>]", (unsigned long) stack, addr); - print_symbol(" %s", addr); - printk("\n"); - } - stack++; - } - printk("\n"); + printk(KERN_INFO "%08lx: [<%08lx>]", + (unsigned long) stack, addr); + print_symbol(KERN_CONT " %s", addr); + printk(KERN_CONT "\n"); + } + stack++; + } + printk(KERN_INFO "\n"); } #endif @@ -67,14 +66,13 @@ void show_stack(struct task_struct *task, unsigned long *esp) } stack = esp; - for(i = 0; i < kstack_depth_to_print; i++) { + for (i = 0; i < kstack_depth_to_print; i++) { if (kstack_end(stack)) break; if (i && ((i % 8) == 0)) - printk("\n "); + printk("\n" KERN_INFO " "); printk("%08lx ", *stack++); } - printk("Call Trace: \n"); show_trace(task, esp); } diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 1ac746a..e066e84 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -3,12 +3,12 @@ * Licensed under the GPL */ -#include "linux/clockchips.h" -#include "linux/interrupt.h" -#include "linux/jiffies.h" -#include "linux/threads.h" -#include "asm/irq.h" -#include "asm/param.h" +#include <linux/clockchips.h> +#include <linux/interrupt.h> +#include <linux/jiffies.h> +#include <linux/threads.h> +#include <asm/irq.h> +#include <asm/param.h> #include "kern_util.h" #include "os.h" @@ -32,7 +32,7 @@ void timer_handler(int sig, struct uml_pt_regs *regs) static void itimer_set_mode(enum clock_event_mode mode, struct clock_event_device *evt) { - switch(mode) { + switch (mode) { case CLOCK_EVT_MODE_PERIODIC: set_interval(); break; diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index f4a0e40..d175d05 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -3,9 +3,10 @@ * Licensed under the GPL */ -#include "linux/mm.h" -#include "asm/pgtable.h" -#include "asm/tlbflush.h" +#include <linux/mm.h> +#include <linux/sched.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> #include "as-layout.h" #include "mem_user.h" #include "os.h" @@ -56,7 +57,7 @@ static int do_ops(struct host_vm_change *hvc, int end, for (i = 0; i < end && !ret; i++) { op = &hvc->ops[i]; - switch(op->type) { + switch (op->type) { case MMAP: ret = map(hvc->id, op->u.mmap.addr, op->u.mmap.len, op->u.mmap.prot, op->u.mmap.fd, @@ -183,27 +184,30 @@ static inline int update_pte_range(pmd_t *pmd, unsigned long addr, pte = pte_offset_kernel(pmd, addr); do { + if ((addr >= STUB_START) && (addr < STUB_END)) + continue; + r = pte_read(*pte); w = pte_write(*pte); x = pte_exec(*pte); if (!pte_young(*pte)) { r = 0; w = 0; - } else if (!pte_dirty(*pte)) { + } else if (!pte_dirty(*pte)) w = 0; - } + prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) | (x ? UM_PROT_EXEC : 0)); if (hvc->force || pte_newpage(*pte)) { if (pte_present(*pte)) ret = add_mmap(addr, pte_val(*pte) & PAGE_MASK, PAGE_SIZE, prot, hvc); - else ret = add_munmap(addr, PAGE_SIZE, hvc); - } - else if (pte_newprot(*pte)) + else + ret = add_munmap(addr, PAGE_SIZE, hvc); + } else if (pte_newprot(*pte)) ret = add_mprotect(addr, PAGE_SIZE, prot, hvc); *pte = pte_mkuptodate(*pte); - } while (pte++, addr += PAGE_SIZE, ((addr != end) && !ret)); + } while (pte++, addr += PAGE_SIZE, ((addr < end) && !ret)); return ret; } @@ -225,7 +229,7 @@ static inline int update_pmd_range(pud_t *pud, unsigned long addr, } } else ret = update_pte_range(pmd, addr, next, hvc); - } while (pmd++, addr = next, ((addr != end) && !ret)); + } while (pmd++, addr = next, ((addr < end) && !ret)); return ret; } @@ -247,7 +251,7 @@ static inline int update_pud_range(pgd_t *pgd, unsigned long addr, } } else ret = update_pmd_range(pud, addr, next, hvc); - } while (pud++, addr = next, ((addr != end) && !ret)); + } while (pud++, addr = next, ((addr < end) && !ret)); return ret; } @@ -270,7 +274,7 @@ void fix_range_common(struct mm_struct *mm, unsigned long start_addr, } } else ret = update_pud_range(pgd, addr, next, &hvc); - } while (pgd++, addr = next, ((addr != end_addr) && !ret)); + } while (pgd++, addr = next, ((addr < end_addr) && !ret)); if (!ret) ret = do_ops(&hvc, hvc.index, 1); @@ -485,9 +489,6 @@ void __flush_tlb_one(unsigned long addr) static void fix_range(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr, int force) { - if (!proc_mm && (end_addr > STUB_START)) - end_addr = STUB_START; - fix_range_common(mm, start_addr, end_addr, force); } @@ -499,10 +500,9 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, else fix_range(vma->vm_mm, start, end, 0); } -void flush_tlb_mm(struct mm_struct *mm) +void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + unsigned long end) { - unsigned long end; - /* * Don't bother flushing if this address space is about to be * destroyed. @@ -510,8 +510,17 @@ void flush_tlb_mm(struct mm_struct *mm) if (atomic_read(&mm->mm_users) == 0) return; - end = proc_mm ? task_size : STUB_START; - fix_range(mm, 0, end, 0); + fix_range(mm, start, end, 0); +} + +void flush_tlb_mm(struct mm_struct *mm) +{ + struct vm_area_struct *vma = mm->mmap; + + while (vma != NULL) { + fix_range(mm, vma->vm_start, vma->vm_end, 0); + vma = vma->vm_next; + } } void force_flush_all(void) diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index cb3321f..44e4904 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -13,6 +13,7 @@ #include "as-layout.h" #include "kern_util.h" #include "os.h" +#include "skas.h" #include "sysdep/sigcontext.h" /* @@ -128,7 +129,19 @@ static void bad_segv(struct faultinfo fi, unsigned long ip) force_sig_info(SIGSEGV, &si, current); } -static void segv_handler(int sig, struct uml_pt_regs *regs) +void fatal_sigsegv(void) +{ + force_sigsegv(SIGSEGV, current); + do_signal(); + /* + * This is to tell gcc that we're not returning - do_signal + * can, in general, return, but in this case, it's not, since + * we just got a fatal SIGSEGV queued. + */ + os_dump_core(); +} + +void segv_handler(int sig, struct uml_pt_regs *regs) { struct faultinfo * fi = UPT_FAULTINFO(regs); @@ -216,9 +229,6 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, void relay_signal(int sig, struct uml_pt_regs *regs) { - if (arch_handle_signal(sig, regs)) - return; - if (!UPT_IS_USER(regs)) { if (sig == SIGBUS) printk(KERN_ERR "Bus error - the host /dev/shm or /tmp " @@ -226,31 +236,24 @@ void relay_signal(int sig, struct uml_pt_regs *regs) panic("Kernel mode signal %d", sig); } + arch_examine_signal(sig, regs); + current->thread.arch.faultinfo = *UPT_FAULTINFO(regs); force_sig(sig, current); } -static void bus_handler(int sig, struct uml_pt_regs *regs) +void bus_handler(int sig, struct uml_pt_regs *regs) { if (current->thread.fault_catcher != NULL) UML_LONGJMP(current->thread.fault_catcher, 1); else relay_signal(sig, regs); } -static void winch(int sig, struct uml_pt_regs *regs) +void winch(int sig, struct uml_pt_regs *regs) { do_IRQ(WINCH_IRQ, regs); } -const struct kern_handlers handlinfo_kern = { - .relay_signal = relay_signal, - .winch = winch, - .bus_handler = bus_handler, - .page_fault = segv_handler, - .sigio_handler = sigio_handler, - .timer_handler = timer_handler -}; - void trap_init(void) { } diff --git a/arch/um/kernel/uaccess.c b/arch/um/kernel/uaccess.c index d7436aa..f0f4b04 100644 --- a/arch/um/kernel/uaccess.c +++ b/arch/um/kernel/uaccess.c @@ -1,10 +1,11 @@ /* * Copyright (C) 2001 Chris Emerson (cemerson@chiark.greenend.org.uk) - * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ -/* These are here rather than tt/uaccess.c because skas mode needs them in +/* + * These are here rather than tt/uaccess.c because skas mode needs them in * order to do SIGBUS recovery when a tmpfs mount runs out of room. */ @@ -25,6 +26,8 @@ int __do_copy_to_user(void *to, const void *from, int n, fault = __do_user_copy(to, from, n, fault_addr, fault_catcher, __do_copy, &faulted); - if(!faulted) return(0); - else return(n - (fault - (unsigned long) to)); + if (!faulted) + return 0; + else + return n - (fault - (unsigned long) to); } diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index f1c7139..468aba9 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -3,22 +3,23 @@ * Licensed under the GPL */ -#include "linux/delay.h" -#include "linux/mm.h" -#include "linux/module.h" -#include "linux/seq_file.h" -#include "linux/string.h" -#include "linux/utsname.h" -#include "asm/pgtable.h" -#include "asm/processor.h" -#include "asm/setup.h" -#include "arch.h" +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/string.h> +#include <linux/utsname.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/setup.h> #include "as-layout.h" +#include "arch.h" #include "init.h" #include "kern.h" +#include "kern_util.h" #include "mem_user.h" #include "os.h" -#include "skas.h" #define DEFAULT_COMMAND_LINE "root=98:0" @@ -100,8 +101,6 @@ const struct seq_operations cpuinfo_op = { }; /* Set in linux_main */ -unsigned long host_task_size; -unsigned long task_size; unsigned long uml_physmem; unsigned long uml_reserved; /* Also modified in mem_init */ unsigned long start_vm; @@ -197,20 +196,19 @@ __uml_setup("--help", Usage, " Prints this message.\n\n" ); -static int __init uml_checksetup(char *line, int *add) +static void __init uml_checksetup(char *line, int *add) { struct uml_param *p; p = &__uml_setup_start; - while(p < &__uml_setup_end) { + while (p < &__uml_setup_end) { int n; n = strlen(p->str); if (!strncmp(line, p->str, n) && p->setup_func(line + n, add)) - return 1; + return; p++; } - return 0; } static void __init uml_postsetup(void) @@ -218,13 +216,30 @@ static void __init uml_postsetup(void) initcall_t *p; p = &__uml_postsetup_start; - while(p < &__uml_postsetup_end) { + while (p < &__uml_postsetup_end) { (*p)(); p++; } return; } +static int panic_exit(struct notifier_block *self, unsigned long unused1, + void *unused2) +{ + bust_spinlocks(1); + show_regs(&(current->thread.regs)); + bust_spinlocks(0); + uml_exitcode = 1; + os_dump_core(); + return 0; +} + +static struct notifier_block panic_exit_notifier = { + .notifier_call = panic_exit, + .next = NULL, + .priority = 0 +}; + /* Set during early boot */ unsigned long brk_start; unsigned long end_iomem; @@ -234,20 +249,6 @@ EXPORT_SYMBOL(end_iomem); extern char __binary_start; -static unsigned long set_task_sizes_skas(unsigned long *task_size_out) -{ - /* Round up to the nearest 4M */ - unsigned long host_task_size = ROUND_4M((unsigned long) - &host_task_size); - - if (!skas_needs_stub) - *task_size_out = host_task_size; - else - *task_size_out = STUB_START & PGDIR_MASK; - - return host_task_size; -} - int __init linux_main(int argc, char **argv) { unsigned long avail, diff; @@ -278,13 +279,6 @@ int __init linux_main(int argc, char **argv) printf("UML running in %s mode\n", mode); - host_task_size = set_task_sizes_skas(&task_size); - - /* - * Setting up handlers to 'sig_info' struct - */ - os_fill_handlinfo(handlinfo_kern); - brk_start = (unsigned long) sbrk(0); /* @@ -309,7 +303,7 @@ int __init linux_main(int argc, char **argv) highmem = 0; iomem_size = (iomem_size + PAGE_SIZE - 1) & PAGE_MASK; - max_physmem = get_kmem_end() - uml_physmem - iomem_size - MIN_VMALLOC; + max_physmem = CONFIG_TOP_ADDR - uml_physmem - iomem_size - MIN_VMALLOC; /* * Zones have to begin on a 1 << MAX_ORDER page boundary, @@ -341,7 +335,7 @@ int __init linux_main(int argc, char **argv) } virtmem_size = physmem_size; - avail = get_kmem_end() - start_vm; + avail = CONFIG_TOP_ADDR - start_vm; if (physmem_size > avail) virtmem_size = avail; end_vm = start_vm + virtmem_size; @@ -350,6 +344,9 @@ int __init linux_main(int argc, char **argv) printf("Kernel virtual memory size shrunk to %lu bytes\n", virtmem_size); + atomic_notifier_chain_register(&panic_notifier_list, + &panic_exit_notifier); + uml_postsetup(); stack_protections((unsigned long) &init_thread_info); @@ -358,29 +355,8 @@ int __init linux_main(int argc, char **argv) return start_uml(); } -extern int uml_exitcode; - -static int panic_exit(struct notifier_block *self, unsigned long unused1, - void *unused2) -{ - bust_spinlocks(1); - show_regs(&(current->thread.regs)); - bust_spinlocks(0); - uml_exitcode = 1; - os_dump_core(); - return 0; -} - -static struct notifier_block panic_exit_notifier = { - .notifier_call = panic_exit, - .next = NULL, - .priority = 0 -}; - void __init setup_arch(char **cmdline_p) { - atomic_notifier_chain_register(&panic_notifier_list, - &panic_exit_notifier); paging_init(); strlcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; diff --git a/arch/um/kernel/umid.c b/arch/um/kernel/umid.c index 039e16e..81e07e2b 100644 --- a/arch/um/kernel/umid.c +++ b/arch/um/kernel/umid.c @@ -1,13 +1,12 @@ -/* - * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) +/* + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ -#include "asm/errno.h" +#include <asm/errno.h> #include "init.h" -#include "os.h" #include "kern.h" -#include "linux/kernel.h" +#include "os.h" /* Changed by set_umid_arg */ static int umid_inited = 0; @@ -16,16 +15,16 @@ static int __init set_umid_arg(char *name, int *add) { int err; - if(umid_inited){ + if (umid_inited) { printf("umid already set\n"); return 0; } *add = 0; err = set_umid(name); - if(err == -EEXIST) + if (err == -EEXIST) printf("umid '%s' already in use\n", name); - else if(!err) + else if (!err) umid_inited = 1; return 0; diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile index 8e129af..8a48d6a 100644 --- a/arch/um/os-Linux/Makefile +++ b/arch/um/os-Linux/Makefile @@ -4,7 +4,7 @@ # obj-y = aio.o elf_aux.o execvp.o file.o helper.o irq.o main.o mem.o process.o \ - registers.o sigio.o signal.o start_up.o time.o trap.o tty.o uaccess.o \ + registers.o sigio.o signal.o start_up.o time.o tty.o uaccess.o \ umid.o tls.o user_syms.o util.o drivers/ sys-$(SUBARCH)/ skas/ obj-$(CONFIG_TTY_LOG) += tty_log.o @@ -12,7 +12,7 @@ user-objs-$(CONFIG_TTY_LOG) += tty_log.o USER_OBJS := $(user-objs-y) aio.o elf_aux.o execvp.o file.o helper.o irq.o \ main.o mem.o process.o registers.o sigio.o signal.o start_up.o time.o \ - trap.o tty.o tls.o uaccess.o umid.o util.o + tty.o tls.o uaccess.o umid.o util.o CFLAGS_user_syms.o += -DSUBARCH_$(SUBARCH) diff --git a/arch/um/os-Linux/aio.c b/arch/um/os-Linux/aio.c index 93dc0c8..b8d8c9c 100644 --- a/arch/um/os-Linux/aio.c +++ b/arch/um/os-Linux/aio.c @@ -12,6 +12,7 @@ #include "aio.h" #include "init.h" #include "kern_constants.h" +#include "kern_util.h" #include "os.h" #include "user.h" diff --git a/arch/um/os-Linux/drivers/ethertap_user.c b/arch/um/os-Linux/drivers/ethertap_user.c index 07ca0cb..6fb0b17 100644 --- a/arch/um/os-Linux/drivers/ethertap_user.c +++ b/arch/um/os-Linux/drivers/ethertap_user.c @@ -131,7 +131,7 @@ static int etap_tramp(char *dev, char *gate, int control_me, } if (c != 1) { printk(UM_KERN_ERR "etap_tramp : uml_net failed\n"); - err = helper_wait(pid, 0, "uml_net"); + err = helper_wait(pid); } return err; } diff --git a/arch/um/os-Linux/drivers/tuntap_user.c b/arch/um/os-Linux/drivers/tuntap_user.c index 1037a3b6..2448be0 100644 --- a/arch/um/os-Linux/drivers/tuntap_user.c +++ b/arch/um/os-Linux/drivers/tuntap_user.c @@ -14,6 +14,7 @@ #include <sys/wait.h> #include <sys/uio.h> #include "kern_constants.h" +#include "kern_util.h" #include "os.h" #include "tuntap.h" #include "user.h" @@ -107,7 +108,7 @@ static int tuntap_open_tramp(char *gate, int *fd_out, int me, int remote, "errno = %d\n", errno); return err; } - helper_wait(pid, 0, "tuntap_open_tramp"); + helper_wait(pid); cmsg = CMSG_FIRSTHDR(&msg); if (cmsg == NULL) { @@ -148,7 +149,7 @@ static int tuntap_open(void *data) memset(&ifr, 0, sizeof(ifr)); ifr.ifr_flags = IFF_TAP | IFF_NO_PI; strlcpy(ifr.ifr_name, pri->dev_name, sizeof(ifr.ifr_name)); - if (ioctl(pri->fd, TUNSETIFF, (void *) &ifr) < 0) { + if (ioctl(pri->fd, TUNSETIFF, &ifr) < 0) { err = -errno; printk(UM_KERN_ERR "TUNSETIFF failed, errno = %d\n", errno); diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index f834627..b5afcfd 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ @@ -8,18 +8,16 @@ #include <errno.h> #include <fcntl.h> #include <signal.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/socket.h> -#include <sys/un.h> #include <sys/ioctl.h> #include <sys/mount.h> -#include <sys/uio.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/un.h> +#include "kern_constants.h" #include "os.h" #include "user.h" -#include "kern_util.h" -static void copy_stat(struct uml_stat *dst, struct stat64 *src) +static void copy_stat(struct uml_stat *dst, const struct stat64 *src) { *dst = ((struct uml_stat) { .ust_dev = src->st_dev, /* device */ @@ -43,10 +41,10 @@ int os_stat_fd(const int fd, struct uml_stat *ubuf) int err; CATCH_EINTR(err = fstat64(fd, &sbuf)); - if(err < 0) + if (err < 0) return -errno; - if(ubuf != NULL) + if (ubuf != NULL) copy_stat(ubuf, &sbuf); return err; } @@ -56,27 +54,26 @@ int os_stat_file(const char *file_name, struct uml_stat *ubuf) struct stat64 sbuf; int err; - do { - err = stat64(file_name, &sbuf); - } while((err < 0) && (errno == EINTR)) ; - - if(err < 0) + CATCH_EINTR(err = stat64(file_name, &sbuf)); + if (err < 0) return -errno; - if(ubuf != NULL) + if (ubuf != NULL) copy_stat(ubuf, &sbuf); return err; } -int os_access(const char* file, int mode) +int os_access(const char *file, int mode) { int amode, err; - amode=(mode&OS_ACC_R_OK ? R_OK : 0) | (mode&OS_ACC_W_OK ? W_OK : 0) | - (mode&OS_ACC_X_OK ? X_OK : 0) | (mode&OS_ACC_F_OK ? F_OK : 0) ; + amode = (mode & OS_ACC_R_OK ? R_OK : 0) | + (mode & OS_ACC_W_OK ? W_OK : 0) | + (mode & OS_ACC_X_OK ? X_OK : 0) | + (mode & OS_ACC_F_OK ? F_OK : 0); err = access(file, amode); - if(err < 0) + if (err < 0) return -errno; return 0; @@ -88,7 +85,7 @@ int os_ioctl_generic(int fd, unsigned int cmd, unsigned long arg) int err; err = ioctl(fd, cmd, arg); - if(err < 0) + if (err < 0) return -errno; return err; @@ -97,7 +94,7 @@ int os_ioctl_generic(int fd, unsigned int cmd, unsigned long arg) /* FIXME: ensure namebuf in os_get_if_name is big enough */ int os_get_ifname(int fd, char* namebuf) { - if(ioctl(fd, SIOCGIFNAME, namebuf) < 0) + if (ioctl(fd, SIOCGIFNAME, namebuf) < 0) return -errno; return 0; @@ -108,37 +105,22 @@ int os_set_slip(int fd) int disc, sencap; disc = N_SLIP; - if(ioctl(fd, TIOCSETD, &disc) < 0) + if (ioctl(fd, TIOCSETD, &disc) < 0) return -errno; sencap = 0; - if(ioctl(fd, SIOCSIFENCAP, &sencap) < 0) + if (ioctl(fd, SIOCSIFENCAP, &sencap) < 0) return -errno; return 0; } -int os_set_owner(int fd, int pid) -{ - if(fcntl(fd, F_SETOWN, pid) < 0){ - int save_errno = errno; - - if(fcntl(fd, F_GETOWN, 0) != pid) - return -save_errno; - } - - return 0; -} - int os_mode_fd(int fd, int mode) { int err; - do { - err = fchmod(fd, mode); - } while((err < 0) && (errno==EINTR)) ; - - if(err < 0) + CATCH_EINTR(err = fchmod(fd, mode)); + if (err < 0) return -errno; return 0; @@ -150,64 +132,73 @@ int os_file_type(char *file) int err; err = os_stat_file(file, &buf); - if(err < 0) + if (err < 0) return err; - if(S_ISDIR(buf.ust_mode)) + if (S_ISDIR(buf.ust_mode)) return OS_TYPE_DIR; - else if(S_ISLNK(buf.ust_mode)) + else if (S_ISLNK(buf.ust_mode)) return OS_TYPE_SYMLINK; - else if(S_ISCHR(buf.ust_mode)) + else if (S_ISCHR(buf.ust_mode)) return OS_TYPE_CHARDEV; - else if(S_ISBLK(buf.ust_mode)) + else if (S_ISBLK(buf.ust_mode)) return OS_TYPE_BLOCKDEV; - else if(S_ISFIFO(buf.ust_mode)) + else if (S_ISFIFO(buf.ust_mode)) return OS_TYPE_FIFO; - else if(S_ISSOCK(buf.ust_mode)) + else if (S_ISSOCK(buf.ust_mode)) return OS_TYPE_SOCK; else return OS_TYPE_FILE; } -int os_file_mode(char *file, struct openflags *mode_out) +int os_file_mode(const char *file, struct openflags *mode_out) { int err; *mode_out = OPENFLAGS(); err = access(file, W_OK); - if(err && (errno != EACCES)) + if (err && (errno != EACCES)) return -errno; - else if(!err) + else if (!err) *mode_out = of_write(*mode_out); err = access(file, R_OK); - if(err && (errno != EACCES)) + if (err && (errno != EACCES)) return -errno; - else if(!err) + else if (!err) *mode_out = of_read(*mode_out); return err; } -int os_open_file(char *file, struct openflags flags, int mode) +int os_open_file(const char *file, struct openflags flags, int mode) { int fd, err, f = 0; - if(flags.r && flags.w) f = O_RDWR; - else if(flags.r) f = O_RDONLY; - else if(flags.w) f = O_WRONLY; + if (flags.r && flags.w) + f = O_RDWR; + else if (flags.r) + f = O_RDONLY; + else if (flags.w) + f = O_WRONLY; else f = 0; - if(flags.s) f |= O_SYNC; - if(flags.c) f |= O_CREAT; - if(flags.t) f |= O_TRUNC; - if(flags.e) f |= O_EXCL; + if (flags.s) + f |= O_SYNC; + if (flags.c) + f |= O_CREAT; + if (flags.t) + f |= O_TRUNC; + if (flags.e) + f |= O_EXCL; + if (flags.a) + f |= O_APPEND; fd = open64(file, f, mode); - if(fd < 0) + if (fd < 0) return -errno; - if(flags.cl && fcntl(fd, F_SETFD, 1)){ + if (flags.cl && fcntl(fd, F_SETFD, 1)) { err = -errno; close(fd); return err; @@ -216,7 +207,7 @@ int os_open_file(char *file, struct openflags flags, int mode) return fd; } -int os_connect_socket(char *name) +int os_connect_socket(const char *name) { struct sockaddr_un sock; int fd, err; @@ -225,13 +216,13 @@ int os_connect_socket(char *name) snprintf(sock.sun_path, sizeof(sock.sun_path), "%s", name); fd = socket(AF_UNIX, SOCK_STREAM, 0); - if(fd < 0) { + if (fd < 0) { err = -errno; goto out; } err = connect(fd, (struct sockaddr *) &sock, sizeof(sock)); - if(err) { + if (err) { err = -errno; goto out_close; } @@ -254,7 +245,7 @@ int os_seek_file(int fd, unsigned long long offset) unsigned long long actual; actual = lseek64(fd, offset, SEEK_SET); - if(actual != offset) + if (actual != offset) return -errno; return 0; } @@ -263,7 +254,7 @@ int os_read_file(int fd, void *buf, int len) { int n = read(fd, buf, len); - if(n < 0) + if (n < 0) return -errno; return n; } @@ -272,37 +263,38 @@ int os_write_file(int fd, const void *buf, int len) { int n = write(fd, (void *) buf, len); - if(n < 0) + if (n < 0) return -errno; return n; } -int os_file_size(char *file, unsigned long long *size_out) +int os_file_size(const char *file, unsigned long long *size_out) { struct uml_stat buf; int err; err = os_stat_file(file, &buf); - if(err < 0){ - printk("Couldn't stat \"%s\" : err = %d\n", file, -err); + if (err < 0) { + printk(UM_KERN_ERR "Couldn't stat \"%s\" : err = %d\n", file, + -err); return err; } - if(S_ISBLK(buf.ust_mode)){ + if (S_ISBLK(buf.ust_mode)) { int fd; long blocks; fd = open(file, O_RDONLY, 0); - if(fd < 0) { + if (fd < 0) { err = -errno; - printk("Couldn't open \"%s\", errno = %d\n", file, - errno); + printk(UM_KERN_ERR "Couldn't open \"%s\", " + "errno = %d\n", file, errno); return err; } - if(ioctl(fd, BLKGETSIZE, &blocks) < 0){ + if (ioctl(fd, BLKGETSIZE, &blocks) < 0) { err = -errno; - printk("Couldn't get the block size of \"%s\", " - "errno = %d\n", file, errno); + printk(UM_KERN_ERR "Couldn't get the block size of " + "\"%s\", errno = %d\n", file, errno); close(fd); return err; } @@ -314,14 +306,15 @@ int os_file_size(char *file, unsigned long long *size_out) return 0; } -int os_file_modtime(char *file, unsigned long *modtime) +int os_file_modtime(const char *file, unsigned long *modtime) { struct uml_stat buf; int err; err = os_stat_file(file, &buf); - if(err < 0){ - printk("Couldn't stat \"%s\" : err = %d\n", file, -err); + if (err < 0) { + printk(UM_KERN_ERR "Couldn't stat \"%s\" : err = %d\n", file, + -err); return err; } @@ -329,26 +322,13 @@ int os_file_modtime(char *file, unsigned long *modtime) return 0; } -int os_get_exec_close(int fd, int *close_on_exec) -{ - int ret; - - CATCH_EINTR(ret = fcntl(fd, F_GETFD)); - - if(ret < 0) - return -errno; - - *close_on_exec = (ret & FD_CLOEXEC) ? 1 : 0; - return ret; -} - int os_set_exec_close(int fd) { int err; CATCH_EINTR(err = fcntl(fd, F_SETFD, FD_CLOEXEC)); - if(err < 0) + if (err < 0) return -errno; return err; } @@ -358,53 +338,51 @@ int os_pipe(int *fds, int stream, int close_on_exec) int err, type = stream ? SOCK_STREAM : SOCK_DGRAM; err = socketpair(AF_UNIX, type, 0, fds); - if(err < 0) + if (err < 0) return -errno; - if(!close_on_exec) + if (!close_on_exec) return 0; err = os_set_exec_close(fds[0]); - if(err < 0) + if (err < 0) goto error; err = os_set_exec_close(fds[1]); - if(err < 0) + if (err < 0) goto error; return 0; error: - printk("os_pipe : Setting FD_CLOEXEC failed, err = %d\n", -err); + printk(UM_KERN_ERR "os_pipe : Setting FD_CLOEXEC failed, err = %d\n", + -err); close(fds[1]); close(fds[0]); return err; } -int os_set_fd_async(int fd, int owner) +int os_set_fd_async(int fd) { - int err; + int err, flags; + + flags = fcntl(fd, F_GETFL); + if (flags < 0) + return -errno; - /* XXX This should do F_GETFL first */ - if(fcntl(fd, F_SETFL, O_ASYNC | O_NONBLOCK) < 0){ + flags |= O_ASYNC | O_NONBLOCK; + if (fcntl(fd, F_SETFL, flags) < 0) { err = -errno; - printk("os_set_fd_async : failed to set O_ASYNC and " - "O_NONBLOCK on fd # %d, errno = %d\n", fd, errno); + printk(UM_KERN_ERR "os_set_fd_async : failed to set O_ASYNC " + "and O_NONBLOCK on fd # %d, errno = %d\n", fd, errno); return err; } -#ifdef notdef - if(fcntl(fd, F_SETFD, 1) < 0){ - printk("os_set_fd_async : Setting FD_CLOEXEC failed, " - "errno = %d\n", errno); - } -#endif - if((fcntl(fd, F_SETSIG, SIGIO) < 0) || - (fcntl(fd, F_SETOWN, owner) < 0)){ + if ((fcntl(fd, F_SETSIG, SIGIO) < 0) || + (fcntl(fd, F_SETOWN, os_getpid()) < 0)) { err = -errno; - printk("os_set_fd_async : Failed to fcntl F_SETOWN " - "(or F_SETSIG) fd %d to pid %d, errno = %d\n", fd, - owner, errno); + printk(UM_KERN_ERR "os_set_fd_async : Failed to fcntl F_SETOWN " + "(or F_SETSIG) fd %d, errno = %d\n", fd, errno); return err; } @@ -413,10 +391,14 @@ int os_set_fd_async(int fd, int owner) int os_clear_fd_async(int fd) { - int flags = fcntl(fd, F_GETFL); + int flags; + + flags = fcntl(fd, F_GETFL); + if (flags < 0) + return -errno; flags &= ~(O_ASYNC | O_NONBLOCK); - if(fcntl(fd, F_SETFL, flags) < 0) + if (fcntl(fd, F_SETFL, flags) < 0) return -errno; return 0; } @@ -426,11 +408,15 @@ int os_set_fd_block(int fd, int blocking) int flags; flags = fcntl(fd, F_GETFL); + if (flags < 0) + return -errno; - if(blocking) flags &= ~O_NONBLOCK; - else flags |= O_NONBLOCK; + if (blocking) + flags &= ~O_NONBLOCK; + else + flags |= O_NONBLOCK; - if(fcntl(fd, F_SETFL, flags) < 0) + if (fcntl(fd, F_SETFL, flags) < 0) return -errno; return 0; @@ -441,7 +427,7 @@ int os_accept_connection(int fd) int new; new = accept(fd, NULL, 0); - if(new < 0) + if (new < 0) return -errno; return new; } @@ -462,15 +448,17 @@ int os_shutdown_socket(int fd, int r, int w) { int what, err; - if(r && w) what = SHUT_RDWR; - else if(r) what = SHUT_RD; - else if(w) what = SHUT_WR; - else { - printk("os_shutdown_socket : neither r or w was set\n"); + if (r && w) + what = SHUT_RDWR; + else if (r) + what = SHUT_RD; + else if (w) + what = SHUT_WR; + else return -EINVAL; - } + err = shutdown(fd, what); - if(err < 0) + if (err < 0) return -errno; return 0; } @@ -494,19 +482,20 @@ int os_rcv_fd(int fd, int *helper_pid_out) msg.msg_flags = 0; n = recvmsg(fd, &msg, 0); - if(n < 0) + if (n < 0) return -errno; - else if(n != iov.iov_len) + else if (n != iov.iov_len) *helper_pid_out = -1; cmsg = CMSG_FIRSTHDR(&msg); - if(cmsg == NULL){ - printk("rcv_fd didn't receive anything, error = %d\n", errno); + if (cmsg == NULL) { + printk(UM_KERN_ERR "rcv_fd didn't receive anything, " + "error = %d\n", errno); return -1; } - if((cmsg->cmsg_level != SOL_SOCKET) || - (cmsg->cmsg_type != SCM_RIGHTS)){ - printk("rcv_fd didn't receive a descriptor\n"); + if ((cmsg->cmsg_level != SOL_SOCKET) || + (cmsg->cmsg_type != SCM_RIGHTS)) { + printk(UM_KERN_ERR "rcv_fd didn't receive a descriptor\n"); return -1; } @@ -514,29 +503,28 @@ int os_rcv_fd(int fd, int *helper_pid_out) return new; } -int os_create_unix_socket(char *file, int len, int close_on_exec) +int os_create_unix_socket(const char *file, int len, int close_on_exec) { struct sockaddr_un addr; int sock, err; sock = socket(PF_UNIX, SOCK_DGRAM, 0); - if(sock < 0) + if (sock < 0) return -errno; - if(close_on_exec) { + if (close_on_exec) { err = os_set_exec_close(sock); - if(err < 0) - printk("create_unix_socket : close_on_exec failed, " - "err = %d", -err); + if (err < 0) + printk(UM_KERN_ERR "create_unix_socket : " + "close_on_exec failed, err = %d", -err); } addr.sun_family = AF_UNIX; - /* XXX Be more careful about overflow */ snprintf(addr.sun_path, len, "%s", file); err = bind(sock, (struct sockaddr *) &addr, sizeof(addr)); - if(err < 0) + if (err < 0) return -errno; return sock; @@ -557,17 +545,18 @@ int os_lock_file(int fd, int excl) int err, save; err = fcntl(fd, F_SETLK, &lock); - if(!err) + if (!err) goto out; save = -errno; err = fcntl(fd, F_GETLK, &lock); - if(err){ + if (err) { err = -errno; goto out; } - printk("F_SETLK failed, file already locked by pid %d\n", lock.l_pid); + printk(UM_KERN_ERR "F_SETLK failed, file already locked by pid %d\n", + lock.l_pid); err = save; out: return err; diff --git a/arch/um/os-Linux/helper.c b/arch/um/os-Linux/helper.c index fba3f0f..f4bd349 100644 --- a/arch/um/os-Linux/helper.c +++ b/arch/um/os-Linux/helper.c @@ -1,22 +1,19 @@ /* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ -#include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <errno.h> #include <sched.h> -#include <limits.h> -#include <sys/signal.h> -#include <sys/wait.h> #include <sys/socket.h> -#include "user.h" +#include <sys/wait.h> +#include "kern_constants.h" #include "kern_util.h" #include "os.h" #include "um_malloc.h" -#include "kern_constants.h" +#include "user.h" struct helper_data { void (*pre_exec)(void*); @@ -30,21 +27,19 @@ static int helper_child(void *arg) { struct helper_data *data = arg; char **argv = data->argv; - int errval; + int err; if (data->pre_exec != NULL) (*data->pre_exec)(data->pre_data); - errval = execvp_noalloc(data->buf, argv[0], argv); - printk("helper_child - execvp of '%s' failed - errno = %d\n", argv[0], - -errval); - write(data->fd, &errval, sizeof(errval)); - kill(os_getpid(), SIGKILL); + err = execvp_noalloc(data->buf, argv[0], argv); + + /* If the exec succeeds, we don't get here */ + write(data->fd, &err, sizeof(err)); + return 0; } -/* Returns either the pid of the child process we run or -E* on failure. - * XXX The alloc_stack here breaks if this is called in the tracing thread, so - * we need to receive a preallocated stack (a local buffer is ok). */ +/* Returns either the pid of the child process we run or -E* on failure. */ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv) { struct helper_data data; @@ -58,14 +53,15 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv) ret = socketpair(AF_UNIX, SOCK_STREAM, 0, fds); if (ret < 0) { ret = -errno; - printk("run_helper : pipe failed, errno = %d\n", errno); + printk(UM_KERN_ERR "run_helper : pipe failed, errno = %d\n", + errno); goto out_free; } ret = os_set_exec_close(fds[1]); if (ret < 0) { - printk("run_helper : setting FD_CLOEXEC failed, ret = %d\n", - -ret); + printk(UM_KERN_ERR "run_helper : setting FD_CLOEXEC failed, " + "ret = %d\n", -ret); goto out_close; } @@ -79,7 +75,8 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv) pid = clone(helper_child, (void *) sp, CLONE_VM, &data); if (pid < 0) { ret = -errno; - printk("run_helper : clone failed, errno = %d\n", errno); + printk(UM_KERN_ERR "run_helper : clone failed, errno = %d\n", + errno); goto out_free2; } @@ -96,10 +93,9 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv) } else { if (n < 0) { n = -errno; - printk("run_helper : read on pipe failed, ret = %d\n", - -n); + printk(UM_KERN_ERR "run_helper : read on pipe failed, " + "ret = %d\n", -n); ret = n; - kill(pid, SIGKILL); } CATCH_EINTR(waitpid(pid, NULL, __WCLONE)); } @@ -129,50 +125,40 @@ int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, pid = clone(proc, (void *) sp, flags, arg); if (pid < 0) { err = -errno; - printk("run_helper_thread : clone failed, errno = %d\n", - errno); + printk(UM_KERN_ERR "run_helper_thread : clone failed, " + "errno = %d\n", errno); return err; } if (stack_out == NULL) { CATCH_EINTR(pid = waitpid(pid, &status, __WCLONE)); if (pid < 0) { err = -errno; - printk("run_helper_thread - wait failed, errno = %d\n", - errno); + printk(UM_KERN_ERR "run_helper_thread - wait failed, " + "errno = %d\n", errno); pid = err; } if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) - printk("run_helper_thread - thread returned status " - "0x%x\n", status); + printk(UM_KERN_ERR "run_helper_thread - thread " + "returned status 0x%x\n", status); free_stack(stack, 0); } else *stack_out = stack; return pid; } -int helper_wait(int pid, int nohang, char *pname) +int helper_wait(int pid) { int ret, status; int wflags = __WCLONE; - if (nohang) - wflags |= WNOHANG; - - if (!pname) - pname = "helper_wait"; - CATCH_EINTR(ret = waitpid(pid, &status, wflags)); if (ret < 0) { - printk(UM_KERN_ERR "%s : waitpid process %d failed, " - "errno = %d\n", pname, pid, errno); + printk(UM_KERN_ERR "helper_wait : waitpid process %d failed, " + "errno = %d\n", pid, errno); return -errno; - } else if (nohang && ret == 0) { - printk(UM_KERN_ERR "%s : process %d has not exited\n", - pname, pid); - return -ECHILD; } else if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { - printk(UM_KERN_ERR "%s : process %d didn't exit with " - "status 0\n", pname, pid); + printk(UM_KERN_ERR "helper_wait : process %d exited with " + "status 0x%x\n", pid, status); return -ECHILD; } else return 0; diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c index 6aa6f95..0348b97 100644 --- a/arch/um/os-Linux/irq.c +++ b/arch/um/os-Linux/irq.c @@ -1,23 +1,19 @@ /* - * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ #include <stdlib.h> -#include <unistd.h> #include <errno.h> +#include <poll.h> #include <signal.h> #include <string.h> -#include <sys/poll.h> -#include <sys/types.h> -#include <sys/time.h> -#include "kern_util.h" -#include "user.h" -#include "process.h" -#include "sigio.h" #include "irq_user.h" +#include "kern_constants.h" #include "os.h" +#include "process.h" #include "um_malloc.h" +#include "user.h" /* * Locked by irq_lock in arch/um/kernel/irq.c. Changed by os_create_pollfd @@ -36,7 +32,7 @@ int os_waiting_for_events(struct irq_fd *active_fds) if (n < 0) { err = -errno; if (errno != EINTR) - printk("sigio_handler: os_waiting_for_events:" + printk(UM_KERN_ERR "os_waiting_for_events:" " poll returned %d, errno = %d\n", n, errno); return err; } @@ -95,24 +91,26 @@ void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg, struct irq_fd *old_fd = *prev; if ((pollfds[i].fd != -1) && (pollfds[i].fd != (*prev)->fd)) { - printk("os_free_irq_by_cb - mismatch between " - "active_fds and pollfds, fd %d vs %d\n", + printk(UM_KERN_ERR "os_free_irq_by_cb - " + "mismatch between active_fds and " + "pollfds, fd %d vs %d\n", (*prev)->fd, pollfds[i].fd); goto out; } pollfds_num--; - /* This moves the *whole* array after pollfds[i] + /* + * This moves the *whole* array after pollfds[i] * (though it doesn't spot as such)! */ memmove(&pollfds[i], &pollfds[i + 1], (pollfds_num - i) * sizeof(pollfds[0])); - if(*last_irq_ptr2 == &old_fd->next) + if (*last_irq_ptr2 == &old_fd->next) *last_irq_ptr2 = prev; *prev = (*prev)->next; - if(old_fd->type == IRQ_WRITE) + if (old_fd->type == IRQ_WRITE) ignore_sigio_fd(old_fd->fd); kfree(old_fd); continue; @@ -138,14 +136,3 @@ void os_set_ioignore(void) { signal(SIGIO, SIG_IGN); } - -void init_irq_signals(int on_sigstack) -{ - int flags; - - flags = on_sigstack ? SA_ONSTACK : 0; - - set_handler(SIGIO, (__sighandler_t) sig_handler, flags | SA_RESTART, - SIGUSR1, SIGIO, SIGWINCH, SIGVTALRM, -1); - signal(SIGWINCH, SIG_IGN); -} diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c index 82c3778..abb9b0f 100644 --- a/arch/um/os-Linux/main.c +++ b/arch/um/os-Linux/main.c @@ -73,7 +73,7 @@ static void install_fatal_handler(int sig) action.sa_handler = last_ditch_exit; if (sigaction(sig, &action, NULL) < 0) { printf("failed to install handler for signal %d - errno = %d\n", - errno); + sig, errno); exit(1); } } @@ -92,7 +92,8 @@ static void setup_env_path(void) * just use the default + /usr/lib/uml */ if (!old_path || (path_len = strlen(old_path)) == 0) { - putenv("PATH=:/bin:/usr/bin/" UML_LIB_PATH); + if (putenv("PATH=:/bin:/usr/bin/" UML_LIB_PATH)) + perror("couldn't putenv"); return; } @@ -100,15 +101,16 @@ static void setup_env_path(void) path_len += strlen("PATH=" UML_LIB_PATH) + 1; new_path = malloc(path_len); if (!new_path) { - perror("coudn't malloc to set a new PATH"); + perror("couldn't malloc to set a new PATH"); return; } snprintf(new_path, path_len, "PATH=%s" UML_LIB_PATH, old_path); - putenv(new_path); + if (putenv(new_path)) { + perror("couldn't putenv to set a new PATH"); + free(new_path); + } } -extern int uml_exitcode; - extern void scan_elf_aux( char **envp); int __init main(int argc, char **argv, char **envp) diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c index 436f8d2..eedc2d8 100644 --- a/arch/um/os-Linux/mem.c +++ b/arch/um/os-Linux/mem.c @@ -9,7 +9,6 @@ #include <sys/types.h> #include <sys/mman.h> #include <sys/statfs.h> -#include "kern_util.h" #include "user.h" #include "mem_user.h" #include "init.h" @@ -30,7 +29,7 @@ static char *tempdir = NULL; static void __init find_tempdir(void) { - char *dirs[] = { "TMP", "TEMP", "TMPDIR", NULL }; + const char *dirs[] = { "TMP", "TEMP", "TMPDIR", NULL }; int i; char *dir = NULL; @@ -59,9 +58,10 @@ static void __init find_tempdir(void) * read the file as needed. If there's an error, -errno is returned; * if the end of the file is reached, 0 is returned. */ -static int next(int fd, char *buf, int size, char c) +static int next(int fd, char *buf, size_t size, char c) { - int n, len; + ssize_t n; + size_t len; char *ptr; while((ptr = strchr(buf, c)) == NULL){ @@ -172,13 +172,15 @@ int __init make_tempfile(const char *template, char **out_tempname, which_tmpdir(); tempname = malloc(MAXPATHLEN); + if (!tempname) + goto out; find_tempdir(); if (template[0] != '/') strcpy(tempname, tempdir); else tempname[0] = '\0'; - strcat(tempname, template); + strncat(tempname, template, MAXPATHLEN-1-strlen(tempname)); fd = mkstemp(tempname); if(fd < 0){ fprintf(stderr, "open - cannot create %s: %s\n", tempname, @@ -268,6 +270,7 @@ void __init check_tmpexec(void) if(addr == MAP_FAILED){ err = errno; perror("failed"); + close(fd); if(err == EPERM) printf("%s must be not mounted noexec\n",tempdir); exit(1); diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index bda5c31..abf6bea 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -249,7 +249,10 @@ void init_new_thread_signals(void) SIGUSR1, SIGIO, SIGWINCH, SIGVTALRM, -1); signal(SIGHUP, SIG_IGN); - init_irq_signals(1); + set_handler(SIGIO, (__sighandler_t) sig_handler, + SA_ONSTACK | SA_RESTART, SIGUSR1, SIGIO, SIGWINCH, SIGALRM, + SIGVTALRM, -1); + signal(SIGWINCH, SIG_IGN); } int run_kernel_thread(int (*fn)(void *), void *arg, jmp_buf **jmp_ptr) diff --git a/arch/um/os-Linux/registers.c b/arch/um/os-Linux/registers.c index a32ba6a..830fe6a 100644 --- a/arch/um/os-Linux/registers.c +++ b/arch/um/os-Linux/registers.c @@ -8,47 +8,41 @@ #include <string.h> #include <sys/ptrace.h> #include "sysdep/ptrace.h" -#include "user.h" -/* This is set once at boot time and not changed thereafter */ - -static unsigned long exec_regs[MAX_REG_NR]; - -void init_thread_registers(struct uml_pt_regs *to) -{ - memcpy(to->gp, exec_regs, sizeof(to->gp)); -} - -void save_registers(int pid, struct uml_pt_regs *regs) +int save_registers(int pid, struct uml_pt_regs *regs) { int err; err = ptrace(PTRACE_GETREGS, pid, 0, regs->gp); if (err < 0) - panic("save_registers - saving registers failed, errno = %d\n", - errno); + return -errno; + return 0; } -void restore_registers(int pid, struct uml_pt_regs *regs) +int restore_registers(int pid, struct uml_pt_regs *regs) { int err; err = ptrace(PTRACE_SETREGS, pid, 0, regs->gp); if (err < 0) - panic("restore_registers - saving registers failed, " - "errno = %d\n", errno); + return -errno; + return 0; } -void init_registers(int pid) +/* This is set once at boot time and not changed thereafter */ + +static unsigned long exec_regs[MAX_REG_NR]; + +int init_registers(int pid) { int err; err = ptrace(PTRACE_GETREGS, pid, 0, exec_regs); - if (err) - panic("check_ptrace : PTRACE_GETREGS failed, errno = %d", - errno); + if (err < 0) + return -errno; arch_init_registers(pid); + return 0; } void get_safe_registers(unsigned long *regs) diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c index dc03e9c..abf47a7c 100644 --- a/arch/um/os-Linux/sigio.c +++ b/arch/um/os-Linux/sigio.c @@ -1,34 +1,33 @@ /* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ #include <unistd.h> -#include <stdlib.h> -#include <termios.h> +#include <errno.h> +#include <fcntl.h> +#include <poll.h> #include <pty.h> +#include <sched.h> #include <signal.h> -#include <fcntl.h> -#include <errno.h> #include <string.h> -#include <sched.h> -#include <sys/socket.h> -#include <sys/poll.h> -#include "init.h" -#include "user.h" +#include "kern_constants.h" #include "kern_util.h" -#include "sigio.h" +#include "init.h" #include "os.h" +#include "sigio.h" #include "um_malloc.h" -#include "init.h" +#include "user.h" -/* Protected by sigio_lock(), also used by sigio_cleanup, which is an +/* + * Protected by sigio_lock(), also used by sigio_cleanup, which is an * exitcall. */ static int write_sigio_pid = -1; static unsigned long write_sigio_stack; -/* These arrays are initialized before the sigio thread is started, and +/* + * These arrays are initialized before the sigio thread is started, and * the descriptors closed after it is killed. So, it can't see them change. * On the UML side, they are changed under the sigio_lock. */ @@ -43,7 +42,8 @@ struct pollfds { int used; }; -/* Protected by sigio_lock(). Used by the sigio thread, but the UML thread +/* + * Protected by sigio_lock(). Used by the sigio thread, but the UML thread * synchronizes with it. */ static struct pollfds current_poll; @@ -57,23 +57,26 @@ static int write_sigio_thread(void *unused) int i, n, respond_fd; char c; - signal(SIGWINCH, SIG_IGN); + signal(SIGWINCH, SIG_IGN); fds = ¤t_poll; - while(1){ + while (1) { n = poll(fds->poll, fds->used, -1); - if(n < 0){ - if(errno == EINTR) continue; - printk("write_sigio_thread : poll returned %d, " - "errno = %d\n", n, errno); + if (n < 0) { + if (errno == EINTR) + continue; + printk(UM_KERN_ERR "write_sigio_thread : poll returned " + "%d, errno = %d\n", n, errno); } - for(i = 0; i < fds->used; i++){ + for (i = 0; i < fds->used; i++) { p = &fds->poll[i]; - if(p->revents == 0) continue; - if(p->fd == sigio_private[1]){ + if (p->revents == 0) + continue; + if (p->fd == sigio_private[1]) { CATCH_EINTR(n = read(sigio_private[1], &c, sizeof(c))); - if(n != sizeof(c)) - printk("write_sigio_thread : " + if (n != sizeof(c)) + printk(UM_KERN_ERR + "write_sigio_thread : " "read on socket failed, " "err = %d\n", errno); tmp = current_poll; @@ -89,9 +92,10 @@ static int write_sigio_thread(void *unused) } CATCH_EINTR(n = write(respond_fd, &c, sizeof(c))); - if(n != sizeof(c)) - printk("write_sigio_thread : write on socket " - "failed, err = %d\n", errno); + if (n != sizeof(c)) + printk(UM_KERN_ERR "write_sigio_thread : " + "write on socket failed, err = %d\n", + errno); } } @@ -102,12 +106,13 @@ static int need_poll(struct pollfds *polls, int n) { struct pollfd *new; - if(n <= polls->size) + if (n <= polls->size) return 0; new = kmalloc(n * sizeof(struct pollfd), UM_GFP_ATOMIC); - if(new == NULL){ - printk("need_poll : failed to allocate new pollfds\n"); + if (new == NULL) { + printk(UM_KERN_ERR "need_poll : failed to allocate new " + "pollfds\n"); return -ENOMEM; } @@ -119,7 +124,8 @@ static int need_poll(struct pollfds *polls, int n) return 0; } -/* Must be called with sigio_lock held, because it's needed by the marked +/* + * Must be called with sigio_lock held, because it's needed by the marked * critical section. */ static void update_thread(void) @@ -129,15 +135,17 @@ static void update_thread(void) char c; flags = set_signals(0); - n = write(sigio_private[0], &c, sizeof(c)); - if(n != sizeof(c)){ - printk("update_thread : write failed, err = %d\n", errno); + CATCH_EINTR(n = write(sigio_private[0], &c, sizeof(c))); + if (n != sizeof(c)) { + printk(UM_KERN_ERR "update_thread : write failed, err = %d\n", + errno); goto fail; } CATCH_EINTR(n = read(sigio_private[0], &c, sizeof(c))); - if(n != sizeof(c)){ - printk("update_thread : read failed, err = %d\n", errno); + if (n != sizeof(c)) { + printk(UM_KERN_ERR "update_thread : read failed, err = %d\n", + errno); goto fail; } @@ -164,23 +172,23 @@ int add_sigio_fd(int fd) int err = 0, i, n; sigio_lock(); - for(i = 0; i < all_sigio_fds.used; i++){ - if(all_sigio_fds.poll[i].fd == fd) + for (i = 0; i < all_sigio_fds.used; i++) { + if (all_sigio_fds.poll[i].fd == fd) break; } - if(i == all_sigio_fds.used) + if (i == all_sigio_fds.used) goto out; p = &all_sigio_fds.poll[i]; - for(i = 0; i < current_poll.used; i++){ - if(current_poll.poll[i].fd == fd) + for (i = 0; i < current_poll.used; i++) { + if (current_poll.poll[i].fd == fd) goto out; } n = current_poll.used; err = need_poll(&next_poll, n + 1); - if(err) + if (err) goto out; memcpy(next_poll.poll, current_poll.poll, @@ -198,27 +206,29 @@ int ignore_sigio_fd(int fd) struct pollfd *p; int err = 0, i, n = 0; - /* This is called from exitcalls elsewhere in UML - if + /* + * This is called from exitcalls elsewhere in UML - if * sigio_cleanup has already run, then update_thread will hang * or fail because the thread is no longer running. */ - if(write_sigio_pid == -1) + if (write_sigio_pid == -1) return -EIO; sigio_lock(); - for(i = 0; i < current_poll.used; i++){ - if(current_poll.poll[i].fd == fd) break; + for (i = 0; i < current_poll.used; i++) { + if (current_poll.poll[i].fd == fd) + break; } - if(i == current_poll.used) + if (i == current_poll.used) goto out; err = need_poll(&next_poll, current_poll.used - 1); - if(err) + if (err) goto out; - for(i = 0; i < current_poll.used; i++){ + for (i = 0; i < current_poll.used; i++) { p = ¤t_poll.poll[i]; - if(p->fd != fd) + if (p->fd != fd) next_poll.poll[n++] = *p; } next_poll.used = current_poll.used - 1; @@ -235,7 +245,8 @@ static struct pollfd *setup_initial_poll(int fd) p = kmalloc(sizeof(struct pollfd), UM_GFP_KERNEL); if (p == NULL) { - printk("setup_initial_poll : failed to allocate poll\n"); + printk(UM_KERN_ERR "setup_initial_poll : failed to allocate " + "poll\n"); return NULL; } *p = ((struct pollfd) { .fd = fd, @@ -261,27 +272,29 @@ static void write_sigio_workaround(void) return; err = os_pipe(l_write_sigio_fds, 1, 1); - if(err < 0){ - printk("write_sigio_workaround - os_pipe 1 failed, " + if (err < 0) { + printk(UM_KERN_ERR "write_sigio_workaround - os_pipe 1 failed, " "err = %d\n", -err); return; } err = os_pipe(l_sigio_private, 1, 1); - if(err < 0){ - printk("write_sigio_workaround - os_pipe 2 failed, " + if (err < 0) { + printk(UM_KERN_ERR "write_sigio_workaround - os_pipe 2 failed, " "err = %d\n", -err); goto out_close1; } p = setup_initial_poll(l_sigio_private[1]); - if(!p) + if (!p) goto out_close2; sigio_lock(); - /* Did we race? Don't try to optimize this, please, it's not so likely - * to happen, and no more than once at the boot. */ - if(write_sigio_pid != -1) + /* + * Did we race? Don't try to optimize this, please, it's not so likely + * to happen, and no more than once at the boot. + */ + if (write_sigio_pid != -1) goto out_free; current_poll = ((struct pollfds) { .poll = p, @@ -333,19 +346,19 @@ void maybe_sigio_broken(int fd, int read) { int err; - if(!isatty(fd)) + if (!isatty(fd)) return; - if((read || pty_output_sigio) && (!read || pty_close_sigio)) + if ((read || pty_output_sigio) && (!read || pty_close_sigio)) return; write_sigio_workaround(); sigio_lock(); err = need_poll(&all_sigio_fds, all_sigio_fds.used + 1); - if(err){ - printk("maybe_sigio_broken - failed to add pollfd for " - "descriptor %d\n", fd); + if (err) { + printk(UM_KERN_ERR "maybe_sigio_broken - failed to add pollfd " + "for descriptor %d\n", fd); goto out; } @@ -388,7 +401,7 @@ static void openpty_cb(void *arg) struct openpty_arg *info = arg; info->err = 0; - if(openpty(&info->master, &info->slave, NULL, NULL, NULL)) + if (openpty(&info->master, &info->slave, NULL, NULL, NULL)) info->err = -errno; } @@ -397,17 +410,17 @@ static int async_pty(int master, int slave) int flags; flags = fcntl(master, F_GETFL); - if(flags < 0) + if (flags < 0) return -errno; - if((fcntl(master, F_SETFL, flags | O_NONBLOCK | O_ASYNC) < 0) || - (fcntl(master, F_SETOWN, os_getpid()) < 0)) + if ((fcntl(master, F_SETFL, flags | O_NONBLOCK | O_ASYNC) < 0) || + (fcntl(master, F_SETOWN, os_getpid()) < 0)) return -errno; - if((fcntl(slave, F_SETFL, flags | O_NONBLOCK) < 0)) + if ((fcntl(slave, F_SETFL, flags | O_NONBLOCK) < 0)) return -errno; - return(0); + return 0; } static void __init check_one_sigio(void (*proc)(int, int)) @@ -417,34 +430,49 @@ static void __init check_one_sigio(void (*proc)(int, int)) int master, slave, err; initial_thread_cb(openpty_cb, &pty); - if(pty.err){ - printk("openpty failed, errno = %d\n", -pty.err); + if (pty.err) { + printk(UM_KERN_ERR "check_one_sigio failed, errno = %d\n", + -pty.err); return; } master = pty.master; slave = pty.slave; - if((master == -1) || (slave == -1)){ - printk("openpty failed to allocate a pty\n"); + if ((master == -1) || (slave == -1)) { + printk(UM_KERN_ERR "check_one_sigio failed to allocate a " + "pty\n"); return; } /* Not now, but complain so we now where we failed. */ err = raw(master); - if (err < 0) - panic("check_sigio : __raw failed, errno = %d\n", -err); + if (err < 0) { + printk(UM_KERN_ERR "check_one_sigio : raw failed, errno = %d\n", + -err); + return; + } err = async_pty(master, slave); - if(err < 0) - panic("tty_fds : sigio_async failed, err = %d\n", -err); + if (err < 0) { + printk(UM_KERN_ERR "check_one_sigio : sigio_async failed, " + "err = %d\n", -err); + return; + } + + if (sigaction(SIGIO, NULL, &old) < 0) { + printk(UM_KERN_ERR "check_one_sigio : sigaction 1 failed, " + "errno = %d\n", errno); + return; + } - if(sigaction(SIGIO, NULL, &old) < 0) - panic("check_sigio : sigaction 1 failed, errno = %d\n", errno); new = old; new.sa_handler = handler; - if(sigaction(SIGIO, &new, NULL) < 0) - panic("check_sigio : sigaction 2 failed, errno = %d\n", errno); + if (sigaction(SIGIO, &new, NULL) < 0) { + printk(UM_KERN_ERR "check_one_sigio : sigaction 2 failed, " + "errno = %d\n", errno); + return; + } got_sigio = 0; (*proc)(master, slave); @@ -452,8 +480,9 @@ static void __init check_one_sigio(void (*proc)(int, int)) close(master); close(slave); - if(sigaction(SIGIO, &old, NULL) < 0) - panic("check_sigio : sigaction 3 failed, errno = %d\n", errno); + if (sigaction(SIGIO, &old, NULL) < 0) + printk(UM_KERN_ERR "check_one_sigio : sigaction 3 failed, " + "errno = %d\n", errno); } static void tty_output(int master, int slave) @@ -461,42 +490,45 @@ static void tty_output(int master, int slave) int n; char buf[512]; - printk("Checking that host ptys support output SIGIO..."); + printk(UM_KERN_INFO "Checking that host ptys support output SIGIO..."); memset(buf, 0, sizeof(buf)); - while(write(master, buf, sizeof(buf)) > 0) ; - if(errno != EAGAIN) - panic("tty_output : write failed, errno = %d\n", errno); - while(((n = read(slave, buf, sizeof(buf))) > 0) && !got_sigio) ; + while (write(master, buf, sizeof(buf)) > 0) ; + if (errno != EAGAIN) + printk(UM_KERN_ERR "tty_output : write failed, errno = %d\n", + errno); + while (((n = read(slave, buf, sizeof(buf))) > 0) && !got_sigio) + ; - if(got_sigio){ - printk("Yes\n"); + if (got_sigio) { + printk(UM_KERN_CONT "Yes\n"); pty_output_sigio = 1; - } - else if(n == -EAGAIN) - printk("No, enabling workaround\n"); - else panic("tty_output : read failed, err = %d\n", n); + } else if (n == -EAGAIN) + printk(UM_KERN_CONT "No, enabling workaround\n"); + else + printk(UM_KERN_CONT "tty_output : read failed, err = %d\n", n); } static void tty_close(int master, int slave) { - printk("Checking that host ptys support SIGIO on close..."); + printk(UM_KERN_INFO "Checking that host ptys support SIGIO on " + "close..."); close(slave); - if(got_sigio){ - printk("Yes\n"); + if (got_sigio) { + printk(UM_KERN_CONT "Yes\n"); pty_close_sigio = 1; - } - else printk("No, enabling workaround\n"); + } else + printk(UM_KERN_CONT "No, enabling workaround\n"); } void __init check_sigio(void) { - if((os_access("/dev/ptmx", OS_ACC_R_OK) < 0) && - (os_access("/dev/ptyp0", OS_ACC_R_OK) < 0)){ - printk("No pseudo-terminals available - skipping pty SIGIO " - "check\n"); + if ((access("/dev/ptmx", R_OK) < 0) && + (access("/dev/ptyp0", R_OK) < 0)) { + printk(UM_KERN_WARNING "No pseudo-terminals available - " + "skipping pty SIGIO check\n"); return; } check_one_sigio(tty_output); diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index e9800b0..0fb0cc8 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -9,11 +9,47 @@ #include <errno.h> #include <signal.h> #include <strings.h> +#include "as-layout.h" +#include "kern_util.h" #include "os.h" #include "sysdep/barrier.h" #include "sysdep/sigcontext.h" #include "user.h" +/* Copied from linux/compiler-gcc.h since we can't include it directly */ +#define barrier() __asm__ __volatile__("": : :"memory") + +void (*sig_info[NSIG])(int, struct uml_pt_regs *) = { + [SIGTRAP] = relay_signal, + [SIGFPE] = relay_signal, + [SIGILL] = relay_signal, + [SIGWINCH] = winch, + [SIGBUS] = bus_handler, + [SIGSEGV] = segv_handler, + [SIGIO] = sigio_handler, + [SIGVTALRM] = timer_handler }; + +static void sig_handler_common(int sig, struct sigcontext *sc) +{ + struct uml_pt_regs r; + int save_errno = errno; + + r.is_user = 0; + if (sig == SIGSEGV) { + /* For segfaults, we want the data from the sigcontext. */ + copy_sc(&r, sc); + GET_FAULTINFO_FROM_SC(r.faultinfo, sc); + } + + /* enable signals if sig isn't IRQ signal */ + if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGVTALRM)) + unblock_signals(); + + (*sig_info[sig])(sig, &r); + + errno = save_errno; +} + /* * These are the asynchronous signals. SIGPROF is excluded because we want to * be able to profile all of UML, not just the non-critical sections. If @@ -26,13 +62,8 @@ #define SIGVTALRM_BIT 1 #define SIGVTALRM_MASK (1 << SIGVTALRM_BIT) -/* - * These are used by both the signal handlers and - * block/unblock_signals. I don't want modifications cached in a - * register - they must go straight to memory. - */ -static volatile int signals_enabled = 1; -static volatile int pending = 0; +static int signals_enabled; +static unsigned int signals_pending; void sig_handler(int sig, struct sigcontext *sc) { @@ -40,13 +71,13 @@ void sig_handler(int sig, struct sigcontext *sc) enabled = signals_enabled; if (!enabled && (sig == SIGIO)) { - pending |= SIGIO_MASK; + signals_pending |= SIGIO_MASK; return; } block_signals(); - sig_handler_common_skas(sig, sc); + sig_handler_common(sig, sc); set_signals(enabled); } @@ -68,7 +99,7 @@ void alarm_handler(int sig, struct sigcontext *sc) enabled = signals_enabled; if (!signals_enabled) { - pending |= SIGVTALRM_MASK; + signals_pending |= SIGVTALRM_MASK; return; } @@ -94,16 +125,6 @@ void set_sigstack(void *sig_stack, int size) panic("enabling signal stack failed, errno = %d\n", errno); } -void remove_sigstack(void) -{ - stack_t stack = ((stack_t) { .ss_flags = SS_DISABLE, - .ss_sp = NULL, - .ss_size = 0 }); - - if (sigaltstack(&stack, NULL) != 0) - panic("disabling signal stack failed, errno = %d\n", errno); -} - void (*handlers[_NSIG])(int sig, struct sigcontext *sc); void handle_signal(int sig, struct sigcontext *sc) @@ -166,6 +187,9 @@ void set_handler(int sig, void (*handler)(int), int flags, ...) sigaddset(&action.sa_mask, mask); va_end(ap); + if (sig == SIGSEGV) + flags |= SA_NODEFER; + action.sa_flags = flags; action.sa_restorer = NULL; if (sigaction(sig, &action, NULL) < 0) @@ -179,12 +203,14 @@ void set_handler(int sig, void (*handler)(int), int flags, ...) int change_sig(int signal, int on) { - sigset_t sigset, old; + sigset_t sigset; sigemptyset(&sigset); sigaddset(&sigset, signal); - sigprocmask(on ? SIG_UNBLOCK : SIG_BLOCK, &sigset, &old); - return !sigismember(&old, signal); + if (sigprocmask(on ? SIG_UNBLOCK : SIG_BLOCK, &sigset, NULL) < 0) + return -errno; + + return 0; } void block_signals(void) @@ -196,7 +222,7 @@ void block_signals(void) * This might matter if gcc figures out how to inline this and * decides to shuffle this code into the caller. */ - mb(); + barrier(); } void unblock_signals(void) @@ -209,36 +235,26 @@ void unblock_signals(void) /* * We loop because the IRQ handler returns with interrupts off. So, * interrupts may have arrived and we need to re-enable them and - * recheck pending. + * recheck signals_pending. */ while(1) { /* * Save and reset save_pending after enabling signals. This - * way, pending won't be changed while we're reading it. + * way, signals_pending won't be changed while we're reading it. */ signals_enabled = 1; /* - * Setting signals_enabled and reading pending must + * Setting signals_enabled and reading signals_pending must * happen in this order. */ - mb(); - - save_pending = pending; - if (save_pending == 0) { - /* - * This must return with signals enabled, so - * this barrier ensures that writes are - * flushed out before the return. This might - * matter if gcc figures out how to inline - * this (unlikely, given its size) and decides - * to shuffle this code into the caller. - */ - mb(); + barrier(); + + save_pending = signals_pending; + if (save_pending == 0) return; - } - pending = 0; + signals_pending = 0; /* * We have pending interrupts, so disable signals, as the @@ -254,7 +270,7 @@ void unblock_signals(void) * back here. */ if (save_pending & SIGIO_MASK) - sig_handler_common_skas(SIGIO, NULL); + sig_handler_common(SIGIO, NULL); if (save_pending & SIGVTALRM_MASK) real_alarm_handler(NULL); diff --git a/arch/um/os-Linux/skas/Makefile b/arch/um/os-Linux/skas/Makefile index 5fd8d4d..d2ea340 100644 --- a/arch/um/os-Linux/skas/Makefile +++ b/arch/um/os-Linux/skas/Makefile @@ -1,10 +1,10 @@ # -# Copyright (C) 2002 - 2004 Jeff Dike (jdike@addtoit.com) +# Copyright (C) 2002 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com) # Licensed under the GPL # -obj-y := mem.o process.o trap.o +obj-y := mem.o process.o -USER_OBJS := mem.o process.o trap.o +USER_OBJS := $(obj-y) include arch/um/scripts/Makefile.rules diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index e8b7a97..d36c89c 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -15,6 +15,7 @@ #include "as-layout.h" #include "chan_user.h" #include "kern_constants.h" +#include "kern_util.h" #include "mem.h" #include "os.h" #include "process.h" @@ -37,27 +38,27 @@ int is_skas_winch(int pid, int fd, void *data) static int ptrace_dump_regs(int pid) { - unsigned long regs[MAX_REG_NR]; - int i; + unsigned long regs[MAX_REG_NR]; + int i; - if (ptrace(PTRACE_GETREGS, pid, 0, regs) < 0) - return -errno; + if (ptrace(PTRACE_GETREGS, pid, 0, regs) < 0) + return -errno; printk(UM_KERN_ERR "Stub registers -\n"); for (i = 0; i < ARRAY_SIZE(regs); i++) printk(UM_KERN_ERR "\t%d - %lx\n", i, regs[i]); - return 0; + return 0; } /* * Signals that are OK to receive in the stub - we'll just continue it. * SIGWINCH will happen when UML is inside a detached screen. */ -#define STUB_SIG_MASK ((1 << SIGVTALRM) | (1 << SIGWINCH)) +#define STUB_SIG_MASK (1 << SIGVTALRM) /* Signals that the stub will finish with - anything else is an error */ -#define STUB_DONE_MASK ((1 << SIGUSR1) | (1 << SIGTRAP)) +#define STUB_DONE_MASK (1 << SIGTRAP) void wait_stub_done(int pid) { @@ -72,9 +73,11 @@ void wait_stub_done(int pid) break; err = ptrace(PTRACE_CONT, pid, 0, 0); - if (err) - panic("wait_stub_done : continue failed, errno = %d\n", - errno); + if (err) { + printk(UM_KERN_ERR "wait_stub_done : continue failed, " + "errno = %d\n", errno); + fatal_sigsegv(); + } } if (((1 << WSTOPSIG(status)) & STUB_DONE_MASK) != 0) @@ -85,8 +88,10 @@ bad_wait: if (err) printk(UM_KERN_ERR "Failed to get registers from stub, " "errno = %d\n", -err); - panic("wait_stub_done : failed to wait for SIGUSR1/SIGTRAP, pid = %d, " - "n = %d, errno = %d, status = 0x%x\n", pid, n, errno, status); + printk(UM_KERN_ERR "wait_stub_done : failed to wait for SIGTRAP, " + "pid = %d, n = %d, errno = %d, status = 0x%x\n", pid, n, errno, + status); + fatal_sigsegv(); } extern unsigned long current_stub_stack(void); @@ -97,9 +102,11 @@ void get_skas_faultinfo(int pid, struct faultinfo * fi) if (ptrace_faultinfo) { err = ptrace(PTRACE_FAULTINFO, pid, 0, fi); - if (err) - panic("get_skas_faultinfo - PTRACE_FAULTINFO failed, " - "errno = %d\n", errno); + if (err) { + printk(UM_KERN_ERR "get_skas_faultinfo - " + "PTRACE_FAULTINFO failed, errno = %d\n", errno); + fatal_sigsegv(); + } /* Special handling for i386, which has different structs */ if (sizeof(struct ptrace_faultinfo) < sizeof(struct faultinfo)) @@ -109,9 +116,11 @@ void get_skas_faultinfo(int pid, struct faultinfo * fi) } else { err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV); - if (err) - panic("Failed to continue stub, pid = %d, errno = %d\n", - pid, errno); + if (err) { + printk(UM_KERN_ERR "Failed to continue stub, pid = %d, " + "errno = %d\n", pid, errno); + fatal_sigsegv(); + } wait_stub_done(pid); /* @@ -137,6 +146,9 @@ static void handle_trap(int pid, struct uml_pt_regs *regs, { int err, status; + if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END)) + fatal_sigsegv(); + /* Mark this as a syscall */ UPT_SYSCALL_NR(regs) = PT_SYSCALL_NR(regs->gp); @@ -144,25 +156,31 @@ static void handle_trap(int pid, struct uml_pt_regs *regs, { err = ptrace(PTRACE_POKEUSR, pid, PT_SYSCALL_NR_OFFSET, __NR_getpid); - if (err < 0) - panic("handle_trap - nullifying syscall failed, " - "errno = %d\n", errno); + if (err < 0) { + printk(UM_KERN_ERR "handle_trap - nullifying syscall " + "failed, errno = %d\n", errno); + fatal_sigsegv(); + } err = ptrace(PTRACE_SYSCALL, pid, 0, 0); - if (err < 0) - panic("handle_trap - continuing to end of syscall " - "failed, errno = %d\n", errno); + if (err < 0) { + printk(UM_KERN_ERR "handle_trap - continuing to end of " + "syscall failed, errno = %d\n", errno); + fatal_sigsegv(); + } CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); if ((err < 0) || !WIFSTOPPED(status) || - (WSTOPSIG(status) != SIGTRAP + 0x80)) { - err = ptrace_dump_regs(pid); - if (err) - printk(UM_KERN_ERR "Failed to get registers " + (WSTOPSIG(status) != SIGTRAP + 0x80)) { + err = ptrace_dump_regs(pid); + if (err) + printk(UM_KERN_ERR "Failed to get registers " "from process, errno = %d\n", -err); - panic("handle_trap - failed to wait at end of syscall, " - "errno = %d, status = %d\n", errno, status); - } + printk(UM_KERN_ERR "handle_trap - failed to wait at " + "end of syscall, errno = %d, status = %d\n", + errno, status); + fatal_sigsegv(); + } } handle_syscall(regs); @@ -178,10 +196,13 @@ static int userspace_tramp(void *stack) ptrace(PTRACE_TRACEME, 0, 0, 0); signal(SIGTERM, SIG_DFL); + signal(SIGWINCH, SIG_IGN); err = set_interval(); - if (err) - panic("userspace_tramp - setting timer failed, errno = %d\n", - err); + if (err) { + printk(UM_KERN_ERR "userspace_tramp - setting timer failed, " + "errno = %d\n", err); + exit(1); + } if (!proc_mm) { /* @@ -221,16 +242,14 @@ static int userspace_tramp(void *stack) set_sigstack((void *) STUB_DATA, UM_KERN_PAGE_SIZE); sigemptyset(&sa.sa_mask); - sigaddset(&sa.sa_mask, SIGIO); - sigaddset(&sa.sa_mask, SIGWINCH); - sigaddset(&sa.sa_mask, SIGVTALRM); - sigaddset(&sa.sa_mask, SIGUSR1); - sa.sa_flags = SA_ONSTACK; + sa.sa_flags = SA_ONSTACK | SA_NODEFER; sa.sa_handler = (void *) v; sa.sa_restorer = NULL; - if (sigaction(SIGSEGV, &sa, NULL) < 0) - panic("userspace_tramp - setting SIGSEGV handler " - "failed - errno = %d\n", errno); + if (sigaction(SIGSEGV, &sa, NULL) < 0) { + printk(UM_KERN_ERR "userspace_tramp - setting SIGSEGV " + "handler failed - errno = %d\n", errno); + exit(1); + } } kill(os_getpid(), SIGSTOP); @@ -246,13 +265,18 @@ int start_userspace(unsigned long stub_stack) { void *stack; unsigned long sp; - int pid, status, n, flags; + int pid, status, n, flags, err; stack = mmap(NULL, UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (stack == MAP_FAILED) - panic("start_userspace : mmap failed, errno = %d", errno); + if (stack == MAP_FAILED) { + err = -errno; + printk(UM_KERN_ERR "start_userspace : mmap failed, " + "errno = %d\n", errno); + return err; + } + sp = (unsigned long) stack + UM_KERN_PAGE_SIZE - sizeof(void *); flags = CLONE_FILES; @@ -262,29 +286,50 @@ int start_userspace(unsigned long stub_stack) flags |= SIGCHLD; pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack); - if (pid < 0) - panic("start_userspace : clone failed, errno = %d", errno); + if (pid < 0) { + err = -errno; + printk(UM_KERN_ERR "start_userspace : clone failed, " + "errno = %d\n", errno); + return err; + } do { CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); - if (n < 0) - panic("start_userspace : wait failed, errno = %d", - errno); + if (n < 0) { + err = -errno; + printk(UM_KERN_ERR "start_userspace : wait failed, " + "errno = %d\n", errno); + goto out_kill; + } } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGVTALRM)); - if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) - panic("start_userspace : expected SIGSTOP, got status = %d", - status); + if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { + err = -EINVAL; + printk(UM_KERN_ERR "start_userspace : expected SIGSTOP, got " + "status = %d\n", status); + goto out_kill; + } if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL, - (void *) PTRACE_O_TRACESYSGOOD) < 0) - panic("start_userspace : PTRACE_OLDSETOPTIONS failed, " - "errno = %d\n", errno); + (void *) PTRACE_O_TRACESYSGOOD) < 0) { + err = -errno; + printk(UM_KERN_ERR "start_userspace : PTRACE_OLDSETOPTIONS " + "failed, errno = %d\n", errno); + goto out_kill; + } - if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) - panic("start_userspace : munmap failed, errno = %d\n", errno); + if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) { + err = -errno; + printk(UM_KERN_ERR "start_userspace : munmap failed, " + "errno = %d\n", errno); + goto out_kill; + } return pid; + + out_kill: + os_kill_ptraced_process(pid, 1); + return err; } void userspace(struct uml_pt_regs *regs) @@ -302,7 +347,16 @@ void userspace(struct uml_pt_regs *regs) nsecs += os_nsecs(); while (1) { - restore_registers(pid, regs); + /* + * This can legitimately fail if the process loads a + * bogus value into a segment register. It will + * segfault and PTRACE_GETREGS will read that value + * out of the process. However, PTRACE_SETREGS will + * fail. In this case, there is nothing to do but + * just kill the process. + */ + if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) + fatal_sigsegv(); /* Now we set local_using_sysemu to be used for one loop */ local_using_sysemu = get_using_sysemu(); @@ -310,19 +364,26 @@ void userspace(struct uml_pt_regs *regs) op = SELECT_PTRACE_OPERATION(local_using_sysemu, singlestepping(NULL)); - err = ptrace(op, pid, 0, 0); - if (err) - panic("userspace - could not resume userspace process, " - "pid=%d, ptrace operation = %d, errno = %d\n", - pid, op, errno); + if (ptrace(op, pid, 0, 0)) { + printk(UM_KERN_ERR "userspace - ptrace continue " + "failed, op = %d, errno = %d\n", op, errno); + fatal_sigsegv(); + } CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); - if (err < 0) - panic("userspace - waitpid failed, errno = %d\n", - errno); + if (err < 0) { + printk(UM_KERN_ERR "userspace - wait failed, " + "errno = %d\n", errno); + fatal_sigsegv(); + } regs->is_user = 1; - save_registers(pid, regs); + if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { + printk(UM_KERN_ERR "userspace - PTRACE_GETREGS failed, " + "errno = %d\n", errno); + fatal_sigsegv(); + } + UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ if (WIFSTOPPED(status)) { @@ -345,7 +406,7 @@ void userspace(struct uml_pt_regs *regs) break; case SIGVTALRM: now = os_nsecs(); - if(now < nsecs) + if (now < nsecs) break; block_signals(); (*sig_info[sig])(sig, regs); @@ -368,6 +429,7 @@ void userspace(struct uml_pt_regs *regs) default: printk(UM_KERN_ERR "userspace - child stopped " "with signal %d\n", sig); + fatal_sigsegv(); } pid = userspace_pid[0]; interrupt_end(); @@ -419,9 +481,12 @@ int copy_context_skas0(unsigned long new_stack, int pid) .it_interval = tv }) }); err = ptrace_setregs(pid, thread_regs); - if (err < 0) - panic("copy_context_skas0 : PTRACE_SETREGS failed, " - "pid = %d, errno = %d\n", pid, -err); + if (err < 0) { + err = -errno; + printk(UM_KERN_ERR "copy_context_skas0 : PTRACE_SETREGS " + "failed, pid = %d, errno = %d\n", pid, -err); + return err; + } /* set a well known return code for detection of child write failure */ child_data->err = 12345678; @@ -431,31 +496,47 @@ int copy_context_skas0(unsigned long new_stack, int pid) * parent's stack, and check, if bad result. */ err = ptrace(PTRACE_CONT, pid, 0, 0); - if (err) - panic("Failed to continue new process, pid = %d, " - "errno = %d\n", pid, errno); + if (err) { + err = -errno; + printk(UM_KERN_ERR "Failed to continue new process, pid = %d, " + "errno = %d\n", pid, errno); + return err; + } + wait_stub_done(pid); pid = data->err; - if (pid < 0) - panic("copy_context_skas0 - stub-parent reports error %d\n", - -pid); + if (pid < 0) { + printk(UM_KERN_ERR "copy_context_skas0 - stub-parent reports " + "error %d\n", -pid); + return pid; + } /* * Wait, until child has finished too: read child's result from * child's stack and check it. */ wait_stub_done(pid); - if (child_data->err != STUB_DATA) - panic("copy_context_skas0 - stub-child reports error %ld\n", - child_data->err); + if (child_data->err != STUB_DATA) { + printk(UM_KERN_ERR "copy_context_skas0 - stub-child reports " + "error %ld\n", child_data->err); + err = child_data->err; + goto out_kill; + } if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL, - (void *)PTRACE_O_TRACESYSGOOD) < 0) - panic("copy_context_skas0 : PTRACE_OLDSETOPTIONS failed, " - "errno = %d\n", errno); + (void *)PTRACE_O_TRACESYSGOOD) < 0) { + err = -errno; + printk(UM_KERN_ERR "copy_context_skas0 : PTRACE_OLDSETOPTIONS " + "failed, errno = %d\n", errno); + goto out_kill; + } return pid; + + out_kill: + os_kill_ptraced_process(pid, 1); + return err; } /* @@ -463,8 +544,8 @@ int copy_context_skas0(unsigned long new_stack, int pid) * available. Opening /proc/mm creates a new mm_context, which lacks * the stub-pages. Thus, we map them using /proc/mm-fd */ -void map_stub_pages(int fd, unsigned long code, - unsigned long data, unsigned long stack) +int map_stub_pages(int fd, unsigned long code, unsigned long data, + unsigned long stack) { struct proc_mm_op mmop; int n; @@ -488,8 +569,9 @@ void map_stub_pages(int fd, unsigned long code, printk(UM_KERN_ERR "mmap args - addr = 0x%lx, fd = %d, " "offset = %llx\n", code, code_fd, (unsigned long long) code_offset); - panic("map_stub_pages : /proc/mm map for code failed, " - "err = %d\n", n); + printk(UM_KERN_ERR "map_stub_pages : /proc/mm map for code " + "failed, err = %d\n", n); + return -n; } if (stack) { @@ -507,10 +589,15 @@ void map_stub_pages(int fd, unsigned long code, .offset = map_offset } } }); CATCH_EINTR(n = write(fd, &mmop, sizeof(mmop))); - if (n != sizeof(mmop)) - panic("map_stub_pages : /proc/mm map for data failed, " - "err = %d\n", errno); + if (n != sizeof(mmop)) { + n = errno; + printk(UM_KERN_ERR "map_stub_pages : /proc/mm map for " + "data failed, err = %d\n", n); + return -n; + } } + + return 0; } void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) @@ -571,7 +658,9 @@ int start_idle_thread(void *stack, jmp_buf *switch_buf) kmalloc_ok = 0; return 1; default: - panic("Bad sigsetjmp return in start_idle_thread - %d\n", n); + printk(UM_KERN_ERR "Bad sigsetjmp return in " + "start_idle_thread - %d\n", n); + fatal_sigsegv(); } longjmp(*switch_buf, 1); } @@ -614,9 +703,11 @@ void __switch_mm(struct mm_id *mm_idp) if (proc_mm) { err = ptrace(PTRACE_SWITCH_MM, userspace_pid[0], 0, mm_idp->u.mm_fd); - if (err) - panic("__switch_mm - PTRACE_SWITCH_MM failed, " - "errno = %d\n", errno); + if (err) { + printk(UM_KERN_ERR "__switch_mm - PTRACE_SWITCH_MM " + "failed, errno = %d\n", errno); + fatal_sigsegv(); + } } else userspace_pid[0] = mm_idp->u.pid; } diff --git a/arch/um/os-Linux/skas/trap.c b/arch/um/os-Linux/skas/trap.c deleted file mode 100644 index 3b1b924..0000000 --- a/arch/um/os-Linux/skas/trap.c +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#if 0 -#include "kern_util.h" -#include "skas.h" -#include "ptrace_user.h" -#include "sysdep/ptrace_user.h" -#endif - -#include <errno.h> -#include <signal.h> -#include "sysdep/ptrace.h" -#include "kern_constants.h" -#include "as-layout.h" -#include "os.h" -#include "sigcontext.h" -#include "task.h" - -static struct uml_pt_regs ksig_regs[UM_NR_CPUS]; - -void sig_handler_common_skas(int sig, void *sc_ptr) -{ - struct sigcontext *sc = sc_ptr; - struct uml_pt_regs *r; - void (*handler)(int, struct uml_pt_regs *); - int save_user, save_errno = errno; - - /* - * This is done because to allow SIGSEGV to be delivered inside a SEGV - * handler. This can happen in copy_user, and if SEGV is disabled, - * the process will die. - * XXX Figure out why this is better than SA_NODEFER - */ - if (sig == SIGSEGV) { - change_sig(SIGSEGV, 1); - /* - * For segfaults, we want the data from the - * sigcontext. In this case, we don't want to mangle - * the process registers, so use a static set of - * registers. For other signals, the process - * registers are OK. - */ - r = &ksig_regs[cpu()]; - copy_sc(r, sc_ptr); - } - else r = TASK_REGS(get_current()); - - save_user = r->is_user; - r->is_user = 0; - if ((sig == SIGFPE) || (sig == SIGSEGV) || (sig == SIGBUS) || - (sig == SIGILL) || (sig == SIGTRAP)) - GET_FAULTINFO_FROM_SC(r->faultinfo, sc); - - change_sig(SIGUSR1, 1); - - handler = sig_info[sig]; - - /* unblock SIGVTALRM, SIGIO if sig isn't IRQ signal */ - if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGVTALRM)) - unblock_signals(); - - handler(sig, r); - - errno = save_errno; - r->is_user = save_user; -} diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c index 7b81f6c..b616e15 100644 --- a/arch/um/os-Linux/start_up.c +++ b/arch/um/os-Linux/start_up.c @@ -60,10 +60,11 @@ static int ptrace_child(void) * the UML code itself. */ ret = 2; - _exit(ret); + + exit(ret); } -static void fatal_perror(char *str) +static void fatal_perror(const char *str) { perror(str); exit(1); @@ -341,6 +342,8 @@ static void __init check_coredump_limit(void) void __init os_early_checks(void) { + int pid; + /* Print out the core dump limits early */ check_coredump_limit(); @@ -350,6 +353,11 @@ void __init os_early_checks(void) * kernel is running. */ check_tmpexec(); + + pid = start_ptraced_child(); + if (init_registers(pid)) + fatal("Failed to initialize default registers"); + stop_ptraced_child(pid, 1, 1); } static int __init noprocmm_cmd_param(char *str, int* add) @@ -411,7 +419,6 @@ static inline void check_skas3_ptrace_faultinfo(void) non_fatal("found\n"); } - init_registers(pid); stop_ptraced_child(pid, 1, 1); } @@ -466,7 +473,7 @@ static inline void check_skas3_proc_mm(void) else non_fatal("found\n"); } -int can_do_skas(void) +void can_do_skas(void) { non_fatal("Checking for the skas3 patch in the host:\n"); @@ -476,8 +483,6 @@ int can_do_skas(void) if (!proc_mm || !ptrace_faultinfo || !ptrace_ldt) skas_needs_stub = 1; - - return 1; } int __init parse_iomem(char *str, int *add) diff --git a/arch/um/os-Linux/trap.c b/arch/um/os-Linux/trap.c deleted file mode 100644 index 2a1c984..0000000 --- a/arch/um/os-Linux/trap.c +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <signal.h> -#include "os.h" -#include "sysdep/ptrace.h" - -/* Initialized from linux_main() */ -void (*sig_info[NSIG])(int, struct uml_pt_regs *); - -void os_fill_handlinfo(struct kern_handlers h) -{ - sig_info[SIGTRAP] = h.relay_signal; - sig_info[SIGFPE] = h.relay_signal; - sig_info[SIGILL] = h.relay_signal; - sig_info[SIGWINCH] = h.winch; - sig_info[SIGBUS] = h.bus_handler; - sig_info[SIGSEGV] = h.page_fault; - sig_info[SIGIO] = h.sigio_handler; - sig_info[SIGVTALRM] = h.timer_handler; -} diff --git a/arch/um/os-Linux/tty.c b/arch/um/os-Linux/tty.c index 4cfdd18..b09ff66 100644 --- a/arch/um/os-Linux/tty.c +++ b/arch/um/os-Linux/tty.c @@ -1,13 +1,16 @@ -/* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) +/* + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ #include <stdlib.h> +#include <unistd.h> #include <errno.h> +#include <fcntl.h> +#include "kern_constants.h" +#include "kern_util.h" #include "os.h" #include "user.h" -#include "kern_util.h" struct grantpt_info { int fd; @@ -26,36 +29,34 @@ static void grantpt_cb(void *arg) int get_pty(void) { struct grantpt_info info; - int fd; - - fd = os_open_file("/dev/ptmx", of_rdwr(OPENFLAGS()), 0); - if(fd < 0){ - printk("get_pty : Couldn't open /dev/ptmx - err = %d\n", -fd); - return(fd); + int fd, err; + + fd = open("/dev/ptmx", O_RDWR); + if (fd < 0) { + err = -errno; + printk(UM_KERN_ERR "get_pty : Couldn't open /dev/ptmx - " + "err = %d\n", errno); + return err; } info.fd = fd; initial_thread_cb(grantpt_cb, &info); - if(info.res < 0){ - printk("get_pty : Couldn't grant pty - errno = %d\n", - -info.err); - return(-1); + if (info.res < 0) { + err = -info.err; + printk(UM_KERN_ERR "get_pty : Couldn't grant pty - " + "errno = %d\n", -info.err); + goto out; } - if(unlockpt(fd) < 0){ - printk("get_pty : Couldn't unlock pty - errno = %d\n", errno); - return(-1); + + if (unlockpt(fd) < 0) { + err = -errno; + printk(UM_KERN_ERR "get_pty : Couldn't unlock pty - " + "errno = %d\n", errno); + goto out; } - return(fd); + return fd; +out: + close(fd); + return err; } - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/os-Linux/tty_log.c b/arch/um/os-Linux/tty_log.c index d11a55b..cc648e6 100644 --- a/arch/um/os-Linux/tty_log.c +++ b/arch/um/os-Linux/tty_log.c @@ -12,7 +12,6 @@ #include <sys/time.h> #include "init.h" #include "user.h" -#include "kern_util.h" #include "os.h" #define TTY_LOG_DIR "./" diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c index 3e058ce..a6f31d4 100644 --- a/arch/um/os-Linux/util.c +++ b/arch/um/os-Linux/util.c @@ -88,21 +88,6 @@ void setup_hostinfo(char *buf, int len) host.release, host.version, host.machine); } -int setjmp_wrapper(void (*proc)(void *, void *), ...) -{ - va_list args; - jmp_buf buf; - int n; - - n = UML_SETJMP(&buf); - if(n == 0){ - va_start(args, proc); - (*proc)(&buf, &args); - } - va_end(args); - return n; -} - void os_dump_core(void) { int pid; diff --git a/arch/um/sys-i386/bug.c b/arch/um/sys-i386/bug.c index a4360b5..8d4f273 100644 --- a/arch/um/sys-i386/bug.c +++ b/arch/um/sys-i386/bug.c @@ -4,6 +4,7 @@ */ #include <linux/uaccess.h> +#include <asm/errno.h> /* Mostly copied from i386/x86_86 - eliminated the eip < PAGE_OFFSET because * that's not relevant in skas mode. diff --git a/arch/um/sys-i386/bugs.c b/arch/um/sys-i386/bugs.c index 806895d..a74442d 100644 --- a/arch/um/sys-i386/bugs.c +++ b/arch/um/sys-i386/bugs.c @@ -3,171 +3,47 @@ * Licensed under the GPL */ -#include <errno.h> #include <signal.h> -#include <string.h> #include "kern_constants.h" -#include "os.h" +#include "kern_util.h" +#include "longjmp.h" #include "task.h" #include "user.h" - -#define MAXTOKEN 64 +#include "sysdep/ptrace.h" /* Set during early boot */ int host_has_cmov = 1; -int host_has_xmm = 0; +static jmp_buf cmov_test_return; -static char token(int fd, char *buf, int len, char stop) +static void cmov_sigill_test_handler(int sig) { - int n; - char *ptr, *end, c; - - ptr = buf; - end = &buf[len]; - do { - n = os_read_file(fd, ptr, sizeof(*ptr)); - c = *ptr++; - if (n != sizeof(*ptr)) { - if (n == 0) - return 0; - printk(UM_KERN_ERR "Reading /proc/cpuinfo failed, " - "err = %d\n", -n); - if (n < 0) - return n; - else return -EIO; - } - } while ((c != '\n') && (c != stop) && (ptr < end)); - - if (ptr == end) { - printk(UM_KERN_ERR "Failed to find '%c' in /proc/cpuinfo\n", - stop); - return -1; - } - *(ptr - 1) = '\0'; - return c; -} - -static int find_cpuinfo_line(int fd, char *key, char *scratch, int len) -{ - int n; - char c; - - scratch[len - 1] = '\0'; - while (1) { - c = token(fd, scratch, len - 1, ':'); - if (c <= 0) - return 0; - else if (c != ':') { - printk(UM_KERN_ERR "Failed to find ':' in " - "/proc/cpuinfo\n"); - return 0; - } - - if (!strncmp(scratch, key, strlen(key))) - return 1; - - do { - n = os_read_file(fd, &c, sizeof(c)); - if (n != sizeof(c)) { - printk(UM_KERN_ERR "Failed to find newline in " - "/proc/cpuinfo, err = %d\n", -n); - return 0; - } - } while (c != '\n'); - } - return 0; + host_has_cmov = 0; + longjmp(cmov_test_return, 1); } -static int check_cpu_flag(char *feature, int *have_it) -{ - char buf[MAXTOKEN], c; - int fd, len = ARRAY_SIZE(buf); - - printk(UM_KERN_INFO "Checking for host processor %s support...", - feature); - fd = os_open_file("/proc/cpuinfo", of_read(OPENFLAGS()), 0); - if (fd < 0) { - printk(UM_KERN_ERR "Couldn't open /proc/cpuinfo, err = %d\n", - -fd); - return 0; - } - - *have_it = 0; - if (!find_cpuinfo_line(fd, "flags", buf, ARRAY_SIZE(buf))) - goto out; - - c = token(fd, buf, len - 1, ' '); - if (c < 0) - goto out; - else if (c != ' ') { - printk(UM_KERN_ERR "Failed to find ' ' in /proc/cpuinfo\n"); - goto out; - } - - while (1) { - c = token(fd, buf, len - 1, ' '); - if (c < 0) - goto out; - else if (c == '\n') - break; - - if (!strcmp(buf, feature)) { - *have_it = 1; - goto out; - } - } - out: - if (*have_it == 0) - printk("No\n"); - else if (*have_it == 1) - printk("Yes\n"); - os_close_file(fd); - return 1; -} - -#if 0 /* - * This doesn't work in tt mode, plus it's causing compilation problems - * for some people. - */ -static void disable_lcall(void) +void arch_check_bugs(void) { - struct modify_ldt_ldt_s ldt; - int err; + struct sigaction old, new; - bzero(&ldt, sizeof(ldt)); - ldt.entry_number = 7; - ldt.base_addr = 0; - ldt.limit = 0; - err = modify_ldt(1, &ldt, sizeof(ldt)); - if (err) - printk(UM_KERN_ERR "Failed to disable lcall7 - errno = %d\n", - errno); -} -#endif + printk(UM_KERN_INFO "Checking for host processor cmov support..."); + new.sa_handler = cmov_sigill_test_handler; -void arch_init_thread(void) -{ -#if 0 - disable_lcall(); -#endif -} + /* Make sure that SIGILL is enabled after the handler longjmps back */ + new.sa_flags = SA_NODEFER; + sigemptyset(&new.sa_mask); + sigaction(SIGILL, &new, &old); -void arch_check_bugs(void) -{ - int have_it; + if (setjmp(cmov_test_return) == 0) { + unsigned long foo = 0; + __asm__ __volatile__("cmovz %0, %1" : "=r" (foo) : "0" (foo)); + printk(UM_KERN_CONT "Yes\n"); + } else + printk(UM_KERN_CONT "No\n"); - if (os_access("/proc/cpuinfo", OS_ACC_R_OK) < 0) { - printk(UM_KERN_ERR "/proc/cpuinfo not available - skipping CPU " - "capability checks\n"); - return; - } - if (check_cpu_flag("cmov", &have_it)) - host_has_cmov = have_it; - if (check_cpu_flag("xmm", &have_it)) - host_has_xmm = have_it; + sigaction(SIGILL, &old, &new); } -int arch_handle_signal(int sig, struct uml_pt_regs *regs) +void arch_examine_signal(int sig, struct uml_pt_regs *regs) { unsigned char tmp[2]; @@ -176,24 +52,25 @@ int arch_handle_signal(int sig, struct uml_pt_regs *regs) * SIGILL in init. */ if ((sig != SIGILL) || (TASK_PID(get_current()) != 1)) - return 0; + return; + + if (copy_from_user_proc(tmp, (void *) UPT_IP(regs), 2)) { + printk(UM_KERN_ERR "SIGILL in init, could not read " + "instructions!\n"); + return; + } - if (copy_from_user_proc(tmp, (void *) UPT_IP(regs), 2)) - panic("SIGILL in init, could not read instructions!\n"); if ((tmp[0] != 0x0f) || ((tmp[1] & 0xf0) != 0x40)) - return 0; + return; if (host_has_cmov == 0) - panic("SIGILL caused by cmov, which this processor doesn't " - "implement, boot a filesystem compiled for older " - "processors"); + printk(UM_KERN_ERR "SIGILL caused by cmov, which this " + "processor doesn't implement. Boot a filesystem " + "compiled for older processors"); else if (host_has_cmov == 1) - panic("SIGILL caused by cmov, which this processor claims to " - "implement"); - else if (host_has_cmov == -1) - panic("SIGILL caused by cmov, couldn't tell if this processor " - "implements it, boot a filesystem compiled for older " - "processors"); - else panic("Bad value for host_has_cmov (%d)", host_has_cmov); - return 0; + printk(UM_KERN_ERR "SIGILL caused by cmov, which this " + "processor claims to implement"); + else + printk(UM_KERN_ERR "Bad value for host_has_cmov (%d)", + host_has_cmov); } diff --git a/arch/um/sys-i386/ldt.c b/arch/um/sys-i386/ldt.c index 67c0958..a34263e 100644 --- a/arch/um/sys-i386/ldt.c +++ b/arch/um/sys-i386/ldt.c @@ -3,8 +3,9 @@ * Licensed under the GPL */ -#include "linux/mm.h" -#include "asm/unistd.h" +#include <linux/mm.h> +#include <linux/sched.h> +#include <asm/unistd.h> #include "os.h" #include "proc_mm.h" #include "skas.h" @@ -146,7 +147,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount) if (ptrace_ldt) return read_ldt_from_host(ptr, bytecount); - down(&ldt->semaphore); + mutex_lock(&ldt->lock); if (ldt->entry_count <= LDT_DIRECT_ENTRIES) { size = LDT_ENTRY_SIZE*LDT_DIRECT_ENTRIES; if (size > bytecount) @@ -170,7 +171,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount) ptr += size; } } - up(&ldt->semaphore); + mutex_unlock(&ldt->lock); if (bytecount == 0 || err == -EFAULT) goto out; @@ -228,7 +229,7 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int func) } if (!ptrace_ldt) - down(&ldt->semaphore); + mutex_lock(&ldt->lock); err = write_ldt_entry(mm_idp, func, &ldt_info, &addr, 1); if (err) @@ -288,7 +289,7 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int func) err = 0; out_unlock: - up(&ldt->semaphore); + mutex_unlock(&ldt->lock); out: return err; } @@ -395,7 +396,7 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm) if (!ptrace_ldt) - init_MUTEX(&new_mm->ldt.semaphore); + mutex_init(&new_mm->ldt.lock); if (!from_mm) { memset(&desc, 0, sizeof(desc)); @@ -455,7 +456,7 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm) * i.e., we have to use the stub for modify_ldt, which * can't handle the big read buffer of up to 64kB. */ - down(&from_mm->ldt.semaphore); + mutex_lock(&from_mm->ldt.lock); if (from_mm->ldt.entry_count <= LDT_DIRECT_ENTRIES) memcpy(new_mm->ldt.u.entries, from_mm->ldt.u.entries, sizeof(new_mm->ldt.u.entries)); @@ -474,7 +475,7 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm) } } new_mm->ldt.entry_count = from_mm->ldt.entry_count; - up(&from_mm->ldt.semaphore); + mutex_unlock(&from_mm->ldt.lock); } out: diff --git a/arch/um/sys-i386/ptrace.c b/arch/um/sys-i386/ptrace.c index bd3da8a..6b44999 100644 --- a/arch/um/sys-i386/ptrace.c +++ b/arch/um/sys-i386/ptrace.c @@ -8,11 +8,11 @@ #include "asm/uaccess.h" #include "skas.h" -extern int arch_switch_tls(struct task_struct *from, struct task_struct *to); +extern int arch_switch_tls(struct task_struct *to); -void arch_switch_to(struct task_struct *from, struct task_struct *to) +void arch_switch_to(struct task_struct *to) { - int err = arch_switch_tls(from, to); + int err = arch_switch_tls(to); if (!err) return; diff --git a/arch/um/sys-i386/ptrace_user.c b/arch/um/sys-i386/ptrace_user.c index 5cf97bc..0b10c3e 100644 --- a/arch/um/sys-i386/ptrace_user.c +++ b/arch/um/sys-i386/ptrace_user.c @@ -19,17 +19,3 @@ int ptrace_setregs(long pid, unsigned long *regs) return -errno; return 0; } - -int ptrace_getfpregs(long pid, unsigned long *regs) -{ - if (ptrace(PTRACE_GETFPREGS, pid, 0, regs) < 0) - return -errno; - return 0; -} - -int ptrace_setfpregs(long pid, unsigned long *regs) -{ - if (ptrace(PTRACE_SETFPREGS, pid, 0, regs) < 0) - return -errno; - return 0; -} diff --git a/arch/um/sys-i386/signal.c b/arch/um/sys-i386/signal.c index 19053d4..fd0c25a 100644 --- a/arch/um/sys-i386/signal.c +++ b/arch/um/sys-i386/signal.c @@ -168,12 +168,13 @@ static int copy_sc_from_user(struct pt_regs *regs, struct sigcontext __user *from) { struct sigcontext sc; - int err; + int err, pid; err = copy_from_user(&sc, from, sizeof(sc)); if (err) return err; + pid = userspace_pid[current_thread_info()->cpu]; copy_sc(®s->regs, &sc); if (have_fpx_regs) { struct user_fxsr_struct fpx; @@ -187,8 +188,7 @@ static int copy_sc_from_user(struct pt_regs *regs, if (err) return 1; - err = restore_fpx_registers(userspace_pid[current_thread->cpu], - (unsigned long *) &fpx); + err = restore_fpx_registers(pid, (unsigned long *) &fpx); if (err < 0) { printk(KERN_ERR "copy_sc_from_user - " "restore_fpx_registers failed, errno = %d\n", @@ -204,8 +204,7 @@ static int copy_sc_from_user(struct pt_regs *regs, if (err) return 1; - err = restore_fp_registers(userspace_pid[current_thread->cpu], - (unsigned long *) &fp); + err = restore_fp_registers(pid, (unsigned long *) &fp); if (err < 0) { printk(KERN_ERR "copy_sc_from_user - " "restore_fp_registers failed, errno = %d\n", @@ -223,7 +222,7 @@ static int copy_sc_to_user(struct sigcontext __user *to, { struct sigcontext sc; struct faultinfo * fi = ¤t->thread.arch.faultinfo; - int err; + int err, pid; sc.gs = REGS_GS(regs->regs.gp); sc.fs = REGS_FS(regs->regs.gp); @@ -249,11 +248,11 @@ static int copy_sc_to_user(struct sigcontext __user *to, to_fp = (to_fp ? to_fp : (struct _fpstate __user *) (to + 1)); sc.fpstate = to_fp; + pid = userspace_pid[current_thread_info()->cpu]; if (have_fpx_regs) { struct user_fxsr_struct fpx; - err = save_fpx_registers(userspace_pid[current_thread->cpu], - (unsigned long *) &fpx); + err = save_fpx_registers(pid, (unsigned long *) &fpx); if (err < 0){ printk(KERN_ERR "copy_sc_to_user - save_fpx_registers " "failed, errno = %d\n", err); @@ -276,8 +275,7 @@ static int copy_sc_to_user(struct sigcontext __user *to, else { struct user_i387_struct fp; - err = save_fp_registers(userspace_pid[current_thread->cpu], - (unsigned long *) &fp); + err = save_fp_registers(pid, (unsigned long *) &fp); if (copy_to_user(to_fp, &fp, sizeof(struct user_i387_struct))) return 1; } diff --git a/arch/um/sys-i386/stub.S b/arch/um/sys-i386/stub.S index e730772..7699e89 100644 --- a/arch/um/sys-i386/stub.S +++ b/arch/um/sys-i386/stub.S @@ -7,7 +7,7 @@ .globl batch_syscall_stub batch_syscall_stub: /* load pointer to first operation */ - mov $(ASM_STUB_DATA+8), %esp + mov $(STUB_DATA+8), %esp again: /* load length of additional data */ @@ -15,12 +15,12 @@ again: /* if(length == 0) : end of list */ /* write possible 0 to header */ - mov %eax, ASM_STUB_DATA+4 + mov %eax, STUB_DATA+4 cmpl $0, %eax jz done /* save current pointer */ - mov %esp, ASM_STUB_DATA+4 + mov %esp, STUB_DATA+4 /* skip additional data */ add %eax, %esp @@ -46,7 +46,7 @@ again: done: /* save return value */ - mov %eax, ASM_STUB_DATA + mov %eax, STUB_DATA /* stop */ int3 diff --git a/arch/um/sys-i386/stub_segv.c b/arch/um/sys-i386/stub_segv.c index b3999cb..28ccf73 100644 --- a/arch/um/sys-i386/stub_segv.c +++ b/arch/um/sys-i386/stub_segv.c @@ -1,32 +1,17 @@ /* - * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com) + * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ -#include <signal.h> -#include <sys/select.h> /* The only way I can see to get sigset_t */ -#include <asm/unistd.h> -#include "as-layout.h" -#include "uml-config.h" #include "sysdep/stub.h" #include "sysdep/sigcontext.h" -#include "sysdep/faultinfo.h" void __attribute__ ((__section__ (".__syscall_stub"))) stub_segv_handler(int sig) { struct sigcontext *sc = (struct sigcontext *) (&sig + 1); - int pid; GET_FAULTINFO_FROM_SC(*((struct faultinfo *) STUB_DATA), sc); - pid = stub_syscall0(__NR_getpid); - stub_syscall2(__NR_kill, pid, SIGUSR1); - - /* Load pointer to sigcontext into esp, since we need to leave - * the stack in its original form when we do the sigreturn here, by - * hand. - */ - __asm__ __volatile__("mov %0,%%esp ; movl %1, %%eax ; " - "int $0x80" : : "a" (sc), "g" (__NR_sigreturn)); + trap_myself(); } diff --git a/arch/um/sys-i386/sys_call_table.S b/arch/um/sys-i386/sys_call_table.S index 12d4148..00e5f520 100644 --- a/arch/um/sys-i386/sys_call_table.S +++ b/arch/um/sys-i386/sys_call_table.S @@ -9,4 +9,9 @@ #define old_mmap old_mmap_i386 +.section .rodata,"a" + #include "../../x86/kernel/syscall_table_32.S" + +ENTRY(syscall_table_size) +.long .-sys_call_table diff --git a/arch/um/sys-i386/tls.c b/arch/um/sys-i386/tls.c index fcaff86..c6c7131 100644 --- a/arch/um/sys-i386/tls.c +++ b/arch/um/sys-i386/tls.c @@ -26,6 +26,11 @@ int do_set_thread_area(struct user_desc *info) cpu = get_cpu(); ret = os_set_thread_area(info, userspace_pid[cpu]); put_cpu(); + + if (ret) + printk(KERN_ERR "PTRACE_SET_THREAD_AREA failed, err = %d, " + "index = %d\n", ret, info->entry_number); + return ret; } @@ -37,6 +42,11 @@ int do_get_thread_area(struct user_desc *info) cpu = get_cpu(); ret = os_get_thread_area(info, userspace_pid[cpu]); put_cpu(); + + if (ret) + printk(KERN_ERR "PTRACE_GET_THREAD_AREA failed, err = %d, " + "index = %d\n", ret, info->entry_number); + return ret; } @@ -172,7 +182,7 @@ void clear_flushed_tls(struct task_struct *task) * SKAS patch. */ -int arch_switch_tls(struct task_struct *from, struct task_struct *to) +int arch_switch_tls(struct task_struct *to) { if (!host_supports_tls) return 0; @@ -225,7 +235,8 @@ out: } /* XXX: use do_get_thread_area to read the host value? I'm not at all sure! */ -static int get_tls_entry(struct task_struct* task, struct user_desc *info, int idx) +static int get_tls_entry(struct task_struct *task, struct user_desc *info, + int idx) { struct thread_struct *t = &task->thread; @@ -263,7 +274,7 @@ clear: goto out; } -asmlinkage int sys_set_thread_area(struct user_desc __user *user_desc) +int sys_set_thread_area(struct user_desc __user *user_desc) { struct user_desc info; int idx, ret; @@ -298,7 +309,7 @@ asmlinkage int sys_set_thread_area(struct user_desc __user *user_desc) * i386. However the only possible error are caused by bugs. */ int ptrace_set_thread_area(struct task_struct *child, int idx, - struct user_desc __user *user_desc) + struct user_desc __user *user_desc) { struct user_desc info; @@ -311,7 +322,7 @@ int ptrace_set_thread_area(struct task_struct *child, int idx, return set_tls_entry(child, &info, idx, 0); } -asmlinkage int sys_get_thread_area(struct user_desc __user *user_desc) +int sys_get_thread_area(struct user_desc __user *user_desc) { struct user_desc info; int idx, ret; @@ -355,10 +366,9 @@ out: return ret; } - /* - * XXX: This part is probably common to i386 and x86-64. Don't create a common - * file for now, do that when implementing x86-64 support. + * This code is really i386-only, but it detects and logs x86_64 GDT indexes + * if a 32-bit UML is running on a 64-bit host. */ static int __init __setup_host_supports_tls(void) { @@ -367,13 +377,16 @@ static int __init __setup_host_supports_tls(void) printk(KERN_INFO "Host TLS support detected\n"); printk(KERN_INFO "Detected host type: "); switch (host_gdt_entry_tls_min) { - case GDT_ENTRY_TLS_MIN_I386: - printk("i386\n"); - break; - case GDT_ENTRY_TLS_MIN_X86_64: - printk("x86_64\n"); - break; + case GDT_ENTRY_TLS_MIN_I386: + printk(KERN_CONT "i386"); + break; + case GDT_ENTRY_TLS_MIN_X86_64: + printk(KERN_CONT "x86_64"); + break; } + printk(KERN_CONT " (GDT indexes %d to %d)\n", + host_gdt_entry_tls_min, + host_gdt_entry_tls_min + GDT_ENTRY_TLS_ENTRIES); } else printk(KERN_ERR " Host TLS support NOT detected! " "TLS support inside UML will not work\n"); diff --git a/arch/um/sys-ppc/Makefile b/arch/um/sys-ppc/Makefile index a9814a7..0890152 100644 --- a/arch/um/sys-ppc/Makefile +++ b/arch/um/sys-ppc/Makefile @@ -6,7 +6,7 @@ OBJ = built-in.o OBJS = ptrace.o sigcontext.o semaphore.o checksum.o miscthings.o misc.o \ ptrace_user.o sysrq.o -EXTRA_AFLAGS := -DCONFIG_PPC32 -I. -I$(TOPDIR)/arch/ppc/kernel +EXTRA_AFLAGS := -DCONFIG_PPC32 -I. -I$(srctree)/arch/ppc/kernel all: $(OBJ) @@ -22,25 +22,25 @@ sigcontext.o: sigcontext.c semaphore.c: rm -f $@ - ln -s $(TOPDIR)/arch/ppc/kernel/$@ $@ + ln -s $(srctree)/arch/ppc/kernel/$@ $@ checksum.S: rm -f $@ - ln -s $(TOPDIR)/arch/ppc/lib/$@ $@ + ln -s $(srctree)/arch/ppc/lib/$@ $@ mk_defs.c: rm -f $@ - ln -s $(TOPDIR)/arch/ppc/kernel/$@ $@ + ln -s $(srctree)/arch/ppc/kernel/$@ $@ ppc_defs.head: rm -f $@ - ln -s $(TOPDIR)/arch/ppc/kernel/$@ $@ + ln -s $(srctree)/arch/ppc/kernel/$@ $@ ppc_defs.h: mk_defs.c ppc_defs.head \ - $(TOPDIR)/include/asm-ppc/mmu.h \ - $(TOPDIR)/include/asm-ppc/processor.h \ - $(TOPDIR)/include/asm-ppc/pgtable.h \ - $(TOPDIR)/include/asm-ppc/ptrace.h + $(srctree)/include/asm-ppc/mmu.h \ + $(srctree)/include/asm-ppc/processor.h \ + $(srctree)/include/asm-ppc/pgtable.h \ + $(srctree)/include/asm-ppc/ptrace.h # $(CC) $(CFLAGS) -S mk_defs.c cp ppc_defs.head ppc_defs.h # for bk, this way we can write to the file even if it's not checked out @@ -56,13 +56,13 @@ ppc_defs.h: mk_defs.c ppc_defs.head \ checksum.o: checksum.S rm -f asm - ln -s $(TOPDIR)/include/asm-ppc asm + ln -s $(srctree)/include/asm-ppc asm $(CC) $(EXTRA_AFLAGS) $(KBUILD_AFLAGS) -D__ASSEMBLY__ -D__UM_PPC__ -c $< -o $*.o rm -f asm misc.o: misc.S ppc_defs.h rm -f asm - ln -s $(TOPDIR)/include/asm-ppc asm + ln -s $(srctree)/include/asm-ppc asm $(CC) $(EXTRA_AFLAGS) $(KBUILD_AFLAGS) -D__ASSEMBLY__ -D__UM_PPC__ -c $< -o $*.o rm -f asm diff --git a/arch/um/sys-x86_64/bug.c b/arch/um/sys-x86_64/bug.c index a4360b5..e8034e3 100644 --- a/arch/um/sys-x86_64/bug.c +++ b/arch/um/sys-x86_64/bug.c @@ -5,7 +5,8 @@ #include <linux/uaccess.h> -/* Mostly copied from i386/x86_86 - eliminated the eip < PAGE_OFFSET because +/* + * Mostly copied from i386/x86_86 - eliminated the eip < PAGE_OFFSET because * that's not relevant in skas mode. */ diff --git a/arch/um/sys-x86_64/bugs.c b/arch/um/sys-x86_64/bugs.c index 506b676..44e02ba 100644 --- a/arch/um/sys-x86_64/bugs.c +++ b/arch/um/sys-x86_64/bugs.c @@ -6,15 +6,10 @@ #include "sysdep/ptrace.h" -void arch_init_thread(void) -{ -} - void arch_check_bugs(void) { } -int arch_handle_signal(int sig, struct uml_pt_regs *regs) +void arch_examine_signal(int sig, struct uml_pt_regs *regs) { - return 0; } diff --git a/arch/um/sys-x86_64/ptrace.c b/arch/um/sys-x86_64/ptrace.c index b7631b0..f3458d7 100644 --- a/arch/um/sys-x86_64/ptrace.c +++ b/arch/um/sys-x86_64/ptrace.c @@ -5,13 +5,12 @@ * Licensed under the GPL */ -#define __FRAME_OFFSETS -#include <asm/ptrace.h> +#include <linux/mm.h> #include <linux/sched.h> #include <linux/errno.h> -#include <linux/mm.h> +#define __FRAME_OFFSETS +#include <asm/ptrace.h> #include <asm/uaccess.h> -#include <asm/elf.h> /* * determines which flags the user has access to. @@ -24,12 +23,14 @@ int putreg(struct task_struct *child, int regno, unsigned long value) unsigned long tmp; #ifdef TIF_IA32 - /* Some code in the 64bit emulation may not be 64bit clean. - Don't take any chances. */ + /* + * Some code in the 64bit emulation may not be 64bit clean. + * Don't take any chances. + */ if (test_tsk_thread_flag(child, TIF_IA32)) value &= 0xffffffff; #endif - switch (regno){ + switch (regno) { case FS: case GS: case DS: @@ -66,7 +67,7 @@ int poke_user(struct task_struct *child, long addr, long data) if (addr < MAX_REG_OFFSET) return putreg(child, addr, data); else if ((addr >= offsetof(struct user, u_debugreg[0])) && - (addr <= offsetof(struct user, u_debugreg[7]))){ + (addr <= offsetof(struct user, u_debugreg[7]))) { addr -= offsetof(struct user, u_debugreg[0]); addr = addr >> 2; if ((addr == 4) || (addr == 5)) @@ -108,11 +109,10 @@ int peek_user(struct task_struct *child, long addr, long data) return -EIO; tmp = 0; /* Default return condition */ - if (addr < MAX_REG_OFFSET){ + if (addr < MAX_REG_OFFSET) tmp = getreg(child, addr); - } else if ((addr >= offsetof(struct user, u_debugreg[0])) && - (addr <= offsetof(struct user, u_debugreg[7]))){ + (addr <= offsetof(struct user, u_debugreg[7]))) { addr -= offsetof(struct user, u_debugreg[0]); addr = addr >> 2; tmp = child->thread.arch.debugregs[addr]; @@ -127,8 +127,9 @@ int is_syscall(unsigned long addr) int n; n = copy_from_user(&instr, (void __user *) addr, sizeof(instr)); - if (n){ - /* access_process_vm() grants access to vsyscall and stub, + if (n) { + /* + * access_process_vm() grants access to vsyscall and stub, * while copy_from_user doesn't. Maybe access_process_vm is * slow, but that doesn't matter, since it will be called only * in case of singlestepping, if copy_from_user failed. @@ -155,7 +156,7 @@ int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) return err; n = copy_to_user(buf, fpregs, sizeof(fpregs)); - if(n > 0) + if (n > 0) return -EFAULT; return n; diff --git a/arch/um/sys-x86_64/ptrace_user.c b/arch/um/sys-x86_64/ptrace_user.c index b5f9c33..c57a496 100644 --- a/arch/um/sys-x86_64/ptrace_user.c +++ b/arch/um/sys-x86_64/ptrace_user.c @@ -4,55 +4,19 @@ * Licensed under the GPL */ -#include <stddef.h> #include <errno.h> #include "ptrace_user.h" -#include "user.h" -#include "kern_constants.h" int ptrace_getregs(long pid, unsigned long *regs_out) { - if(ptrace(PTRACE_GETREGS, pid, 0, regs_out) < 0) - return(-errno); - return(0); -} - -int ptrace_setregs(long pid, unsigned long *regs) -{ - if(ptrace(PTRACE_SETREGS, pid, 0, regs) < 0) - return(-errno); + if (ptrace(PTRACE_GETREGS, pid, 0, regs_out) < 0) + return -errno; return(0); } -int ptrace_setfpregs(long pid, unsigned long *regs) +int ptrace_setregs(long pid, unsigned long *regs_out) { - if (ptrace(PTRACE_SETFPREGS, pid, 0, regs) < 0) + if (ptrace(PTRACE_SETREGS, pid, 0, regs_out) < 0) return -errno; - return 0; -} - -void ptrace_pokeuser(unsigned long addr, unsigned long data) -{ - panic("ptrace_pokeuser"); -} - -#define DS 184 -#define ES 192 -#define __USER_DS 0x2b - -void arch_enter_kernel(void *task, int pid) -{ -} - -void arch_leave_kernel(void *task, int pid) -{ -#ifdef UM_USER_CS - if(ptrace(PTRACE_POKEUSR, pid, CS, UM_USER_CS) < 0) - printk("POKEUSR CS failed"); -#endif - - if(ptrace(PTRACE_POKEUSR, pid, DS, __USER_DS) < 0) - printk("POKEUSR DS failed"); - if(ptrace(PTRACE_POKEUSR, pid, ES, __USER_DS) < 0) - printk("POKEUSR ES failed"); + return(0); } diff --git a/arch/um/sys-x86_64/signal.c b/arch/um/sys-x86_64/signal.c index 1407018..1a899a7 100644 --- a/arch/um/sys-x86_64/signal.c +++ b/arch/um/sys-x86_64/signal.c @@ -81,7 +81,7 @@ static int copy_sc_from_user(struct pt_regs *regs, if (err) return 1; - err = restore_fp_registers(userspace_pid[current_thread->cpu], + err = restore_fp_registers(userspace_pid[current_thread_info()->cpu], (unsigned long *) &fp); if (err < 0) { printk(KERN_ERR "copy_sc_from_user - " @@ -143,7 +143,7 @@ static int copy_sc_to_user(struct sigcontext __user *to, if (err) return 1; - err = save_fp_registers(userspace_pid[current_thread->cpu], + err = save_fp_registers(userspace_pid[current_thread_info()->cpu], (unsigned long *) &fp); if (err < 0) { printk(KERN_ERR "copy_sc_from_user - restore_fp_registers " diff --git a/arch/um/sys-x86_64/stub.S b/arch/um/sys-x86_64/stub.S index 4afe204..5687687 100644 --- a/arch/um/sys-x86_64/stub.S +++ b/arch/um/sys-x86_64/stub.S @@ -8,18 +8,18 @@ syscall_stub: /* We don't have 64-bit constants, so this constructs the address * we need. */ - movq $(ASM_STUB_DATA >> 32), %rbx + movq $(STUB_DATA >> 32), %rbx salq $32, %rbx - movq $(ASM_STUB_DATA & 0xffffffff), %rcx + movq $(STUB_DATA & 0xffffffff), %rcx or %rcx, %rbx movq %rax, (%rbx) int3 .globl batch_syscall_stub batch_syscall_stub: - mov $(ASM_STUB_DATA >> 32), %rbx + mov $(STUB_DATA >> 32), %rbx sal $32, %rbx - mov $(ASM_STUB_DATA & 0xffffffff), %rax + mov $(STUB_DATA & 0xffffffff), %rax or %rax, %rbx /* load pointer to first operation */ mov %rbx, %rsp diff --git a/arch/um/sys-x86_64/stub_segv.c b/arch/um/sys-x86_64/stub_segv.c index 3afb590..ced051a 100644 --- a/arch/um/sys-x86_64/stub_segv.c +++ b/arch/um/sys-x86_64/stub_segv.c @@ -1,51 +1,22 @@ /* - * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com) + * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ -#include <stddef.h> #include <signal.h> -#include <asm/unistd.h> #include "as-layout.h" -#include "uml-config.h" -#include "sysdep/sigcontext.h" -#include "sysdep/faultinfo.h" #include "sysdep/stub.h" - -/* Copied from sys-x86_64/signal.c - Can't find an equivalent definition - * in the libc headers anywhere. - */ -struct rt_sigframe -{ - char *pretcode; - struct ucontext uc; - struct siginfo info; -}; - -/* Copied here from <linux/kernel.h> - we're userspace. */ -#define container_of(ptr, type, member) ({ \ - const typeof( ((type *)0)->member ) *__mptr = (ptr); \ - (type *)( (char *)__mptr - offsetof(type,member) );}) +#include "sysdep/faultinfo.h" +#include "sysdep/sigcontext.h" void __attribute__ ((__section__ (".__syscall_stub"))) stub_segv_handler(int sig) { struct ucontext *uc; - int pid; __asm__ __volatile__("movq %%rdx, %0" : "=g" (uc) :); GET_FAULTINFO_FROM_SC(*((struct faultinfo *) STUB_DATA), &uc->uc_mcontext); - - pid = stub_syscall0(__NR_getpid); - stub_syscall2(__NR_kill, pid, SIGUSR1); - - /* sys_sigreturn expects that the stack pointer will be 8 bytes into - * the signal frame. So, we use the ucontext pointer, which we know - * already, to get the signal frame pointer, and add 8 to that. - */ - __asm__ __volatile__("movq %0, %%rsp; movq %1, %%rax ; syscall": : - "g" ((unsigned long) - container_of(uc, struct rt_sigframe, uc) + 8), - "g" (__NR_rt_sigreturn)); + trap_myself(); } + diff --git a/arch/um/sys-x86_64/syscall_table.c b/arch/um/sys-x86_64/syscall_table.c index 71b2ae4..c128eb8 100644 --- a/arch/um/sys-x86_64/syscall_table.c +++ b/arch/um/sys-x86_64/syscall_table.c @@ -1,5 +1,7 @@ -/* System call table for UML/x86-64, copied from arch/x86_64/kernel/syscall.c - * with some changes for UML. */ +/* + * System call table for UML/x86-64, copied from arch/x86_64/kernel/syscall.c + * with some changes for UML. + */ #include <linux/linkage.h> #include <linux/sys.h> @@ -8,22 +10,26 @@ #define __NO_STUBS -/* Below you can see, in terms of #define's, the differences between the x86-64 - * and the UML syscall table. */ +/* + * Below you can see, in terms of #define's, the differences between the x86-64 + * and the UML syscall table. + */ /* Not going to be implemented by UML, since we have no hardware. */ #define stub_iopl sys_ni_syscall #define sys_ioperm sys_ni_syscall -/* The UML TLS problem. Note that x86_64 does not implement this, so the below - * is needed only for the ia32 compatibility. */ -/*#define sys_set_thread_area sys_ni_syscall -#define sys_get_thread_area sys_ni_syscall*/ +/* + * The UML TLS problem. Note that x86_64 does not implement this, so the below + * is needed only for the ia32 compatibility. + */ /* On UML we call it this way ("old" means it's not mmap2) */ #define sys_mmap old_mmap -/* On x86-64 sys_uname is actually sys_newuname plus a compatibility trick. - * See arch/x86_64/kernel/sys_x86_64.c */ +/* + * On x86-64 sys_uname is actually sys_newuname plus a compatibility trick. + * See arch/x86_64/kernel/sys_x86_64.c + */ #define sys_uname sys_uname64 #define stub_clone sys_clone @@ -46,8 +52,19 @@ typedef void (*sys_call_ptr_t)(void); extern void sys_ni_syscall(void); -sys_call_ptr_t sys_call_table[UM_NR_syscall_max+1] __cacheline_aligned = { - /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ - [0 ... UM_NR_syscall_max] = &sys_ni_syscall, +/* + * We used to have a trick here which made sure that holes in the + * x86_64 table were filled in with sys_ni_syscall, but a comment in + * unistd_64.h says that holes aren't allowed, so the trick was + * removed. + * The trick looked like this + * [0 ... UM_NR_syscall_max] = &sys_ni_syscall + * before including unistd_64.h - the later initializations overwrote + * the sys_ni_syscall filler. + */ + +sys_call_ptr_t sys_call_table[] __cacheline_aligned = { #include <asm-x86/unistd_64.h> }; + +int syscall_table_size = sizeof(sys_call_table); diff --git a/arch/um/sys-x86_64/syscalls.c b/arch/um/sys-x86_64/syscalls.c index 86f6b18..f1199fd 100644 --- a/arch/um/sys-x86_64/syscalls.c +++ b/arch/um/sys-x86_64/syscalls.c @@ -48,7 +48,9 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) switch (code) { case ARCH_SET_FS: case ARCH_SET_GS: - restore_registers(pid, ¤t->thread.regs.regs); + ret = restore_registers(pid, ¤t->thread.regs.regs); + if (ret) + return ret; break; case ARCH_GET_FS: case ARCH_GET_GS: @@ -70,10 +72,10 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) switch (code) { case ARCH_SET_FS: current->thread.arch.fs = (unsigned long) ptr; - save_registers(pid, ¤t->thread.regs.regs); + ret = save_registers(pid, ¤t->thread.regs.regs); break; case ARCH_SET_GS: - save_registers(pid, ¤t->thread.regs.regs); + ret = save_registers(pid, ¤t->thread.regs.regs); break; case ARCH_GET_FS: ret = put_user(tmp, addr); @@ -105,7 +107,7 @@ long sys_clone(unsigned long clone_flags, unsigned long newsp, return ret; } -void arch_switch_to(struct task_struct *from, struct task_struct *to) +void arch_switch_to(struct task_struct *to) { if ((to->thread.arch.fs == 0) || (to->mm == NULL)) return; diff --git a/arch/um/sys-x86_64/sysrq.c b/arch/um/sys-x86_64/sysrq.c index 7654440..f4f82be 100644 --- a/arch/um/sys-x86_64/sysrq.c +++ b/arch/um/sys-x86_64/sysrq.c @@ -4,32 +4,33 @@ * Licensed under the GPL */ -#include "linux/kernel.h" -#include "linux/utsname.h" -#include "linux/module.h" -#include "asm/current.h" -#include "asm/ptrace.h" +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/utsname.h> +#include <asm/current.h> +#include <asm/ptrace.h> #include "sysrq.h" -void __show_regs(struct pt_regs * regs) +void __show_regs(struct pt_regs *regs) { printk("\n"); print_modules(); - printk("Pid: %d, comm: %.20s %s %s\n", task_pid_nr(current), + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s\n", task_pid_nr(current), current->comm, print_tainted(), init_utsname()->release); - printk("RIP: %04lx:[<%016lx>] ", PT_REGS_CS(regs) & 0xffff, + printk(KERN_INFO "RIP: %04lx:[<%016lx>]\n", PT_REGS_CS(regs) & 0xffff, PT_REGS_RIP(regs)); - printk("\nRSP: %016lx EFLAGS: %08lx\n", PT_REGS_RSP(regs), + printk(KERN_INFO "RSP: %016lx EFLAGS: %08lx\n", PT_REGS_RSP(regs), PT_REGS_EFLAGS(regs)); - printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", + printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", PT_REGS_RAX(regs), PT_REGS_RBX(regs), PT_REGS_RCX(regs)); - printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", + printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", PT_REGS_RDX(regs), PT_REGS_RSI(regs), PT_REGS_RDI(regs)); - printk("RBP: %016lx R08: %016lx R09: %016lx\n", + printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", PT_REGS_RBP(regs), PT_REGS_R8(regs), PT_REGS_R9(regs)); - printk("R10: %016lx R11: %016lx R12: %016lx\n", + printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", PT_REGS_R10(regs), PT_REGS_R11(regs), PT_REGS_R12(regs)); - printk("R13: %016lx R14: %016lx R15: %016lx\n", + printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", PT_REGS_R13(regs), PT_REGS_R14(regs), PT_REGS_R15(regs)); } diff --git a/arch/um/sys-x86_64/um_module.c b/arch/um/sys-x86_64/um_module.c index 8b8eff1..3dead39 100644 --- a/arch/um/sys-x86_64/um_module.c +++ b/arch/um/sys-x86_64/um_module.c @@ -1,7 +1,7 @@ #include <linux/vmalloc.h> #include <linux/moduleloader.h> -/*Copied from i386 arch/i386/kernel/module.c */ +/* Copied from i386 arch/i386/kernel/module.c */ void *module_alloc(unsigned long size) { if (size == 0) @@ -13,7 +13,9 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ + /* + * FIXME: If module_region == mod->init_region, trim exception + * table entries. + */ } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 59eef1c..4348211 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -465,6 +465,9 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT Calgary anyway, pass 'iommu=calgary' on the kernel command line. If unsure, say Y. +config IOMMU_HELPER + def_bool (CALGARY_IOMMU || GART_IOMMU) + # need this always selected by IOMMU for the VIA workaround config SWIOTLB bool diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 0db0a62..8022d3c 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -722,7 +722,9 @@ ia32_sys_call_table: .quad sys_epoll_pwait .quad compat_sys_utimensat /* 320 */ .quad compat_sys_signalfd - .quad compat_sys_timerfd + .quad sys_timerfd_create .quad sys_eventfd .quad sys32_fallocate + .quad compat_sys_timerfd_settime /* 325 */ + .quad compat_sys_timerfd_gettime ia32_syscall_end: diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 1fe7f04..1b5464c 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -35,6 +35,7 @@ #include <linux/pci.h> #include <linux/delay.h> #include <linux/scatterlist.h> +#include <linux/iommu-helper.h> #include <asm/gart.h> #include <asm/calgary.h> #include <asm/tce.h> @@ -260,22 +261,28 @@ static void iommu_range_reserve(struct iommu_table *tbl, spin_unlock_irqrestore(&tbl->it_lock, flags); } -static unsigned long iommu_range_alloc(struct iommu_table *tbl, - unsigned int npages) +static unsigned long iommu_range_alloc(struct device *dev, + struct iommu_table *tbl, + unsigned int npages) { unsigned long flags; unsigned long offset; + unsigned long boundary_size; + + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + PAGE_SIZE) >> PAGE_SHIFT; BUG_ON(npages == 0); spin_lock_irqsave(&tbl->it_lock, flags); - offset = find_next_zero_string(tbl->it_map, tbl->it_hint, - tbl->it_size, npages); + offset = iommu_area_alloc(tbl->it_map, tbl->it_size, tbl->it_hint, + npages, 0, boundary_size, 0); if (offset == ~0UL) { tbl->chip_ops->tce_cache_blast(tbl); - offset = find_next_zero_string(tbl->it_map, 0, - tbl->it_size, npages); + + offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0, + npages, 0, boundary_size, 0); if (offset == ~0UL) { printk(KERN_WARNING "Calgary: IOMMU full.\n"); spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -286,7 +293,6 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl, } } - set_bit_string(tbl->it_map, offset, npages); tbl->it_hint = offset + npages; BUG_ON(tbl->it_hint > tbl->it_size); @@ -295,13 +301,13 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl, return offset; } -static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr, - unsigned int npages, int direction) +static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, + void *vaddr, unsigned int npages, int direction) { unsigned long entry; dma_addr_t ret = bad_dma_address; - entry = iommu_range_alloc(tbl, npages); + entry = iommu_range_alloc(dev, tbl, npages); if (unlikely(entry == bad_dma_address)) goto error; @@ -354,7 +360,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, badbit, tbl, dma_addr, entry, npages); } - __clear_bit_string(tbl->it_map, entry, npages); + iommu_area_free(tbl->it_map, entry, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); } @@ -438,7 +444,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, vaddr = (unsigned long) sg_virt(s); npages = num_dma_pages(vaddr, s->length); - entry = iommu_range_alloc(tbl, npages); + entry = iommu_range_alloc(dev, tbl, npages); if (entry == bad_dma_address) { /* makes sure unmap knows to stop */ s->dma_length = 0; @@ -476,7 +482,7 @@ static dma_addr_t calgary_map_single(struct device *dev, void *vaddr, npages = num_dma_pages(uaddr, size); if (translation_enabled(tbl)) - dma_handle = iommu_alloc(tbl, vaddr, npages, direction); + dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction); else dma_handle = virt_to_bus(vaddr); @@ -516,7 +522,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, if (translation_enabled(tbl)) { /* set up tces to cover the allocated range */ - mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL); + mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); if (mapping == bad_dma_address) goto free; diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 845cbec..65f6acb 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -25,6 +25,7 @@ #include <linux/bitops.h> #include <linux/kdebug.h> #include <linux/scatterlist.h> +#include <linux/iommu-helper.h> #include <asm/atomic.h> #include <asm/io.h> #include <asm/mtrr.h> @@ -82,17 +83,24 @@ AGPEXTERN __u32 *agp_gatt_table; static unsigned long next_bit; /* protected by iommu_bitmap_lock */ static int need_flush; /* global flush state. set for each gart wrap */ -static unsigned long alloc_iommu(int size) +static unsigned long alloc_iommu(struct device *dev, int size) { unsigned long offset, flags; + unsigned long boundary_size; + unsigned long base_index; + + base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), + PAGE_SIZE) >> PAGE_SHIFT; + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + PAGE_SIZE) >> PAGE_SHIFT; spin_lock_irqsave(&iommu_bitmap_lock, flags); - offset = find_next_zero_string(iommu_gart_bitmap, next_bit, - iommu_pages, size); + offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, + size, base_index, boundary_size, 0); if (offset == -1) { need_flush = 1; - offset = find_next_zero_string(iommu_gart_bitmap, 0, - iommu_pages, size); + offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, + size, base_index, boundary_size, 0); } if (offset != -1) { set_bit_string(iommu_gart_bitmap, offset, size); @@ -114,7 +122,7 @@ static void free_iommu(unsigned long offset, int size) unsigned long flags; spin_lock_irqsave(&iommu_bitmap_lock, flags); - __clear_bit_string(iommu_gart_bitmap, offset, size); + iommu_area_free(iommu_gart_bitmap, offset, size); spin_unlock_irqrestore(&iommu_bitmap_lock, flags); } @@ -235,7 +243,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir) { unsigned long npages = to_pages(phys_mem, size); - unsigned long iommu_page = alloc_iommu(npages); + unsigned long iommu_page = alloc_iommu(dev, npages); int i; if (iommu_page == -1) { @@ -355,10 +363,11 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, } /* Map multiple scatterlist entries continuous into the first. */ -static int __dma_map_cont(struct scatterlist *start, int nelems, - struct scatterlist *sout, unsigned long pages) +static int __dma_map_cont(struct device *dev, struct scatterlist *start, + int nelems, struct scatterlist *sout, + unsigned long pages) { - unsigned long iommu_start = alloc_iommu(pages); + unsigned long iommu_start = alloc_iommu(dev, pages); unsigned long iommu_page = iommu_start; struct scatterlist *s; int i; @@ -394,8 +403,8 @@ static int __dma_map_cont(struct scatterlist *start, int nelems, } static inline int -dma_map_cont(struct scatterlist *start, int nelems, struct scatterlist *sout, - unsigned long pages, int need) +dma_map_cont(struct device *dev, struct scatterlist *start, int nelems, + struct scatterlist *sout, unsigned long pages, int need) { if (!need) { BUG_ON(nelems != 1); @@ -403,7 +412,7 @@ dma_map_cont(struct scatterlist *start, int nelems, struct scatterlist *sout, sout->dma_length = start->length; return 0; } - return __dma_map_cont(start, nelems, sout, pages); + return __dma_map_cont(dev, start, nelems, sout, pages); } /* @@ -416,6 +425,8 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) struct scatterlist *s, *ps, *start_sg, *sgmap; int need = 0, nextneed, i, out, start; unsigned long pages = 0; + unsigned int seg_size; + unsigned int max_seg_size; if (nents == 0) return 0; @@ -426,6 +437,8 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) out = 0; start = 0; start_sg = sgmap = sg; + seg_size = 0; + max_seg_size = dma_get_max_seg_size(dev); ps = NULL; /* shut up gcc */ for_each_sg(sg, s, nents, i) { dma_addr_t addr = sg_phys(s); @@ -443,11 +456,13 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) * offset. */ if (!iommu_merge || !nextneed || !need || s->offset || + (s->length + seg_size > max_seg_size) || (ps->offset + ps->length) % PAGE_SIZE) { - if (dma_map_cont(start_sg, i - start, sgmap, - pages, need) < 0) + if (dma_map_cont(dev, start_sg, i - start, + sgmap, pages, need) < 0) goto error; out++; + seg_size = 0; sgmap = sg_next(sgmap); pages = 0; start = i; @@ -455,11 +470,12 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) } } + seg_size += s->length; need = nextneed; pages += to_pages(s->offset, s->length); ps = s; } - if (dma_map_cont(start_sg, i - start, sgmap, pages, need) < 0) + if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) goto error; out++; flush_gart(); diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 8344c70..adff556 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -321,6 +321,8 @@ ENTRY(sys_call_table) .long sys_epoll_pwait .long sys_utimensat /* 320 */ .long sys_signalfd - .long sys_timerfd + .long sys_timerfd_create .long sys_eventfd .long sys_fallocate + .long sys_timerfd_settime /* 325 */ + .long sys_timerfd_gettime diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8f94a0b..cf53081 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1739,7 +1739,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, if (bytes == 8) { gpa_t gpa; struct page *page; - char *addr; + char *kaddr; u64 val; down_read(¤t->mm->mmap_sem); @@ -1754,9 +1754,9 @@ static int emulator_cmpxchg_emulated(unsigned long addr, val = *(u64 *)new; page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - addr = kmap_atomic(page, KM_USER0); - set_64bit((u64 *)(addr + offset_in_page(gpa)), val); - kunmap_atomic(addr, KM_USER0); + kaddr = kmap_atomic(page, KM_USER0); + set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); + kunmap_atomic(kaddr, KM_USER0); kvm_release_page_dirty(page); emul_write: up_read(¤t->mm->mmap_sem); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 4876182..25df1c1 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -21,7 +21,7 @@ else lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o lib-y += thunk_64.o clear_page_64.o copy_page_64.o - lib-y += bitstr_64.o bitops_64.o + lib-y += bitops_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o endif diff --git a/arch/x86/lib/bitstr_64.c b/arch/x86/lib/bitstr_64.c deleted file mode 100644 index 7445caf..0000000 --- a/arch/x86/lib/bitstr_64.c +++ /dev/null @@ -1,28 +0,0 @@ -#include <linux/module.h> -#include <linux/bitops.h> - -/* Find string of zero bits in a bitmap */ -unsigned long -find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len) -{ - unsigned long n, end, i; - - again: - n = find_next_zero_bit(bitmap, nbits, start); - if (n == -1) - return -1; - - /* could test bitsliced, but it's hardly worth it */ - end = n+len; - if (end > nbits) - return -1; - for (i = n+1; i < end; i++) { - if (test_bit(i, bitmap)) { - start = i+1; - goto again; - } - } - return n; -} - -EXPORT_SYMBOL(find_next_zero_string); diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index c7db504..6c19146 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -272,7 +272,7 @@ static void pgd_dtor(void *pgd) * preallocate which never got a corresponding vma will need to be * freed manually. */ -static void pgd_mop_up_pmds(pgd_t *pgdp) +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { int i; @@ -285,7 +285,7 @@ static void pgd_mop_up_pmds(pgd_t *pgdp) pgdp[i] = native_make_pgd(0); paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT); - pmd_free(pmd); + pmd_free(mm, pmd); } } } @@ -313,7 +313,7 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) pmd_t *pmd = pmd_alloc_one(mm, addr); if (!pmd) { - pgd_mop_up_pmds(pgd); + pgd_mop_up_pmds(mm, pgd); return 0; } @@ -333,7 +333,7 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) return 1; } -static void pgd_mop_up_pmds(pgd_t *pgd) +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { } #endif /* CONFIG_X86_PAE */ @@ -352,9 +352,9 @@ pgd_t *pgd_alloc(struct mm_struct *mm) return pgd; } -void pgd_free(pgd_t *pgd) +void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - pgd_mop_up_pmds(pgd); + pgd_mop_up_pmds(mm, pgd); quicklist_free(0, pgd_dtor, pgd); } diff --git a/drivers/Kconfig b/drivers/Kconfig index 3f8a231..d74d9fb 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -52,6 +52,8 @@ source "drivers/i2c/Kconfig" source "drivers/spi/Kconfig" +source "drivers/gpio/Kconfig" + source "drivers/w1/Kconfig" source "drivers/power/Kconfig" diff --git a/drivers/Makefile b/drivers/Makefile index 0ee9a8a..f1c11db 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -5,6 +5,7 @@ # Rewritten to use lists instead of if-statements. # +obj-$(CONFIG_HAVE_GPIO_LIB) += gpio/ obj-$(CONFIG_PCI) += pci/ obj-$(CONFIG_PARISC) += parisc/ obj-$(CONFIG_RAPIDIO) += rapidio/ diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index eb1f82f..199ea21 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -38,7 +38,7 @@ #include <linux/dmi.h> #include <linux/moduleparam.h> #include <linux/sched.h> /* need_resched() */ -#include <linux/latency.h> +#include <linux/pm_qos_params.h> #include <linux/clockchips.h> #include <linux/cpuidle.h> @@ -648,7 +648,8 @@ static void acpi_processor_idle(void) if (cx->promotion.state && ((cx->promotion.state - pr->power.states) <= max_cstate)) { if (sleep_ticks > cx->promotion.threshold.ticks && - cx->promotion.state->latency <= system_latency_constraint()) { + cx->promotion.state->latency <= + pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY)) { cx->promotion.count++; cx->demotion.count = 0; if (cx->promotion.count >= @@ -692,7 +693,8 @@ static void acpi_processor_idle(void) * or if the latency of the current state is unacceptable */ if ((pr->power.state - pr->power.states) > max_cstate || - pr->power.state->latency > system_latency_constraint()) { + pr->power.state->latency > + pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY)) { if (cx->demotion.state) next_state = cx->demotion.state; } @@ -1200,7 +1202,7 @@ static int acpi_processor_power_seq_show(struct seq_file *seq, void *offset) "maximum allowed latency: %d usec\n", pr->power.state ? pr->power.state - pr->power.states : 0, max_cstate, (unsigned)pr->power.bm_activity, - system_latency_constraint()); + pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY)); seq_puts(seq, "states:\n"); @@ -1718,8 +1720,9 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr, "ACPI: processor limited to max C-state %d\n", max_cstate); first_run++; -#if !defined (CONFIG_CPU_IDLE) && defined (CONFIG_SMP) - register_latency_notifier(&acpi_processor_latency_notifier); +#if !defined(CONFIG_CPU_IDLE) && defined(CONFIG_SMP) + pm_qos_add_notifier(PM_QOS_CPU_DMA_LATENCY, + &acpi_processor_latency_notifier); #endif } @@ -1806,7 +1809,8 @@ int acpi_processor_power_exit(struct acpi_processor *pr, */ cpu_idle_wait(); #ifdef CONFIG_SMP - unregister_latency_notifier(&acpi_processor_latency_notifier); + pm_qos_remove_notifier(PM_QOS_CPU_DMA_LATENCY, + &acpi_processor_latency_notifier); #endif } #endif diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c index 96e614a..59e65ed 100644 --- a/drivers/ata/sata_inic162x.c +++ b/drivers/ata/sata_inic162x.c @@ -108,17 +108,6 @@ struct inic_port_priv { u8 cached_pirq_mask; }; -static int inic_slave_config(struct scsi_device *sdev) -{ - /* This controller is braindamaged. dma_boundary is 0xffff - * like others but it will lock up the whole machine HARD if - * 65536 byte PRD entry is fed. Reduce maximum segment size. - */ - blk_queue_max_segment_size(sdev->request_queue, 65536 - 512); - - return ata_scsi_slave_config(sdev); -} - static struct scsi_host_template inic_sht = { .module = THIS_MODULE, .name = DRV_NAME, @@ -132,7 +121,7 @@ static struct scsi_host_template inic_sht = { .use_clustering = ATA_SHT_USE_CLUSTERING, .proc_name = DRV_NAME, .dma_boundary = ATA_DMA_BOUNDARY, - .slave_configure = inic_slave_config, + .slave_configure = ata_scsi_slave_config, .slave_destroy = ata_scsi_slave_destroy, .bios_param = ata_std_bios_param, }; @@ -730,6 +719,18 @@ static int inic_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) return rc; } + /* + * This controller is braindamaged. dma_boundary is 0xffff + * like others but it will lock up the whole machine HARD if + * 65536 byte PRD entry is fed. Reduce maximum segment size. + */ + rc = pci_set_dma_max_seg_size(pdev, 65536 - 512); + if (rc) { + dev_printk(KERN_ERR, &pdev->dev, + "failed to set the maximum segment size.\n"); + return rc; + } + rc = init_controller(iomap[MMIO_BAR], hpriv->cached_hctl); if (rc) { dev_printk(KERN_ERR, &pdev->dev, diff --git a/drivers/bluetooth/bt3c_cs.c b/drivers/bluetooth/bt3c_cs.c index a18f9b8..7703d6e 100644 --- a/drivers/bluetooth/bt3c_cs.c +++ b/drivers/bluetooth/bt3c_cs.c @@ -704,7 +704,7 @@ static int next_tuple(struct pcmcia_device *handle, tuple_t *tuple, cisparse_t * static int bt3c_config(struct pcmcia_device *link) { - static kio_addr_t base[5] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8, 0x0 }; + static unsigned int base[5] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8, 0x0 }; bt3c_info_t *info = link->priv; tuple_t tuple; u_short buf[256]; diff --git a/drivers/bluetooth/btuart_cs.c b/drivers/bluetooth/btuart_cs.c index dade162..68d1d25 100644 --- a/drivers/bluetooth/btuart_cs.c +++ b/drivers/bluetooth/btuart_cs.c @@ -634,7 +634,7 @@ static int next_tuple(struct pcmcia_device *handle, tuple_t *tuple, cisparse_t * static int btuart_config(struct pcmcia_device *link) { - static kio_addr_t base[5] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8, 0x0 }; + static unsigned int base[5] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8, 0x0 }; btuart_info_t *info = link->priv; tuple_t tuple; u_short buf[256]; diff --git a/drivers/char/agp/alpha-agp.c b/drivers/char/agp/alpha-agp.c index aa8f3a3..e77c178 100644 --- a/drivers/char/agp/alpha-agp.c +++ b/drivers/char/agp/alpha-agp.c @@ -11,29 +11,28 @@ #include "agp.h" -static struct page *alpha_core_agp_vm_nopage(struct vm_area_struct *vma, - unsigned long address, - int *type) +static int alpha_core_agp_vm_fault(struct vm_area_struct *vma, + struct vm_fault *vmf) { alpha_agp_info *agp = agp_bridge->dev_private_data; dma_addr_t dma_addr; unsigned long pa; struct page *page; - dma_addr = address - vma->vm_start + agp->aperture.bus_base; + dma_addr = (unsigned long)vmf->virtual_address - vma->vm_start + + agp->aperture.bus_base; pa = agp->ops->translate(agp, dma_addr); if (pa == (unsigned long)-EINVAL) - return NULL; /* no translation */ + return VM_FAULT_SIGBUS; /* no translation */ /* * Get the page, inc the use count, and return it */ page = virt_to_page(__va(pa)); get_page(page); - if (type) - *type = VM_FAULT_MINOR; - return page; + vmf->page = page; + return 0; } static struct aper_size_info_fixed alpha_core_agp_sizes[] = @@ -42,7 +41,7 @@ static struct aper_size_info_fixed alpha_core_agp_sizes[] = }; struct vm_operations_struct alpha_core_agp_vm_ops = { - .nopage = alpha_core_agp_vm_nopage, + .fault = alpha_core_agp_vm_fault, }; diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 0118b98..84cdf90 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -234,11 +234,11 @@ static DEVICE_ATTR(rng_available, S_IRUGO, NULL); -static void unregister_miscdev(void) +static void unregister_miscdev(bool suspended) { device_remove_file(rng_miscdev.this_device, &dev_attr_rng_available); device_remove_file(rng_miscdev.this_device, &dev_attr_rng_current); - misc_deregister(&rng_miscdev); + __misc_deregister(&rng_miscdev, suspended); } static int register_miscdev(void) @@ -313,7 +313,7 @@ out: } EXPORT_SYMBOL_GPL(hwrng_register); -void hwrng_unregister(struct hwrng *rng) +void __hwrng_unregister(struct hwrng *rng, bool suspended) { int err; @@ -332,11 +332,11 @@ void hwrng_unregister(struct hwrng *rng) } } if (list_empty(&rng_list)) - unregister_miscdev(); + unregister_miscdev(suspended); mutex_unlock(&rng_mutex); } -EXPORT_SYMBOL_GPL(hwrng_unregister); +EXPORT_SYMBOL_GPL(__hwrng_unregister); MODULE_DESCRIPTION("H/W Random Number Generator (RNG) driver"); diff --git a/drivers/char/misc.c b/drivers/char/misc.c index 71c8cd7..a39101f 100644 --- a/drivers/char/misc.c +++ b/drivers/char/misc.c @@ -232,8 +232,9 @@ int misc_register(struct miscdevice * misc) } /** - * misc_deregister - unregister a miscellaneous device + * __misc_deregister - unregister a miscellaneous device * @misc: device to unregister + * @suspended: to be set if the function is used during suspend/resume * * Unregister a miscellaneous device that was previously * successfully registered with misc_register(). Success @@ -241,7 +242,7 @@ int misc_register(struct miscdevice * misc) * indicates an error. */ -int misc_deregister(struct miscdevice * misc) +int __misc_deregister(struct miscdevice *misc, bool suspended) { int i = misc->minor; @@ -250,7 +251,11 @@ int misc_deregister(struct miscdevice * misc) mutex_lock(&misc_mtx); list_del(&misc->list); - device_destroy(misc_class, MKDEV(MISC_MAJOR, misc->minor)); + if (suspended) + destroy_suspended_device(misc_class, + MKDEV(MISC_MAJOR, misc->minor)); + else + device_destroy(misc_class, MKDEV(MISC_MAJOR, misc->minor)); if (i < DYNAMIC_MINORS && i>0) { misc_minors[i>>3] &= ~(1 << (misc->minor & 7)); } @@ -259,7 +264,7 @@ int misc_deregister(struct miscdevice * misc) } EXPORT_SYMBOL(misc_register); -EXPORT_SYMBOL(misc_deregister); +EXPORT_SYMBOL(__misc_deregister); static int __init misc_init(void) { diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c index 02518da..454d732 100644 --- a/drivers/char/pcmcia/cm4000_cs.c +++ b/drivers/char/pcmcia/cm4000_cs.c @@ -308,7 +308,8 @@ static unsigned int calc_baudv(unsigned char fidi) return (wcrcf / wbrcf); } -static unsigned short io_read_num_rec_bytes(ioaddr_t iobase, unsigned short *s) +static unsigned short io_read_num_rec_bytes(unsigned int iobase, + unsigned short *s) { unsigned short tmp; @@ -426,7 +427,7 @@ static struct card_fixup card_fixups[] = { static void set_cardparameter(struct cm4000_dev *dev) { int i; - ioaddr_t iobase = dev->p_dev->io.BasePort1; + unsigned int iobase = dev->p_dev->io.BasePort1; u_int8_t stopbits = 0x02; /* ISO default */ DEBUGP(3, dev, "-> set_cardparameter\n"); @@ -459,7 +460,7 @@ static int set_protocol(struct cm4000_dev *dev, struct ptsreq *ptsreq) unsigned short num_bytes_read; unsigned char pts_reply[4]; ssize_t rc; - ioaddr_t iobase = dev->p_dev->io.BasePort1; + unsigned int iobase = dev->p_dev->io.BasePort1; rc = 0; @@ -610,7 +611,7 @@ exit_setprotocol: return rc; } -static int io_detect_cm4000(ioaddr_t iobase, struct cm4000_dev *dev) +static int io_detect_cm4000(unsigned int iobase, struct cm4000_dev *dev) { /* note: statemachine is assumed to be reset */ @@ -671,7 +672,7 @@ static void terminate_monitor(struct cm4000_dev *dev) static void monitor_card(unsigned long p) { struct cm4000_dev *dev = (struct cm4000_dev *) p; - ioaddr_t iobase = dev->p_dev->io.BasePort1; + unsigned int iobase = dev->p_dev->io.BasePort1; unsigned short s; struct ptsreq ptsreq; int i, atrc; @@ -933,7 +934,7 @@ static ssize_t cmm_read(struct file *filp, __user char *buf, size_t count, loff_t *ppos) { struct cm4000_dev *dev = filp->private_data; - ioaddr_t iobase = dev->p_dev->io.BasePort1; + unsigned int iobase = dev->p_dev->io.BasePort1; ssize_t rc; int i, j, k; @@ -1054,7 +1055,7 @@ static ssize_t cmm_write(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { struct cm4000_dev *dev = (struct cm4000_dev *) filp->private_data; - ioaddr_t iobase = dev->p_dev->io.BasePort1; + unsigned int iobase = dev->p_dev->io.BasePort1; unsigned short s; unsigned char tmp; unsigned char infolen; @@ -1408,7 +1409,7 @@ static int cmm_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { struct cm4000_dev *dev = filp->private_data; - ioaddr_t iobase = dev->p_dev->io.BasePort1; + unsigned int iobase = dev->p_dev->io.BasePort1; struct pcmcia_device *link; int size; int rc; diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index d2fabe7..2a98d99 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -12,7 +12,7 @@ #include <linux/mutex.h> #include <linux/sched.h> #include <linux/notifier.h> -#include <linux/latency.h> +#include <linux/pm_qos_params.h> #include <linux/cpu.h> #include <linux/cpuidle.h> @@ -265,7 +265,10 @@ static struct notifier_block cpuidle_latency_notifier = { .notifier_call = cpuidle_latency_notify, }; -#define latency_notifier_init(x) do { register_latency_notifier(x); } while (0) +static inline void latency_notifier_init(struct notifier_block *n) +{ + pm_qos_add_notifier(PM_QOS_CPU_DMA_LATENCY, n); +} #else /* CONFIG_SMP */ diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c index eb666ec..ba7b9a6 100644 --- a/drivers/cpuidle/governors/ladder.c +++ b/drivers/cpuidle/governors/ladder.c @@ -14,7 +14,7 @@ #include <linux/kernel.h> #include <linux/cpuidle.h> -#include <linux/latency.h> +#include <linux/pm_qos_params.h> #include <linux/moduleparam.h> #include <linux/jiffies.h> @@ -81,7 +81,8 @@ static int ladder_select_state(struct cpuidle_device *dev) /* consider promotion */ if (last_idx < dev->state_count - 1 && last_residency > last_state->threshold.promotion_time && - dev->states[last_idx + 1].exit_latency <= system_latency_constraint()) { + dev->states[last_idx + 1].exit_latency <= + pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY)) { last_state->stats.promotion_count++; last_state->stats.demotion_count = 0; if (last_state->stats.promotion_count >= last_state->threshold.promotion_count) { diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 299d45c..78d77c5 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -8,7 +8,7 @@ #include <linux/kernel.h> #include <linux/cpuidle.h> -#include <linux/latency.h> +#include <linux/pm_qos_params.h> #include <linux/time.h> #include <linux/ktime.h> #include <linux/hrtimer.h> @@ -48,7 +48,7 @@ static int menu_select(struct cpuidle_device *dev) break; if (s->target_residency > data->predicted_us) break; - if (s->exit_latency > system_latency_constraint()) + if (s->exit_latency > pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY)) break; } diff --git a/drivers/dio/dio.c b/drivers/dio/dio.c index 17502d6..07f274f 100644 --- a/drivers/dio/dio.c +++ b/drivers/dio/dio.c @@ -88,8 +88,6 @@ static struct dioname names[] = #undef DIONAME #undef DIOFBNAME -#define NUMNAMES (sizeof(names) / sizeof(struct dioname)) - static const char *unknowndioname = "unknown DIO board -- please email <linux-m68k@lists.linux-m68k.org>!"; @@ -97,7 +95,7 @@ static const char *dio_getname(int id) { /* return pointer to a constant string describing the board with given ID */ unsigned int i; - for (i = 0; i < NUMNAMES; i++) + for (i = 0; i < ARRAY_SIZE(names); i++) if (names[i].id == id) return names[i].name; diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig new file mode 100644 index 0000000..74fac0f --- /dev/null +++ b/drivers/gpio/Kconfig @@ -0,0 +1,72 @@ +# +# GPIO infrastructure and expanders +# + +config HAVE_GPIO_LIB + bool + help + Platforms select gpiolib if they use this infrastructure + for all their GPIOs, usually starting with ones integrated + into SOC processors. + +menu "GPIO Support" + depends on HAVE_GPIO_LIB + +config DEBUG_GPIO + bool "Debug GPIO calls" + depends on DEBUG_KERNEL + help + Say Y here to add some extra checks and diagnostics to GPIO calls. + The checks help ensure that GPIOs have been properly initialized + before they are used and that sleeping calls aren not made from + nonsleeping contexts. They can make bitbanged serial protocols + slower. The diagnostics help catch the type of setup errors + that are most common when setting up new platforms or boards. + +# put expanders in the right section, in alphabetical order + +comment "I2C GPIO expanders:" + +config GPIO_PCA9539 + tristate "PCA9539 16-bit I/O port" + depends on I2C + help + Say yes here to support the PCA9539 16-bit I/O port. These + parts are made by NXP and TI. + + This driver can also be built as a module. If so, the module + will be called pca9539. + +config GPIO_PCF857X + tristate "PCF857x, PCA857x, and PCA967x I2C GPIO expanders" + depends on I2C + help + Say yes here to provide access to most "quasi-bidirectional" I2C + GPIO expanders used for additional digital outputs or inputs. + Most of these parts are from NXP, though TI is a second source for + some of them. Compatible models include: + + 8 bits: pcf8574, pcf8574a, pca8574, pca8574a, + pca9670, pca9672, pca9674, pca9674a + + 16 bits: pcf8575, pcf8575c, pca8575, + pca9671, pca9673, pca9675 + + Your board setup code will need to declare the expanders in + use, and assign numbers to the GPIOs they expose. Those GPIOs + can then be used from drivers and other kernel code, just like + other GPIOs, but only accessible from task contexts. + + This driver provides an in-kernel interface to those GPIOs using + platform-neutral GPIO calls. + +comment "SPI GPIO expanders:" + +config GPIO_MCP23S08 + tristate "Microchip MCP23S08 I/O expander" + depends on SPI_MASTER + help + SPI driver for Microchip MCP23S08 I/O expander. This provides + a GPIO interface supporting inputs and outputs. + +endmenu diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile new file mode 100644 index 0000000..470ecd6 --- /dev/null +++ b/drivers/gpio/Makefile @@ -0,0 +1,9 @@ +# gpio support: dedicated expander chips, etc + +ccflags-$(CONFIG_DEBUG_GPIO) += -DDEBUG + +obj-$(CONFIG_HAVE_GPIO_LIB) += gpiolib.o + +obj-$(CONFIG_GPIO_MCP23S08) += mcp23s08.o +obj-$(CONFIG_GPIO_PCA9539) += pca9539.o +obj-$(CONFIG_GPIO_PCF857X) += pcf857x.o diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c new file mode 100644 index 0000000..d8db2f8e --- /dev/null +++ b/drivers/gpio/gpiolib.c @@ -0,0 +1,567 @@ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/irq.h> +#include <linux/spinlock.h> + +#include <asm/gpio.h> + + +/* Optional implementation infrastructure for GPIO interfaces. + * + * Platforms may want to use this if they tend to use very many GPIOs + * that aren't part of a System-On-Chip core; or across I2C/SPI/etc. + * + * When kernel footprint or instruction count is an issue, simpler + * implementations may be preferred. The GPIO programming interface + * allows for inlining speed-critical get/set operations for common + * cases, so that access to SOC-integrated GPIOs can sometimes cost + * only an instruction or two per bit. + */ + + +/* When debugging, extend minimal trust to callers and platform code. + * Also emit diagnostic messages that may help initial bringup, when + * board setup or driver bugs are most common. + * + * Otherwise, minimize overhead in what may be bitbanging codepaths. + */ +#ifdef DEBUG +#define extra_checks 1 +#else +#define extra_checks 0 +#endif + +/* gpio_lock prevents conflicts during gpio_desc[] table updates. + * While any GPIO is requested, its gpio_chip is not removable; + * each GPIO's "requested" flag serves as a lock and refcount. + */ +static DEFINE_SPINLOCK(gpio_lock); + +struct gpio_desc { + struct gpio_chip *chip; + unsigned long flags; +/* flag symbols are bit numbers */ +#define FLAG_REQUESTED 0 +#define FLAG_IS_OUT 1 + +#ifdef CONFIG_DEBUG_FS + const char *label; +#endif +}; +static struct gpio_desc gpio_desc[ARCH_NR_GPIOS]; + +static inline void desc_set_label(struct gpio_desc *d, const char *label) +{ +#ifdef CONFIG_DEBUG_FS + d->label = label; +#endif +} + +/* Warn when drivers omit gpio_request() calls -- legal but ill-advised + * when setting direction, and otherwise illegal. Until board setup code + * and drivers use explicit requests everywhere (which won't happen when + * those calls have no teeth) we can't avoid autorequesting. This nag + * message should motivate switching to explicit requests... + */ +static void gpio_ensure_requested(struct gpio_desc *desc) +{ + if (test_and_set_bit(FLAG_REQUESTED, &desc->flags) == 0) { + pr_warning("GPIO-%d autorequested\n", (int)(desc - gpio_desc)); + desc_set_label(desc, "[auto]"); + } +} + +/* caller holds gpio_lock *OR* gpio is marked as requested */ +static inline struct gpio_chip *gpio_to_chip(unsigned gpio) +{ + return gpio_desc[gpio].chip; +} + +/** + * gpiochip_add() - register a gpio_chip + * @chip: the chip to register, with chip->base initialized + * Context: potentially before irqs or kmalloc will work + * + * Returns a negative errno if the chip can't be registered, such as + * because the chip->base is invalid or already associated with a + * different chip. Otherwise it returns zero as a success code. + */ +int gpiochip_add(struct gpio_chip *chip) +{ + unsigned long flags; + int status = 0; + unsigned id; + + /* NOTE chip->base negative is reserved to mean a request for + * dynamic allocation. We don't currently support that. + */ + + if (chip->base < 0 || (chip->base + chip->ngpio) >= ARCH_NR_GPIOS) { + status = -EINVAL; + goto fail; + } + + spin_lock_irqsave(&gpio_lock, flags); + + /* these GPIO numbers must not be managed by another gpio_chip */ + for (id = chip->base; id < chip->base + chip->ngpio; id++) { + if (gpio_desc[id].chip != NULL) { + status = -EBUSY; + break; + } + } + if (status == 0) { + for (id = chip->base; id < chip->base + chip->ngpio; id++) { + gpio_desc[id].chip = chip; + gpio_desc[id].flags = 0; + } + } + + spin_unlock_irqrestore(&gpio_lock, flags); +fail: + /* failures here can mean systems won't boot... */ + if (status) + pr_err("gpiochip_add: gpios %d..%d (%s) not registered\n", + chip->base, chip->base + chip->ngpio, + chip->label ? : "generic"); + return status; +} +EXPORT_SYMBOL_GPL(gpiochip_add); + +/** + * gpiochip_remove() - unregister a gpio_chip + * @chip: the chip to unregister + * + * A gpio_chip with any GPIOs still requested may not be removed. + */ +int gpiochip_remove(struct gpio_chip *chip) +{ + unsigned long flags; + int status = 0; + unsigned id; + + spin_lock_irqsave(&gpio_lock, flags); + + for (id = chip->base; id < chip->base + chip->ngpio; id++) { + if (test_bit(FLAG_REQUESTED, &gpio_desc[id].flags)) { + status = -EBUSY; + break; + } + } + if (status == 0) { + for (id = chip->base; id < chip->base + chip->ngpio; id++) + gpio_desc[id].chip = NULL; + } + + spin_unlock_irqrestore(&gpio_lock, flags); + return status; +} +EXPORT_SYMBOL_GPL(gpiochip_remove); + + +/* These "optional" allocation calls help prevent drivers from stomping + * on each other, and help provide better diagnostics in debugfs. + * They're called even less than the "set direction" calls. + */ +int gpio_request(unsigned gpio, const char *label) +{ + struct gpio_desc *desc; + int status = -EINVAL; + unsigned long flags; + + spin_lock_irqsave(&gpio_lock, flags); + + if (gpio >= ARCH_NR_GPIOS) + goto done; + desc = &gpio_desc[gpio]; + if (desc->chip == NULL) + goto done; + + /* NOTE: gpio_request() can be called in early boot, + * before IRQs are enabled. + */ + + if (test_and_set_bit(FLAG_REQUESTED, &desc->flags) == 0) { + desc_set_label(desc, label ? : "?"); + status = 0; + } else + status = -EBUSY; + +done: + if (status) + pr_debug("gpio_request: gpio-%d (%s) status %d\n", + gpio, label ? : "?", status); + spin_unlock_irqrestore(&gpio_lock, flags); + return status; +} +EXPORT_SYMBOL_GPL(gpio_request); + +void gpio_free(unsigned gpio) +{ + unsigned long flags; + struct gpio_desc *desc; + + if (gpio >= ARCH_NR_GPIOS) { + WARN_ON(extra_checks); + return; + } + + spin_lock_irqsave(&gpio_lock, flags); + + desc = &gpio_desc[gpio]; + if (desc->chip && test_and_clear_bit(FLAG_REQUESTED, &desc->flags)) + desc_set_label(desc, NULL); + else + WARN_ON(extra_checks); + + spin_unlock_irqrestore(&gpio_lock, flags); +} +EXPORT_SYMBOL_GPL(gpio_free); + + +/** + * gpiochip_is_requested - return string iff signal was requested + * @chip: controller managing the signal + * @offset: of signal within controller's 0..(ngpio - 1) range + * + * Returns NULL if the GPIO is not currently requested, else a string. + * If debugfs support is enabled, the string returned is the label passed + * to gpio_request(); otherwise it is a meaningless constant. + * + * This function is for use by GPIO controller drivers. The label can + * help with diagnostics, and knowing that the signal is used as a GPIO + * can help avoid accidentally multiplexing it to another controller. + */ +const char *gpiochip_is_requested(struct gpio_chip *chip, unsigned offset) +{ + unsigned gpio = chip->base + offset; + + if (gpio >= ARCH_NR_GPIOS || gpio_desc[gpio].chip != chip) + return NULL; + if (test_bit(FLAG_REQUESTED, &gpio_desc[gpio].flags) == 0) + return NULL; +#ifdef CONFIG_DEBUG_FS + return gpio_desc[gpio].label; +#else + return "?"; +#endif +} +EXPORT_SYMBOL_GPL(gpiochip_is_requested); + + +/* Drivers MUST set GPIO direction before making get/set calls. In + * some cases this is done in early boot, before IRQs are enabled. + * + * As a rule these aren't called more than once (except for drivers + * using the open-drain emulation idiom) so these are natural places + * to accumulate extra debugging checks. Note that we can't (yet) + * rely on gpio_request() having been called beforehand. + */ + +int gpio_direction_input(unsigned gpio) +{ + unsigned long flags; + struct gpio_chip *chip; + struct gpio_desc *desc = &gpio_desc[gpio]; + int status = -EINVAL; + + spin_lock_irqsave(&gpio_lock, flags); + + if (gpio >= ARCH_NR_GPIOS) + goto fail; + chip = desc->chip; + if (!chip || !chip->get || !chip->direction_input) + goto fail; + gpio -= chip->base; + if (gpio >= chip->ngpio) + goto fail; + gpio_ensure_requested(desc); + + /* now we know the gpio is valid and chip won't vanish */ + + spin_unlock_irqrestore(&gpio_lock, flags); + + might_sleep_if(extra_checks && chip->can_sleep); + + status = chip->direction_input(chip, gpio); + if (status == 0) + clear_bit(FLAG_IS_OUT, &desc->flags); + return status; +fail: + spin_unlock_irqrestore(&gpio_lock, flags); + if (status) + pr_debug("%s: gpio-%d status %d\n", + __FUNCTION__, gpio, status); + return status; +} +EXPORT_SYMBOL_GPL(gpio_direction_input); + +int gpio_direction_output(unsigned gpio, int value) +{ + unsigned long flags; + struct gpio_chip *chip; + struct gpio_desc *desc = &gpio_desc[gpio]; + int status = -EINVAL; + + spin_lock_irqsave(&gpio_lock, flags); + + if (gpio >= ARCH_NR_GPIOS) + goto fail; + chip = desc->chip; + if (!chip || !chip->set || !chip->direction_output) + goto fail; + gpio -= chip->base; + if (gpio >= chip->ngpio) + goto fail; + gpio_ensure_requested(desc); + + /* now we know the gpio is valid and chip won't vanish */ + + spin_unlock_irqrestore(&gpio_lock, flags); + + might_sleep_if(extra_checks && chip->can_sleep); + + status = chip->direction_output(chip, gpio, value); + if (status == 0) + set_bit(FLAG_IS_OUT, &desc->flags); + return status; +fail: + spin_unlock_irqrestore(&gpio_lock, flags); + if (status) + pr_debug("%s: gpio-%d status %d\n", + __FUNCTION__, gpio, status); + return status; +} +EXPORT_SYMBOL_GPL(gpio_direction_output); + + +/* I/O calls are only valid after configuration completed; the relevant + * "is this a valid GPIO" error checks should already have been done. + * + * "Get" operations are often inlinable as reading a pin value register, + * and masking the relevant bit in that register. + * + * When "set" operations are inlinable, they involve writing that mask to + * one register to set a low value, or a different register to set it high. + * Otherwise locking is needed, so there may be little value to inlining. + * + *------------------------------------------------------------------------ + * + * IMPORTANT!!! The hot paths -- get/set value -- assume that callers + * have requested the GPIO. That can include implicit requesting by + * a direction setting call. Marking a gpio as requested locks its chip + * in memory, guaranteeing that these table lookups need no more locking + * and that gpiochip_remove() will fail. + * + * REVISIT when debugging, consider adding some instrumentation to ensure + * that the GPIO was actually requested. + */ + +/** + * __gpio_get_value() - return a gpio's value + * @gpio: gpio whose value will be returned + * Context: any + * + * This is used directly or indirectly to implement gpio_get_value(). + * It returns the zero or nonzero value provided by the associated + * gpio_chip.get() method; or zero if no such method is provided. + */ +int __gpio_get_value(unsigned gpio) +{ + struct gpio_chip *chip; + + chip = gpio_to_chip(gpio); + WARN_ON(extra_checks && chip->can_sleep); + return chip->get ? chip->get(chip, gpio - chip->base) : 0; +} +EXPORT_SYMBOL_GPL(__gpio_get_value); + +/** + * __gpio_set_value() - assign a gpio's value + * @gpio: gpio whose value will be assigned + * @value: value to assign + * Context: any + * + * This is used directly or indirectly to implement gpio_set_value(). + * It invokes the associated gpio_chip.set() method. + */ +void __gpio_set_value(unsigned gpio, int value) +{ + struct gpio_chip *chip; + + chip = gpio_to_chip(gpio); + WARN_ON(extra_checks && chip->can_sleep); + chip->set(chip, gpio - chip->base, value); +} +EXPORT_SYMBOL_GPL(__gpio_set_value); + +/** + * __gpio_cansleep() - report whether gpio value access will sleep + * @gpio: gpio in question + * Context: any + * + * This is used directly or indirectly to implement gpio_cansleep(). It + * returns nonzero if access reading or writing the GPIO value can sleep. + */ +int __gpio_cansleep(unsigned gpio) +{ + struct gpio_chip *chip; + + /* only call this on GPIOs that are valid! */ + chip = gpio_to_chip(gpio); + + return chip->can_sleep; +} +EXPORT_SYMBOL_GPL(__gpio_cansleep); + + + +/* There's no value in making it easy to inline GPIO calls that may sleep. + * Common examples include ones connected to I2C or SPI chips. + */ + +int gpio_get_value_cansleep(unsigned gpio) +{ + struct gpio_chip *chip; + + might_sleep_if(extra_checks); + chip = gpio_to_chip(gpio); + return chip->get(chip, gpio - chip->base); +} +EXPORT_SYMBOL_GPL(gpio_get_value_cansleep); + +void gpio_set_value_cansleep(unsigned gpio, int value) +{ + struct gpio_chip *chip; + + might_sleep_if(extra_checks); + chip = gpio_to_chip(gpio); + chip->set(chip, gpio - chip->base, value); +} +EXPORT_SYMBOL_GPL(gpio_set_value_cansleep); + + +#ifdef CONFIG_DEBUG_FS + +#include <linux/debugfs.h> +#include <linux/seq_file.h> + + +static void gpiolib_dbg_show(struct seq_file *s, struct gpio_chip *chip) +{ + unsigned i; + unsigned gpio = chip->base; + struct gpio_desc *gdesc = &gpio_desc[gpio]; + int is_out; + + for (i = 0; i < chip->ngpio; i++, gpio++, gdesc++) { + if (!test_bit(FLAG_REQUESTED, &gdesc->flags)) + continue; + + is_out = test_bit(FLAG_IS_OUT, &gdesc->flags); + seq_printf(s, " gpio-%-3d (%-12s) %s %s", + gpio, gdesc->label, + is_out ? "out" : "in ", + chip->get + ? (chip->get(chip, i) ? "hi" : "lo") + : "? "); + + if (!is_out) { + int irq = gpio_to_irq(gpio); + struct irq_desc *desc = irq_desc + irq; + + /* This races with request_irq(), set_irq_type(), + * and set_irq_wake() ... but those are "rare". + * + * More significantly, trigger type flags aren't + * currently maintained by genirq. + */ + if (irq >= 0 && desc->action) { + char *trigger; + + switch (desc->status & IRQ_TYPE_SENSE_MASK) { + case IRQ_TYPE_NONE: + trigger = "(default)"; + break; + case IRQ_TYPE_EDGE_FALLING: + trigger = "edge-falling"; + break; + case IRQ_TYPE_EDGE_RISING: + trigger = "edge-rising"; + break; + case IRQ_TYPE_EDGE_BOTH: + trigger = "edge-both"; + break; + case IRQ_TYPE_LEVEL_HIGH: + trigger = "level-high"; + break; + case IRQ_TYPE_LEVEL_LOW: + trigger = "level-low"; + break; + default: + trigger = "?trigger?"; + break; + } + + seq_printf(s, " irq-%d %s%s", + irq, trigger, + (desc->status & IRQ_WAKEUP) + ? " wakeup" : ""); + } + } + + seq_printf(s, "\n"); + } +} + +static int gpiolib_show(struct seq_file *s, void *unused) +{ + struct gpio_chip *chip = NULL; + unsigned gpio; + int started = 0; + + /* REVISIT this isn't locked against gpio_chip removal ... */ + + for (gpio = 0; gpio < ARCH_NR_GPIOS; gpio++) { + if (chip == gpio_desc[gpio].chip) + continue; + chip = gpio_desc[gpio].chip; + if (!chip) + continue; + + seq_printf(s, "%sGPIOs %d-%d, %s%s:\n", + started ? "\n" : "", + chip->base, chip->base + chip->ngpio - 1, + chip->label ? : "generic", + chip->can_sleep ? ", can sleep" : ""); + started = 1; + if (chip->dbg_show) + chip->dbg_show(s, chip); + else + gpiolib_dbg_show(s, chip); + } + return 0; +} + +static int gpiolib_open(struct inode *inode, struct file *file) +{ + return single_open(file, gpiolib_show, NULL); +} + +static struct file_operations gpiolib_operations = { + .open = gpiolib_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init gpiolib_debugfs_init(void) +{ + /* /sys/kernel/debug/gpio */ + (void) debugfs_create_file("gpio", S_IFREG | S_IRUGO, + NULL, NULL, &gpiolib_operations); + return 0; +} +subsys_initcall(gpiolib_debugfs_init); + +#endif /* DEBUG_FS */ diff --git a/drivers/gpio/mcp23s08.c b/drivers/gpio/mcp23s08.c new file mode 100644 index 0000000..bb60e8c --- /dev/null +++ b/drivers/gpio/mcp23s08.c @@ -0,0 +1,357 @@ +/* + * mcp23s08.c - SPI gpio expander driver + */ + +#include <linux/kernel.h> +#include <linux/device.h> +#include <linux/workqueue.h> +#include <linux/mutex.h> + +#include <linux/spi/spi.h> +#include <linux/spi/mcp23s08.h> + +#include <asm/gpio.h> + + +/* Registers are all 8 bits wide. + * + * The mcp23s17 has twice as many bits, and can be configured to work + * with either 16 bit registers or with two adjacent 8 bit banks. + * + * Also, there are I2C versions of both chips. + */ +#define MCP_IODIR 0x00 /* init/reset: all ones */ +#define MCP_IPOL 0x01 +#define MCP_GPINTEN 0x02 +#define MCP_DEFVAL 0x03 +#define MCP_INTCON 0x04 +#define MCP_IOCON 0x05 +# define IOCON_SEQOP (1 << 5) +# define IOCON_HAEN (1 << 3) +# define IOCON_ODR (1 << 2) +# define IOCON_INTPOL (1 << 1) +#define MCP_GPPU 0x06 +#define MCP_INTF 0x07 +#define MCP_INTCAP 0x08 +#define MCP_GPIO 0x09 +#define MCP_OLAT 0x0a + +struct mcp23s08 { + struct spi_device *spi; + u8 addr; + + /* lock protects the cached values */ + struct mutex lock; + u8 cache[11]; + + struct gpio_chip chip; + + struct work_struct work; +}; + +static int mcp23s08_read(struct mcp23s08 *mcp, unsigned reg) +{ + u8 tx[2], rx[1]; + int status; + + tx[0] = mcp->addr | 0x01; + tx[1] = reg; + status = spi_write_then_read(mcp->spi, tx, sizeof tx, rx, sizeof rx); + return (status < 0) ? status : rx[0]; +} + +static int mcp23s08_write(struct mcp23s08 *mcp, unsigned reg, u8 val) +{ + u8 tx[3]; + + tx[0] = mcp->addr; + tx[1] = reg; + tx[2] = val; + return spi_write_then_read(mcp->spi, tx, sizeof tx, NULL, 0); +} + +static int +mcp23s08_read_regs(struct mcp23s08 *mcp, unsigned reg, u8 *vals, unsigned n) +{ + u8 tx[2]; + + if ((n + reg) > sizeof mcp->cache) + return -EINVAL; + tx[0] = mcp->addr | 0x01; + tx[1] = reg; + return spi_write_then_read(mcp->spi, tx, sizeof tx, vals, n); +} + +/*----------------------------------------------------------------------*/ + +static int mcp23s08_direction_input(struct gpio_chip *chip, unsigned offset) +{ + struct mcp23s08 *mcp = container_of(chip, struct mcp23s08, chip); + int status; + + mutex_lock(&mcp->lock); + mcp->cache[MCP_IODIR] |= (1 << offset); + status = mcp23s08_write(mcp, MCP_IODIR, mcp->cache[MCP_IODIR]); + mutex_unlock(&mcp->lock); + return status; +} + +static int mcp23s08_get(struct gpio_chip *chip, unsigned offset) +{ + struct mcp23s08 *mcp = container_of(chip, struct mcp23s08, chip); + int status; + + mutex_lock(&mcp->lock); + + /* REVISIT reading this clears any IRQ ... */ + status = mcp23s08_read(mcp, MCP_GPIO); + if (status < 0) + status = 0; + else { + mcp->cache[MCP_GPIO] = status; + status = !!(status & (1 << offset)); + } + mutex_unlock(&mcp->lock); + return status; +} + +static int __mcp23s08_set(struct mcp23s08 *mcp, unsigned mask, int value) +{ + u8 olat = mcp->cache[MCP_OLAT]; + + if (value) + olat |= mask; + else + olat &= ~mask; + mcp->cache[MCP_OLAT] = olat; + return mcp23s08_write(mcp, MCP_OLAT, olat); +} + +static void mcp23s08_set(struct gpio_chip *chip, unsigned offset, int value) +{ + struct mcp23s08 *mcp = container_of(chip, struct mcp23s08, chip); + u8 mask = 1 << offset; + + mutex_lock(&mcp->lock); + __mcp23s08_set(mcp, mask, value); + mutex_unlock(&mcp->lock); +} + +static int +mcp23s08_direction_output(struct gpio_chip *chip, unsigned offset, int value) +{ + struct mcp23s08 *mcp = container_of(chip, struct mcp23s08, chip); + u8 mask = 1 << offset; + int status; + + mutex_lock(&mcp->lock); + status = __mcp23s08_set(mcp, mask, value); + if (status == 0) { + mcp->cache[MCP_IODIR] &= ~mask; + status = mcp23s08_write(mcp, MCP_IODIR, mcp->cache[MCP_IODIR]); + } + mutex_unlock(&mcp->lock); + return status; +} + +/*----------------------------------------------------------------------*/ + +#ifdef CONFIG_DEBUG_FS + +#include <linux/seq_file.h> + +/* + * This shows more info than the generic gpio dump code: + * pullups, deglitching, open drain drive. + */ +static void mcp23s08_dbg_show(struct seq_file *s, struct gpio_chip *chip) +{ + struct mcp23s08 *mcp; + char bank; + unsigned t; + unsigned mask; + + mcp = container_of(chip, struct mcp23s08, chip); + + /* NOTE: we only handle one bank for now ... */ + bank = '0' + ((mcp->addr >> 1) & 0x3); + + mutex_lock(&mcp->lock); + t = mcp23s08_read_regs(mcp, 0, mcp->cache, sizeof mcp->cache); + if (t < 0) { + seq_printf(s, " I/O ERROR %d\n", t); + goto done; + } + + for (t = 0, mask = 1; t < 8; t++, mask <<= 1) { + const char *label; + + label = gpiochip_is_requested(chip, t); + if (!label) + continue; + + seq_printf(s, " gpio-%-3d P%c.%d (%-12s) %s %s %s", + chip->base + t, bank, t, label, + (mcp->cache[MCP_IODIR] & mask) ? "in " : "out", + (mcp->cache[MCP_GPIO] & mask) ? "hi" : "lo", + (mcp->cache[MCP_GPPU] & mask) ? " " : "up"); + /* NOTE: ignoring the irq-related registers */ + seq_printf(s, "\n"); + } +done: + mutex_unlock(&mcp->lock); +} + +#else +#define mcp23s08_dbg_show NULL +#endif + +/*----------------------------------------------------------------------*/ + +static int mcp23s08_probe(struct spi_device *spi) +{ + struct mcp23s08 *mcp; + struct mcp23s08_platform_data *pdata; + int status; + int do_update = 0; + + pdata = spi->dev.platform_data; + if (!pdata || pdata->slave > 3 || !pdata->base) + return -ENODEV; + + mcp = kzalloc(sizeof *mcp, GFP_KERNEL); + if (!mcp) + return -ENOMEM; + + mutex_init(&mcp->lock); + + mcp->spi = spi; + mcp->addr = 0x40 | (pdata->slave << 1); + + mcp->chip.label = "mcp23s08", + + mcp->chip.direction_input = mcp23s08_direction_input; + mcp->chip.get = mcp23s08_get; + mcp->chip.direction_output = mcp23s08_direction_output; + mcp->chip.set = mcp23s08_set; + mcp->chip.dbg_show = mcp23s08_dbg_show; + + mcp->chip.base = pdata->base; + mcp->chip.ngpio = 8; + mcp->chip.can_sleep = 1; + + spi_set_drvdata(spi, mcp); + + /* verify MCP_IOCON.SEQOP = 0, so sequential reads work */ + status = mcp23s08_read(mcp, MCP_IOCON); + if (status < 0) + goto fail; + if (status & IOCON_SEQOP) { + status &= ~IOCON_SEQOP; + status = mcp23s08_write(mcp, MCP_IOCON, (u8) status); + if (status < 0) + goto fail; + } + + /* configure ~100K pullups */ + status = mcp23s08_write(mcp, MCP_GPPU, pdata->pullups); + if (status < 0) + goto fail; + + status = mcp23s08_read_regs(mcp, 0, mcp->cache, sizeof mcp->cache); + if (status < 0) + goto fail; + + /* disable inverter on input */ + if (mcp->cache[MCP_IPOL] != 0) { + mcp->cache[MCP_IPOL] = 0; + do_update = 1; + } + + /* disable irqs */ + if (mcp->cache[MCP_GPINTEN] != 0) { + mcp->cache[MCP_GPINTEN] = 0; + do_update = 1; + } + + if (do_update) { + u8 tx[4]; + + tx[0] = mcp->addr; + tx[1] = MCP_IPOL; + memcpy(&tx[2], &mcp->cache[MCP_IPOL], sizeof(tx) - 2); + status = spi_write_then_read(mcp->spi, tx, sizeof tx, NULL, 0); + + /* FIXME check status... */ + } + + status = gpiochip_add(&mcp->chip); + + /* NOTE: these chips have a relatively sane IRQ framework, with + * per-signal masking and level/edge triggering. It's not yet + * handled here... + */ + + if (pdata->setup) { + status = pdata->setup(spi, mcp->chip.base, + mcp->chip.ngpio, pdata->context); + if (status < 0) + dev_dbg(&spi->dev, "setup --> %d\n", status); + } + + return 0; + +fail: + kfree(mcp); + return status; +} + +static int mcp23s08_remove(struct spi_device *spi) +{ + struct mcp23s08 *mcp = spi_get_drvdata(spi); + struct mcp23s08_platform_data *pdata = spi->dev.platform_data; + int status = 0; + + if (pdata->teardown) { + status = pdata->teardown(spi, + mcp->chip.base, mcp->chip.ngpio, + pdata->context); + if (status < 0) { + dev_err(&spi->dev, "%s --> %d\n", "teardown", status); + return status; + } + } + + status = gpiochip_remove(&mcp->chip); + if (status == 0) + kfree(mcp); + else + dev_err(&spi->dev, "%s --> %d\n", "remove", status); + return status; +} + +static struct spi_driver mcp23s08_driver = { + .probe = mcp23s08_probe, + .remove = mcp23s08_remove, + .driver = { + .name = "mcp23s08", + .owner = THIS_MODULE, + }, +}; + +/*----------------------------------------------------------------------*/ + +static int __init mcp23s08_init(void) +{ + return spi_register_driver(&mcp23s08_driver); +} +module_init(mcp23s08_init); + +static void __exit mcp23s08_exit(void) +{ + spi_unregister_driver(&mcp23s08_driver); +} +module_exit(mcp23s08_exit); + +MODULE_LICENSE("GPL"); + diff --git a/drivers/gpio/pca9539.c b/drivers/gpio/pca9539.c new file mode 100644 index 0000000..3e85c92 --- /dev/null +++ b/drivers/gpio/pca9539.c @@ -0,0 +1,271 @@ +/* + * pca9539.c - 16-bit I/O port with interrupt and reset + * + * Copyright (C) 2005 Ben Gardner <bgardner@wabtec.com> + * Copyright (C) 2007 Marvell International Ltd. + * + * Derived from drivers/i2c/chips/pca9539.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/i2c.h> +#include <linux/i2c/pca9539.h> + +#include <asm/gpio.h> + + +#define NR_PCA9539_GPIOS 16 + +#define PCA9539_INPUT 0 +#define PCA9539_OUTPUT 2 +#define PCA9539_INVERT 4 +#define PCA9539_DIRECTION 6 + +struct pca9539_chip { + unsigned gpio_start; + uint16_t reg_output; + uint16_t reg_direction; + + struct i2c_client *client; + struct gpio_chip gpio_chip; +}; + +/* NOTE: we can't currently rely on fault codes to come from SMBus + * calls, so we map all errors to EIO here and return zero otherwise. + */ +static int pca9539_write_reg(struct pca9539_chip *chip, int reg, uint16_t val) +{ + if (i2c_smbus_write_word_data(chip->client, reg, val) < 0) + return -EIO; + else + return 0; +} + +static int pca9539_read_reg(struct pca9539_chip *chip, int reg, uint16_t *val) +{ + int ret; + + ret = i2c_smbus_read_word_data(chip->client, reg); + if (ret < 0) { + dev_err(&chip->client->dev, "failed reading register\n"); + return -EIO; + } + + *val = (uint16_t)ret; + return 0; +} + +static int pca9539_gpio_direction_input(struct gpio_chip *gc, unsigned off) +{ + struct pca9539_chip *chip; + uint16_t reg_val; + int ret; + + chip = container_of(gc, struct pca9539_chip, gpio_chip); + + reg_val = chip->reg_direction | (1u << off); + ret = pca9539_write_reg(chip, PCA9539_DIRECTION, reg_val); + if (ret) + return ret; + + chip->reg_direction = reg_val; + return 0; +} + +static int pca9539_gpio_direction_output(struct gpio_chip *gc, + unsigned off, int val) +{ + struct pca9539_chip *chip; + uint16_t reg_val; + int ret; + + chip = container_of(gc, struct pca9539_chip, gpio_chip); + + /* set output level */ + if (val) + reg_val = chip->reg_output | (1u << off); + else + reg_val = chip->reg_output & ~(1u << off); + + ret = pca9539_write_reg(chip, PCA9539_OUTPUT, reg_val); + if (ret) + return ret; + + chip->reg_output = reg_val; + + /* then direction */ + reg_val = chip->reg_direction & ~(1u << off); + ret = pca9539_write_reg(chip, PCA9539_DIRECTION, reg_val); + if (ret) + return ret; + + chip->reg_direction = reg_val; + return 0; +} + +static int pca9539_gpio_get_value(struct gpio_chip *gc, unsigned off) +{ + struct pca9539_chip *chip; + uint16_t reg_val; + int ret; + + chip = container_of(gc, struct pca9539_chip, gpio_chip); + + ret = pca9539_read_reg(chip, PCA9539_INPUT, ®_val); + if (ret < 0) { + /* NOTE: diagnostic already emitted; that's all we should + * do unless gpio_*_value_cansleep() calls become different + * from their nonsleeping siblings (and report faults). + */ + return 0; + } + + return (reg_val & (1u << off)) ? 1 : 0; +} + +static void pca9539_gpio_set_value(struct gpio_chip *gc, unsigned off, int val) +{ + struct pca9539_chip *chip; + uint16_t reg_val; + int ret; + + chip = container_of(gc, struct pca9539_chip, gpio_chip); + + if (val) + reg_val = chip->reg_output | (1u << off); + else + reg_val = chip->reg_output & ~(1u << off); + + ret = pca9539_write_reg(chip, PCA9539_OUTPUT, reg_val); + if (ret) + return; + + chip->reg_output = reg_val; +} + +static int pca9539_init_gpio(struct pca9539_chip *chip) +{ + struct gpio_chip *gc; + + gc = &chip->gpio_chip; + + gc->direction_input = pca9539_gpio_direction_input; + gc->direction_output = pca9539_gpio_direction_output; + gc->get = pca9539_gpio_get_value; + gc->set = pca9539_gpio_set_value; + + gc->base = chip->gpio_start; + gc->ngpio = NR_PCA9539_GPIOS; + gc->label = "pca9539"; + + return gpiochip_add(gc); +} + +static int __devinit pca9539_probe(struct i2c_client *client) +{ + struct pca9539_platform_data *pdata; + struct pca9539_chip *chip; + int ret; + + pdata = client->dev.platform_data; + if (pdata == NULL) + return -ENODEV; + + chip = kzalloc(sizeof(struct pca9539_chip), GFP_KERNEL); + if (chip == NULL) + return -ENOMEM; + + chip->client = client; + + chip->gpio_start = pdata->gpio_base; + + /* initialize cached registers from their original values. + * we can't share this chip with another i2c master. + */ + ret = pca9539_read_reg(chip, PCA9539_OUTPUT, &chip->reg_output); + if (ret) + goto out_failed; + + ret = pca9539_read_reg(chip, PCA9539_DIRECTION, &chip->reg_direction); + if (ret) + goto out_failed; + + /* set platform specific polarity inversion */ + ret = pca9539_write_reg(chip, PCA9539_INVERT, pdata->invert); + if (ret) + goto out_failed; + + ret = pca9539_init_gpio(chip); + if (ret) + goto out_failed; + + if (pdata->setup) { + ret = pdata->setup(client, chip->gpio_chip.base, + chip->gpio_chip.ngpio, pdata->context); + if (ret < 0) + dev_warn(&client->dev, "setup failed, %d\n", ret); + } + + i2c_set_clientdata(client, chip); + return 0; + +out_failed: + kfree(chip); + return ret; +} + +static int pca9539_remove(struct i2c_client *client) +{ + struct pca9539_platform_data *pdata = client->dev.platform_data; + struct pca9539_chip *chip = i2c_get_clientdata(client); + int ret = 0; + + if (pdata->teardown) { + ret = pdata->teardown(client, chip->gpio_chip.base, + chip->gpio_chip.ngpio, pdata->context); + if (ret < 0) { + dev_err(&client->dev, "%s failed, %d\n", + "teardown", ret); + return ret; + } + } + + ret = gpiochip_remove(&chip->gpio_chip); + if (ret) { + dev_err(&client->dev, "%s failed, %d\n", + "gpiochip_remove()", ret); + return ret; + } + + kfree(chip); + return 0; +} + +static struct i2c_driver pca9539_driver = { + .driver = { + .name = "pca9539", + }, + .probe = pca9539_probe, + .remove = pca9539_remove, +}; + +static int __init pca9539_init(void) +{ + return i2c_add_driver(&pca9539_driver); +} +module_init(pca9539_init); + +static void __exit pca9539_exit(void) +{ + i2c_del_driver(&pca9539_driver); +} +module_exit(pca9539_exit); + +MODULE_AUTHOR("eric miao <eric.miao@marvell.com>"); +MODULE_DESCRIPTION("GPIO expander driver for PCA9539"); +MODULE_LICENSE("GPL"); diff --git a/drivers/gpio/pcf857x.c b/drivers/gpio/pcf857x.c new file mode 100644 index 0000000..c6b3b53 --- /dev/null +++ b/drivers/gpio/pcf857x.c @@ -0,0 +1,330 @@ +/* + * pcf857x - driver for pcf857x, pca857x, and pca967x I2C GPIO expanders + * + * Copyright (C) 2007 David Brownell + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/i2c.h> +#include <linux/i2c/pcf857x.h> + +#include <asm/gpio.h> + + +/* + * The pcf857x, pca857x, and pca967x chips only expose one read and one + * write register. Writing a "one" bit (to match the reset state) lets + * that pin be used as an input; it's not an open-drain model, but acts + * a bit like one. This is described as "quasi-bidirectional"; read the + * chip documentation for details. + * + * Many other I2C GPIO expander chips (like the pca953x models) have + * more complex register models and more conventional circuitry using + * push/pull drivers. They often use the same 0x20..0x27 addresses as + * pcf857x parts, making the "legacy" I2C driver model problematic. + */ +struct pcf857x { + struct gpio_chip chip; + struct i2c_client *client; + unsigned out; /* software latch */ +}; + +/*-------------------------------------------------------------------------*/ + +/* Talk to 8-bit I/O expander */ + +static int pcf857x_input8(struct gpio_chip *chip, unsigned offset) +{ + struct pcf857x *gpio = container_of(chip, struct pcf857x, chip); + + gpio->out |= (1 << offset); + return i2c_smbus_write_byte(gpio->client, gpio->out); +} + +static int pcf857x_get8(struct gpio_chip *chip, unsigned offset) +{ + struct pcf857x *gpio = container_of(chip, struct pcf857x, chip); + s32 value; + + value = i2c_smbus_read_byte(gpio->client); + return (value < 0) ? 0 : (value & (1 << offset)); +} + +static int pcf857x_output8(struct gpio_chip *chip, unsigned offset, int value) +{ + struct pcf857x *gpio = container_of(chip, struct pcf857x, chip); + unsigned bit = 1 << offset; + + if (value) + gpio->out |= bit; + else + gpio->out &= ~bit; + return i2c_smbus_write_byte(gpio->client, gpio->out); +} + +static void pcf857x_set8(struct gpio_chip *chip, unsigned offset, int value) +{ + pcf857x_output8(chip, offset, value); +} + +/*-------------------------------------------------------------------------*/ + +/* Talk to 16-bit I/O expander */ + +static int i2c_write_le16(struct i2c_client *client, u16 word) +{ + u8 buf[2] = { word & 0xff, word >> 8, }; + int status; + + status = i2c_master_send(client, buf, 2); + return (status < 0) ? status : 0; +} + +static int i2c_read_le16(struct i2c_client *client) +{ + u8 buf[2]; + int status; + + status = i2c_master_recv(client, buf, 2); + if (status < 0) + return status; + return (buf[1] << 8) | buf[0]; +} + +static int pcf857x_input16(struct gpio_chip *chip, unsigned offset) +{ + struct pcf857x *gpio = container_of(chip, struct pcf857x, chip); + + gpio->out |= (1 << offset); + return i2c_write_le16(gpio->client, gpio->out); +} + +static int pcf857x_get16(struct gpio_chip *chip, unsigned offset) +{ + struct pcf857x *gpio = container_of(chip, struct pcf857x, chip); + int value; + + value = i2c_read_le16(gpio->client); + return (value < 0) ? 0 : (value & (1 << offset)); +} + +static int pcf857x_output16(struct gpio_chip *chip, unsigned offset, int value) +{ + struct pcf857x *gpio = container_of(chip, struct pcf857x, chip); + unsigned bit = 1 << offset; + + if (value) + gpio->out |= bit; + else + gpio->out &= ~bit; + return i2c_write_le16(gpio->client, gpio->out); +} + +static void pcf857x_set16(struct gpio_chip *chip, unsigned offset, int value) +{ + pcf857x_output16(chip, offset, value); +} + +/*-------------------------------------------------------------------------*/ + +static int pcf857x_probe(struct i2c_client *client) +{ + struct pcf857x_platform_data *pdata; + struct pcf857x *gpio; + int status; + + pdata = client->dev.platform_data; + if (!pdata) + return -ENODEV; + + /* Allocate, initialize, and register this gpio_chip. */ + gpio = kzalloc(sizeof *gpio, GFP_KERNEL); + if (!gpio) + return -ENOMEM; + + gpio->chip.base = pdata->gpio_base; + gpio->chip.can_sleep = 1; + + /* NOTE: the OnSemi jlc1562b is also largely compatible with + * these parts, notably for output. It has a low-resolution + * DAC instead of pin change IRQs; and its inputs can be the + * result of comparators. + */ + + /* 8574 addresses are 0x20..0x27; 8574a uses 0x38..0x3f; + * 9670, 9672, 9764, and 9764a use quite a variety. + * + * NOTE: we don't distinguish here between *4 and *4a parts. + */ + if (strcmp(client->name, "pcf8574") == 0 + || strcmp(client->name, "pca8574") == 0 + || strcmp(client->name, "pca9670") == 0 + || strcmp(client->name, "pca9672") == 0 + || strcmp(client->name, "pca9674") == 0 + ) { + gpio->chip.ngpio = 8; + gpio->chip.direction_input = pcf857x_input8; + gpio->chip.get = pcf857x_get8; + gpio->chip.direction_output = pcf857x_output8; + gpio->chip.set = pcf857x_set8; + + if (!i2c_check_functionality(client->adapter, + I2C_FUNC_SMBUS_BYTE)) + status = -EIO; + + /* fail if there's no chip present */ + else + status = i2c_smbus_read_byte(client); + + /* '75/'75c addresses are 0x20..0x27, just like the '74; + * the '75c doesn't have a current source pulling high. + * 9671, 9673, and 9765 use quite a variety of addresses. + * + * NOTE: we don't distinguish here between '75 and '75c parts. + */ + } else if (strcmp(client->name, "pcf8575") == 0 + || strcmp(client->name, "pca8575") == 0 + || strcmp(client->name, "pca9671") == 0 + || strcmp(client->name, "pca9673") == 0 + || strcmp(client->name, "pca9675") == 0 + ) { + gpio->chip.ngpio = 16; + gpio->chip.direction_input = pcf857x_input16; + gpio->chip.get = pcf857x_get16; + gpio->chip.direction_output = pcf857x_output16; + gpio->chip.set = pcf857x_set16; + + if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) + status = -EIO; + + /* fail if there's no chip present */ + else + status = i2c_read_le16(client); + + } else + status = -ENODEV; + + if (status < 0) + goto fail; + + gpio->chip.label = client->name; + + gpio->client = client; + i2c_set_clientdata(client, gpio); + + /* NOTE: these chips have strange "quasi-bidirectional" I/O pins. + * We can't actually know whether a pin is configured (a) as output + * and driving the signal low, or (b) as input and reporting a low + * value ... without knowing the last value written since the chip + * came out of reset (if any). We can't read the latched output. + * + * In short, the only reliable solution for setting up pin direction + * is to do it explicitly. The setup() method can do that, but it + * may cause transient glitching since it can't know the last value + * written (some pins may need to be driven low). + * + * Using pdata->n_latch avoids that trouble. When left initialized + * to zero, our software copy of the "latch" then matches the chip's + * all-ones reset state. Otherwise it flags pins to be driven low. + */ + gpio->out = ~pdata->n_latch; + + status = gpiochip_add(&gpio->chip); + if (status < 0) + goto fail; + + /* NOTE: these chips can issue "some pin-changed" IRQs, which we + * don't yet even try to use. Among other issues, the relevant + * genirq state isn't available to modular drivers; and most irq + * methods can't be called from sleeping contexts. + */ + + dev_info(&client->dev, "gpios %d..%d on a %s%s\n", + gpio->chip.base, + gpio->chip.base + gpio->chip.ngpio - 1, + client->name, + client->irq ? " (irq ignored)" : ""); + + /* Let platform code set up the GPIOs and their users. + * Now is the first time anyone could use them. + */ + if (pdata->setup) { + status = pdata->setup(client, + gpio->chip.base, gpio->chip.ngpio, + pdata->context); + if (status < 0) + dev_warn(&client->dev, "setup --> %d\n", status); + } + + return 0; + +fail: + dev_dbg(&client->dev, "probe error %d for '%s'\n", + status, client->name); + kfree(gpio); + return status; +} + +static int pcf857x_remove(struct i2c_client *client) +{ + struct pcf857x_platform_data *pdata = client->dev.platform_data; + struct pcf857x *gpio = i2c_get_clientdata(client); + int status = 0; + + if (pdata->teardown) { + status = pdata->teardown(client, + gpio->chip.base, gpio->chip.ngpio, + pdata->context); + if (status < 0) { + dev_err(&client->dev, "%s --> %d\n", + "teardown", status); + return status; + } + } + + status = gpiochip_remove(&gpio->chip); + if (status == 0) + kfree(gpio); + else + dev_err(&client->dev, "%s --> %d\n", "remove", status); + return status; +} + +static struct i2c_driver pcf857x_driver = { + .driver = { + .name = "pcf857x", + .owner = THIS_MODULE, + }, + .probe = pcf857x_probe, + .remove = pcf857x_remove, +}; + +static int __init pcf857x_init(void) +{ + return i2c_add_driver(&pcf857x_driver); +} +module_init(pcf857x_init); + +static void __exit pcf857x_exit(void) +{ + i2c_del_driver(&pcf857x_driver); +} +module_exit(pcf857x_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("David Brownell"); diff --git a/drivers/i2c/chips/Kconfig b/drivers/i2c/chips/Kconfig index bd7082c..b21593f 100644 --- a/drivers/i2c/chips/Kconfig +++ b/drivers/i2c/chips/Kconfig @@ -54,8 +54,8 @@ config PCF8575 hardware. If unsure, say N. config SENSORS_PCA9539 - tristate "Philips PCA9539 16-bit I/O port" - depends on EXPERIMENTAL + tristate "Philips PCA9539 16-bit I/O port (DEPRECATED)" + depends on EXPERIMENTAL && GPIO_PCA9539 = "n" help If you say yes here you get support for the Philips PCA9539 16-bit I/O port. @@ -63,6 +63,9 @@ config SENSORS_PCA9539 This driver can also be built as a module. If so, the module will be called pca9539. + This driver is deprecated and will be dropped soon. Use + drivers/gpio/pca9539.c instead. + config SENSORS_PCF8591 tristate "Philips PCF8591" depends on EXPERIMENTAL diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index a193dfb..a5dc78a 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -44,8 +44,8 @@ source "drivers/infiniband/hw/ipath/Kconfig" source "drivers/infiniband/hw/ehca/Kconfig" source "drivers/infiniband/hw/amso1100/Kconfig" source "drivers/infiniband/hw/cxgb3/Kconfig" - source "drivers/infiniband/hw/mlx4/Kconfig" +source "drivers/infiniband/hw/nes/Kconfig" source "drivers/infiniband/ulp/ipoib/Kconfig" diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile index 75f325e4..ed35e44 100644 --- a/drivers/infiniband/Makefile +++ b/drivers/infiniband/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_INFINIBAND_EHCA) += hw/ehca/ obj-$(CONFIG_INFINIBAND_AMSO1100) += hw/amso1100/ obj-$(CONFIG_INFINIBAND_CXGB3) += hw/cxgb3/ obj-$(CONFIG_MLX4_INFINIBAND) += hw/mlx4/ +obj-$(CONFIG_INFINIBAND_NES) += hw/nes/ obj-$(CONFIG_INFINIBAND_IPOIB) += ulp/ipoib/ obj-$(CONFIG_INFINIBAND_SRP) += ulp/srp/ obj-$(CONFIG_INFINIBAND_ISER) += ulp/iser/ diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index c015014..638b727 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -974,6 +974,9 @@ static void cm_format_req(struct cm_req_msg *req_msg, struct cm_id_private *cm_id_priv, struct ib_cm_req_param *param) { + struct ib_sa_path_rec *pri_path = param->primary_path; + struct ib_sa_path_rec *alt_path = param->alternate_path; + cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ)); @@ -997,35 +1000,46 @@ static void cm_format_req(struct cm_req_msg *req_msg, cm_req_set_max_cm_retries(req_msg, param->max_cm_retries); cm_req_set_srq(req_msg, param->srq); - req_msg->primary_local_lid = param->primary_path->slid; - req_msg->primary_remote_lid = param->primary_path->dlid; - req_msg->primary_local_gid = param->primary_path->sgid; - req_msg->primary_remote_gid = param->primary_path->dgid; - cm_req_set_primary_flow_label(req_msg, param->primary_path->flow_label); - cm_req_set_primary_packet_rate(req_msg, param->primary_path->rate); - req_msg->primary_traffic_class = param->primary_path->traffic_class; - req_msg->primary_hop_limit = param->primary_path->hop_limit; - cm_req_set_primary_sl(req_msg, param->primary_path->sl); - cm_req_set_primary_subnet_local(req_msg, 1); /* local only... */ + if (pri_path->hop_limit <= 1) { + req_msg->primary_local_lid = pri_path->slid; + req_msg->primary_remote_lid = pri_path->dlid; + } else { + /* Work-around until there's a way to obtain remote LID info */ + req_msg->primary_local_lid = IB_LID_PERMISSIVE; + req_msg->primary_remote_lid = IB_LID_PERMISSIVE; + } + req_msg->primary_local_gid = pri_path->sgid; + req_msg->primary_remote_gid = pri_path->dgid; + cm_req_set_primary_flow_label(req_msg, pri_path->flow_label); + cm_req_set_primary_packet_rate(req_msg, pri_path->rate); + req_msg->primary_traffic_class = pri_path->traffic_class; + req_msg->primary_hop_limit = pri_path->hop_limit; + cm_req_set_primary_sl(req_msg, pri_path->sl); + cm_req_set_primary_subnet_local(req_msg, (pri_path->hop_limit <= 1)); cm_req_set_primary_local_ack_timeout(req_msg, cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, - param->primary_path->packet_life_time)); + pri_path->packet_life_time)); - if (param->alternate_path) { - req_msg->alt_local_lid = param->alternate_path->slid; - req_msg->alt_remote_lid = param->alternate_path->dlid; - req_msg->alt_local_gid = param->alternate_path->sgid; - req_msg->alt_remote_gid = param->alternate_path->dgid; + if (alt_path) { + if (alt_path->hop_limit <= 1) { + req_msg->alt_local_lid = alt_path->slid; + req_msg->alt_remote_lid = alt_path->dlid; + } else { + req_msg->alt_local_lid = IB_LID_PERMISSIVE; + req_msg->alt_remote_lid = IB_LID_PERMISSIVE; + } + req_msg->alt_local_gid = alt_path->sgid; + req_msg->alt_remote_gid = alt_path->dgid; cm_req_set_alt_flow_label(req_msg, - param->alternate_path->flow_label); - cm_req_set_alt_packet_rate(req_msg, param->alternate_path->rate); - req_msg->alt_traffic_class = param->alternate_path->traffic_class; - req_msg->alt_hop_limit = param->alternate_path->hop_limit; - cm_req_set_alt_sl(req_msg, param->alternate_path->sl); - cm_req_set_alt_subnet_local(req_msg, 1); /* local only... */ + alt_path->flow_label); + cm_req_set_alt_packet_rate(req_msg, alt_path->rate); + req_msg->alt_traffic_class = alt_path->traffic_class; + req_msg->alt_hop_limit = alt_path->hop_limit; + cm_req_set_alt_sl(req_msg, alt_path->sl); + cm_req_set_alt_subnet_local(req_msg, (alt_path->hop_limit <= 1)); cm_req_set_alt_local_ack_timeout(req_msg, cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, - param->alternate_path->packet_life_time)); + alt_path->packet_life_time)); } if (param->private_data && param->private_data_len) @@ -1441,6 +1455,34 @@ out: return listen_cm_id_priv; } +/* + * Work-around for inter-subnet connections. If the LIDs are permissive, + * we need to override the LID/SL data in the REQ with the LID information + * in the work completion. + */ +static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) +{ + if (!cm_req_get_primary_subnet_local(req_msg)) { + if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) { + req_msg->primary_local_lid = cpu_to_be16(wc->slid); + cm_req_set_primary_sl(req_msg, wc->sl); + } + + if (req_msg->primary_remote_lid == IB_LID_PERMISSIVE) + req_msg->primary_remote_lid = cpu_to_be16(wc->dlid_path_bits); + } + + if (!cm_req_get_alt_subnet_local(req_msg)) { + if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) { + req_msg->alt_local_lid = cpu_to_be16(wc->slid); + cm_req_set_alt_sl(req_msg, wc->sl); + } + + if (req_msg->alt_remote_lid == IB_LID_PERMISSIVE) + req_msg->alt_remote_lid = cpu_to_be16(wc->dlid_path_bits); + } +} + static int cm_req_handler(struct cm_work *work) { struct ib_cm_id *cm_id; @@ -1481,6 +1523,7 @@ static int cm_req_handler(struct cm_work *work) cm_id_priv->id.service_id = req_msg->service_id; cm_id_priv->id.service_mask = __constant_cpu_to_be64(~0ULL); + cm_process_routed_req(req_msg, work->mad_recv_wc->wc); cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); if (ret) { diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index 6c7aa59..7f00347 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -320,10 +320,13 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, .max_maps = pool->max_remaps, .page_shift = params->page_shift }; + int bytes_per_fmr = sizeof *fmr; + + if (pool->cache_bucket) + bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64); for (i = 0; i < params->pool_size; ++i) { - fmr = kmalloc(sizeof *fmr + params->max_pages_per_fmr * sizeof (u64), - GFP_KERNEL); + fmr = kmalloc(bytes_per_fmr, GFP_KERNEL); if (!fmr) { printk(KERN_WARNING PFX "failed to allocate fmr " "struct for FMR %d\n", i); diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h index f281d16..92cce8a 100644 --- a/drivers/infiniband/hw/ehca/ehca_classes.h +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -101,6 +101,7 @@ struct ehca_sport { spinlock_t mod_sqp_lock; enum ib_port_state port_state; struct ehca_sma_attr saved_attr; + u32 pma_qp_nr; }; #define HCA_CAP_MR_PGSIZE_4K 0x80000000 diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c index 863b34f..b5ca94c 100644 --- a/drivers/infiniband/hw/ehca/ehca_irq.c +++ b/drivers/infiniband/hw/ehca/ehca_irq.c @@ -403,6 +403,8 @@ static void parse_ec(struct ehca_shca *shca, u64 eqe) sport->port_state = IB_PORT_ACTIVE; dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE, "is active"); + ehca_query_sma_attr(shca, port, + &sport->saved_attr); } else notify_port_conf_change(shca, port); break; diff --git a/drivers/infiniband/hw/ehca/ehca_iverbs.h b/drivers/infiniband/hw/ehca/ehca_iverbs.h index c469bfd..a8a2ea5 100644 --- a/drivers/infiniband/hw/ehca/ehca_iverbs.h +++ b/drivers/infiniband/hw/ehca/ehca_iverbs.h @@ -187,6 +187,11 @@ int ehca_dealloc_ucontext(struct ib_ucontext *context); int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); +int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, + struct ib_mad *out_mad); + void ehca_poll_eqs(unsigned long data); int ehca_calc_ipd(struct ehca_shca *shca, int port, diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index 84c9b7b..a86ebcc 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -472,7 +472,7 @@ int ehca_init_device(struct ehca_shca *shca) shca->ib_device.dealloc_fmr = ehca_dealloc_fmr; shca->ib_device.attach_mcast = ehca_attach_mcast; shca->ib_device.detach_mcast = ehca_detach_mcast; - /* shca->ib_device.process_mad = ehca_process_mad; */ + shca->ib_device.process_mad = ehca_process_mad; shca->ib_device.mmap = ehca_mmap; if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) { diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c index 3aacc8c..2ce8cff 100644 --- a/drivers/infiniband/hw/ehca/ehca_reqs.c +++ b/drivers/infiniband/hw/ehca/ehca_reqs.c @@ -209,6 +209,10 @@ static inline int ehca_write_swqe(struct ehca_qp *qp, ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp); return -EINVAL; } + if (unlikely(send_wr->wr.ud.remote_qpn == 0)) { + ehca_gen_err("dest QP# is 0. qp=%x", qp->real_qp_num); + return -EINVAL; + } my_av = container_of(send_wr->wr.ud.ah, struct ehca_av, ib_ah); wqe_p->u.ud_av.ud_av = my_av->av; diff --git a/drivers/infiniband/hw/ehca/ehca_sqp.c b/drivers/infiniband/hw/ehca/ehca_sqp.c index 79e72b2..706d97a 100644 --- a/drivers/infiniband/hw/ehca/ehca_sqp.c +++ b/drivers/infiniband/hw/ehca/ehca_sqp.c @@ -39,12 +39,18 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include <rdma/ib_mad.h> #include "ehca_classes.h" #include "ehca_tools.h" #include "ehca_iverbs.h" #include "hcp_if.h" +#define IB_MAD_STATUS_REDIRECT __constant_htons(0x0002) +#define IB_MAD_STATUS_UNSUP_VERSION __constant_htons(0x0004) +#define IB_MAD_STATUS_UNSUP_METHOD __constant_htons(0x0008) + +#define IB_PMA_CLASS_PORT_INFO __constant_htons(0x0001) /** * ehca_define_sqp - Defines special queue pair 1 (GSI QP). When special queue @@ -83,6 +89,9 @@ u64 ehca_define_sqp(struct ehca_shca *shca, port, ret); return ret; } + shca->sport[port - 1].pma_qp_nr = pma_qp_nr; + ehca_dbg(&shca->ib_device, "port=%x pma_qp_nr=%x", + port, pma_qp_nr); break; default: ehca_err(&shca->ib_device, "invalid qp_type=%x", @@ -109,3 +118,85 @@ u64 ehca_define_sqp(struct ehca_shca *shca, return H_SUCCESS; } + +struct ib_perf { + struct ib_mad_hdr mad_hdr; + u8 reserved[40]; + u8 data[192]; +} __attribute__ ((packed)); + + +static int ehca_process_perf(struct ib_device *ibdev, u8 port_num, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct ib_perf *in_perf = (struct ib_perf *)in_mad; + struct ib_perf *out_perf = (struct ib_perf *)out_mad; + struct ib_class_port_info *poi = + (struct ib_class_port_info *)out_perf->data; + struct ehca_shca *shca = + container_of(ibdev, struct ehca_shca, ib_device); + struct ehca_sport *sport = &shca->sport[port_num - 1]; + + ehca_dbg(ibdev, "method=%x", in_perf->mad_hdr.method); + + *out_mad = *in_mad; + + if (in_perf->mad_hdr.class_version != 1) { + ehca_warn(ibdev, "Unsupported class_version=%x", + in_perf->mad_hdr.class_version); + out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_VERSION; + goto perf_reply; + } + + switch (in_perf->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + case IB_MGMT_METHOD_SET: + /* set class port info for redirection */ + out_perf->mad_hdr.attr_id = IB_PMA_CLASS_PORT_INFO; + out_perf->mad_hdr.status = IB_MAD_STATUS_REDIRECT; + memset(poi, 0, sizeof(*poi)); + poi->base_version = 1; + poi->class_version = 1; + poi->resp_time_value = 18; + poi->redirect_lid = sport->saved_attr.lid; + poi->redirect_qp = sport->pma_qp_nr; + poi->redirect_qkey = IB_QP1_QKEY; + poi->redirect_pkey = IB_DEFAULT_PKEY_FULL; + + ehca_dbg(ibdev, "ehca_pma_lid=%x ehca_pma_qp=%x", + sport->saved_attr.lid, sport->pma_qp_nr); + break; + + case IB_MGMT_METHOD_GET_RESP: + return IB_MAD_RESULT_FAILURE; + + default: + out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_METHOD; + break; + } + +perf_reply: + out_perf->mad_hdr.method = IB_MGMT_METHOD_GET_RESP; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + int ret; + + if (!port_num || port_num > ibdev->phys_port_cnt) + return IB_MAD_RESULT_FAILURE; + + /* accept only pma request */ + if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT) + return IB_MAD_RESULT_SUCCESS; + + ehca_dbg(ibdev, "port_num=%x src_qp=%x", port_num, in_wc->src_qp); + ret = ehca_process_perf(ibdev, port_num, in_mad, out_mad); + + return ret; +} diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index d8287d9..96a39b5 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -52,7 +52,7 @@ MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(DRV_VERSION); -static const char mlx4_ib_version[] __devinitdata = +static const char mlx4_ib_version[] = DRV_NAME ": Mellanox ConnectX InfiniBand driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; @@ -468,6 +468,7 @@ static int init_node_data(struct mlx4_ib_dev *dev) if (err) goto out; + dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); out: @@ -516,9 +517,16 @@ static struct class_device_attribute *mlx4_class_attributes[] = { static void *mlx4_ib_add(struct mlx4_dev *dev) { + static int mlx4_ib_version_printed; struct mlx4_ib_dev *ibdev; int i; + + if (!mlx4_ib_version_printed) { + printk(KERN_INFO "%s", mlx4_ib_version); + ++mlx4_ib_version_printed; + } + ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev); if (!ibdev) { dev_err(&dev->pdev->dev, "Device struct alloc failed\n"); diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index 6966f94..09a30dd 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -1255,9 +1255,14 @@ int mthca_QUERY_ADAPTER(struct mthca_dev *dev, if (err) goto out; - MTHCA_GET(adapter->vendor_id, outbox, QUERY_ADAPTER_VENDOR_ID_OFFSET); - MTHCA_GET(adapter->device_id, outbox, QUERY_ADAPTER_DEVICE_ID_OFFSET); - MTHCA_GET(adapter->revision_id, outbox, QUERY_ADAPTER_REVISION_ID_OFFSET); + if (!mthca_is_memfree(dev)) { + MTHCA_GET(adapter->vendor_id, outbox, + QUERY_ADAPTER_VENDOR_ID_OFFSET); + MTHCA_GET(adapter->device_id, outbox, + QUERY_ADAPTER_DEVICE_ID_OFFSET); + MTHCA_GET(adapter->revision_id, outbox, + QUERY_ADAPTER_REVISION_ID_OFFSET); + } MTHCA_GET(adapter->inta_pin, outbox, QUERY_ADAPTER_INTA_PIN_OFFSET); get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4, diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 5cf8250..cd3d8ad 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -126,7 +126,7 @@ module_param_named(fmr_reserved_mtts, hca_profile.fmr_reserved_mtts, int, 0444); MODULE_PARM_DESC(fmr_reserved_mtts, "number of memory translation table segments reserved for FMR"); -static const char mthca_version[] __devinitdata = +static char mthca_version[] __devinitdata = DRV_NAME ": Mellanox InfiniBand HCA driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; @@ -735,7 +735,8 @@ static int mthca_init_hca(struct mthca_dev *mdev) } mdev->eq_table.inta_pin = adapter.inta_pin; - mdev->rev_id = adapter.revision_id; + if (!mthca_is_memfree(mdev)) + mdev->rev_id = adapter.revision_id; memcpy(mdev->board_id, adapter.board_id, sizeof mdev->board_id); return 0; diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c index aa6c70a..3b69855 100644 --- a/drivers/infiniband/hw/mthca/mthca_mr.c +++ b/drivers/infiniband/hw/mthca/mthca_mr.c @@ -613,8 +613,10 @@ int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd, sizeof *(mr->mem.tavor.mpt) * idx; mr->mtt = __mthca_alloc_mtt(dev, list_len, dev->mr_table.fmr_mtt_buddy); - if (IS_ERR(mr->mtt)) + if (IS_ERR(mr->mtt)) { + err = PTR_ERR(mr->mtt); goto err_out_table; + } mtt_seg = mr->mtt->first_seg * MTHCA_MTT_SEG_SIZE; @@ -627,8 +629,10 @@ int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd, mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg; mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); - if (IS_ERR(mailbox)) + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); goto err_out_free_mtt; + } mpt_entry = mailbox->buf; diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 6bcde1c..9e491df 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -923,17 +923,13 @@ static struct ib_mr *mthca_reg_phys_mr(struct ib_pd *pd, struct mthca_mr *mr; u64 *page_list; u64 total_size; - u64 mask; + unsigned long mask; int shift; int npages; int err; int i, j, n; - /* First check that we have enough alignment */ - if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) - return ERR_PTR(-EINVAL); - - mask = 0; + mask = buffer_list[0].addr ^ *iova_start; total_size = 0; for (i = 0; i < num_phys_buf; ++i) { if (i != 0) @@ -947,17 +943,7 @@ static struct ib_mr *mthca_reg_phys_mr(struct ib_pd *pd, if (mask & ~PAGE_MASK) return ERR_PTR(-EINVAL); - /* Find largest page shift we can use to cover buffers */ - for (shift = PAGE_SHIFT; shift < 31; ++shift) - if (num_phys_buf > 1) { - if ((1ULL << shift) & mask) - break; - } else { - if (1ULL << shift >= - buffer_list[0].size + - (buffer_list[0].addr & ((1ULL << shift) - 1))) - break; - } + shift = __ffs(mask | 1 << 31); buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1); buffer_list[0].addr &= ~0ull << shift; @@ -1270,6 +1256,8 @@ static int mthca_init_node_data(struct mthca_dev *dev) goto out; } + if (mthca_is_memfree(dev)) + dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); out: diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 0e5461c..db5595b 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -1175,6 +1175,7 @@ static int mthca_alloc_qp_common(struct mthca_dev *dev, { int ret; int i; + struct mthca_next_seg *next; qp->refcount = 1; init_waitqueue_head(&qp->wait); @@ -1217,7 +1218,6 @@ static int mthca_alloc_qp_common(struct mthca_dev *dev, } if (mthca_is_memfree(dev)) { - struct mthca_next_seg *next; struct mthca_data_seg *scatter; int size = (sizeof (struct mthca_next_seg) + qp->rq.max_gs * sizeof (struct mthca_data_seg)) / 16; @@ -1240,6 +1240,13 @@ static int mthca_alloc_qp_common(struct mthca_dev *dev, qp->sq.wqe_shift) + qp->send_wqe_offset); } + } else { + for (i = 0; i < qp->rq.max; ++i) { + next = get_recv_wqe(qp, i); + next->nda_op = htonl((((i + 1) % qp->rq.max) << + qp->rq.wqe_shift) | 1); + } + } qp->sq.last = get_send_wqe(qp, qp->sq.max - 1); @@ -1863,7 +1870,6 @@ int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, prev_wqe = qp->rq.last; qp->rq.last = wqe; - ((struct mthca_next_seg *) wqe)->nda_op = 0; ((struct mthca_next_seg *) wqe)->ee_nds = cpu_to_be32(MTHCA_NEXT_DBD); ((struct mthca_next_seg *) wqe)->flags = 0; @@ -1885,9 +1891,6 @@ int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, qp->wrid[ind] = wr->wr_id; - ((struct mthca_next_seg *) prev_wqe)->nda_op = - cpu_to_be32((ind << qp->rq.wqe_shift) | 1); - wmb(); ((struct mthca_next_seg *) prev_wqe)->ee_nds = cpu_to_be32(MTHCA_NEXT_DBD | size); diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c index 553d681..a5ffff6 100644 --- a/drivers/infiniband/hw/mthca/mthca_srq.c +++ b/drivers/infiniband/hw/mthca/mthca_srq.c @@ -175,9 +175,17 @@ static int mthca_alloc_srq_buf(struct mthca_dev *dev, struct mthca_pd *pd, * scatter list L_Keys to the sentry value of 0x100. */ for (i = 0; i < srq->max; ++i) { - wqe = get_wqe(srq, i); + struct mthca_next_seg *next; - *wqe_to_link(wqe) = i < srq->max - 1 ? i + 1 : -1; + next = wqe = get_wqe(srq, i); + + if (i < srq->max - 1) { + *wqe_to_link(wqe) = i + 1; + next->nda_op = htonl(((i + 1) << srq->wqe_shift) | 1); + } else { + *wqe_to_link(wqe) = -1; + next->nda_op = 0; + } for (scatter = wqe + sizeof (struct mthca_next_seg); (void *) scatter < wqe + (1 << srq->wqe_shift); @@ -470,16 +478,15 @@ out: void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr) { int ind; + struct mthca_next_seg *last_free; ind = wqe_addr >> srq->wqe_shift; spin_lock(&srq->lock); - if (likely(srq->first_free >= 0)) - *wqe_to_link(get_wqe(srq, srq->last_free)) = ind; - else - srq->first_free = ind; - + last_free = get_wqe(srq, srq->last_free); + *wqe_to_link(last_free) = ind; + last_free->nda_op = htonl((ind << srq->wqe_shift) | 1); *wqe_to_link(get_wqe(srq, ind)) = -1; srq->last_free = ind; @@ -506,15 +513,7 @@ int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, first_ind = srq->first_free; for (nreq = 0; wr; wr = wr->next) { - ind = srq->first_free; - - if (unlikely(ind < 0)) { - mthca_err(dev, "SRQ %06x full\n", srq->srqn); - err = -ENOMEM; - *bad_wr = wr; - break; - } - + ind = srq->first_free; wqe = get_wqe(srq, ind); next_ind = *wqe_to_link(wqe); @@ -528,7 +527,6 @@ int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, prev_wqe = srq->last; srq->last = wqe; - ((struct mthca_next_seg *) wqe)->nda_op = 0; ((struct mthca_next_seg *) wqe)->ee_nds = 0; /* flags field will always remain 0 */ @@ -549,9 +547,6 @@ int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, if (i < srq->max_gs) mthca_set_data_seg_inval(wqe); - ((struct mthca_next_seg *) prev_wqe)->nda_op = - cpu_to_be32((ind << srq->wqe_shift) | 1); - wmb(); ((struct mthca_next_seg *) prev_wqe)->ee_nds = cpu_to_be32(MTHCA_NEXT_DBD); @@ -614,15 +609,7 @@ int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, spin_lock_irqsave(&srq->lock, flags); for (nreq = 0; wr; ++nreq, wr = wr->next) { - ind = srq->first_free; - - if (unlikely(ind < 0)) { - mthca_err(dev, "SRQ %06x full\n", srq->srqn); - err = -ENOMEM; - *bad_wr = wr; - break; - } - + ind = srq->first_free; wqe = get_wqe(srq, ind); next_ind = *wqe_to_link(wqe); @@ -633,8 +620,6 @@ int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, break; } - ((struct mthca_next_seg *) wqe)->nda_op = - cpu_to_be32((next_ind << srq->wqe_shift) | 1); ((struct mthca_next_seg *) wqe)->ee_nds = 0; /* flags field will always remain 0 */ diff --git a/drivers/infiniband/hw/nes/Kconfig b/drivers/infiniband/hw/nes/Kconfig new file mode 100644 index 0000000..2aeb7ac --- /dev/null +++ b/drivers/infiniband/hw/nes/Kconfig @@ -0,0 +1,16 @@ +config INFINIBAND_NES + tristate "NetEffect RNIC Driver" + depends on PCI && INET && INFINIBAND + select LIBCRC32C + ---help--- + This is a low-level driver for NetEffect RDMA enabled + Network Interface Cards (RNIC). + +config INFINIBAND_NES_DEBUG + bool "Verbose debugging output" + depends on INFINIBAND_NES + default n + ---help--- + This option causes the NetEffect RNIC driver to produce debug + messages. Select this if you are developing the driver + or trying to diagnose a problem. diff --git a/drivers/infiniband/hw/nes/Makefile b/drivers/infiniband/hw/nes/Makefile new file mode 100644 index 0000000..3514851 --- /dev/null +++ b/drivers/infiniband/hw/nes/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_NES) += iw_nes.o + +iw_nes-objs := nes.o nes_hw.o nes_nic.o nes_utils.o nes_verbs.o nes_cm.o diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c new file mode 100644 index 0000000..7f8853b --- /dev/null +++ b/drivers/infiniband/hw/nes/nes.c @@ -0,0 +1,1152 @@ +/* + * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/ethtool.h> +#include <linux/mii.h> +#include <linux/if_vlan.h> +#include <linux/crc32.h> +#include <linux/in.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/if_arp.h> +#include <linux/highmem.h> +#include <asm/io.h> +#include <asm/irq.h> +#include <asm/byteorder.h> +#include <rdma/ib_smi.h> +#include <rdma/ib_verbs.h> +#include <rdma/ib_pack.h> +#include <rdma/iw_cm.h> + +#include "nes.h" + +#include <net/netevent.h> +#include <net/neighbour.h> +#include <linux/route.h> +#include <net/ip_fib.h> + +MODULE_AUTHOR("NetEffect"); +MODULE_DESCRIPTION("NetEffect RNIC Low-level iWARP Driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +int max_mtu = 9000; +int nics_per_function = 1; +int interrupt_mod_interval = 0; + + +/* Interoperability */ +int mpa_version = 1; +module_param(mpa_version, int, 0); +MODULE_PARM_DESC(mpa_version, "MPA version to be used int MPA Req/Resp (0 or 1)"); + +/* Interoperability */ +int disable_mpa_crc = 0; +module_param(disable_mpa_crc, int, 0); +MODULE_PARM_DESC(disable_mpa_crc, "Disable checking of MPA CRC"); + +unsigned int send_first = 0; +module_param(send_first, int, 0); +MODULE_PARM_DESC(send_first, "Send RDMA Message First on Active Connection"); + + +unsigned int nes_drv_opt = 0; +module_param(nes_drv_opt, int, 0); +MODULE_PARM_DESC(nes_drv_opt, "Driver option parameters"); + +unsigned int nes_debug_level = 0; +module_param_named(debug_level, nes_debug_level, uint, 0644); +MODULE_PARM_DESC(debug_level, "Enable debug output level"); + +LIST_HEAD(nes_adapter_list); +LIST_HEAD(nes_dev_list); + +atomic_t qps_destroyed; +atomic_t cqp_reqs_allocated; +atomic_t cqp_reqs_freed; +atomic_t cqp_reqs_dynallocated; +atomic_t cqp_reqs_dynfreed; +atomic_t cqp_reqs_queued; +atomic_t cqp_reqs_redriven; + +static void nes_print_macaddr(struct net_device *netdev); +static irqreturn_t nes_interrupt(int, void *); +static int __devinit nes_probe(struct pci_dev *, const struct pci_device_id *); +static void __devexit nes_remove(struct pci_dev *); +static int __init nes_init_module(void); +static void __exit nes_exit_module(void); +static unsigned int ee_flsh_adapter; +static unsigned int sysfs_nonidx_addr; +static unsigned int sysfs_idx_addr; + +static struct pci_device_id nes_pci_table[] = { + {PCI_VENDOR_ID_NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020, PCI_ANY_ID, PCI_ANY_ID}, + {0} +}; + +MODULE_DEVICE_TABLE(pci, nes_pci_table); + +static int nes_inetaddr_event(struct notifier_block *, unsigned long, void *); +static int nes_net_event(struct notifier_block *, unsigned long, void *); +static int nes_notifiers_registered; + + +static struct notifier_block nes_inetaddr_notifier = { + .notifier_call = nes_inetaddr_event +}; + +static struct notifier_block nes_net_notifier = { + .notifier_call = nes_net_event +}; + + + + +/** + * nes_inetaddr_event + */ +static int nes_inetaddr_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = ptr; + struct net_device *event_netdev = ifa->ifa_dev->dev; + struct nes_device *nesdev; + struct net_device *netdev; + struct nes_vnic *nesvnic; + unsigned int addr; + unsigned int mask; + + addr = ntohl(ifa->ifa_address); + mask = ntohl(ifa->ifa_mask); + nes_debug(NES_DBG_NETDEV, "nes_inetaddr_event: ip address %08X, netmask %08X.\n", + addr, mask); + list_for_each_entry(nesdev, &nes_dev_list, list) { + nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p. (%s)\n", + nesdev, nesdev->netdev[0]->name); + netdev = nesdev->netdev[0]; + nesvnic = netdev_priv(netdev); + if (netdev == event_netdev) { + if (nesvnic->rdma_enabled == 0) { + nes_debug(NES_DBG_NETDEV, "Returning without processing event for %s since" + " RDMA is not enabled.\n", + netdev->name); + return NOTIFY_OK; + } + /* we have ifa->ifa_address/mask here if we need it */ + switch (event) { + case NETDEV_DOWN: + nes_debug(NES_DBG_NETDEV, "event:DOWN\n"); + nes_write_indexed(nesdev, + NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)), 0); + + nes_manage_arp_cache(netdev, netdev->dev_addr, + ntohl(nesvnic->local_ipaddr), NES_ARP_DELETE); + nesvnic->local_ipaddr = 0; + return NOTIFY_OK; + break; + case NETDEV_UP: + nes_debug(NES_DBG_NETDEV, "event:UP\n"); + + if (nesvnic->local_ipaddr != 0) { + nes_debug(NES_DBG_NETDEV, "Interface already has local_ipaddr\n"); + return NOTIFY_OK; + } + /* Add the address to the IP table */ + nesvnic->local_ipaddr = ifa->ifa_address; + + nes_write_indexed(nesdev, + NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)), + ntohl(ifa->ifa_address)); + nes_manage_arp_cache(netdev, netdev->dev_addr, + ntohl(nesvnic->local_ipaddr), NES_ARP_ADD); + return NOTIFY_OK; + break; + default: + break; + } + } + } + + return NOTIFY_DONE; +} + + +/** + * nes_net_event + */ +static int nes_net_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct neighbour *neigh = ptr; + struct nes_device *nesdev; + struct net_device *netdev; + struct nes_vnic *nesvnic; + + switch (event) { + case NETEVENT_NEIGH_UPDATE: + list_for_each_entry(nesdev, &nes_dev_list, list) { + /* nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p.\n", nesdev); */ + netdev = nesdev->netdev[0]; + nesvnic = netdev_priv(netdev); + if (netdev == neigh->dev) { + if (nesvnic->rdma_enabled == 0) { + nes_debug(NES_DBG_NETDEV, "Skipping device %s since no RDMA\n", + netdev->name); + } else { + if (neigh->nud_state & NUD_VALID) { + nes_manage_arp_cache(neigh->dev, neigh->ha, + ntohl(*(__be32 *)neigh->primary_key), NES_ARP_ADD); + } else { + nes_manage_arp_cache(neigh->dev, neigh->ha, + ntohl(*(__be32 *)neigh->primary_key), NES_ARP_DELETE); + } + } + return NOTIFY_OK; + } + } + break; + default: + nes_debug(NES_DBG_NETDEV, "NETEVENT_ %lu undefined\n", event); + break; + } + + return NOTIFY_DONE; +} + + +/** + * nes_add_ref + */ +void nes_add_ref(struct ib_qp *ibqp) +{ + struct nes_qp *nesqp; + + nesqp = to_nesqp(ibqp); + nes_debug(NES_DBG_QP, "Bumping refcount for QP%u. Pre-inc value = %u\n", + ibqp->qp_num, atomic_read(&nesqp->refcount)); + atomic_inc(&nesqp->refcount); +} + +static void nes_cqp_rem_ref_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request) +{ + unsigned long flags; + struct nes_qp *nesqp = cqp_request->cqp_callback_pointer; + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 qp_id; + + atomic_inc(&qps_destroyed); + + /* Free the control structures */ + + qp_id = nesqp->hwqp.qp_id; + if (nesqp->pbl_vbase) { + pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, + nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase); + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + nesadapter->free_256pbl++; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase); + nesqp->pbl_vbase = NULL; + + } else { + pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, + nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase); + } + nes_free_resource(nesadapter, nesadapter->allocated_qps, nesqp->hwqp.qp_id); + + kfree(nesqp->allocated_buffer); + +} + +/** + * nes_rem_ref + */ +void nes_rem_ref(struct ib_qp *ibqp) +{ + u64 u64temp; + struct nes_qp *nesqp; + struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + u32 opcode; + + nesqp = to_nesqp(ibqp); + + if (atomic_read(&nesqp->refcount) == 0) { + printk(KERN_INFO PFX "%s: Reference count already 0 for QP%d, last aeq = 0x%04X.\n", + __FUNCTION__, ibqp->qp_num, nesqp->last_aeq); + BUG(); + } + + if (atomic_dec_and_test(&nesqp->refcount)) { + nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = NULL; + + /* Destroy the QP */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n"); + return; + } + cqp_request->waiting = 0; + cqp_request->callback = 1; + cqp_request->cqp_callback = nes_cqp_rem_ref_callback; + cqp_request->cqp_callback_pointer = nesqp; + cqp_wqe = &cqp_request->cqp_wqe; + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + opcode = NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_IWARP; + + if (nesqp->hte_added) { + opcode |= NES_CQP_QP_DEL_HTE; + nesqp->hte_added = 0; + } + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); + u64temp = (u64)nesqp->nesqp_context_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL); + } +} + + +/** + * nes_get_qp + */ +struct ib_qp *nes_get_qp(struct ib_device *device, int qpn) +{ + struct nes_vnic *nesvnic = to_nesvnic(device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + if ((qpn < NES_FIRST_QPN) || (qpn >= (NES_FIRST_QPN + nesadapter->max_qp))) + return NULL; + + return &nesadapter->qp_table[qpn - NES_FIRST_QPN]->ibqp; +} + + +/** + * nes_print_macaddr + */ +static void nes_print_macaddr(struct net_device *netdev) +{ + nes_debug(NES_DBG_INIT, "%s: MAC %02X:%02X:%02X:%02X:%02X:%02X, IRQ %u\n", + netdev->name, + netdev->dev_addr[0], netdev->dev_addr[1], netdev->dev_addr[2], + netdev->dev_addr[3], netdev->dev_addr[4], netdev->dev_addr[5], + netdev->irq); +} + + +/** + * nes_interrupt - handle interrupts + */ +static irqreturn_t nes_interrupt(int irq, void *dev_id) +{ + struct nes_device *nesdev = (struct nes_device *)dev_id; + int handled = 0; + u32 int_mask; + u32 int_req; + u32 int_stat; + u32 intf_int_stat; + u32 timer_stat; + + if (nesdev->msi_enabled) { + /* No need to read the interrupt pending register if msi is enabled */ + handled = 1; + } else { + if (unlikely(nesdev->nesadapter->hw_rev == NE020_REV)) { + /* Master interrupt enable provides synchronization for kicking off bottom half + when interrupt sharing is going on */ + int_mask = nes_read32(nesdev->regs + NES_INT_MASK); + if (int_mask & 0x80000000) { + /* Check interrupt status to see if this might be ours */ + int_stat = nes_read32(nesdev->regs + NES_INT_STAT); + int_req = nesdev->int_req; + if (int_stat&int_req) { + /* if interesting CEQ or AEQ is pending, claim the interrupt */ + if ((int_stat&int_req) & (~(NES_INT_TIMER|NES_INT_INTF))) { + handled = 1; + } else { + if (((int_stat & int_req) & NES_INT_TIMER) == NES_INT_TIMER) { + /* Timer might be running but might be for another function */ + timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT); + if ((timer_stat & nesdev->timer_int_req) != 0) { + handled = 1; + } + } + if ((((int_stat & int_req) & NES_INT_INTF) == NES_INT_INTF) && + (handled == 0)) { + intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT); + if ((intf_int_stat & nesdev->intf_int_req) != 0) { + handled = 1; + } + } + } + if (handled) { + nes_write32(nesdev->regs+NES_INT_MASK, int_mask & (~0x80000000)); + int_mask = nes_read32(nesdev->regs+NES_INT_MASK); + /* Save off the status to save an additional read */ + nesdev->int_stat = int_stat; + nesdev->napi_isr_ran = 1; + } + } + } + } else { + handled = nes_read32(nesdev->regs+NES_INT_PENDING); + } + } + + if (handled) { + + if (nes_napi_isr(nesdev) == 0) { + tasklet_schedule(&nesdev->dpc_tasklet); + + } + return IRQ_HANDLED; + } else { + return IRQ_NONE; + } +} + + +/** + * nes_probe - Device initialization + */ +static int __devinit nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) +{ + struct net_device *netdev = NULL; + struct nes_device *nesdev = NULL; + int ret = 0; + struct nes_vnic *nesvnic = NULL; + void __iomem *mmio_regs = NULL; + u8 hw_rev; + + assert(pcidev != NULL); + assert(ent != NULL); + + printk(KERN_INFO PFX "NetEffect RNIC driver v%s loading. (%s)\n", + DRV_VERSION, pci_name(pcidev)); + + ret = pci_enable_device(pcidev); + if (ret) { + printk(KERN_ERR PFX "Unable to enable PCI device. (%s)\n", pci_name(pcidev)); + goto bail0; + } + + nes_debug(NES_DBG_INIT, "BAR0 (@0x%08lX) size = 0x%lX bytes\n", + (long unsigned int)pci_resource_start(pcidev, BAR_0), + (long unsigned int)pci_resource_len(pcidev, BAR_0)); + nes_debug(NES_DBG_INIT, "BAR1 (@0x%08lX) size = 0x%lX bytes\n", + (long unsigned int)pci_resource_start(pcidev, BAR_1), + (long unsigned int)pci_resource_len(pcidev, BAR_1)); + + /* Make sure PCI base addr are MMIO */ + if (!(pci_resource_flags(pcidev, BAR_0) & IORESOURCE_MEM) || + !(pci_resource_flags(pcidev, BAR_1) & IORESOURCE_MEM)) { + printk(KERN_ERR PFX "PCI regions not an MMIO resource\n"); + ret = -ENODEV; + goto bail1; + } + + /* Reserve PCI I/O and memory resources */ + ret = pci_request_regions(pcidev, DRV_NAME); + if (ret) { + printk(KERN_ERR PFX "Unable to request regions. (%s)\n", pci_name(pcidev)); + goto bail1; + } + + if ((sizeof(dma_addr_t) > 4)) { + ret = pci_set_dma_mask(pcidev, DMA_64BIT_MASK); + if (ret < 0) { + printk(KERN_ERR PFX "64b DMA mask configuration failed\n"); + goto bail2; + } + ret = pci_set_consistent_dma_mask(pcidev, DMA_64BIT_MASK); + if (ret) { + printk(KERN_ERR PFX "64b DMA consistent mask configuration failed\n"); + goto bail2; + } + } else { + ret = pci_set_dma_mask(pcidev, DMA_32BIT_MASK); + if (ret < 0) { + printk(KERN_ERR PFX "32b DMA mask configuration failed\n"); + goto bail2; + } + ret = pci_set_consistent_dma_mask(pcidev, DMA_32BIT_MASK); + if (ret) { + printk(KERN_ERR PFX "32b DMA consistent mask configuration failed\n"); + goto bail2; + } + } + + pci_set_master(pcidev); + + /* Allocate hardware structure */ + nesdev = kzalloc(sizeof(struct nes_device), GFP_KERNEL); + if (!nesdev) { + printk(KERN_ERR PFX "%s: Unable to alloc hardware struct\n", pci_name(pcidev)); + ret = -ENOMEM; + goto bail2; + } + + nes_debug(NES_DBG_INIT, "Allocated nes device at %p\n", nesdev); + nesdev->pcidev = pcidev; + pci_set_drvdata(pcidev, nesdev); + + pci_read_config_byte(pcidev, 0x0008, &hw_rev); + nes_debug(NES_DBG_INIT, "hw_rev=%u\n", hw_rev); + + spin_lock_init(&nesdev->indexed_regs_lock); + + /* Remap the PCI registers in adapter BAR0 to kernel VA space */ + mmio_regs = ioremap_nocache(pci_resource_start(pcidev, BAR_0), sizeof(mmio_regs)); + if (mmio_regs == NULL) { + printk(KERN_ERR PFX "Unable to remap BAR0\n"); + ret = -EIO; + goto bail3; + } + nesdev->regs = mmio_regs; + nesdev->index_reg = 0x50 + (PCI_FUNC(pcidev->devfn)*8) + mmio_regs; + + /* Ensure interrupts are disabled */ + nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff); + + if (nes_drv_opt & NES_DRV_OPT_ENABLE_MSI) { + if (!pci_enable_msi(nesdev->pcidev)) { + nesdev->msi_enabled = 1; + nes_debug(NES_DBG_INIT, "MSI is enabled for device %s\n", + pci_name(pcidev)); + } else { + nes_debug(NES_DBG_INIT, "MSI is disabled by linux for device %s\n", + pci_name(pcidev)); + } + } else { + nes_debug(NES_DBG_INIT, "MSI not requested due to driver options for device %s\n", + pci_name(pcidev)); + } + + nesdev->csr_start = pci_resource_start(nesdev->pcidev, BAR_0); + nesdev->doorbell_region = pci_resource_start(nesdev->pcidev, BAR_1); + + /* Init the adapter */ + nesdev->nesadapter = nes_init_adapter(nesdev, hw_rev); + nesdev->nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval; + if (!nesdev->nesadapter) { + printk(KERN_ERR PFX "Unable to initialize adapter.\n"); + ret = -ENOMEM; + goto bail5; + } + + /* nesdev->base_doorbell_index = + nesdev->nesadapter->pd_config_base[PCI_FUNC(nesdev->pcidev->devfn)]; */ + nesdev->base_doorbell_index = 1; + nesdev->doorbell_start = nesdev->nesadapter->doorbell_start; + nesdev->mac_index = PCI_FUNC(nesdev->pcidev->devfn) % nesdev->nesadapter->port_count; + + tasklet_init(&nesdev->dpc_tasklet, nes_dpc, (unsigned long)nesdev); + + /* bring up the Control QP */ + if (nes_init_cqp(nesdev)) { + ret = -ENODEV; + goto bail6; + } + + /* Arm the CCQ */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + PCI_FUNC(nesdev->pcidev->devfn)); + nes_read32(nesdev->regs+NES_CQE_ALLOC); + + /* Enable the interrupts */ + nesdev->int_req = (0x101 << PCI_FUNC(nesdev->pcidev->devfn)) | + (1 << (PCI_FUNC(nesdev->pcidev->devfn)+16)); + if (PCI_FUNC(nesdev->pcidev->devfn) < 4) { + nesdev->int_req |= (1 << (PCI_FUNC(nesdev->pcidev->devfn)+24)); + } + + /* TODO: This really should be the first driver to load, not function 0 */ + if (PCI_FUNC(nesdev->pcidev->devfn) == 0) { + /* pick up PCI and critical errors if the first driver to load */ + nesdev->intf_int_req = NES_INTF_INT_PCIERR | NES_INTF_INT_CRITERR; + nesdev->int_req |= NES_INT_INTF; + } else { + nesdev->intf_int_req = 0; + } + nesdev->intf_int_req |= (1 << (PCI_FUNC(nesdev->pcidev->devfn)+16)); + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0, 0); + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 0); + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS2, 0x00001265); + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS4, 0x18021804); + + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS3, 0x17801790); + + /* deal with both periodic and one_shot */ + nesdev->timer_int_req = 0x101 << PCI_FUNC(nesdev->pcidev->devfn); + nesdev->nesadapter->timer_int_req |= nesdev->timer_int_req; + nes_debug(NES_DBG_INIT, "setting int_req for function %u, nesdev = 0x%04X, adapter = 0x%04X\n", + PCI_FUNC(nesdev->pcidev->devfn), + nesdev->timer_int_req, nesdev->nesadapter->timer_int_req); + + nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req)); + + list_add_tail(&nesdev->list, &nes_dev_list); + + /* Request an interrupt line for the driver */ + ret = request_irq(pcidev->irq, nes_interrupt, IRQF_SHARED, DRV_NAME, nesdev); + if (ret) { + printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n", + pci_name(pcidev), pcidev->irq); + goto bail65; + } + + nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); + + if (nes_notifiers_registered == 0) { + register_inetaddr_notifier(&nes_inetaddr_notifier); + register_netevent_notifier(&nes_net_notifier); + } + nes_notifiers_registered++; + + /* Initialize network devices */ + if ((netdev = nes_netdev_init(nesdev, mmio_regs)) == NULL) { + goto bail7; + } + + /* Register network device */ + ret = register_netdev(netdev); + if (ret) { + printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n", ret); + nes_netdev_destroy(netdev); + goto bail7; + } + + nes_print_macaddr(netdev); + /* create a CM core for this netdev */ + nesvnic = netdev_priv(netdev); + + nesdev->netdev_count++; + nesdev->nesadapter->netdev_count++; + + + printk(KERN_ERR PFX "%s: NetEffect RNIC driver successfully loaded.\n", + pci_name(pcidev)); + return 0; + + bail7: + printk(KERN_ERR PFX "bail7\n"); + while (nesdev->netdev_count > 0) { + nesdev->netdev_count--; + nesdev->nesadapter->netdev_count--; + + unregister_netdev(nesdev->netdev[nesdev->netdev_count]); + nes_netdev_destroy(nesdev->netdev[nesdev->netdev_count]); + } + + nes_debug(NES_DBG_INIT, "netdev_count=%d, nesadapter->netdev_count=%d\n", + nesdev->netdev_count, nesdev->nesadapter->netdev_count); + + nes_notifiers_registered--; + if (nes_notifiers_registered == 0) { + unregister_netevent_notifier(&nes_net_notifier); + unregister_inetaddr_notifier(&nes_inetaddr_notifier); + } + + list_del(&nesdev->list); + nes_destroy_cqp(nesdev); + + bail65: + printk(KERN_ERR PFX "bail65\n"); + free_irq(pcidev->irq, nesdev); + if (nesdev->msi_enabled) { + pci_disable_msi(pcidev); + } + bail6: + printk(KERN_ERR PFX "bail6\n"); + tasklet_kill(&nesdev->dpc_tasklet); + /* Deallocate the Adapter Structure */ + nes_destroy_adapter(nesdev->nesadapter); + + bail5: + printk(KERN_ERR PFX "bail5\n"); + iounmap(nesdev->regs); + + bail3: + printk(KERN_ERR PFX "bail3\n"); + kfree(nesdev); + + bail2: + pci_release_regions(pcidev); + + bail1: + pci_disable_device(pcidev); + + bail0: + return ret; +} + + +/** + * nes_remove - unload from kernel + */ +static void __devexit nes_remove(struct pci_dev *pcidev) +{ + struct nes_device *nesdev = pci_get_drvdata(pcidev); + struct net_device *netdev; + int netdev_index = 0; + + if (nesdev->netdev_count) { + netdev = nesdev->netdev[netdev_index]; + if (netdev) { + netif_stop_queue(netdev); + unregister_netdev(netdev); + nes_netdev_destroy(netdev); + + nesdev->netdev[netdev_index] = NULL; + nesdev->netdev_count--; + nesdev->nesadapter->netdev_count--; + } + } + + nes_notifiers_registered--; + if (nes_notifiers_registered == 0) { + unregister_netevent_notifier(&nes_net_notifier); + unregister_inetaddr_notifier(&nes_inetaddr_notifier); + } + + list_del(&nesdev->list); + nes_destroy_cqp(nesdev); + tasklet_kill(&nesdev->dpc_tasklet); + + /* Deallocate the Adapter Structure */ + nes_destroy_adapter(nesdev->nesadapter); + + free_irq(pcidev->irq, nesdev); + + if (nesdev->msi_enabled) { + pci_disable_msi(pcidev); + } + + iounmap(nesdev->regs); + kfree(nesdev); + + /* nes_debug(NES_DBG_SHUTDOWN, "calling pci_release_regions.\n"); */ + pci_release_regions(pcidev); + pci_disable_device(pcidev); + pci_set_drvdata(pcidev, NULL); +} + + +static struct pci_driver nes_pci_driver = { + .name = DRV_NAME, + .id_table = nes_pci_table, + .probe = nes_probe, + .remove = __devexit_p(nes_remove), +}; + +static ssize_t nes_show_adapter(struct device_driver *ddp, char *buf) +{ + unsigned int devfn = 0xffffffff; + unsigned char bus_number = 0xff; + unsigned int i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + devfn = nesdev->nesadapter->devfn; + bus_number = nesdev->nesadapter->bus_number; + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "%x:%x", bus_number, devfn); +} + +static ssize_t nes_store_adapter(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + + ee_flsh_adapter = simple_strtoul(p, &p, 10); + return strnlen(buf, count); +} + +static ssize_t nes_show_ee_cmd(struct device_driver *ddp, char *buf) +{ + u32 eeprom_cmd = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + eeprom_cmd = nes_read32(nesdev->regs + NES_EEPROM_COMMAND); + break; + } + i++; + } + return snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_cmd); +} + +static ssize_t nes_store_ee_cmd(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write32(nesdev->regs + NES_EEPROM_COMMAND, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + +static ssize_t nes_show_ee_data(struct device_driver *ddp, char *buf) +{ + u32 eeprom_data = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + eeprom_data = nes_read32(nesdev->regs + NES_EEPROM_DATA); + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_data); +} + +static ssize_t nes_store_ee_data(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write32(nesdev->regs + NES_EEPROM_DATA, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + +static ssize_t nes_show_flash_cmd(struct device_driver *ddp, char *buf) +{ + u32 flash_cmd = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + flash_cmd = nes_read32(nesdev->regs + NES_FLASH_COMMAND); + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "0x%x\n", flash_cmd); +} + +static ssize_t nes_store_flash_cmd(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write32(nesdev->regs + NES_FLASH_COMMAND, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + +static ssize_t nes_show_flash_data(struct device_driver *ddp, char *buf) +{ + u32 flash_data = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + flash_data = nes_read32(nesdev->regs + NES_FLASH_DATA); + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "0x%x\n", flash_data); +} + +static ssize_t nes_store_flash_data(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write32(nesdev->regs + NES_FLASH_DATA, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + +static ssize_t nes_show_nonidx_addr(struct device_driver *ddp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_nonidx_addr); +} + +static ssize_t nes_store_nonidx_addr(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') + sysfs_nonidx_addr = simple_strtoul(p, &p, 16); + + return strnlen(buf, count); +} + +static ssize_t nes_show_nonidx_data(struct device_driver *ddp, char *buf) +{ + u32 nonidx_data = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nonidx_data = nes_read32(nesdev->regs + sysfs_nonidx_addr); + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "0x%x\n", nonidx_data); +} + +static ssize_t nes_store_nonidx_data(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write32(nesdev->regs + sysfs_nonidx_addr, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + +static ssize_t nes_show_idx_addr(struct device_driver *ddp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_idx_addr); +} + +static ssize_t nes_store_idx_addr(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') + sysfs_idx_addr = simple_strtoul(p, &p, 16); + + return strnlen(buf, count); +} + +static ssize_t nes_show_idx_data(struct device_driver *ddp, char *buf) +{ + u32 idx_data = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + idx_data = nes_read_indexed(nesdev, sysfs_idx_addr); + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "0x%x\n", idx_data); +} + +static ssize_t nes_store_idx_data(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write_indexed(nesdev, sysfs_idx_addr, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + +static DRIVER_ATTR(adapter, S_IRUSR | S_IWUSR, + nes_show_adapter, nes_store_adapter); +static DRIVER_ATTR(eeprom_cmd, S_IRUSR | S_IWUSR, + nes_show_ee_cmd, nes_store_ee_cmd); +static DRIVER_ATTR(eeprom_data, S_IRUSR | S_IWUSR, + nes_show_ee_data, nes_store_ee_data); +static DRIVER_ATTR(flash_cmd, S_IRUSR | S_IWUSR, + nes_show_flash_cmd, nes_store_flash_cmd); +static DRIVER_ATTR(flash_data, S_IRUSR | S_IWUSR, + nes_show_flash_data, nes_store_flash_data); +static DRIVER_ATTR(nonidx_addr, S_IRUSR | S_IWUSR, + nes_show_nonidx_addr, nes_store_nonidx_addr); +static DRIVER_ATTR(nonidx_data, S_IRUSR | S_IWUSR, + nes_show_nonidx_data, nes_store_nonidx_data); +static DRIVER_ATTR(idx_addr, S_IRUSR | S_IWUSR, + nes_show_idx_addr, nes_store_idx_addr); +static DRIVER_ATTR(idx_data, S_IRUSR | S_IWUSR, + nes_show_idx_data, nes_store_idx_data); + +static int nes_create_driver_sysfs(struct pci_driver *drv) +{ + int error; + error = driver_create_file(&drv->driver, &driver_attr_adapter); + error |= driver_create_file(&drv->driver, &driver_attr_eeprom_cmd); + error |= driver_create_file(&drv->driver, &driver_attr_eeprom_data); + error |= driver_create_file(&drv->driver, &driver_attr_flash_cmd); + error |= driver_create_file(&drv->driver, &driver_attr_flash_data); + error |= driver_create_file(&drv->driver, &driver_attr_nonidx_addr); + error |= driver_create_file(&drv->driver, &driver_attr_nonidx_data); + error |= driver_create_file(&drv->driver, &driver_attr_idx_addr); + error |= driver_create_file(&drv->driver, &driver_attr_idx_data); + return error; +} + +static void nes_remove_driver_sysfs(struct pci_driver *drv) +{ + driver_remove_file(&drv->driver, &driver_attr_adapter); + driver_remove_file(&drv->driver, &driver_attr_eeprom_cmd); + driver_remove_file(&drv->driver, &driver_attr_eeprom_data); + driver_remove_file(&drv->driver, &driver_attr_flash_cmd); + driver_remove_file(&drv->driver, &driver_attr_flash_data); + driver_remove_file(&drv->driver, &driver_attr_nonidx_addr); + driver_remove_file(&drv->driver, &driver_attr_nonidx_data); + driver_remove_file(&drv->driver, &driver_attr_idx_addr); + driver_remove_file(&drv->driver, &driver_attr_idx_data); +} + +/** + * nes_init_module - module initialization entry point + */ +static int __init nes_init_module(void) +{ + int retval; + int retval1; + + retval = nes_cm_start(); + if (retval) { + printk(KERN_ERR PFX "Unable to start NetEffect iWARP CM.\n"); + return retval; + } + retval = pci_register_driver(&nes_pci_driver); + if (retval >= 0) { + retval1 = nes_create_driver_sysfs(&nes_pci_driver); + if (retval1 < 0) + printk(KERN_ERR PFX "Unable to create NetEffect sys files.\n"); + } + return retval; +} + + +/** + * nes_exit_module - module unload entry point + */ +static void __exit nes_exit_module(void) +{ + nes_cm_stop(); + nes_remove_driver_sysfs(&nes_pci_driver); + + pci_unregister_driver(&nes_pci_driver); +} + + +module_init(nes_init_module); +module_exit(nes_exit_module); diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h new file mode 100644 index 0000000..fd57e8a --- /dev/null +++ b/drivers/infiniband/hw/nes/nes.h @@ -0,0 +1,560 @@ +/* + * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __NES_H +#define __NES_H + +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/spinlock.h> +#include <linux/kernel.h> +#include <linux/delay.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <linux/workqueue.h> +#include <linux/slab.h> +#include <asm/semaphore.h> +#include <linux/version.h> +#include <asm/io.h> +#include <linux/crc32c.h> + +#include <rdma/ib_smi.h> +#include <rdma/ib_verbs.h> +#include <rdma/ib_pack.h> +#include <rdma/rdma_cm.h> +#include <rdma/iw_cm.h> + +#define NES_SEND_FIRST_WRITE + +#define QUEUE_DISCONNECTS + +#define DRV_BUILD "1" + +#define DRV_NAME "iw_nes" +#define DRV_VERSION "1.0 KO Build " DRV_BUILD +#define PFX DRV_NAME ": " + +/* + * NetEffect PCI vendor id and NE010 PCI device id. + */ +#ifndef PCI_VENDOR_ID_NETEFFECT /* not in pci.ids yet */ +#define PCI_VENDOR_ID_NETEFFECT 0x1678 +#define PCI_DEVICE_ID_NETEFFECT_NE020 0x0100 +#endif + +#define NE020_REV 4 +#define NE020_REV1 5 + +#define BAR_0 0 +#define BAR_1 2 + +#define RX_BUF_SIZE (1536 + 8) +#define NES_REG0_SIZE (4 * 1024) +#define NES_TX_TIMEOUT (6*HZ) +#define NES_FIRST_QPN 64 +#define NES_SW_CONTEXT_ALIGN 1024 + +#define NES_NIC_MAX_NICS 16 +#define NES_MAX_ARP_TABLE_SIZE 4096 + +#define NES_NIC_CEQ_SIZE 8 +/* NICs will be on a separate CQ */ +#define NES_CCEQ_SIZE ((nesadapter->max_cq / nesadapter->port_count) - 32) + +#define NES_MAX_PORT_COUNT 4 + +#define MAX_DPC_ITERATIONS 128 + +#define NES_CQP_REQUEST_NO_DOORBELL_RING 0 +#define NES_CQP_REQUEST_RING_DOORBELL 1 + +#define NES_DRV_OPT_ENABLE_MPA_VER_0 0x00000001 +#define NES_DRV_OPT_DISABLE_MPA_CRC 0x00000002 +#define NES_DRV_OPT_DISABLE_FIRST_WRITE 0x00000004 +#define NES_DRV_OPT_DISABLE_INTF 0x00000008 +#define NES_DRV_OPT_ENABLE_MSI 0x00000010 +#define NES_DRV_OPT_DUAL_LOGICAL_PORT 0x00000020 +#define NES_DRV_OPT_SUPRESS_OPTION_BC 0x00000040 +#define NES_DRV_OPT_NO_INLINE_DATA 0x00000080 +#define NES_DRV_OPT_DISABLE_INT_MOD 0x00000100 +#define NES_DRV_OPT_DISABLE_VIRT_WQ 0x00000200 + +#define NES_AEQ_EVENT_TIMEOUT 2500 +#define NES_DISCONNECT_EVENT_TIMEOUT 2000 + +/* debug levels */ +/* must match userspace */ +#define NES_DBG_HW 0x00000001 +#define NES_DBG_INIT 0x00000002 +#define NES_DBG_ISR 0x00000004 +#define NES_DBG_PHY 0x00000008 +#define NES_DBG_NETDEV 0x00000010 +#define NES_DBG_CM 0x00000020 +#define NES_DBG_CM1 0x00000040 +#define NES_DBG_NIC_RX 0x00000080 +#define NES_DBG_NIC_TX 0x00000100 +#define NES_DBG_CQP 0x00000200 +#define NES_DBG_MMAP 0x00000400 +#define NES_DBG_MR 0x00000800 +#define NES_DBG_PD 0x00001000 +#define NES_DBG_CQ 0x00002000 +#define NES_DBG_QP 0x00004000 +#define NES_DBG_MOD_QP 0x00008000 +#define NES_DBG_AEQ 0x00010000 +#define NES_DBG_IW_RX 0x00020000 +#define NES_DBG_IW_TX 0x00040000 +#define NES_DBG_SHUTDOWN 0x00080000 +#define NES_DBG_RSVD1 0x10000000 +#define NES_DBG_RSVD2 0x20000000 +#define NES_DBG_RSVD3 0x40000000 +#define NES_DBG_RSVD4 0x80000000 +#define NES_DBG_ALL 0xffffffff + +#ifdef CONFIG_INFINIBAND_NES_DEBUG +#define nes_debug(level, fmt, args...) \ + if (level & nes_debug_level) \ + printk(KERN_ERR PFX "%s[%u]: " fmt, __FUNCTION__, __LINE__, ##args) + +#define assert(expr) \ +if (!(expr)) { \ + printk(KERN_ERR PFX "Assertion failed! %s, %s, %s, line %d\n", \ + #expr, __FILE__, __FUNCTION__, __LINE__); \ +} + +#define NES_EVENT_TIMEOUT 1200000 +#else +#define nes_debug(level, fmt, args...) +#define assert(expr) do {} while (0) + +#define NES_EVENT_TIMEOUT 100000 +#endif + +#include "nes_hw.h" +#include "nes_verbs.h" +#include "nes_context.h" +#include "nes_user.h" +#include "nes_cm.h" + +extern int max_mtu; +extern int nics_per_function; +#define max_frame_len (max_mtu+ETH_HLEN) +extern int interrupt_mod_interval; +extern int nes_if_count; +extern int mpa_version; +extern int disable_mpa_crc; +extern unsigned int send_first; +extern unsigned int nes_drv_opt; +extern unsigned int nes_debug_level; + +extern struct list_head nes_adapter_list; +extern struct list_head nes_dev_list; + +extern struct nes_cm_core *g_cm_core; + +extern atomic_t cm_connects; +extern atomic_t cm_accepts; +extern atomic_t cm_disconnects; +extern atomic_t cm_closes; +extern atomic_t cm_connecteds; +extern atomic_t cm_connect_reqs; +extern atomic_t cm_rejects; +extern atomic_t mod_qp_timouts; +extern atomic_t qps_created; +extern atomic_t qps_destroyed; +extern atomic_t sw_qps_destroyed; +extern u32 mh_detected; +extern u32 mh_pauses_sent; +extern u32 cm_packets_sent; +extern u32 cm_packets_bounced; +extern u32 cm_packets_created; +extern u32 cm_packets_received; +extern u32 cm_packets_dropped; +extern u32 cm_packets_retrans; +extern u32 cm_listens_created; +extern u32 cm_listens_destroyed; +extern u32 cm_backlog_drops; +extern atomic_t cm_loopbacks; +extern atomic_t cm_nodes_created; +extern atomic_t cm_nodes_destroyed; +extern atomic_t cm_accel_dropped_pkts; +extern atomic_t cm_resets_recvd; + +extern u32 crit_err_count; +extern u32 int_mod_timer_init; +extern u32 int_mod_cq_depth_256; +extern u32 int_mod_cq_depth_128; +extern u32 int_mod_cq_depth_32; +extern u32 int_mod_cq_depth_24; +extern u32 int_mod_cq_depth_16; +extern u32 int_mod_cq_depth_4; +extern u32 int_mod_cq_depth_1; + +extern atomic_t cqp_reqs_allocated; +extern atomic_t cqp_reqs_freed; +extern atomic_t cqp_reqs_dynallocated; +extern atomic_t cqp_reqs_dynfreed; +extern atomic_t cqp_reqs_queued; +extern atomic_t cqp_reqs_redriven; + + +struct nes_device { + struct nes_adapter *nesadapter; + void __iomem *regs; + void __iomem *index_reg; + struct pci_dev *pcidev; + struct net_device *netdev[NES_NIC_MAX_NICS]; + u64 link_status_interrupts; + struct tasklet_struct dpc_tasklet; + spinlock_t indexed_regs_lock; + unsigned long csr_start; + unsigned long doorbell_region; + unsigned long doorbell_start; + unsigned long mac_tx_errors; + unsigned long mac_pause_frames_sent; + unsigned long mac_pause_frames_received; + unsigned long mac_rx_errors; + unsigned long mac_rx_crc_errors; + unsigned long mac_rx_symbol_err_frames; + unsigned long mac_rx_jabber_frames; + unsigned long mac_rx_oversized_frames; + unsigned long mac_rx_short_frames; + unsigned long port_rx_discards; + unsigned long port_tx_discards; + unsigned int mac_index; + unsigned int nes_stack_start; + + /* Control Structures */ + void *cqp_vbase; + dma_addr_t cqp_pbase; + u32 cqp_mem_size; + u8 ceq_index; + u8 nic_ceq_index; + struct nes_hw_cqp cqp; + struct nes_hw_cq ccq; + struct list_head cqp_avail_reqs; + struct list_head cqp_pending_reqs; + struct nes_cqp_request *nes_cqp_requests; + + u32 int_req; + u32 int_stat; + u32 timer_int_req; + u32 timer_only_int_count; + u32 intf_int_req; + u32 last_mac_tx_pauses; + u32 last_used_chunks_tx; + struct list_head list; + + u16 base_doorbell_index; + u16 currcq_count; + u16 deepcq_count; + u8 msi_enabled; + u8 netdev_count; + u8 napi_isr_ran; + u8 disable_rx_flow_control; + u8 disable_tx_flow_control; +}; + + +static inline void +set_wqe_64bit_value(__le32 *wqe_words, u32 index, u64 value) +{ + wqe_words[index] = cpu_to_le32((u32) ((unsigned long)value)); + wqe_words[index + 1] = cpu_to_le32((u32)(upper_32_bits((unsigned long)value))); +} + +static inline void +set_wqe_32bit_value(__le32 *wqe_words, u32 index, u32 value) +{ + wqe_words[index] = cpu_to_le32(value); +} + +static inline void +nes_fill_init_cqp_wqe(struct nes_hw_cqp_wqe *cqp_wqe, struct nes_device *nesdev) +{ + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_COMP_CTX_LOW_IDX, + (u64)((unsigned long) &nesdev->cqp)); + cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_LEN_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_LOW_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_LOW_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_HIGH_IDX] = 0; +} + +static inline void +nes_fill_init_qp_wqe(struct nes_hw_qp_wqe *wqe, struct nes_qp *nesqp, u32 head) +{ + u32 value; + value = ((u32)((unsigned long) nesqp)) | head; + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_HIGH_IDX, + (u32)(upper_32_bits((unsigned long)(nesqp)))); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, value); +} + +/* Read from memory-mapped device */ +static inline u32 nes_read_indexed(struct nes_device *nesdev, u32 reg_index) +{ + unsigned long flags; + void __iomem *addr = nesdev->index_reg; + u32 value; + + spin_lock_irqsave(&nesdev->indexed_regs_lock, flags); + + writel(reg_index, addr); + value = readl((void __iomem *)addr + 4); + + spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags); + return value; +} + +static inline u32 nes_read32(const void __iomem *addr) +{ + return readl(addr); +} + +static inline u16 nes_read16(const void __iomem *addr) +{ + return readw(addr); +} + +static inline u8 nes_read8(const void __iomem *addr) +{ + return readb(addr); +} + +/* Write to memory-mapped device */ +static inline void nes_write_indexed(struct nes_device *nesdev, u32 reg_index, u32 val) +{ + unsigned long flags; + void __iomem *addr = nesdev->index_reg; + + spin_lock_irqsave(&nesdev->indexed_regs_lock, flags); + + writel(reg_index, addr); + writel(val, (void __iomem *)addr + 4); + + spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags); +} + +static inline void nes_write32(void __iomem *addr, u32 val) +{ + writel(val, addr); +} + +static inline void nes_write16(void __iomem *addr, u16 val) +{ + writew(val, addr); +} + +static inline void nes_write8(void __iomem *addr, u8 val) +{ + writeb(val, addr); +} + + + +static inline int nes_alloc_resource(struct nes_adapter *nesadapter, + unsigned long *resource_array, u32 max_resources, + u32 *req_resource_num, u32 *next) +{ + unsigned long flags; + u32 resource_num; + + spin_lock_irqsave(&nesadapter->resource_lock, flags); + + resource_num = find_next_zero_bit(resource_array, max_resources, *next); + if (resource_num >= max_resources) { + resource_num = find_first_zero_bit(resource_array, max_resources); + if (resource_num >= max_resources) { + printk(KERN_ERR PFX "%s: No available resourcess.\n", __FUNCTION__); + spin_unlock_irqrestore(&nesadapter->resource_lock, flags); + return -EMFILE; + } + } + set_bit(resource_num, resource_array); + *next = resource_num+1; + if (*next == max_resources) { + *next = 0; + } + spin_unlock_irqrestore(&nesadapter->resource_lock, flags); + *req_resource_num = resource_num; + + return 0; +} + +static inline int nes_is_resource_allocated(struct nes_adapter *nesadapter, + unsigned long *resource_array, u32 resource_num) +{ + unsigned long flags; + int bit_is_set; + + spin_lock_irqsave(&nesadapter->resource_lock, flags); + + bit_is_set = test_bit(resource_num, resource_array); + nes_debug(NES_DBG_HW, "resource_num %u is%s allocated.\n", + resource_num, (bit_is_set ? "": " not")); + spin_unlock_irqrestore(&nesadapter->resource_lock, flags); + + return bit_is_set; +} + +static inline void nes_free_resource(struct nes_adapter *nesadapter, + unsigned long *resource_array, u32 resource_num) +{ + unsigned long flags; + + spin_lock_irqsave(&nesadapter->resource_lock, flags); + clear_bit(resource_num, resource_array); + spin_unlock_irqrestore(&nesadapter->resource_lock, flags); +} + +static inline struct nes_vnic *to_nesvnic(struct ib_device *ibdev) +{ + return container_of(ibdev, struct nes_ib_device, ibdev)->nesvnic; +} + +static inline struct nes_pd *to_nespd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct nes_pd, ibpd); +} + +static inline struct nes_ucontext *to_nesucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct nes_ucontext, ibucontext); +} + +static inline struct nes_mr *to_nesmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct nes_mr, ibmr); +} + +static inline struct nes_mr *to_nesmr_from_ibfmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct nes_mr, ibfmr); +} + +static inline struct nes_mr *to_nesmw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct nes_mr, ibmw); +} + +static inline struct nes_fmr *to_nesfmr(struct nes_mr *nesmr) +{ + return container_of(nesmr, struct nes_fmr, nesmr); +} + +static inline struct nes_cq *to_nescq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct nes_cq, ibcq); +} + +static inline struct nes_qp *to_nesqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct nes_qp, ibqp); +} + + + +/* nes.c */ +void nes_add_ref(struct ib_qp *); +void nes_rem_ref(struct ib_qp *); +struct ib_qp *nes_get_qp(struct ib_device *, int); + + +/* nes_hw.c */ +struct nes_adapter *nes_init_adapter(struct nes_device *, u8); +void nes_nic_init_timer_defaults(struct nes_device *, u8); +unsigned int nes_reset_adapter_ne020(struct nes_device *, u8 *); +int nes_init_serdes(struct nes_device *, u8, u8, u8); +void nes_init_csr_ne020(struct nes_device *, u8, u8); +void nes_destroy_adapter(struct nes_adapter *); +int nes_init_cqp(struct nes_device *); +int nes_init_phy(struct nes_device *); +int nes_init_nic_qp(struct nes_device *, struct net_device *); +void nes_destroy_nic_qp(struct nes_vnic *); +int nes_napi_isr(struct nes_device *); +void nes_dpc(unsigned long); +void nes_process_ceq(struct nes_device *, struct nes_hw_ceq *); +void nes_process_aeq(struct nes_device *, struct nes_hw_aeq *); +void nes_process_mac_intr(struct nes_device *, u32); +void nes_nic_napi_ce_handler(struct nes_device *, struct nes_hw_nic_cq *); +void nes_nic_ce_handler(struct nes_device *, struct nes_hw_nic_cq *); +void nes_cqp_ce_handler(struct nes_device *, struct nes_hw_cq *); +void nes_process_iwarp_aeqe(struct nes_device *, struct nes_hw_aeqe *); +void nes_iwarp_ce_handler(struct nes_device *, struct nes_hw_cq *); +int nes_destroy_cqp(struct nes_device *); +int nes_nic_cm_xmit(struct sk_buff *, struct net_device *); + +/* nes_nic.c */ +void nes_netdev_set_multicast_list(struct net_device *); +void nes_netdev_exit(struct nes_vnic *); +struct net_device *nes_netdev_init(struct nes_device *, void __iomem *); +void nes_netdev_destroy(struct net_device *); +int nes_nic_cm_xmit(struct sk_buff *, struct net_device *); + +/* nes_cm.c */ +void *nes_cm_create(struct net_device *); +int nes_cm_recv(struct sk_buff *, struct net_device *); +void nes_update_arp(unsigned char *, u32, u32, u16, u16); +void nes_manage_arp_cache(struct net_device *, unsigned char *, u32, u32); +void nes_sock_release(struct nes_qp *, unsigned long *); +struct nes_cm_core *nes_cm_alloc_core(void); +void flush_wqes(struct nes_device *nesdev, struct nes_qp *, u32, u32); +int nes_manage_apbvt(struct nes_vnic *, u32, u32, u32); +int nes_cm_disconn(struct nes_qp *); +void nes_cm_disconn_worker(void *); + +/* nes_verbs.c */ +int nes_hw_modify_qp(struct nes_device *, struct nes_qp *, u32, u32); +int nes_modify_qp(struct ib_qp *, struct ib_qp_attr *, int, struct ib_udata *); +struct nes_ib_device *nes_init_ofa_device(struct net_device *); +void nes_destroy_ofa_device(struct nes_ib_device *); +int nes_register_ofa_device(struct nes_ib_device *); +void nes_unregister_ofa_device(struct nes_ib_device *); + +/* nes_util.c */ +int nes_read_eeprom_values(struct nes_device *, struct nes_adapter *); +void nes_write_1G_phy_reg(struct nes_device *, u8, u8, u16); +void nes_read_1G_phy_reg(struct nes_device *, u8, u8, u16 *); +void nes_write_10G_phy_reg(struct nes_device *, u16, u8, u16); +void nes_read_10G_phy_reg(struct nes_device *, u16, u8); +struct nes_cqp_request *nes_get_cqp_request(struct nes_device *); +void nes_post_cqp_request(struct nes_device *, struct nes_cqp_request *, int); +int nes_arp_table(struct nes_device *, u32, u8 *, u32); +void nes_mh_fix(unsigned long); +void nes_clc(unsigned long); +void nes_dump_mem(unsigned int, void *, int); +u32 nes_crc32(u32, u32, u32, u32, u8 *, u32, u32, u32); + +#endif /* __NES_H */ diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c new file mode 100644 index 0000000..bd5cfea --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -0,0 +1,3088 @@ +/* + * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + + +#define TCPOPT_TIMESTAMP 8 + +#include <asm/atomic.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/init.h> +#include <linux/if_arp.h> +#include <linux/notifier.h> +#include <linux/net.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/time.h> +#include <linux/delay.h> +#include <linux/etherdevice.h> +#include <linux/netdevice.h> +#include <linux/random.h> +#include <linux/list.h> +#include <linux/threads.h> + +#include <net/neighbour.h> +#include <net/route.h> +#include <net/ip_fib.h> + +#include "nes.h" + +u32 cm_packets_sent; +u32 cm_packets_bounced; +u32 cm_packets_dropped; +u32 cm_packets_retrans; +u32 cm_packets_created; +u32 cm_packets_received; +u32 cm_listens_created; +u32 cm_listens_destroyed; +u32 cm_backlog_drops; +atomic_t cm_loopbacks; +atomic_t cm_nodes_created; +atomic_t cm_nodes_destroyed; +atomic_t cm_accel_dropped_pkts; +atomic_t cm_resets_recvd; + +static inline int mini_cm_accelerated(struct nes_cm_core *, struct nes_cm_node *); +static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *, + struct nes_vnic *, struct nes_cm_info *); +static int add_ref_cm_node(struct nes_cm_node *); +static int rem_ref_cm_node(struct nes_cm_core *, struct nes_cm_node *); +static int mini_cm_del_listen(struct nes_cm_core *, struct nes_cm_listener *); + + +/* External CM API Interface */ +/* instance of function pointers for client API */ +/* set address of this instance to cm_core->cm_ops at cm_core alloc */ +static struct nes_cm_ops nes_cm_api = { + mini_cm_accelerated, + mini_cm_listen, + mini_cm_del_listen, + mini_cm_connect, + mini_cm_close, + mini_cm_accept, + mini_cm_reject, + mini_cm_recv_pkt, + mini_cm_dealloc_core, + mini_cm_get, + mini_cm_set +}; + +struct nes_cm_core *g_cm_core; + +atomic_t cm_connects; +atomic_t cm_accepts; +atomic_t cm_disconnects; +atomic_t cm_closes; +atomic_t cm_connecteds; +atomic_t cm_connect_reqs; +atomic_t cm_rejects; + + +/** + * create_event + */ +static struct nes_cm_event *create_event(struct nes_cm_node *cm_node, + enum nes_cm_event_type type) +{ + struct nes_cm_event *event; + + if (!cm_node->cm_id) + return NULL; + + /* allocate an empty event */ + event = kzalloc(sizeof(*event), GFP_ATOMIC); + + if (!event) + return NULL; + + event->type = type; + event->cm_node = cm_node; + event->cm_info.rem_addr = cm_node->rem_addr; + event->cm_info.loc_addr = cm_node->loc_addr; + event->cm_info.rem_port = cm_node->rem_port; + event->cm_info.loc_port = cm_node->loc_port; + event->cm_info.cm_id = cm_node->cm_id; + + nes_debug(NES_DBG_CM, "Created event=%p, type=%u, dst_addr=%08x[%x]," + " src_addr=%08x[%x]\n", + event, type, + event->cm_info.loc_addr, event->cm_info.loc_port, + event->cm_info.rem_addr, event->cm_info.rem_port); + + nes_cm_post_event(event); + return event; +} + + +/** + * send_mpa_request + */ +int send_mpa_request(struct nes_cm_node *cm_node) +{ + struct sk_buff *skb; + int ret; + + skb = get_free_pkt(cm_node); + if (!skb) { + nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); + return -1; + } + + /* send an MPA Request frame */ + form_cm_frame(skb, cm_node, NULL, 0, &cm_node->mpa_frame, + cm_node->mpa_frame_size, SET_ACK); + + ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); + if (ret < 0) { + return ret; + } + + return 0; +} + + +/** + * recv_mpa - process a received TCP pkt, we are expecting an + * IETF MPA frame + */ +static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 len) +{ + struct ietf_mpa_frame *mpa_frame; + + /* assume req frame is in tcp data payload */ + if (len < sizeof(struct ietf_mpa_frame)) { + nes_debug(NES_DBG_CM, "The received ietf buffer was too small (%x)\n", len); + return -1; + } + + mpa_frame = (struct ietf_mpa_frame *)buffer; + cm_node->mpa_frame_size = ntohs(mpa_frame->priv_data_len); + + if (cm_node->mpa_frame_size + sizeof(struct ietf_mpa_frame) != len) { + nes_debug(NES_DBG_CM, "The received ietf buffer was not right" + " complete (%x + %x != %x)\n", + cm_node->mpa_frame_size, (u32)sizeof(struct ietf_mpa_frame), len); + return -1; + } + + /* copy entire MPA frame to our cm_node's frame */ + memcpy(cm_node->mpa_frame_buf, buffer + sizeof(struct ietf_mpa_frame), + cm_node->mpa_frame_size); + + return 0; +} + + +/** + * handle_exception_pkt - process an exception packet. + * We have been in a TSA state, and we have now received SW + * TCP/IP traffic should be a FIN request or IP pkt with options + */ +static int handle_exception_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb) +{ + int ret = 0; + struct tcphdr *tcph = tcp_hdr(skb); + + /* first check to see if this a FIN pkt */ + if (tcph->fin) { + /* we need to ACK the FIN request */ + send_ack(cm_node); + + /* check which side we are (client/server) and set next state accordingly */ + if (cm_node->tcp_cntxt.client) + cm_node->state = NES_CM_STATE_CLOSING; + else { + /* we are the server side */ + cm_node->state = NES_CM_STATE_CLOSE_WAIT; + /* since this is a self contained CM we don't wait for */ + /* an APP to close us, just send final FIN immediately */ + ret = send_fin(cm_node, NULL); + cm_node->state = NES_CM_STATE_LAST_ACK; + } + } else { + ret = -EINVAL; + } + + return ret; +} + + +/** + * form_cm_frame - get a free packet and build empty frame Use + * node info to build. + */ +struct sk_buff *form_cm_frame(struct sk_buff *skb, struct nes_cm_node *cm_node, + void *options, u32 optionsize, void *data, u32 datasize, u8 flags) +{ + struct tcphdr *tcph; + struct iphdr *iph; + struct ethhdr *ethh; + u8 *buf; + u16 packetsize = sizeof(*iph); + + packetsize += sizeof(*tcph); + packetsize += optionsize + datasize; + + memset(skb->data, 0x00, ETH_HLEN + sizeof(*iph) + sizeof(*tcph)); + + skb->len = 0; + buf = skb_put(skb, packetsize + ETH_HLEN); + + ethh = (struct ethhdr *) buf; + buf += ETH_HLEN; + + iph = (struct iphdr *)buf; + buf += sizeof(*iph); + tcph = (struct tcphdr *)buf; + skb_reset_mac_header(skb); + skb_set_network_header(skb, ETH_HLEN); + skb_set_transport_header(skb, ETH_HLEN+sizeof(*iph)); + buf += sizeof(*tcph); + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->protocol = htons(0x800); + skb->data_len = 0; + skb->mac_len = ETH_HLEN; + + memcpy(ethh->h_dest, cm_node->rem_mac, ETH_ALEN); + memcpy(ethh->h_source, cm_node->loc_mac, ETH_ALEN); + ethh->h_proto = htons(0x0800); + + iph->version = IPVERSION; + iph->ihl = 5; /* 5 * 4Byte words, IP headr len */ + iph->tos = 0; + iph->tot_len = htons(packetsize); + iph->id = htons(++cm_node->tcp_cntxt.loc_id); + + iph->frag_off = htons(0x4000); + iph->ttl = 0x40; + iph->protocol = 0x06; /* IPPROTO_TCP */ + + iph->saddr = htonl(cm_node->loc_addr); + iph->daddr = htonl(cm_node->rem_addr); + + tcph->source = htons(cm_node->loc_port); + tcph->dest = htons(cm_node->rem_port); + tcph->seq = htonl(cm_node->tcp_cntxt.loc_seq_num); + + if (flags & SET_ACK) { + cm_node->tcp_cntxt.loc_ack_num = cm_node->tcp_cntxt.rcv_nxt; + tcph->ack_seq = htonl(cm_node->tcp_cntxt.loc_ack_num); + tcph->ack = 1; + } else + tcph->ack_seq = 0; + + if (flags & SET_SYN) { + cm_node->tcp_cntxt.loc_seq_num++; + tcph->syn = 1; + } else + cm_node->tcp_cntxt.loc_seq_num += datasize; /* data (no headers) */ + + if (flags & SET_FIN) + tcph->fin = 1; + + if (flags & SET_RST) + tcph->rst = 1; + + tcph->doff = (u16)((sizeof(*tcph) + optionsize + 3) >> 2); + tcph->window = htons(cm_node->tcp_cntxt.rcv_wnd); + tcph->urg_ptr = 0; + if (optionsize) + memcpy(buf, options, optionsize); + buf += optionsize; + if (datasize) + memcpy(buf, data, datasize); + + skb_shinfo(skb)->nr_frags = 0; + cm_packets_created++; + + return skb; +} + + +/** + * print_core - dump a cm core + */ +static void print_core(struct nes_cm_core *core) +{ + nes_debug(NES_DBG_CM, "---------------------------------------------\n"); + nes_debug(NES_DBG_CM, "CM Core -- (core = %p )\n", core); + if (!core) + return; + nes_debug(NES_DBG_CM, "---------------------------------------------\n"); + nes_debug(NES_DBG_CM, "Session ID : %u \n", atomic_read(&core->session_id)); + + nes_debug(NES_DBG_CM, "State : %u \n", core->state); + + nes_debug(NES_DBG_CM, "Tx Free cnt : %u \n", skb_queue_len(&core->tx_free_list)); + nes_debug(NES_DBG_CM, "Listen Nodes : %u \n", atomic_read(&core->listen_node_cnt)); + nes_debug(NES_DBG_CM, "Active Nodes : %u \n", atomic_read(&core->node_cnt)); + + nes_debug(NES_DBG_CM, "core : %p \n", core); + + nes_debug(NES_DBG_CM, "-------------- end core ---------------\n"); +} + + +/** + * schedule_nes_timer + * note - cm_node needs to be protected before calling this. Encase in: + * rem_ref_cm_node(cm_core, cm_node);add_ref_cm_node(cm_node); + */ +int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb, + enum nes_timer_type type, int send_retrans, + int close_when_complete) +{ + unsigned long flags; + struct nes_cm_core *cm_core; + struct nes_timer_entry *new_send; + int ret = 0; + u32 was_timer_set; + + new_send = kzalloc(sizeof(*new_send), GFP_ATOMIC); + if (!new_send) + return -1; + if (!cm_node) + return -EINVAL; + + /* new_send->timetosend = currenttime */ + new_send->retrycount = NES_DEFAULT_RETRYS; + new_send->retranscount = NES_DEFAULT_RETRANS; + new_send->skb = skb; + new_send->timetosend = jiffies; + new_send->type = type; + new_send->netdev = cm_node->netdev; + new_send->send_retrans = send_retrans; + new_send->close_when_complete = close_when_complete; + + if (type == NES_TIMER_TYPE_CLOSE) { + new_send->timetosend += (HZ/2); /* TODO: decide on the correct value here */ + spin_lock_irqsave(&cm_node->recv_list_lock, flags); + list_add_tail(&new_send->list, &cm_node->recv_list); + spin_unlock_irqrestore(&cm_node->recv_list_lock, flags); + } + + if (type == NES_TIMER_TYPE_SEND) { + new_send->seq_num = htonl(tcp_hdr(skb)->seq); + atomic_inc(&new_send->skb->users); + + ret = nes_nic_cm_xmit(new_send->skb, cm_node->netdev); + if (ret != NETDEV_TX_OK) { + nes_debug(NES_DBG_CM, "Error sending packet %p (jiffies = %lu)\n", + new_send, jiffies); + atomic_dec(&new_send->skb->users); + new_send->timetosend = jiffies; + } else { + cm_packets_sent++; + if (!send_retrans) { + if (close_when_complete) + rem_ref_cm_node(cm_node->cm_core, cm_node); + dev_kfree_skb_any(new_send->skb); + kfree(new_send); + return ret; + } + new_send->timetosend = jiffies + NES_RETRY_TIMEOUT; + } + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + list_add_tail(&new_send->list, &cm_node->retrans_list); + spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); + } + if (type == NES_TIMER_TYPE_RECV) { + new_send->seq_num = htonl(tcp_hdr(skb)->seq); + new_send->timetosend = jiffies; + spin_lock_irqsave(&cm_node->recv_list_lock, flags); + list_add_tail(&new_send->list, &cm_node->recv_list); + spin_unlock_irqrestore(&cm_node->recv_list_lock, flags); + } + cm_core = cm_node->cm_core; + + was_timer_set = timer_pending(&cm_core->tcp_timer); + + if (!was_timer_set) { + cm_core->tcp_timer.expires = new_send->timetosend; + add_timer(&cm_core->tcp_timer); + } + + return ret; +} + + +/** + * nes_cm_timer_tick + */ +void nes_cm_timer_tick(unsigned long pass) +{ + unsigned long flags, qplockflags; + unsigned long nexttimeout = jiffies + NES_LONG_TIME; + struct iw_cm_id *cm_id; + struct nes_cm_node *cm_node; + struct nes_timer_entry *send_entry, *recv_entry; + struct list_head *list_core, *list_core_temp; + struct list_head *list_node, *list_node_temp; + struct nes_cm_core *cm_core = g_cm_core; + struct nes_qp *nesqp; + struct sk_buff *skb; + u32 settimer = 0; + int ret = NETDEV_TX_OK; + int node_done; + + spin_lock_irqsave(&cm_core->ht_lock, flags); + + list_for_each_safe(list_node, list_core_temp, &cm_core->connected_nodes) { + cm_node = container_of(list_node, struct nes_cm_node, list); + add_ref_cm_node(cm_node); + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + spin_lock_irqsave(&cm_node->recv_list_lock, flags); + list_for_each_safe(list_core, list_node_temp, &cm_node->recv_list) { + recv_entry = container_of(list_core, struct nes_timer_entry, list); + if ((time_after(recv_entry->timetosend, jiffies)) && + (recv_entry->type == NES_TIMER_TYPE_CLOSE)) { + if (nexttimeout > recv_entry->timetosend || !settimer) { + nexttimeout = recv_entry->timetosend; + settimer = 1; + } + continue; + } + list_del(&recv_entry->list); + cm_id = cm_node->cm_id; + spin_unlock_irqrestore(&cm_node->recv_list_lock, flags); + if (recv_entry->type == NES_TIMER_TYPE_CLOSE) { + nesqp = (struct nes_qp *)recv_entry->skb; + spin_lock_irqsave(&nesqp->lock, qplockflags); + if (nesqp->cm_id) { + nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, refcount = %d: " + "****** HIT A NES_TIMER_TYPE_CLOSE" + " with something to do!!! ******\n", + nesqp->hwqp.qp_id, cm_id, + atomic_read(&nesqp->refcount)); + nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; + nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT; + nesqp->ibqp_state = IB_QPS_ERR; + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_cm_disconn(nesqp); + } else { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, refcount = %d:" + " ****** HIT A NES_TIMER_TYPE_CLOSE" + " with nothing to do!!! ******\n", + nesqp->hwqp.qp_id, cm_id, + atomic_read(&nesqp->refcount)); + nes_rem_ref(&nesqp->ibqp); + } + if (cm_id) + cm_id->rem_ref(cm_id); + } + kfree(recv_entry); + spin_lock_irqsave(&cm_node->recv_list_lock, flags); + } + spin_unlock_irqrestore(&cm_node->recv_list_lock, flags); + + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + node_done = 0; + list_for_each_safe(list_core, list_node_temp, &cm_node->retrans_list) { + if (node_done) { + break; + } + send_entry = container_of(list_core, struct nes_timer_entry, list); + if (time_after(send_entry->timetosend, jiffies)) { + if (cm_node->state != NES_CM_STATE_TSA) { + if ((nexttimeout > send_entry->timetosend) || !settimer) { + nexttimeout = send_entry->timetosend; + settimer = 1; + } + node_done = 1; + continue; + } else { + list_del(&send_entry->list); + skb = send_entry->skb; + spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); + dev_kfree_skb_any(skb); + kfree(send_entry); + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + continue; + } + } + if (send_entry->type == NES_TIMER_NODE_CLEANUP) { + list_del(&send_entry->list); + spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); + kfree(send_entry); + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + continue; + } + if ((send_entry->seq_num < cm_node->tcp_cntxt.rem_ack_num) || + (cm_node->state == NES_CM_STATE_TSA) || + (cm_node->state == NES_CM_STATE_CLOSED)) { + skb = send_entry->skb; + list_del(&send_entry->list); + spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); + kfree(send_entry); + dev_kfree_skb_any(skb); + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + continue; + } + + if (!send_entry->retranscount || !send_entry->retrycount) { + cm_packets_dropped++; + skb = send_entry->skb; + list_del(&send_entry->list); + spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); + dev_kfree_skb_any(skb); + kfree(send_entry); + if (cm_node->state == NES_CM_STATE_SYN_RCVD) { + /* this node never even generated an indication up to the cm */ + rem_ref_cm_node(cm_core, cm_node); + } else { + cm_node->state = NES_CM_STATE_CLOSED; + create_event(cm_node, NES_CM_EVENT_ABORTED); + } + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + continue; + } + /* this seems like the correct place, but leave send entry unprotected */ + // spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); + atomic_inc(&send_entry->skb->users); + cm_packets_retrans++; + nes_debug(NES_DBG_CM, "Retransmitting send_entry %p for node %p," + " jiffies = %lu, time to send = %lu, retranscount = %u, " + "send_entry->seq_num = 0x%08X, cm_node->tcp_cntxt.rem_ack_num = 0x%08X\n", + send_entry, cm_node, jiffies, send_entry->timetosend, send_entry->retranscount, + send_entry->seq_num, cm_node->tcp_cntxt.rem_ack_num); + + spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); + ret = nes_nic_cm_xmit(send_entry->skb, cm_node->netdev); + if (ret != NETDEV_TX_OK) { + cm_packets_bounced++; + atomic_dec(&send_entry->skb->users); + send_entry->retrycount--; + nexttimeout = jiffies + NES_SHORT_TIME; + settimer = 1; + node_done = 1; + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + continue; + } else { + cm_packets_sent++; + } + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + list_del(&send_entry->list); + nes_debug(NES_DBG_CM, "Packet Sent: retrans count = %u, retry count = %u.\n", + send_entry->retranscount, send_entry->retrycount); + if (send_entry->send_retrans) { + send_entry->retranscount--; + send_entry->timetosend = jiffies + NES_RETRY_TIMEOUT; + if (nexttimeout > send_entry->timetosend || !settimer) { + nexttimeout = send_entry->timetosend; + settimer = 1; + } + list_add(&send_entry->list, &cm_node->retrans_list); + continue; + } else { + int close_when_complete; + skb = send_entry->skb; + close_when_complete = send_entry->close_when_complete; + spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); + if (close_when_complete) { + BUG_ON(atomic_read(&cm_node->ref_count) == 1); + rem_ref_cm_node(cm_core, cm_node); + } + dev_kfree_skb_any(skb); + kfree(send_entry); + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + continue; + } + } + spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); + + rem_ref_cm_node(cm_core, cm_node); + + spin_lock_irqsave(&cm_core->ht_lock, flags); + if (ret != NETDEV_TX_OK) + break; + } + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + + if (settimer) { + if (!timer_pending(&cm_core->tcp_timer)) { + cm_core->tcp_timer.expires = nexttimeout; + add_timer(&cm_core->tcp_timer); + } + } +} + + +/** + * send_syn + */ +int send_syn(struct nes_cm_node *cm_node, u32 sendack) +{ + int ret; + int flags = SET_SYN; + struct sk_buff *skb; + char optionsbuffer[sizeof(struct option_mss) + + sizeof(struct option_windowscale) + + sizeof(struct option_base) + 1]; + + int optionssize = 0; + /* Sending MSS option */ + union all_known_options *options; + + if (!cm_node) + return -EINVAL; + + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_mss.optionnum = OPTION_NUMBER_MSS; + options->as_mss.length = sizeof(struct option_mss); + options->as_mss.mss = htons(cm_node->tcp_cntxt.mss); + optionssize += sizeof(struct option_mss); + + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_windowscale.optionnum = OPTION_NUMBER_WINDOW_SCALE; + options->as_windowscale.length = sizeof(struct option_windowscale); + options->as_windowscale.shiftcount = cm_node->tcp_cntxt.rcv_wscale; + optionssize += sizeof(struct option_windowscale); + + if (sendack && !(NES_DRV_OPT_SUPRESS_OPTION_BC & nes_drv_opt) + ) { + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_base.optionnum = OPTION_NUMBER_WRITE0; + options->as_base.length = sizeof(struct option_base); + optionssize += sizeof(struct option_base); + /* we need the size to be a multiple of 4 */ + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_end = 1; + optionssize += 1; + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_end = 1; + optionssize += 1; + } + + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_end = OPTION_NUMBER_END; + optionssize += 1; + + skb = get_free_pkt(cm_node); + if (!skb) { + nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); + return -1; + } + + if (sendack) + flags |= SET_ACK; + + form_cm_frame(skb, cm_node, optionsbuffer, optionssize, NULL, 0, flags); + ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); + + return ret; +} + + +/** + * send_reset + */ +int send_reset(struct nes_cm_node *cm_node) +{ + int ret; + struct sk_buff *skb = get_free_pkt(cm_node); + int flags = SET_RST | SET_ACK; + + if (!skb) { + nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); + return -1; + } + + add_ref_cm_node(cm_node); + form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, flags); + ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 1); + + return ret; +} + + +/** + * send_ack + */ +int send_ack(struct nes_cm_node *cm_node) +{ + int ret; + struct sk_buff *skb = get_free_pkt(cm_node); + + if (!skb) { + nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); + return -1; + } + + form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK); + ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 0); + + return ret; +} + + +/** + * send_fin + */ +int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb) +{ + int ret; + + /* if we didn't get a frame get one */ + if (!skb) + skb = get_free_pkt(cm_node); + + if (!skb) { + nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); + return -1; + } + + form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK | SET_FIN); + ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); + + return ret; +} + + +/** + * get_free_pkt + */ +struct sk_buff *get_free_pkt(struct nes_cm_node *cm_node) +{ + struct sk_buff *skb, *new_skb; + + /* check to see if we need to repopulate the free tx pkt queue */ + if (skb_queue_len(&cm_node->cm_core->tx_free_list) < NES_CM_FREE_PKT_LO_WATERMARK) { + while (skb_queue_len(&cm_node->cm_core->tx_free_list) < + cm_node->cm_core->free_tx_pkt_max) { + /* replace the frame we took, we won't get it back */ + new_skb = dev_alloc_skb(cm_node->cm_core->mtu); + BUG_ON(!new_skb); + /* add a replacement frame to the free tx list head */ + skb_queue_head(&cm_node->cm_core->tx_free_list, new_skb); + } + } + + skb = skb_dequeue(&cm_node->cm_core->tx_free_list); + + return skb; +} + + +/** + * make_hashkey - generate hash key from node tuple + */ +static inline int make_hashkey(u16 loc_port, nes_addr_t loc_addr, u16 rem_port, + nes_addr_t rem_addr) +{ + u32 hashkey = 0; + + hashkey = loc_addr + rem_addr + loc_port + rem_port; + hashkey = (hashkey % NES_CM_HASHTABLE_SIZE); + + return hashkey; +} + + +/** + * find_node - find a cm node that matches the reference cm node + */ +static struct nes_cm_node *find_node(struct nes_cm_core *cm_core, + u16 rem_port, nes_addr_t rem_addr, u16 loc_port, nes_addr_t loc_addr) +{ + unsigned long flags; + u32 hashkey; + struct list_head *list_pos; + struct list_head *hte; + struct nes_cm_node *cm_node; + + /* make a hash index key for this packet */ + hashkey = make_hashkey(loc_port, loc_addr, rem_port, rem_addr); + + /* get a handle on the hte */ + hte = &cm_core->connected_nodes; + + nes_debug(NES_DBG_CM, "Searching for an owner node:%x:%x from core %p->%p\n", + loc_addr, loc_port, cm_core, hte); + + /* walk list and find cm_node associated with this session ID */ + spin_lock_irqsave(&cm_core->ht_lock, flags); + list_for_each(list_pos, hte) { + cm_node = container_of(list_pos, struct nes_cm_node, list); + /* compare quad, return node handle if a match */ + nes_debug(NES_DBG_CM, "finding node %x:%x =? %x:%x ^ %x:%x =? %x:%x\n", + cm_node->loc_addr, cm_node->loc_port, + loc_addr, loc_port, + cm_node->rem_addr, cm_node->rem_port, + rem_addr, rem_port); + if ((cm_node->loc_addr == loc_addr) && (cm_node->loc_port == loc_port) && + (cm_node->rem_addr == rem_addr) && (cm_node->rem_port == rem_port)) { + add_ref_cm_node(cm_node); + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + return cm_node; + } + } + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + + /* no owner node */ + return NULL; +} + + +/** + * find_listener - find a cm node listening on this addr-port pair + */ +static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core, + nes_addr_t dst_addr, u16 dst_port, enum nes_cm_listener_state listener_state) +{ + unsigned long flags; + struct list_head *listen_list; + struct nes_cm_listener *listen_node; + + /* walk list and find cm_node associated with this session ID */ + spin_lock_irqsave(&cm_core->listen_list_lock, flags); + list_for_each(listen_list, &cm_core->listen_list.list) { + listen_node = container_of(listen_list, struct nes_cm_listener, list); + /* compare node pair, return node handle if a match */ + if (((listen_node->loc_addr == dst_addr) || + listen_node->loc_addr == 0x00000000) && + (listen_node->loc_port == dst_port) && + (listener_state & listen_node->listener_state)) { + atomic_inc(&listen_node->ref_count); + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + return listen_node; + } + } + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + + nes_debug(NES_DBG_CM, "Unable to find listener- %x:%x\n", + dst_addr, dst_port); + + /* no listener */ + return NULL; +} + + +/** + * add_hte_node - add a cm node to the hash table + */ +static int add_hte_node(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) +{ + unsigned long flags; + u32 hashkey; + struct list_head *hte; + + if (!cm_node || !cm_core) + return -EINVAL; + + nes_debug(NES_DBG_CM, "Adding Node to Active Connection HT\n"); + + /* first, make an index into our hash table */ + hashkey = make_hashkey(cm_node->loc_port, cm_node->loc_addr, + cm_node->rem_port, cm_node->rem_addr); + cm_node->hashkey = hashkey; + + spin_lock_irqsave(&cm_core->ht_lock, flags); + + /* get a handle on the hash table element (list head for this slot) */ + hte = &cm_core->connected_nodes; + list_add_tail(&cm_node->list, hte); + atomic_inc(&cm_core->ht_node_cnt); + + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + + return 0; +} + + +/** + * mini_cm_dec_refcnt_listen + */ +static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, + struct nes_cm_listener *listener, int free_hanging_nodes) +{ + int ret = 1; + unsigned long flags; + spin_lock_irqsave(&cm_core->listen_list_lock, flags); + if (!atomic_dec_return(&listener->ref_count)) { + list_del(&listener->list); + + /* decrement our listen node count */ + atomic_dec(&cm_core->listen_node_cnt); + + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + + if (listener->nesvnic) { + nes_manage_apbvt(listener->nesvnic, listener->loc_port, + PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL); + } + + nes_debug(NES_DBG_CM, "destroying listener (%p)\n", listener); + + kfree(listener); + ret = 0; + cm_listens_destroyed++; + } else { + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + } + if (listener) { + if (atomic_read(&listener->pend_accepts_cnt) > 0) + nes_debug(NES_DBG_CM, "destroying listener (%p)" + " with non-zero pending accepts=%u\n", + listener, atomic_read(&listener->pend_accepts_cnt)); + } + + return ret; +} + + +/** + * mini_cm_del_listen + */ +static int mini_cm_del_listen(struct nes_cm_core *cm_core, + struct nes_cm_listener *listener) +{ + listener->listener_state = NES_CM_LISTENER_PASSIVE_STATE; + listener->cm_id = NULL; /* going to be destroyed pretty soon */ + return mini_cm_dec_refcnt_listen(cm_core, listener, 1); +} + + +/** + * mini_cm_accelerated + */ +static inline int mini_cm_accelerated(struct nes_cm_core *cm_core, + struct nes_cm_node *cm_node) +{ + u32 was_timer_set; + cm_node->accelerated = 1; + + if (cm_node->accept_pend) { + BUG_ON(!cm_node->listener); + atomic_dec(&cm_node->listener->pend_accepts_cnt); + BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0); + } + + was_timer_set = timer_pending(&cm_core->tcp_timer); + if (!was_timer_set) { + cm_core->tcp_timer.expires = jiffies + NES_SHORT_TIME; + add_timer(&cm_core->tcp_timer); + } + + return 0; +} + + +/** + * nes_addr_send_arp + */ +static void nes_addr_send_arp(u32 dst_ip) +{ + struct rtable *rt; + struct flowi fl; + + memset(&fl, 0, sizeof fl); + fl.nl_u.ip4_u.daddr = htonl(dst_ip); + if (ip_route_output_key(&init_net, &rt, &fl)) { + printk("%s: ip_route_output_key failed for 0x%08X\n", + __FUNCTION__, dst_ip); + return; + } + + neigh_event_send(rt->u.dst.neighbour, NULL); + ip_rt_put(rt); +} + + +/** + * make_cm_node - create a new instance of a cm node + */ +static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, + struct nes_vnic *nesvnic, struct nes_cm_info *cm_info, + struct nes_cm_listener *listener) +{ + struct nes_cm_node *cm_node; + struct timespec ts; + int arpindex = 0; + struct nes_device *nesdev; + struct nes_adapter *nesadapter; + + /* create an hte and cm_node for this instance */ + cm_node = kzalloc(sizeof(*cm_node), GFP_ATOMIC); + if (!cm_node) + return NULL; + + /* set our node specific transport info */ + cm_node->loc_addr = cm_info->loc_addr; + cm_node->rem_addr = cm_info->rem_addr; + cm_node->loc_port = cm_info->loc_port; + cm_node->rem_port = cm_info->rem_port; + cm_node->send_write0 = send_first; + nes_debug(NES_DBG_CM, "Make node addresses : loc = %x:%x, rem = %x:%x\n", + cm_node->loc_addr, cm_node->loc_port, cm_node->rem_addr, cm_node->rem_port); + cm_node->listener = listener; + cm_node->netdev = nesvnic->netdev; + cm_node->cm_id = cm_info->cm_id; + memcpy(cm_node->loc_mac, nesvnic->netdev->dev_addr, ETH_ALEN); + + nes_debug(NES_DBG_CM, "listener=%p, cm_id=%p\n", + cm_node->listener, cm_node->cm_id); + + INIT_LIST_HEAD(&cm_node->retrans_list); + spin_lock_init(&cm_node->retrans_list_lock); + INIT_LIST_HEAD(&cm_node->recv_list); + spin_lock_init(&cm_node->recv_list_lock); + + cm_node->loopbackpartner = NULL; + atomic_set(&cm_node->ref_count, 1); + /* associate our parent CM core */ + cm_node->cm_core = cm_core; + cm_node->tcp_cntxt.loc_id = NES_CM_DEF_LOCAL_ID; + cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE; + cm_node->tcp_cntxt.rcv_wnd = NES_CM_DEFAULT_RCV_WND_SCALED >> + NES_CM_DEFAULT_RCV_WND_SCALE; + ts = current_kernel_time(); + cm_node->tcp_cntxt.loc_seq_num = htonl(ts.tv_nsec); + cm_node->tcp_cntxt.mss = nesvnic->max_frame_size - sizeof(struct iphdr) - + sizeof(struct tcphdr) - ETH_HLEN; + cm_node->tcp_cntxt.rcv_nxt = 0; + /* get a unique session ID , add thread_id to an upcounter to handle race */ + atomic_inc(&cm_core->node_cnt); + atomic_inc(&cm_core->session_id); + cm_node->session_id = (u32)(atomic_read(&cm_core->session_id) + current->tgid); + cm_node->conn_type = cm_info->conn_type; + cm_node->apbvt_set = 0; + cm_node->accept_pend = 0; + + cm_node->nesvnic = nesvnic; + /* get some device handles, for arp lookup */ + nesdev = nesvnic->nesdev; + nesadapter = nesdev->nesadapter; + + cm_node->loopbackpartner = NULL; + /* get the mac addr for the remote node */ + arpindex = nes_arp_table(nesdev, cm_node->rem_addr, NULL, NES_ARP_RESOLVE); + if (arpindex < 0) { + kfree(cm_node); + nes_addr_send_arp(cm_info->rem_addr); + return NULL; + } + + /* copy the mac addr to node context */ + memcpy(cm_node->rem_mac, nesadapter->arp_table[arpindex].mac_addr, ETH_ALEN); + nes_debug(NES_DBG_CM, "Remote mac addr from arp table:%02x," + " %02x, %02x, %02x, %02x, %02x\n", + cm_node->rem_mac[0], cm_node->rem_mac[1], + cm_node->rem_mac[2], cm_node->rem_mac[3], + cm_node->rem_mac[4], cm_node->rem_mac[5]); + + add_hte_node(cm_core, cm_node); + atomic_inc(&cm_nodes_created); + + return cm_node; +} + + +/** + * add_ref_cm_node - destroy an instance of a cm node + */ +static int add_ref_cm_node(struct nes_cm_node *cm_node) +{ + atomic_inc(&cm_node->ref_count); + return 0; +} + + +/** + * rem_ref_cm_node - destroy an instance of a cm node + */ +static int rem_ref_cm_node(struct nes_cm_core *cm_core, + struct nes_cm_node *cm_node) +{ + unsigned long flags, qplockflags; + struct nes_timer_entry *send_entry; + struct nes_timer_entry *recv_entry; + struct iw_cm_id *cm_id; + struct list_head *list_core, *list_node_temp; + struct nes_qp *nesqp; + + if (!cm_node) + return -EINVAL; + + spin_lock_irqsave(&cm_node->cm_core->ht_lock, flags); + if (atomic_dec_return(&cm_node->ref_count)) { + spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags); + return 0; + } + list_del(&cm_node->list); + atomic_dec(&cm_core->ht_node_cnt); + spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags); + + /* if the node is destroyed before connection was accelerated */ + if (!cm_node->accelerated && cm_node->accept_pend) { + BUG_ON(!cm_node->listener); + atomic_dec(&cm_node->listener->pend_accepts_cnt); + BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0); + } + + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + list_for_each_safe(list_core, list_node_temp, &cm_node->retrans_list) { + send_entry = container_of(list_core, struct nes_timer_entry, list); + list_del(&send_entry->list); + spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); + dev_kfree_skb_any(send_entry->skb); + kfree(send_entry); + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + continue; + } + spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); + + spin_lock_irqsave(&cm_node->recv_list_lock, flags); + list_for_each_safe(list_core, list_node_temp, &cm_node->recv_list) { + recv_entry = container_of(list_core, struct nes_timer_entry, list); + list_del(&recv_entry->list); + cm_id = cm_node->cm_id; + spin_unlock_irqrestore(&cm_node->recv_list_lock, flags); + if (recv_entry->type == NES_TIMER_TYPE_CLOSE) { + nesqp = (struct nes_qp *)recv_entry->skb; + spin_lock_irqsave(&nesqp->lock, qplockflags); + if (nesqp->cm_id) { + nes_debug(NES_DBG_CM, "QP%u: cm_id = %p: ****** HIT A NES_TIMER_TYPE_CLOSE" + " with something to do!!! ******\n", + nesqp->hwqp.qp_id, cm_id); + nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; + nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT; + nesqp->ibqp_state = IB_QPS_ERR; + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_cm_disconn(nesqp); + } else { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_debug(NES_DBG_CM, "QP%u: cm_id = %p: ****** HIT A NES_TIMER_TYPE_CLOSE" + " with nothing to do!!! ******\n", + nesqp->hwqp.qp_id, cm_id); + nes_rem_ref(&nesqp->ibqp); + } + cm_id->rem_ref(cm_id); + } else if (recv_entry->type == NES_TIMER_TYPE_RECV) { + dev_kfree_skb_any(recv_entry->skb); + } + kfree(recv_entry); + spin_lock_irqsave(&cm_node->recv_list_lock, flags); + } + spin_unlock_irqrestore(&cm_node->recv_list_lock, flags); + + if (cm_node->listener) { + mini_cm_dec_refcnt_listen(cm_core, cm_node->listener, 0); + } else { + if (cm_node->apbvt_set && cm_node->nesvnic) { + nes_manage_apbvt(cm_node->nesvnic, cm_node->loc_port, + PCI_FUNC(cm_node->nesvnic->nesdev->pcidev->devfn), + NES_MANAGE_APBVT_DEL); + } + } + + kfree(cm_node); + atomic_dec(&cm_core->node_cnt); + atomic_inc(&cm_nodes_destroyed); + + return 0; +} + + +/** + * process_options + */ +static int process_options(struct nes_cm_node *cm_node, u8 *optionsloc, u32 optionsize, u32 syn_packet) +{ + u32 tmp; + u32 offset = 0; + union all_known_options *all_options; + char got_mss_option = 0; + + while (offset < optionsize) { + all_options = (union all_known_options *)(optionsloc + offset); + switch (all_options->as_base.optionnum) { + case OPTION_NUMBER_END: + offset = optionsize; + break; + case OPTION_NUMBER_NONE: + offset += 1; + continue; + case OPTION_NUMBER_MSS: + nes_debug(NES_DBG_CM, "%s: MSS Length: %d Offset: %d Size: %d\n", + __FUNCTION__, + all_options->as_mss.length, offset, optionsize); + got_mss_option = 1; + if (all_options->as_mss.length != 4) { + return 1; + } else { + tmp = ntohs(all_options->as_mss.mss); + if (tmp > 0 && tmp < cm_node->tcp_cntxt.mss) + cm_node->tcp_cntxt.mss = tmp; + } + break; + case OPTION_NUMBER_WINDOW_SCALE: + cm_node->tcp_cntxt.snd_wscale = all_options->as_windowscale.shiftcount; + break; + case OPTION_NUMBER_WRITE0: + cm_node->send_write0 = 1; + break; + default: + nes_debug(NES_DBG_CM, "TCP Option not understood: %x\n", + all_options->as_base.optionnum); + break; + } + offset += all_options->as_base.length; + } + if ((!got_mss_option) && (syn_packet)) + cm_node->tcp_cntxt.mss = NES_CM_DEFAULT_MSS; + return 0; +} + + +/** + * process_packet + */ +int process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb, + struct nes_cm_core *cm_core) +{ + int optionsize; + int datasize; + int ret = 0; + struct tcphdr *tcph = tcp_hdr(skb); + u32 inc_sequence; + if (cm_node->state == NES_CM_STATE_SYN_SENT && tcph->syn) { + inc_sequence = ntohl(tcph->seq); + cm_node->tcp_cntxt.rcv_nxt = inc_sequence; + } + + if ((!tcph) || (cm_node->state == NES_CM_STATE_TSA)) { + BUG_ON(!tcph); + atomic_inc(&cm_accel_dropped_pkts); + return -1; + } + + if (tcph->rst) { + atomic_inc(&cm_resets_recvd); + nes_debug(NES_DBG_CM, "Received Reset, cm_node = %p, state = %u. refcnt=%d\n", + cm_node, cm_node->state, atomic_read(&cm_node->ref_count)); + switch (cm_node->state) { + case NES_CM_STATE_LISTENING: + rem_ref_cm_node(cm_core, cm_node); + break; + case NES_CM_STATE_TSA: + case NES_CM_STATE_CLOSED: + break; + case NES_CM_STATE_SYN_RCVD: + nes_debug(NES_DBG_CM, "Received a reset for local 0x%08X:%04X," + " remote 0x%08X:%04X, node state = %u\n", + cm_node->loc_addr, cm_node->loc_port, + cm_node->rem_addr, cm_node->rem_port, + cm_node->state); + rem_ref_cm_node(cm_core, cm_node); + break; + case NES_CM_STATE_ONE_SIDE_ESTABLISHED: + case NES_CM_STATE_ESTABLISHED: + case NES_CM_STATE_MPAREQ_SENT: + default: + nes_debug(NES_DBG_CM, "Received a reset for local 0x%08X:%04X," + " remote 0x%08X:%04X, node state = %u refcnt=%d\n", + cm_node->loc_addr, cm_node->loc_port, + cm_node->rem_addr, cm_node->rem_port, + cm_node->state, atomic_read(&cm_node->ref_count)); + // create event + cm_node->state = NES_CM_STATE_CLOSED; + + create_event(cm_node, NES_CM_EVENT_ABORTED); + break; + + } + return -1; + } + + optionsize = (tcph->doff << 2) - sizeof(struct tcphdr); + + skb_pull(skb, ip_hdr(skb)->ihl << 2); + skb_pull(skb, tcph->doff << 2); + + datasize = skb->len; + inc_sequence = ntohl(tcph->seq); + nes_debug(NES_DBG_CM, "datasize = %u, sequence = 0x%08X, ack_seq = 0x%08X," + " rcv_nxt = 0x%08X Flags: %s %s.\n", + datasize, inc_sequence, ntohl(tcph->ack_seq), + cm_node->tcp_cntxt.rcv_nxt, (tcph->syn ? "SYN":""), + (tcph->ack ? "ACK":"")); + + if (!tcph->syn && (inc_sequence != cm_node->tcp_cntxt.rcv_nxt) + ) { + nes_debug(NES_DBG_CM, "dropping packet, datasize = %u, sequence = 0x%08X," + " ack_seq = 0x%08X, rcv_nxt = 0x%08X Flags: %s.\n", + datasize, inc_sequence, ntohl(tcph->ack_seq), + cm_node->tcp_cntxt.rcv_nxt, (tcph->ack ? "ACK":"")); + if (cm_node->state == NES_CM_STATE_LISTENING) { + rem_ref_cm_node(cm_core, cm_node); + } + return -1; + } + + cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize; + + + if (optionsize) { + u8 *optionsloc = (u8 *)&tcph[1]; + if (process_options(cm_node, optionsloc, optionsize, (u32)tcph->syn)) { + nes_debug(NES_DBG_CM, "%s: Node %p, Sending RESET\n", __FUNCTION__, cm_node); + send_reset(cm_node); + if (cm_node->state != NES_CM_STATE_SYN_SENT) + rem_ref_cm_node(cm_core, cm_node); + return 0; + } + } else if (tcph->syn) + cm_node->tcp_cntxt.mss = NES_CM_DEFAULT_MSS; + + cm_node->tcp_cntxt.snd_wnd = ntohs(tcph->window) << + cm_node->tcp_cntxt.snd_wscale; + + if (cm_node->tcp_cntxt.snd_wnd > cm_node->tcp_cntxt.max_snd_wnd) { + cm_node->tcp_cntxt.max_snd_wnd = cm_node->tcp_cntxt.snd_wnd; + } + + if (tcph->ack) { + cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq); + switch (cm_node->state) { + case NES_CM_STATE_SYN_RCVD: + case NES_CM_STATE_SYN_SENT: + /* read and stash current sequence number */ + if (cm_node->tcp_cntxt.rem_ack_num != cm_node->tcp_cntxt.loc_seq_num) { + nes_debug(NES_DBG_CM, "ERROR - cm_node->tcp_cntxt.rem_ack_num !=" + " cm_node->tcp_cntxt.loc_seq_num\n"); + send_reset(cm_node); + return 0; + } + if (cm_node->state == NES_CM_STATE_SYN_SENT) + cm_node->state = NES_CM_STATE_ONE_SIDE_ESTABLISHED; + else { + cm_node->state = NES_CM_STATE_ESTABLISHED; + } + break; + case NES_CM_STATE_LAST_ACK: + cm_node->state = NES_CM_STATE_CLOSED; + break; + case NES_CM_STATE_FIN_WAIT1: + cm_node->state = NES_CM_STATE_FIN_WAIT2; + break; + case NES_CM_STATE_CLOSING: + cm_node->state = NES_CM_STATE_TIME_WAIT; + /* need to schedule this to happen in 2MSL timeouts */ + cm_node->state = NES_CM_STATE_CLOSED; + break; + case NES_CM_STATE_ONE_SIDE_ESTABLISHED: + case NES_CM_STATE_ESTABLISHED: + case NES_CM_STATE_MPAREQ_SENT: + case NES_CM_STATE_CLOSE_WAIT: + case NES_CM_STATE_TIME_WAIT: + case NES_CM_STATE_CLOSED: + break; + case NES_CM_STATE_LISTENING: + nes_debug(NES_DBG_CM, "Received an ACK on a listening port (SYN %d)\n", tcph->syn); + cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq); + send_reset(cm_node); + /* send_reset bumps refcount, this should have been a new node */ + rem_ref_cm_node(cm_core, cm_node); + return -1; + break; + case NES_CM_STATE_TSA: + nes_debug(NES_DBG_CM, "Received a packet with the ack bit set while in TSA state\n"); + break; + case NES_CM_STATE_UNKNOWN: + case NES_CM_STATE_INITED: + case NES_CM_STATE_ACCEPTING: + case NES_CM_STATE_FIN_WAIT2: + default: + nes_debug(NES_DBG_CM, "Received ack from unknown state: %x\n", + cm_node->state); + send_reset(cm_node); + break; + } + } + + if (tcph->syn) { + if (cm_node->state == NES_CM_STATE_LISTENING) { + /* do not exceed backlog */ + atomic_inc(&cm_node->listener->pend_accepts_cnt); + if (atomic_read(&cm_node->listener->pend_accepts_cnt) > + cm_node->listener->backlog) { + nes_debug(NES_DBG_CM, "drop syn due to backlog pressure \n"); + cm_backlog_drops++; + atomic_dec(&cm_node->listener->pend_accepts_cnt); + rem_ref_cm_node(cm_core, cm_node); + return 0; + } + cm_node->accept_pend = 1; + + } + if (datasize == 0) + cm_node->tcp_cntxt.rcv_nxt ++; + + if (cm_node->state == NES_CM_STATE_LISTENING) { + cm_node->state = NES_CM_STATE_SYN_RCVD; + send_syn(cm_node, 1); + } + if (cm_node->state == NES_CM_STATE_ONE_SIDE_ESTABLISHED) { + cm_node->state = NES_CM_STATE_ESTABLISHED; + /* send final handshake ACK */ + ret = send_ack(cm_node); + if (ret < 0) + return ret; + + cm_node->state = NES_CM_STATE_MPAREQ_SENT; + ret = send_mpa_request(cm_node); + if (ret < 0) + return ret; + } + } + + if (tcph->fin) { + cm_node->tcp_cntxt.rcv_nxt++; + switch (cm_node->state) { + case NES_CM_STATE_SYN_RCVD: + case NES_CM_STATE_SYN_SENT: + case NES_CM_STATE_ONE_SIDE_ESTABLISHED: + case NES_CM_STATE_ESTABLISHED: + case NES_CM_STATE_ACCEPTING: + case NES_CM_STATE_MPAREQ_SENT: + cm_node->state = NES_CM_STATE_CLOSE_WAIT; + cm_node->state = NES_CM_STATE_LAST_ACK; + ret = send_fin(cm_node, NULL); + break; + case NES_CM_STATE_FIN_WAIT1: + cm_node->state = NES_CM_STATE_CLOSING; + ret = send_ack(cm_node); + break; + case NES_CM_STATE_FIN_WAIT2: + cm_node->state = NES_CM_STATE_TIME_WAIT; + cm_node->tcp_cntxt.loc_seq_num ++; + ret = send_ack(cm_node); + /* need to schedule this to happen in 2MSL timeouts */ + cm_node->state = NES_CM_STATE_CLOSED; + break; + case NES_CM_STATE_CLOSE_WAIT: + case NES_CM_STATE_LAST_ACK: + case NES_CM_STATE_CLOSING: + case NES_CM_STATE_TSA: + default: + nes_debug(NES_DBG_CM, "Received a fin while in %x state\n", + cm_node->state); + ret = -EINVAL; + break; + } + } + + if (datasize) { + u8 *dataloc = skb->data; + /* figure out what state we are in and handle transition to next state */ + switch (cm_node->state) { + case NES_CM_STATE_LISTENING: + case NES_CM_STATE_SYN_RCVD: + case NES_CM_STATE_SYN_SENT: + case NES_CM_STATE_FIN_WAIT1: + case NES_CM_STATE_FIN_WAIT2: + case NES_CM_STATE_CLOSE_WAIT: + case NES_CM_STATE_LAST_ACK: + case NES_CM_STATE_CLOSING: + break; + case NES_CM_STATE_MPAREQ_SENT: + /* recv the mpa res frame, ret=frame len (incl priv data) */ + ret = parse_mpa(cm_node, dataloc, datasize); + if (ret < 0) + break; + /* set the req frame payload len in skb */ + /* we are done handling this state, set node to a TSA state */ + cm_node->state = NES_CM_STATE_TSA; + send_ack(cm_node); + create_event(cm_node, NES_CM_EVENT_CONNECTED); + break; + + case NES_CM_STATE_ESTABLISHED: + /* we are expecting an MPA req frame */ + ret = parse_mpa(cm_node, dataloc, datasize); + if (ret < 0) { + break; + } + cm_node->state = NES_CM_STATE_TSA; + send_ack(cm_node); + /* we got a valid MPA request, create an event */ + create_event(cm_node, NES_CM_EVENT_MPA_REQ); + break; + case NES_CM_STATE_TSA: + handle_exception_pkt(cm_node, skb); + break; + case NES_CM_STATE_UNKNOWN: + case NES_CM_STATE_INITED: + default: + ret = -1; + } + } + + return ret; +} + + +/** + * mini_cm_listen - create a listen node with params + */ +static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, + struct nes_vnic *nesvnic, struct nes_cm_info *cm_info) +{ + struct nes_cm_listener *listener; + unsigned long flags; + + nes_debug(NES_DBG_CM, "Search for 0x%08x : 0x%04x\n", + cm_info->loc_addr, cm_info->loc_port); + + /* cannot have multiple matching listeners */ + listener = find_listener(cm_core, htonl(cm_info->loc_addr), + htons(cm_info->loc_port), NES_CM_LISTENER_EITHER_STATE); + if (listener && listener->listener_state == NES_CM_LISTENER_ACTIVE_STATE) { + /* find automatically incs ref count ??? */ + atomic_dec(&listener->ref_count); + nes_debug(NES_DBG_CM, "Not creating listener since it already exists\n"); + return NULL; + } + + if (!listener) { + /* create a CM listen node (1/2 node to compare incoming traffic to) */ + listener = kzalloc(sizeof(*listener), GFP_ATOMIC); + if (!listener) { + nes_debug(NES_DBG_CM, "Not creating listener memory allocation failed\n"); + return NULL; + } + + memset(listener, 0, sizeof(struct nes_cm_listener)); + listener->loc_addr = htonl(cm_info->loc_addr); + listener->loc_port = htons(cm_info->loc_port); + listener->reused_node = 0; + + atomic_set(&listener->ref_count, 1); + } + /* pasive case */ + /* find already inc'ed the ref count */ + else { + listener->reused_node = 1; + } + + listener->cm_id = cm_info->cm_id; + atomic_set(&listener->pend_accepts_cnt, 0); + listener->cm_core = cm_core; + listener->nesvnic = nesvnic; + atomic_inc(&cm_core->node_cnt); + atomic_inc(&cm_core->session_id); + + listener->session_id = (u32)(atomic_read(&cm_core->session_id) + current->tgid); + listener->conn_type = cm_info->conn_type; + listener->backlog = cm_info->backlog; + listener->listener_state = NES_CM_LISTENER_ACTIVE_STATE; + + if (!listener->reused_node) { + spin_lock_irqsave(&cm_core->listen_list_lock, flags); + list_add(&listener->list, &cm_core->listen_list.list); + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + atomic_inc(&cm_core->listen_node_cnt); + } + + nes_debug(NES_DBG_CM, "Api - listen(): addr=0x%08X, port=0x%04x," + " listener = %p, backlog = %d, cm_id = %p.\n", + cm_info->loc_addr, cm_info->loc_port, + listener, listener->backlog, listener->cm_id); + + return listener; +} + + +/** + * mini_cm_connect - make a connection node with params + */ +struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, + struct nes_vnic *nesvnic, struct ietf_mpa_frame *mpa_frame, + struct nes_cm_info *cm_info) +{ + int ret = 0; + struct nes_cm_node *cm_node; + struct nes_cm_listener *loopbackremotelistener; + struct nes_cm_node *loopbackremotenode; + struct nes_cm_info loopback_cm_info; + + u16 mpa_frame_size = sizeof(struct ietf_mpa_frame) + + ntohs(mpa_frame->priv_data_len); + + cm_info->loc_addr = htonl(cm_info->loc_addr); + cm_info->rem_addr = htonl(cm_info->rem_addr); + cm_info->loc_port = htons(cm_info->loc_port); + cm_info->rem_port = htons(cm_info->rem_port); + + /* create a CM connection node */ + cm_node = make_cm_node(cm_core, nesvnic, cm_info, NULL); + if (!cm_node) + return NULL; + + // set our node side to client (active) side + cm_node->tcp_cntxt.client = 1; + cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE; + + if (cm_info->loc_addr == cm_info->rem_addr) { + loopbackremotelistener = find_listener(cm_core, cm_node->rem_addr, + cm_node->rem_port, NES_CM_LISTENER_ACTIVE_STATE); + if (loopbackremotelistener == NULL) { + create_event(cm_node, NES_CM_EVENT_ABORTED); + } else { + atomic_inc(&cm_loopbacks); + loopback_cm_info = *cm_info; + loopback_cm_info.loc_port = cm_info->rem_port; + loopback_cm_info.rem_port = cm_info->loc_port; + loopback_cm_info.cm_id = loopbackremotelistener->cm_id; + loopbackremotenode = make_cm_node(cm_core, nesvnic, &loopback_cm_info, + loopbackremotelistener); + loopbackremotenode->loopbackpartner = cm_node; + loopbackremotenode->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE; + cm_node->loopbackpartner = loopbackremotenode; + memcpy(loopbackremotenode->mpa_frame_buf, &mpa_frame->priv_data, + mpa_frame_size); + loopbackremotenode->mpa_frame_size = mpa_frame_size - + sizeof(struct ietf_mpa_frame); + + // we are done handling this state, set node to a TSA state + cm_node->state = NES_CM_STATE_TSA; + cm_node->tcp_cntxt.rcv_nxt = loopbackremotenode->tcp_cntxt.loc_seq_num; + loopbackremotenode->tcp_cntxt.rcv_nxt = cm_node->tcp_cntxt.loc_seq_num; + cm_node->tcp_cntxt.max_snd_wnd = loopbackremotenode->tcp_cntxt.rcv_wnd; + loopbackremotenode->tcp_cntxt.max_snd_wnd = cm_node->tcp_cntxt.rcv_wnd; + cm_node->tcp_cntxt.snd_wnd = loopbackremotenode->tcp_cntxt.rcv_wnd; + loopbackremotenode->tcp_cntxt.snd_wnd = cm_node->tcp_cntxt.rcv_wnd; + cm_node->tcp_cntxt.snd_wscale = loopbackremotenode->tcp_cntxt.rcv_wscale; + loopbackremotenode->tcp_cntxt.snd_wscale = cm_node->tcp_cntxt.rcv_wscale; + + create_event(loopbackremotenode, NES_CM_EVENT_MPA_REQ); + } + return cm_node; + } + + /* set our node side to client (active) side */ + cm_node->tcp_cntxt.client = 1; + /* init our MPA frame ptr */ + memcpy(&cm_node->mpa_frame, mpa_frame, mpa_frame_size); + cm_node->mpa_frame_size = mpa_frame_size; + + /* send a syn and goto syn sent state */ + cm_node->state = NES_CM_STATE_SYN_SENT; + ret = send_syn(cm_node, 0); + + nes_debug(NES_DBG_CM, "Api - connect(): dest addr=0x%08X, port=0x%04x," + " cm_node=%p, cm_id = %p.\n", + cm_node->rem_addr, cm_node->rem_port, cm_node, cm_node->cm_id); + + return cm_node; +} + + +/** + * mini_cm_accept - accept a connection + * This function is never called + */ +int mini_cm_accept(struct nes_cm_core *cm_core, struct ietf_mpa_frame *mpa_frame, + struct nes_cm_node *cm_node) +{ + return 0; +} + + +/** + * mini_cm_reject - reject and teardown a connection + */ +int mini_cm_reject(struct nes_cm_core *cm_core, + struct ietf_mpa_frame *mpa_frame, + struct nes_cm_node *cm_node) +{ + int ret = 0; + struct sk_buff *skb; + u16 mpa_frame_size = sizeof(struct ietf_mpa_frame) + + ntohs(mpa_frame->priv_data_len); + + skb = get_free_pkt(cm_node); + if (!skb) { + nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); + return -1; + } + + /* send an MPA Request frame */ + form_cm_frame(skb, cm_node, NULL, 0, mpa_frame, mpa_frame_size, SET_ACK | SET_FIN); + ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); + + cm_node->state = NES_CM_STATE_CLOSED; + ret = send_fin(cm_node, NULL); + + if (ret < 0) { + printk(KERN_INFO PFX "failed to send MPA Reply (reject)\n"); + return ret; + } + + return ret; +} + + +/** + * mini_cm_close + */ +int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) +{ + int ret = 0; + + if (!cm_core || !cm_node) + return -EINVAL; + + switch (cm_node->state) { + /* if passed in node is null, create a reference key node for node search */ + /* check if we found an owner node for this pkt */ + case NES_CM_STATE_SYN_RCVD: + case NES_CM_STATE_SYN_SENT: + case NES_CM_STATE_ONE_SIDE_ESTABLISHED: + case NES_CM_STATE_ESTABLISHED: + case NES_CM_STATE_ACCEPTING: + case NES_CM_STATE_MPAREQ_SENT: + cm_node->state = NES_CM_STATE_FIN_WAIT1; + send_fin(cm_node, NULL); + break; + case NES_CM_STATE_CLOSE_WAIT: + cm_node->state = NES_CM_STATE_LAST_ACK; + send_fin(cm_node, NULL); + break; + case NES_CM_STATE_FIN_WAIT1: + case NES_CM_STATE_FIN_WAIT2: + case NES_CM_STATE_LAST_ACK: + case NES_CM_STATE_TIME_WAIT: + case NES_CM_STATE_CLOSING: + ret = -1; + break; + case NES_CM_STATE_LISTENING: + case NES_CM_STATE_UNKNOWN: + case NES_CM_STATE_INITED: + case NES_CM_STATE_CLOSED: + case NES_CM_STATE_TSA: + ret = rem_ref_cm_node(cm_core, cm_node); + break; + } + cm_node->cm_id = NULL; + return ret; +} + + +/** + * recv_pkt - recv an ETHERNET packet, and process it through CM + * node state machine + */ +int mini_cm_recv_pkt(struct nes_cm_core *cm_core, struct nes_vnic *nesvnic, + struct sk_buff *skb) +{ + struct nes_cm_node *cm_node = NULL; + struct nes_cm_listener *listener = NULL; + struct iphdr *iph; + struct tcphdr *tcph; + struct nes_cm_info nfo; + int ret = 0; + + if (!skb || skb->len < sizeof(struct iphdr) + sizeof(struct tcphdr)) { + ret = -EINVAL; + goto out; + } + + iph = (struct iphdr *)skb->data; + tcph = (struct tcphdr *)(skb->data + sizeof(struct iphdr)); + skb_reset_network_header(skb); + skb_set_transport_header(skb, sizeof(*tcph)); + skb->len = ntohs(iph->tot_len); + + nfo.loc_addr = ntohl(iph->daddr); + nfo.loc_port = ntohs(tcph->dest); + nfo.rem_addr = ntohl(iph->saddr); + nfo.rem_port = ntohs(tcph->source); + + nes_debug(NES_DBG_CM, "Received packet: dest=0x%08X:0x%04X src=0x%08X:0x%04X\n", + iph->daddr, tcph->dest, iph->saddr, tcph->source); + + /* note: this call is going to increment cm_node ref count */ + cm_node = find_node(cm_core, + nfo.rem_port, nfo.rem_addr, + nfo.loc_port, nfo.loc_addr); + + if (!cm_node) { + listener = find_listener(cm_core, nfo.loc_addr, nfo.loc_port, + NES_CM_LISTENER_ACTIVE_STATE); + if (listener) { + nfo.cm_id = listener->cm_id; + nfo.conn_type = listener->conn_type; + } else { + nfo.cm_id = NULL; + nfo.conn_type = 0; + } + + cm_node = make_cm_node(cm_core, nesvnic, &nfo, listener); + if (!cm_node) { + nes_debug(NES_DBG_CM, "Unable to allocate node\n"); + if (listener) { + nes_debug(NES_DBG_CM, "unable to allocate node and decrementing listener refcount\n"); + atomic_dec(&listener->ref_count); + } + ret = -1; + goto out; + } + if (!listener) { + nes_debug(NES_DBG_CM, "Packet found for unknown port %x refcnt=%d\n", + nfo.loc_port, atomic_read(&cm_node->ref_count)); + if (!tcph->rst) { + nes_debug(NES_DBG_CM, "Packet found for unknown port=%d" + " rem_port=%d refcnt=%d\n", + nfo.loc_port, nfo.rem_port, atomic_read(&cm_node->ref_count)); + + cm_node->tcp_cntxt.rcv_nxt = ntohl(tcph->seq); + cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq); + send_reset(cm_node); + } + rem_ref_cm_node(cm_core, cm_node); + ret = -1; + goto out; + } + add_ref_cm_node(cm_node); + cm_node->state = NES_CM_STATE_LISTENING; + } + + nes_debug(NES_DBG_CM, "Processing Packet for node %p, data = (%p):\n", + cm_node, skb->data); + process_packet(cm_node, skb, cm_core); + + rem_ref_cm_node(cm_core, cm_node); + out: + if (skb) + dev_kfree_skb_any(skb); + return ret; +} + + +/** + * nes_cm_alloc_core - allocate a top level instance of a cm core + */ +struct nes_cm_core *nes_cm_alloc_core(void) +{ + int i; + + struct nes_cm_core *cm_core; + struct sk_buff *skb = NULL; + + /* setup the CM core */ + /* alloc top level core control structure */ + cm_core = kzalloc(sizeof(*cm_core), GFP_KERNEL); + if (!cm_core) + return NULL; + + INIT_LIST_HEAD(&cm_core->connected_nodes); + init_timer(&cm_core->tcp_timer); + cm_core->tcp_timer.function = nes_cm_timer_tick; + + cm_core->mtu = NES_CM_DEFAULT_MTU; + cm_core->state = NES_CM_STATE_INITED; + cm_core->free_tx_pkt_max = NES_CM_DEFAULT_FREE_PKTS; + + atomic_set(&cm_core->session_id, 0); + atomic_set(&cm_core->events_posted, 0); + + /* init the packet lists */ + skb_queue_head_init(&cm_core->tx_free_list); + + for (i = 0; i < NES_CM_DEFAULT_FRAME_CNT; i++) { + skb = dev_alloc_skb(cm_core->mtu); + if (!skb) { + kfree(cm_core); + return NULL; + } + /* add 'raw' skb to free frame list */ + skb_queue_head(&cm_core->tx_free_list, skb); + } + + cm_core->api = &nes_cm_api; + + spin_lock_init(&cm_core->ht_lock); + spin_lock_init(&cm_core->listen_list_lock); + + INIT_LIST_HEAD(&cm_core->listen_list.list); + + nes_debug(NES_DBG_CM, "Init CM Core completed -- cm_core=%p\n", cm_core); + + nes_debug(NES_DBG_CM, "Enable QUEUE EVENTS\n"); + cm_core->event_wq = create_singlethread_workqueue("nesewq"); + cm_core->post_event = nes_cm_post_event; + nes_debug(NES_DBG_CM, "Enable QUEUE DISCONNECTS\n"); + cm_core->disconn_wq = create_singlethread_workqueue("nesdwq"); + + print_core(cm_core); + return cm_core; +} + + +/** + * mini_cm_dealloc_core - deallocate a top level instance of a cm core + */ +int mini_cm_dealloc_core(struct nes_cm_core *cm_core) +{ + nes_debug(NES_DBG_CM, "De-Alloc CM Core (%p)\n", cm_core); + + if (!cm_core) + return -EINVAL; + + barrier(); + + if (timer_pending(&cm_core->tcp_timer)) { + del_timer(&cm_core->tcp_timer); + } + + destroy_workqueue(cm_core->event_wq); + destroy_workqueue(cm_core->disconn_wq); + nes_debug(NES_DBG_CM, "\n"); + kfree(cm_core); + + return 0; +} + + +/** + * mini_cm_get + */ +int mini_cm_get(struct nes_cm_core *cm_core) +{ + return cm_core->state; +} + + +/** + * mini_cm_set + */ +int mini_cm_set(struct nes_cm_core *cm_core, u32 type, u32 value) +{ + int ret = 0; + + switch (type) { + case NES_CM_SET_PKT_SIZE: + cm_core->mtu = value; + break; + case NES_CM_SET_FREE_PKT_Q_SIZE: + cm_core->free_tx_pkt_max = value; + break; + default: + /* unknown set option */ + ret = -EINVAL; + } + + return ret; +} + + +/** + * nes_cm_init_tsa_conn setup HW; MPA frames must be + * successfully exchanged when this is called + */ +static int nes_cm_init_tsa_conn(struct nes_qp *nesqp, struct nes_cm_node *cm_node) +{ + int ret = 0; + + if (!nesqp) + return -EINVAL; + + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_IPV4 | + NES_QPCONTEXT_MISC_NO_NAGLE | NES_QPCONTEXT_MISC_DO_NOT_FRAG | + NES_QPCONTEXT_MISC_DROS); + + if (cm_node->tcp_cntxt.snd_wscale || cm_node->tcp_cntxt.rcv_wscale) + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WSCALE); + + nesqp->nesqp_context->misc2 |= cpu_to_le32(64 << NES_QPCONTEXT_MISC2_TTL_SHIFT); + + nesqp->nesqp_context->mss |= cpu_to_le32(((u32)cm_node->tcp_cntxt.mss) << 16); + + nesqp->nesqp_context->tcp_state_flow_label |= cpu_to_le32( + (u32)NES_QPCONTEXT_TCPSTATE_EST << NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT); + + nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32( + (cm_node->tcp_cntxt.snd_wscale << NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT) & + NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK); + + nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32( + (cm_node->tcp_cntxt.rcv_wscale << NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT) & + NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK); + + nesqp->nesqp_context->keepalive = cpu_to_le32(0x80); + nesqp->nesqp_context->ts_recent = 0; + nesqp->nesqp_context->ts_age = 0; + nesqp->nesqp_context->snd_nxt = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); + nesqp->nesqp_context->snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.snd_wnd); + nesqp->nesqp_context->rcv_nxt = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt); + nesqp->nesqp_context->rcv_wnd = cpu_to_le32(cm_node->tcp_cntxt.rcv_wnd << + cm_node->tcp_cntxt.rcv_wscale); + nesqp->nesqp_context->snd_max = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); + nesqp->nesqp_context->snd_una = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); + nesqp->nesqp_context->srtt = 0; + nesqp->nesqp_context->rttvar = cpu_to_le32(0x6); + nesqp->nesqp_context->ssthresh = cpu_to_le32(0x3FFFC000); + nesqp->nesqp_context->cwnd = cpu_to_le32(2*cm_node->tcp_cntxt.mss); + nesqp->nesqp_context->snd_wl1 = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt); + nesqp->nesqp_context->snd_wl2 = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); + nesqp->nesqp_context->max_snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.max_snd_wnd); + + nes_debug(NES_DBG_CM, "QP%u: rcv_nxt = 0x%08X, snd_nxt = 0x%08X," + " Setting MSS to %u, PDWscale = 0x%08X, rcv_wnd = %u, context misc = 0x%08X.\n", + nesqp->hwqp.qp_id, le32_to_cpu(nesqp->nesqp_context->rcv_nxt), + le32_to_cpu(nesqp->nesqp_context->snd_nxt), + cm_node->tcp_cntxt.mss, le32_to_cpu(nesqp->nesqp_context->pd_index_wscale), + le32_to_cpu(nesqp->nesqp_context->rcv_wnd), + le32_to_cpu(nesqp->nesqp_context->misc)); + nes_debug(NES_DBG_CM, " snd_wnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->snd_wnd)); + nes_debug(NES_DBG_CM, " snd_cwnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->cwnd)); + nes_debug(NES_DBG_CM, " max_swnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->max_snd_wnd)); + + nes_debug(NES_DBG_CM, "Change cm_node state to TSA\n"); + cm_node->state = NES_CM_STATE_TSA; + + return ret; +} + + +/** + * nes_cm_disconn + */ +int nes_cm_disconn(struct nes_qp *nesqp) +{ + unsigned long flags; + + spin_lock_irqsave(&nesqp->lock, flags); + if (nesqp->disconn_pending == 0) { + nesqp->disconn_pending++; + spin_unlock_irqrestore(&nesqp->lock, flags); + /* nes_add_ref(&nesqp->ibqp); */ + /* init our disconnect work element, to */ + INIT_WORK(&nesqp->disconn_work, nes_disconnect_worker); + + queue_work(g_cm_core->disconn_wq, &nesqp->disconn_work); + } else { + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_rem_ref(&nesqp->ibqp); + } + + return 0; +} + + +/** + * nes_disconnect_worker + */ +void nes_disconnect_worker(struct work_struct *work) +{ + struct nes_qp *nesqp = container_of(work, struct nes_qp, disconn_work); + + nes_debug(NES_DBG_CM, "processing AEQE id 0x%04X for QP%u.\n", + nesqp->last_aeq, nesqp->hwqp.qp_id); + nes_cm_disconn_true(nesqp); +} + + +/** + * nes_cm_disconn_true + */ +int nes_cm_disconn_true(struct nes_qp *nesqp) +{ + unsigned long flags; + int ret = 0; + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + struct nes_vnic *nesvnic; + u16 last_ae; + u8 original_hw_tcp_state; + u8 original_ibqp_state; + u8 issued_disconnect_reset = 0; + + if (!nesqp) { + nes_debug(NES_DBG_CM, "disconnect_worker nesqp is NULL\n"); + return -1; + } + + spin_lock_irqsave(&nesqp->lock, flags); + cm_id = nesqp->cm_id; + /* make sure we havent already closed this connection */ + if (!cm_id) { + nes_debug(NES_DBG_CM, "QP%u disconnect_worker cmid is NULL\n", + nesqp->hwqp.qp_id); + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_rem_ref(&nesqp->ibqp); + return -1; + } + + nesvnic = to_nesvnic(nesqp->ibqp.device); + nes_debug(NES_DBG_CM, "Disconnecting QP%u\n", nesqp->hwqp.qp_id); + + original_hw_tcp_state = nesqp->hw_tcp_state; + original_ibqp_state = nesqp->ibqp_state; + last_ae = nesqp->last_aeq; + + + nes_debug(NES_DBG_CM, "set ibqp_state=%u\n", nesqp->ibqp_state); + + if ((nesqp->cm_id) && (cm_id->event_handler)) { + if ((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) || + ((original_ibqp_state == IB_QPS_RTS) && + (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + atomic_inc(&cm_disconnects); + cm_event.event = IW_CM_EVENT_DISCONNECT; + if (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET) { + issued_disconnect_reset = 1; + cm_event.status = IW_CM_EVENT_STATUS_RESET; + nes_debug(NES_DBG_CM, "Generating a CM Disconnect Event (status reset) for " + " QP%u, cm_id = %p. \n", + nesqp->hwqp.qp_id, cm_id); + } else { + cm_event.status = IW_CM_EVENT_STATUS_OK; + } + + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + + nes_debug(NES_DBG_CM, "Generating a CM Disconnect Event for " + " QP%u, SQ Head = %u, SQ Tail = %u. cm_id = %p, refcount = %u.\n", + nesqp->hwqp.qp_id, + nesqp->hwqp.sq_head, nesqp->hwqp.sq_tail, cm_id, + atomic_read(&nesqp->refcount)); + + spin_unlock_irqrestore(&nesqp->lock, flags); + ret = cm_id->event_handler(cm_id, &cm_event); + if (ret) + nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); + spin_lock_irqsave(&nesqp->lock, flags); + } + + nesqp->disconn_pending = 0; + /* There might have been another AE while the lock was released */ + original_hw_tcp_state = nesqp->hw_tcp_state; + original_ibqp_state = nesqp->ibqp_state; + last_ae = nesqp->last_aeq; + + if ((issued_disconnect_reset == 0) && (nesqp->cm_id) && + ((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSED) || + (original_hw_tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT) || + (last_ae == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) || + (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + atomic_inc(&cm_closes); + nesqp->cm_id = NULL; + nesqp->in_disconnect = 0; + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_disconnect(nesqp, 1); + + cm_id->provider_data = nesqp; + /* Send up the close complete event */ + cm_event.event = IW_CM_EVENT_CLOSE; + cm_event.status = IW_CM_EVENT_STATUS_OK; + cm_event.provider_data = cm_id->provider_data; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + + ret = cm_id->event_handler(cm_id, &cm_event); + if (ret) { + nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); + } + + cm_id->rem_ref(cm_id); + + spin_lock_irqsave(&nesqp->lock, flags); + if (nesqp->flush_issued == 0) { + nesqp->flush_issued = 1; + spin_unlock_irqrestore(&nesqp->lock, flags); + flush_wqes(nesvnic->nesdev, nesqp, NES_CQP_FLUSH_RQ, 1); + } else { + spin_unlock_irqrestore(&nesqp->lock, flags); + } + + /* This reference is from either ModifyQP or the AE processing, + there is still a race here with modifyqp */ + nes_rem_ref(&nesqp->ibqp); + + } else { + cm_id = nesqp->cm_id; + spin_unlock_irqrestore(&nesqp->lock, flags); + /* check to see if the inbound reset beat the outbound reset */ + if ((!cm_id) && (last_ae==NES_AEQE_AEID_RESET_SENT)) { + nes_debug(NES_DBG_CM, "QP%u: Decing refcount due to inbound reset" + " beating the outbound reset.\n", + nesqp->hwqp.qp_id); + nes_rem_ref(&nesqp->ibqp); + } + } + } else { + nesqp->disconn_pending = 0; + spin_unlock_irqrestore(&nesqp->lock, flags); + } + nes_rem_ref(&nesqp->ibqp); + + return 0; +} + + +/** + * nes_disconnect + */ +int nes_disconnect(struct nes_qp *nesqp, int abrupt) +{ + int ret = 0; + struct nes_vnic *nesvnic; + struct nes_device *nesdev; + + nesvnic = to_nesvnic(nesqp->ibqp.device); + if (!nesvnic) + return -EINVAL; + + nesdev = nesvnic->nesdev; + + nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", + atomic_read(&nesvnic->netdev->refcnt)); + + if (nesqp->active_conn) { + + /* indicate this connection is NOT active */ + nesqp->active_conn = 0; + } else { + /* Need to free the Last Streaming Mode Message */ + if (nesqp->ietf_frame) { + pci_free_consistent(nesdev->pcidev, + nesqp->private_data_len+sizeof(struct ietf_mpa_frame), + nesqp->ietf_frame, nesqp->ietf_frame_pbase); + } + } + + /* close the CM node down if it is still active */ + if (nesqp->cm_node) { + nes_debug(NES_DBG_CM, "Call close API\n"); + + g_cm_core->api->close(g_cm_core, nesqp->cm_node); + nesqp->cm_node = NULL; + } + + return ret; +} + + +/** + * nes_accept + */ +int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + u64 u64temp; + struct ib_qp *ibqp; + struct nes_qp *nesqp; + struct nes_vnic *nesvnic; + struct nes_device *nesdev; + struct nes_cm_node *cm_node; + struct nes_adapter *adapter; + struct ib_qp_attr attr; + struct iw_cm_event cm_event; + struct nes_hw_qp_wqe *wqe; + struct nes_v4_quad nes_quad; + int ret; + + ibqp = nes_get_qp(cm_id->device, conn_param->qpn); + if (!ibqp) + return -EINVAL; + + /* get all our handles */ + nesqp = to_nesqp(ibqp); + nesvnic = to_nesvnic(nesqp->ibqp.device); + nesdev = nesvnic->nesdev; + adapter = nesdev->nesadapter; + + nes_debug(NES_DBG_CM, "nesvnic=%p, netdev=%p, %s\n", + nesvnic, nesvnic->netdev, nesvnic->netdev->name); + + /* since this is from a listen, we were able to put node handle into cm_id */ + cm_node = (struct nes_cm_node *)cm_id->provider_data; + + /* associate the node with the QP */ + nesqp->cm_node = (void *)cm_node; + + nes_debug(NES_DBG_CM, "QP%u, cm_node=%p, jiffies = %lu\n", + nesqp->hwqp.qp_id, cm_node, jiffies); + atomic_inc(&cm_accepts); + + nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", + atomic_read(&nesvnic->netdev->refcnt)); + + /* allocate the ietf frame and space for private data */ + nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev, + sizeof(struct ietf_mpa_frame) + conn_param->private_data_len, + &nesqp->ietf_frame_pbase); + + if (!nesqp->ietf_frame) { + nes_debug(NES_DBG_CM, "Unable to allocate memory for private data\n"); + return -ENOMEM; + } + + + /* setup the MPA frame */ + nesqp->private_data_len = conn_param->private_data_len; + memcpy(nesqp->ietf_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE); + + memcpy(nesqp->ietf_frame->priv_data, conn_param->private_data, + conn_param->private_data_len); + + nesqp->ietf_frame->priv_data_len = cpu_to_be16(conn_param->private_data_len); + nesqp->ietf_frame->rev = mpa_version; + nesqp->ietf_frame->flags = IETF_MPA_FLAGS_CRC; + + /* setup our first outgoing iWarp send WQE (the IETF frame response) */ + wqe = &nesqp->hwqp.sq_vbase[0]; + + if (cm_id->remote_addr.sin_addr.s_addr != cm_id->local_addr.sin_addr.s_addr) { + u64temp = (unsigned long)nesqp; + u64temp |= NES_SW_CONTEXT_ALIGN>>1; + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, + u64temp); + wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = + cpu_to_le32(NES_IWARP_SQ_WQE_STREAMING | NES_IWARP_SQ_WQE_WRPDU); + wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] = + cpu_to_le32(conn_param->private_data_len + sizeof(struct ietf_mpa_frame)); + wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_LOW_IDX] = + cpu_to_le32((u32)nesqp->ietf_frame_pbase); + wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX] = + cpu_to_le32((u32)((u64)nesqp->ietf_frame_pbase >> 32)); + wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = + cpu_to_le32(conn_param->private_data_len + sizeof(struct ietf_mpa_frame)); + wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 0; + + nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32( + NES_QPCONTEXT_ORDIRD_LSMM_PRESENT | NES_QPCONTEXT_ORDIRD_WRPDU); + } else { + nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32((NES_QPCONTEXT_ORDIRD_LSMM_PRESENT | + NES_QPCONTEXT_ORDIRD_WRPDU | NES_QPCONTEXT_ORDIRD_ALSMM)); + } + nesqp->skip_lsmm = 1; + + + /* Cache the cm_id in the qp */ + nesqp->cm_id = cm_id; + cm_node->cm_id = cm_id; + + /* nesqp->cm_node = (void *)cm_id->provider_data; */ + cm_id->provider_data = nesqp; + nesqp->active_conn = 0; + + nes_cm_init_tsa_conn(nesqp, cm_node); + + nesqp->nesqp_context->tcpPorts[0] = cpu_to_le16(ntohs(cm_id->local_addr.sin_port)); + nesqp->nesqp_context->tcpPorts[1] = cpu_to_le16(ntohs(cm_id->remote_addr.sin_port)); + nesqp->nesqp_context->ip0 = cpu_to_le32(ntohl(cm_id->remote_addr.sin_addr.s_addr)); + + nesqp->nesqp_context->misc2 |= cpu_to_le32( + (u32)PCI_FUNC(nesdev->pcidev->devfn) << NES_QPCONTEXT_MISC2_SRC_IP_SHIFT); + + nesqp->nesqp_context->arp_index_vlan |= cpu_to_le32( + nes_arp_table(nesdev, le32_to_cpu(nesqp->nesqp_context->ip0), NULL, + NES_ARP_RESOLVE) << 16); + + nesqp->nesqp_context->ts_val_delta = cpu_to_le32( + jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW)); + + nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id); + + nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32( + ((u32)1 << NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT)); + nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32((u32)conn_param->ord); + + memset(&nes_quad, 0, sizeof(nes_quad)); + nes_quad.DstIpAdrIndex = cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); + nes_quad.SrcIpadr = cm_id->remote_addr.sin_addr.s_addr; + nes_quad.TcpPorts[0] = cm_id->remote_addr.sin_port; + nes_quad.TcpPorts[1] = cm_id->local_addr.sin_port; + + /* Produce hash key */ + nesqp->hte_index = cpu_to_be32( + crc32c(~0, (void *)&nes_quad, sizeof(nes_quad)) ^ 0xffffffff); + nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, CRC = 0x%08X\n", + nesqp->hte_index, nesqp->hte_index & adapter->hte_index_mask); + + nesqp->hte_index &= adapter->hte_index_mask; + nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index); + + cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node); + + nes_debug(NES_DBG_CM, "QP%u, Destination IP = 0x%08X:0x%04X, local = 0x%08X:0x%04X," + " rcv_nxt=0x%08X, snd_nxt=0x%08X, mpa + private data length=%zu.\n", + nesqp->hwqp.qp_id, + ntohl(cm_id->remote_addr.sin_addr.s_addr), + ntohs(cm_id->remote_addr.sin_port), + ntohl(cm_id->local_addr.sin_addr.s_addr), + ntohs(cm_id->local_addr.sin_port), + le32_to_cpu(nesqp->nesqp_context->rcv_nxt), + le32_to_cpu(nesqp->nesqp_context->snd_nxt), + conn_param->private_data_len+sizeof(struct ietf_mpa_frame)); + + attr.qp_state = IB_QPS_RTS; + nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); + + /* notify OF layer that accept event was successfull */ + cm_id->add_ref(cm_id); + + cm_event.event = IW_CM_EVENT_ESTABLISHED; + cm_event.status = IW_CM_EVENT_STATUS_ACCEPTED; + cm_event.provider_data = (void *)nesqp; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + ret = cm_id->event_handler(cm_id, &cm_event); + if (cm_node->loopbackpartner) { + cm_node->loopbackpartner->mpa_frame_size = nesqp->private_data_len; + /* copy entire MPA frame to our cm_node's frame */ + memcpy(cm_node->loopbackpartner->mpa_frame_buf, nesqp->ietf_frame->priv_data, + nesqp->private_data_len); + create_event(cm_node->loopbackpartner, NES_CM_EVENT_CONNECTED); + } + if (ret) + printk("%s[%u] OFA CM event_handler returned, ret=%d\n", + __FUNCTION__, __LINE__, ret); + + return 0; +} + + +/** + * nes_reject + */ +int nes_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) +{ + struct nes_cm_node *cm_node; + struct nes_cm_core *cm_core; + + atomic_inc(&cm_rejects); + cm_node = (struct nes_cm_node *) cm_id->provider_data; + cm_core = cm_node->cm_core; + cm_node->mpa_frame_size = sizeof(struct ietf_mpa_frame) + pdata_len; + + strcpy(&cm_node->mpa_frame.key[0], IEFT_MPA_KEY_REP); + memcpy(&cm_node->mpa_frame.priv_data, pdata, pdata_len); + + cm_node->mpa_frame.priv_data_len = cpu_to_be16(pdata_len); + cm_node->mpa_frame.rev = mpa_version; + cm_node->mpa_frame.flags = IETF_MPA_FLAGS_CRC | IETF_MPA_FLAGS_REJECT; + + cm_core->api->reject(cm_core, &cm_node->mpa_frame, cm_node); + + return 0; +} + + +/** + * nes_connect + * setup and launch cm connect node + */ +int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + struct ib_qp *ibqp; + struct nes_qp *nesqp; + struct nes_vnic *nesvnic; + struct nes_device *nesdev; + struct nes_cm_node *cm_node; + struct nes_cm_info cm_info; + + ibqp = nes_get_qp(cm_id->device, conn_param->qpn); + if (!ibqp) + return -EINVAL; + nesqp = to_nesqp(ibqp); + if (!nesqp) + return -EINVAL; + nesvnic = to_nesvnic(nesqp->ibqp.device); + if (!nesvnic) + return -EINVAL; + nesdev = nesvnic->nesdev; + if (!nesdev) + return -EINVAL; + + atomic_inc(&cm_connects); + + nesqp->ietf_frame = kzalloc(sizeof(struct ietf_mpa_frame) + + conn_param->private_data_len, GFP_KERNEL); + if (!nesqp->ietf_frame) + return -ENOMEM; + + /* set qp as having an active connection */ + nesqp->active_conn = 1; + + nes_debug(NES_DBG_CM, "QP%u, Destination IP = 0x%08X:0x%04X, local = 0x%08X:0x%04X.\n", + nesqp->hwqp.qp_id, + ntohl(cm_id->remote_addr.sin_addr.s_addr), + ntohs(cm_id->remote_addr.sin_port), + ntohl(cm_id->local_addr.sin_addr.s_addr), + ntohs(cm_id->local_addr.sin_port)); + + /* cache the cm_id in the qp */ + nesqp->cm_id = cm_id; + + cm_id->provider_data = nesqp; + + /* copy the private data */ + if (conn_param->private_data_len) { + memcpy(nesqp->ietf_frame->priv_data, conn_param->private_data, + conn_param->private_data_len); + } + + nesqp->private_data_len = conn_param->private_data_len; + nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32((u32)conn_param->ord); + nes_debug(NES_DBG_CM, "requested ord = 0x%08X.\n", (u32)conn_param->ord); + nes_debug(NES_DBG_CM, "mpa private data len =%u\n", conn_param->private_data_len); + + strcpy(&nesqp->ietf_frame->key[0], IEFT_MPA_KEY_REQ); + nesqp->ietf_frame->flags = IETF_MPA_FLAGS_CRC; + nesqp->ietf_frame->rev = IETF_MPA_VERSION; + nesqp->ietf_frame->priv_data_len = htons(conn_param->private_data_len); + + if (cm_id->local_addr.sin_addr.s_addr != cm_id->remote_addr.sin_addr.s_addr) + nes_manage_apbvt(nesvnic, ntohs(cm_id->local_addr.sin_port), + PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD); + + /* set up the connection params for the node */ + cm_info.loc_addr = (cm_id->local_addr.sin_addr.s_addr); + cm_info.loc_port = (cm_id->local_addr.sin_port); + cm_info.rem_addr = (cm_id->remote_addr.sin_addr.s_addr); + cm_info.rem_port = (cm_id->remote_addr.sin_port); + cm_info.cm_id = cm_id; + cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; + + cm_id->add_ref(cm_id); + nes_add_ref(&nesqp->ibqp); + + /* create a connect CM node connection */ + cm_node = g_cm_core->api->connect(g_cm_core, nesvnic, nesqp->ietf_frame, &cm_info); + if (!cm_node) { + if (cm_id->local_addr.sin_addr.s_addr != cm_id->remote_addr.sin_addr.s_addr) + nes_manage_apbvt(nesvnic, ntohs(cm_id->local_addr.sin_port), + PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL); + nes_rem_ref(&nesqp->ibqp); + kfree(nesqp->ietf_frame); + nesqp->ietf_frame = NULL; + cm_id->rem_ref(cm_id); + return -ENOMEM; + } + + cm_node->apbvt_set = 1; + nesqp->cm_node = cm_node; + + return 0; +} + + +/** + * nes_create_listen + */ +int nes_create_listen(struct iw_cm_id *cm_id, int backlog) +{ + struct nes_vnic *nesvnic; + struct nes_cm_listener *cm_node; + struct nes_cm_info cm_info; + struct nes_adapter *adapter; + int err; + + + nes_debug(NES_DBG_CM, "cm_id = %p, local port = 0x%04X.\n", + cm_id, ntohs(cm_id->local_addr.sin_port)); + + nesvnic = to_nesvnic(cm_id->device); + if (!nesvnic) + return -EINVAL; + adapter = nesvnic->nesdev->nesadapter; + nes_debug(NES_DBG_CM, "nesvnic=%p, netdev=%p, %s\n", + nesvnic, nesvnic->netdev, nesvnic->netdev->name); + + nes_debug(NES_DBG_CM, "nesvnic->local_ipaddr=0x%08x, sin_addr.s_addr=0x%08x\n", + nesvnic->local_ipaddr, cm_id->local_addr.sin_addr.s_addr); + + /* setup listen params in our api call struct */ + cm_info.loc_addr = nesvnic->local_ipaddr; + cm_info.loc_port = cm_id->local_addr.sin_port; + cm_info.backlog = backlog; + cm_info.cm_id = cm_id; + + cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; + + + cm_node = g_cm_core->api->listen(g_cm_core, nesvnic, &cm_info); + if (!cm_node) { + printk("%s[%u] Error returned from listen API call\n", + __FUNCTION__, __LINE__); + return -ENOMEM; + } + + cm_id->provider_data = cm_node; + + if (!cm_node->reused_node) { + err = nes_manage_apbvt(nesvnic, ntohs(cm_id->local_addr.sin_port), + PCI_FUNC(nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD); + if (err) { + printk("nes_manage_apbvt call returned %d.\n", err); + g_cm_core->api->stop_listener(g_cm_core, (void *)cm_node); + return err; + } + cm_listens_created++; + } + + cm_id->add_ref(cm_id); + cm_id->provider_data = (void *)cm_node; + + + return 0; +} + + +/** + * nes_destroy_listen + */ +int nes_destroy_listen(struct iw_cm_id *cm_id) +{ + if (cm_id->provider_data) + g_cm_core->api->stop_listener(g_cm_core, cm_id->provider_data); + else + nes_debug(NES_DBG_CM, "cm_id->provider_data was NULL\n"); + + cm_id->rem_ref(cm_id); + + return 0; +} + + +/** + * nes_cm_recv + */ +int nes_cm_recv(struct sk_buff *skb, struct net_device *netdevice) +{ + cm_packets_received++; + if ((g_cm_core) && (g_cm_core->api)) { + g_cm_core->api->recv_pkt(g_cm_core, netdev_priv(netdevice), skb); + } else { + nes_debug(NES_DBG_CM, "Unable to process packet for CM," + " cm is not setup properly.\n"); + } + + return 0; +} + + +/** + * nes_cm_start + * Start and init a cm core module + */ +int nes_cm_start(void) +{ + nes_debug(NES_DBG_CM, "\n"); + /* create the primary CM core, pass this handle to subsequent core inits */ + g_cm_core = nes_cm_alloc_core(); + if (g_cm_core) { + return 0; + } else { + return -ENOMEM; + } +} + + +/** + * nes_cm_stop + * stop and dealloc all cm core instances + */ +int nes_cm_stop(void) +{ + g_cm_core->api->destroy_cm_core(g_cm_core); + return 0; +} + + +/** + * cm_event_connected + * handle a connected event, setup QPs and HW + */ +void cm_event_connected(struct nes_cm_event *event) +{ + u64 u64temp; + struct nes_qp *nesqp; + struct nes_vnic *nesvnic; + struct nes_device *nesdev; + struct nes_cm_node *cm_node; + struct nes_adapter *nesadapter; + struct ib_qp_attr attr; + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + struct nes_hw_qp_wqe *wqe; + struct nes_v4_quad nes_quad; + int ret; + + /* get all our handles */ + cm_node = event->cm_node; + cm_id = cm_node->cm_id; + nes_debug(NES_DBG_CM, "cm_event_connected - %p - cm_id = %p\n", cm_node, cm_id); + nesqp = (struct nes_qp *)cm_id->provider_data; + nesvnic = to_nesvnic(nesqp->ibqp.device); + nesdev = nesvnic->nesdev; + nesadapter = nesdev->nesadapter; + + if (nesqp->destroyed) { + return; + } + atomic_inc(&cm_connecteds); + nes_debug(NES_DBG_CM, "QP%u attempting to connect to 0x%08X:0x%04X on" + " local port 0x%04X. jiffies = %lu.\n", + nesqp->hwqp.qp_id, + ntohl(cm_id->remote_addr.sin_addr.s_addr), + ntohs(cm_id->remote_addr.sin_port), + ntohs(cm_id->local_addr.sin_port), + jiffies); + + nes_cm_init_tsa_conn(nesqp, cm_node); + + /* set the QP tsa context */ + nesqp->nesqp_context->tcpPorts[0] = cpu_to_le16(ntohs(cm_id->local_addr.sin_port)); + nesqp->nesqp_context->tcpPorts[1] = cpu_to_le16(ntohs(cm_id->remote_addr.sin_port)); + nesqp->nesqp_context->ip0 = cpu_to_le32(ntohl(cm_id->remote_addr.sin_addr.s_addr)); + + nesqp->nesqp_context->misc2 |= cpu_to_le32( + (u32)PCI_FUNC(nesdev->pcidev->devfn) << NES_QPCONTEXT_MISC2_SRC_IP_SHIFT); + nesqp->nesqp_context->arp_index_vlan |= cpu_to_le32( + nes_arp_table(nesdev, le32_to_cpu(nesqp->nesqp_context->ip0), + NULL, NES_ARP_RESOLVE) << 16); + nesqp->nesqp_context->ts_val_delta = cpu_to_le32( + jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW)); + nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id); + nesqp->nesqp_context->ird_ord_sizes |= + cpu_to_le32((u32)1 << NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT); + + /* Adjust tail for not having a LSMM */ + nesqp->hwqp.sq_tail = 1; + +#if defined(NES_SEND_FIRST_WRITE) + if (cm_node->send_write0) { + nes_debug(NES_DBG_CM, "Sending first write.\n"); + wqe = &nesqp->hwqp.sq_vbase[0]; + u64temp = (unsigned long)nesqp; + u64temp |= NES_SW_CONTEXT_ALIGN>>1; + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, + u64temp); + wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = cpu_to_le32(NES_IWARP_SQ_OP_RDMAW); + wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_LOW_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 0; + + /* use the reserved spot on the WQ for the extra first WQE */ + nesqp->nesqp_context->ird_ord_sizes &= cpu_to_le32(~(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT | + NES_QPCONTEXT_ORDIRD_WRPDU | NES_QPCONTEXT_ORDIRD_ALSMM)); + nesqp->skip_lsmm = 1; + nesqp->hwqp.sq_tail = 0; + nes_write32(nesdev->regs + NES_WQE_ALLOC, + (1 << 24) | 0x00800000 | nesqp->hwqp.qp_id); + } +#endif + + memset(&nes_quad, 0, sizeof(nes_quad)); + + nes_quad.DstIpAdrIndex = cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); + nes_quad.SrcIpadr = cm_id->remote_addr.sin_addr.s_addr; + nes_quad.TcpPorts[0] = cm_id->remote_addr.sin_port; + nes_quad.TcpPorts[1] = cm_id->local_addr.sin_port; + + /* Produce hash key */ + nesqp->hte_index = cpu_to_be32( + crc32c(~0, (void *)&nes_quad, sizeof(nes_quad)) ^ 0xffffffff); + nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, After CRC = 0x%08X\n", + nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask); + + nesqp->hte_index &= nesadapter->hte_index_mask; + nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index); + + nesqp->ietf_frame = &cm_node->mpa_frame; + nesqp->private_data_len = (u8) cm_node->mpa_frame_size; + cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node); + + /* modify QP state to rts */ + attr.qp_state = IB_QPS_RTS; + nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); + + /* notify OF layer we successfully created the requested connection */ + cm_event.event = IW_CM_EVENT_CONNECT_REPLY; + cm_event.status = IW_CM_EVENT_STATUS_ACCEPTED; + cm_event.provider_data = cm_id->provider_data; + cm_event.local_addr.sin_family = AF_INET; + cm_event.local_addr.sin_port = cm_id->local_addr.sin_port; + cm_event.remote_addr = cm_id->remote_addr; + + cm_event.private_data = (void *)event->cm_node->mpa_frame_buf; + cm_event.private_data_len = (u8) event->cm_node->mpa_frame_size; + + cm_event.local_addr.sin_addr.s_addr = event->cm_info.rem_addr; + ret = cm_id->event_handler(cm_id, &cm_event); + nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); + + if (ret) + printk("%s[%u] OFA CM event_handler returned, ret=%d\n", + __FUNCTION__, __LINE__, ret); + nes_debug(NES_DBG_CM, "Exiting connect thread for QP%u. jiffies = %lu\n", + nesqp->hwqp.qp_id, jiffies ); + + nes_rem_ref(&nesqp->ibqp); + + return; +} + + +/** + * cm_event_connect_error + */ +void cm_event_connect_error(struct nes_cm_event *event) +{ + struct nes_qp *nesqp; + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + /* struct nes_cm_info cm_info; */ + int ret; + + if (!event->cm_node) + return; + + cm_id = event->cm_node->cm_id; + if (!cm_id) { + return; + } + + nes_debug(NES_DBG_CM, "cm_node=%p, cm_id=%p\n", event->cm_node, cm_id); + nesqp = cm_id->provider_data; + + if (!nesqp) { + return; + } + + /* notify OF layer about this connection error event */ + /* cm_id->rem_ref(cm_id); */ + nesqp->cm_id = NULL; + cm_id->provider_data = NULL; + cm_event.event = IW_CM_EVENT_CONNECT_REPLY; + cm_event.status = IW_CM_EVENT_STATUS_REJECTED; + cm_event.provider_data = cm_id->provider_data; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + + nes_debug(NES_DBG_CM, "call CM_EVENT REJECTED, local_addr=%08x, remove_addr=%08x\n", + cm_event.local_addr.sin_addr.s_addr, cm_event.remote_addr.sin_addr.s_addr); + + ret = cm_id->event_handler(cm_id, &cm_event); + nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); + if (ret) + printk("%s[%u] OFA CM event_handler returned, ret=%d\n", + __FUNCTION__, __LINE__, ret); + nes_rem_ref(&nesqp->ibqp); + cm_id->rem_ref(cm_id); + + return; +} + + +/** + * cm_event_reset + */ +void cm_event_reset(struct nes_cm_event *event) +{ + struct nes_qp *nesqp; + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + /* struct nes_cm_info cm_info; */ + int ret; + + if (!event->cm_node) + return; + + if (!event->cm_node->cm_id) + return; + + cm_id = event->cm_node->cm_id; + + nes_debug(NES_DBG_CM, "%p - cm_id = %p\n", event->cm_node, cm_id); + nesqp = cm_id->provider_data; + + nesqp->cm_id = NULL; + /* cm_id->provider_data = NULL; */ + cm_event.event = IW_CM_EVENT_DISCONNECT; + cm_event.status = IW_CM_EVENT_STATUS_RESET; + cm_event.provider_data = cm_id->provider_data; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + + ret = cm_id->event_handler(cm_id, &cm_event); + nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); + + + /* notify OF layer about this connection error event */ + cm_id->rem_ref(cm_id); + + return; +} + + +/** + * cm_event_mpa_req + */ +void cm_event_mpa_req(struct nes_cm_event *event) +{ + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + int ret; + struct nes_cm_node *cm_node; + + cm_node = event->cm_node; + if (!cm_node) + return; + cm_id = cm_node->cm_id; + + atomic_inc(&cm_connect_reqs); + nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n", + cm_node, cm_id, jiffies); + + cm_event.event = IW_CM_EVENT_CONNECT_REQUEST; + cm_event.status = IW_CM_EVENT_STATUS_OK; + cm_event.provider_data = (void *)cm_node; + + cm_event.local_addr.sin_family = AF_INET; + cm_event.local_addr.sin_port = htons(event->cm_info.loc_port); + cm_event.local_addr.sin_addr.s_addr = htonl(event->cm_info.loc_addr); + + cm_event.remote_addr.sin_family = AF_INET; + cm_event.remote_addr.sin_port = htons(event->cm_info.rem_port); + cm_event.remote_addr.sin_addr.s_addr = htonl(event->cm_info.rem_addr); + + cm_event.private_data = cm_node->mpa_frame_buf; + cm_event.private_data_len = (u8) cm_node->mpa_frame_size; + + ret = cm_id->event_handler(cm_id, &cm_event); + if (ret) + printk("%s[%u] OFA CM event_handler returned, ret=%d\n", + __FUNCTION__, __LINE__, ret); + + return; +} + + +static void nes_cm_event_handler(struct work_struct *); + +/** + * nes_cm_post_event + * post an event to the cm event handler + */ +int nes_cm_post_event(struct nes_cm_event *event) +{ + atomic_inc(&event->cm_node->cm_core->events_posted); + add_ref_cm_node(event->cm_node); + event->cm_info.cm_id->add_ref(event->cm_info.cm_id); + INIT_WORK(&event->event_work, nes_cm_event_handler); + nes_debug(NES_DBG_CM, "queue_work, event=%p\n", event); + + queue_work(event->cm_node->cm_core->event_wq, &event->event_work); + + nes_debug(NES_DBG_CM, "Exit\n"); + return 0; +} + + +/** + * nes_cm_event_handler + * worker function to handle cm events + * will free instance of nes_cm_event + */ +static void nes_cm_event_handler(struct work_struct *work) +{ + struct nes_cm_event *event = container_of(work, struct nes_cm_event, event_work); + struct nes_cm_core *cm_core; + + if ((!event) || (!event->cm_node) || (!event->cm_node->cm_core)) { + return; + } + cm_core = event->cm_node->cm_core; + nes_debug(NES_DBG_CM, "event=%p, event->type=%u, events posted=%u\n", + event, event->type, atomic_read(&cm_core->events_posted)); + + switch (event->type) { + case NES_CM_EVENT_MPA_REQ: + cm_event_mpa_req(event); + nes_debug(NES_DBG_CM, "CM Event: MPA REQUEST\n"); + break; + case NES_CM_EVENT_RESET: + nes_debug(NES_DBG_CM, "CM Event: RESET\n"); + cm_event_reset(event); + break; + case NES_CM_EVENT_CONNECTED: + if ((!event->cm_node->cm_id) || + (event->cm_node->state != NES_CM_STATE_TSA)) { + break; + } + cm_event_connected(event); + nes_debug(NES_DBG_CM, "CM Event: CONNECTED\n"); + break; + case NES_CM_EVENT_ABORTED: + if ((!event->cm_node->cm_id) || (event->cm_node->state == NES_CM_STATE_TSA)) { + break; + } + cm_event_connect_error(event); + nes_debug(NES_DBG_CM, "CM Event: ABORTED\n"); + break; + case NES_CM_EVENT_DROPPED_PKT: + nes_debug(NES_DBG_CM, "CM Event: DROPPED PKT\n"); + break; + default: + nes_debug(NES_DBG_CM, "CM Event: UNKNOWN EVENT TYPE\n"); + break; + } + + atomic_dec(&cm_core->events_posted); + event->cm_info.cm_id->rem_ref(event->cm_info.cm_id); + rem_ref_cm_node(cm_core, event->cm_node); + kfree(event); + + return; +} diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h new file mode 100644 index 0000000..a59f0a7 --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -0,0 +1,433 @@ +/* + * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef NES_CM_H +#define NES_CM_H + +#define QUEUE_EVENTS + +#define NES_MANAGE_APBVT_DEL 0 +#define NES_MANAGE_APBVT_ADD 1 + +/* IETF MPA -- defines, enums, structs */ +#define IEFT_MPA_KEY_REQ "MPA ID Req Frame" +#define IEFT_MPA_KEY_REP "MPA ID Rep Frame" +#define IETF_MPA_KEY_SIZE 16 +#define IETF_MPA_VERSION 1 + +enum ietf_mpa_flags { + IETF_MPA_FLAGS_MARKERS = 0x80, /* receive Markers */ + IETF_MPA_FLAGS_CRC = 0x40, /* receive Markers */ + IETF_MPA_FLAGS_REJECT = 0x20, /* Reject */ +}; + +struct ietf_mpa_frame { + u8 key[IETF_MPA_KEY_SIZE]; + u8 flags; + u8 rev; + __be16 priv_data_len; + u8 priv_data[0]; +}; + +#define ietf_mpa_req_resp_frame ietf_mpa_frame + +struct nes_v4_quad { + u32 rsvd0; + __le32 DstIpAdrIndex; /* Only most significant 5 bits are valid */ + __be32 SrcIpadr; + __be16 TcpPorts[2]; /* src is low, dest is high */ +}; + +struct nes_cm_node; +enum nes_timer_type { + NES_TIMER_TYPE_SEND, + NES_TIMER_TYPE_RECV, + NES_TIMER_NODE_CLEANUP, + NES_TIMER_TYPE_CLOSE, +}; + +#define MAX_NES_IFS 4 + +#define SET_ACK 1 +#define SET_SYN 2 +#define SET_FIN 4 +#define SET_RST 8 + +struct option_base { + u8 optionnum; + u8 length; +}; + +enum option_numbers { + OPTION_NUMBER_END, + OPTION_NUMBER_NONE, + OPTION_NUMBER_MSS, + OPTION_NUMBER_WINDOW_SCALE, + OPTION_NUMBER_SACK_PERM, + OPTION_NUMBER_SACK, + OPTION_NUMBER_WRITE0 = 0xbc +}; + +struct option_mss { + u8 optionnum; + u8 length; + __be16 mss; +}; + +struct option_windowscale { + u8 optionnum; + u8 length; + u8 shiftcount; +}; + +union all_known_options { + char as_end; + struct option_base as_base; + struct option_mss as_mss; + struct option_windowscale as_windowscale; +}; + +struct nes_timer_entry { + struct list_head list; + unsigned long timetosend; /* jiffies */ + struct sk_buff *skb; + u32 type; + u32 retrycount; + u32 retranscount; + u32 context; + u32 seq_num; + u32 send_retrans; + int close_when_complete; + struct net_device *netdev; +}; + +#define NES_DEFAULT_RETRYS 64 +#define NES_DEFAULT_RETRANS 8 +#ifdef CONFIG_INFINIBAND_NES_DEBUG +#define NES_RETRY_TIMEOUT (1000*HZ/1000) +#else +#define NES_RETRY_TIMEOUT (3000*HZ/1000) +#endif +#define NES_SHORT_TIME (10) +#define NES_LONG_TIME (2000*HZ/1000) + +#define NES_CM_HASHTABLE_SIZE 1024 +#define NES_CM_TCP_TIMER_INTERVAL 3000 +#define NES_CM_DEFAULT_MTU 1540 +#define NES_CM_DEFAULT_FRAME_CNT 10 +#define NES_CM_THREAD_STACK_SIZE 256 +#define NES_CM_DEFAULT_RCV_WND 64240 // before we know that window scaling is allowed +#define NES_CM_DEFAULT_RCV_WND_SCALED 256960 // after we know that window scaling is allowed +#define NES_CM_DEFAULT_RCV_WND_SCALE 2 +#define NES_CM_DEFAULT_FREE_PKTS 0x000A +#define NES_CM_FREE_PKT_LO_WATERMARK 2 + +#define NES_CM_DEFAULT_MSS 536 + +#define NES_CM_DEF_SEQ 0x159bf75f +#define NES_CM_DEF_LOCAL_ID 0x3b47 + +#define NES_CM_DEF_SEQ2 0x18ed5740 +#define NES_CM_DEF_LOCAL_ID2 0xb807 + +typedef u32 nes_addr_t; + +#define nes_cm_tsa_context nes_qp_context + +struct nes_qp; + +/* cm node transition states */ +enum nes_cm_node_state { + NES_CM_STATE_UNKNOWN, + NES_CM_STATE_INITED, + NES_CM_STATE_LISTENING, + NES_CM_STATE_SYN_RCVD, + NES_CM_STATE_SYN_SENT, + NES_CM_STATE_ONE_SIDE_ESTABLISHED, + NES_CM_STATE_ESTABLISHED, + NES_CM_STATE_ACCEPTING, + NES_CM_STATE_MPAREQ_SENT, + NES_CM_STATE_TSA, + NES_CM_STATE_FIN_WAIT1, + NES_CM_STATE_FIN_WAIT2, + NES_CM_STATE_CLOSE_WAIT, + NES_CM_STATE_TIME_WAIT, + NES_CM_STATE_LAST_ACK, + NES_CM_STATE_CLOSING, + NES_CM_STATE_CLOSED +}; + +/* type of nes connection */ +enum nes_cm_conn_type { + NES_CM_IWARP_CONN_TYPE, +}; + +/* CM context params */ +struct nes_cm_tcp_context { + u8 client; + + u32 loc_seq_num; + u32 loc_ack_num; + u32 rem_ack_num; + u32 rcv_nxt; + + u32 loc_id; + u32 rem_id; + + u32 snd_wnd; + u32 max_snd_wnd; + + u32 rcv_wnd; + u32 mss; + u8 snd_wscale; + u8 rcv_wscale; + + struct nes_cm_tsa_context tsa_cntxt; + struct timeval sent_ts; +}; + + +enum nes_cm_listener_state { + NES_CM_LISTENER_PASSIVE_STATE=1, + NES_CM_LISTENER_ACTIVE_STATE=2, + NES_CM_LISTENER_EITHER_STATE=3 +}; + +struct nes_cm_listener { + struct list_head list; + u64 session_id; + struct nes_cm_core *cm_core; + u8 loc_mac[ETH_ALEN]; + nes_addr_t loc_addr; + u16 loc_port; + struct iw_cm_id *cm_id; + enum nes_cm_conn_type conn_type; + atomic_t ref_count; + struct nes_vnic *nesvnic; + atomic_t pend_accepts_cnt; + int backlog; + enum nes_cm_listener_state listener_state; + u32 reused_node; +}; + +/* per connection node and node state information */ +struct nes_cm_node { + u64 session_id; + u32 hashkey; + + nes_addr_t loc_addr, rem_addr; + u16 loc_port, rem_port; + + u8 loc_mac[ETH_ALEN]; + u8 rem_mac[ETH_ALEN]; + + enum nes_cm_node_state state; + struct nes_cm_tcp_context tcp_cntxt; + struct nes_cm_core *cm_core; + struct sk_buff_head resend_list; + atomic_t ref_count; + struct net_device *netdev; + + struct nes_cm_node *loopbackpartner; + struct list_head retrans_list; + spinlock_t retrans_list_lock; + struct list_head recv_list; + spinlock_t recv_list_lock; + + int send_write0; + union { + struct ietf_mpa_frame mpa_frame; + u8 mpa_frame_buf[NES_CM_DEFAULT_MTU]; + }; + u16 mpa_frame_size; + struct iw_cm_id *cm_id; + struct list_head list; + int accelerated; + struct nes_cm_listener *listener; + enum nes_cm_conn_type conn_type; + struct nes_vnic *nesvnic; + int apbvt_set; + int accept_pend; +}; + +/* structure for client or CM to fill when making CM api calls. */ +/* - only need to set relevant data, based on op. */ +struct nes_cm_info { + union { + struct iw_cm_id *cm_id; + struct net_device *netdev; + }; + + u16 loc_port; + u16 rem_port; + nes_addr_t loc_addr; + nes_addr_t rem_addr; + + enum nes_cm_conn_type conn_type; + int backlog; +}; + +/* CM event codes */ +enum nes_cm_event_type { + NES_CM_EVENT_UNKNOWN, + NES_CM_EVENT_ESTABLISHED, + NES_CM_EVENT_MPA_REQ, + NES_CM_EVENT_MPA_CONNECT, + NES_CM_EVENT_MPA_ACCEPT, + NES_CM_EVENT_MPA_ESTABLISHED, + NES_CM_EVENT_CONNECTED, + NES_CM_EVENT_CLOSED, + NES_CM_EVENT_RESET, + NES_CM_EVENT_DROPPED_PKT, + NES_CM_EVENT_CLOSE_IMMED, + NES_CM_EVENT_CLOSE_HARD, + NES_CM_EVENT_CLOSE_CLEAN, + NES_CM_EVENT_ABORTED, + NES_CM_EVENT_SEND_FIRST +}; + +/* event to post to CM event handler */ +struct nes_cm_event { + enum nes_cm_event_type type; + + struct nes_cm_info cm_info; + struct work_struct event_work; + struct nes_cm_node *cm_node; +}; + +struct nes_cm_core { + enum nes_cm_node_state state; + atomic_t session_id; + + atomic_t listen_node_cnt; + struct nes_cm_node listen_list; + spinlock_t listen_list_lock; + + u32 mtu; + u32 free_tx_pkt_max; + u32 rx_pkt_posted; + struct sk_buff_head tx_free_list; + atomic_t ht_node_cnt; + struct list_head connected_nodes; + /* struct list_head hashtable[NES_CM_HASHTABLE_SIZE]; */ + spinlock_t ht_lock; + + struct timer_list tcp_timer; + + struct nes_cm_ops *api; + + int (*post_event)(struct nes_cm_event *event); + atomic_t events_posted; + struct workqueue_struct *event_wq; + struct workqueue_struct *disconn_wq; + + atomic_t node_cnt; + u64 aborted_connects; + u32 options; + + struct nes_cm_node *current_listen_node; +}; + + +#define NES_CM_SET_PKT_SIZE (1 << 1) +#define NES_CM_SET_FREE_PKT_Q_SIZE (1 << 2) + +/* CM ops/API for client interface */ +struct nes_cm_ops { + int (*accelerated)(struct nes_cm_core *, struct nes_cm_node *); + struct nes_cm_listener * (*listen)(struct nes_cm_core *, struct nes_vnic *, + struct nes_cm_info *); + int (*stop_listener)(struct nes_cm_core *, struct nes_cm_listener *); + struct nes_cm_node * (*connect)(struct nes_cm_core *, + struct nes_vnic *, struct ietf_mpa_frame *, + struct nes_cm_info *); + int (*close)(struct nes_cm_core *, struct nes_cm_node *); + int (*accept)(struct nes_cm_core *, struct ietf_mpa_frame *, + struct nes_cm_node *); + int (*reject)(struct nes_cm_core *, struct ietf_mpa_frame *, + struct nes_cm_node *); + int (*recv_pkt)(struct nes_cm_core *, struct nes_vnic *, + struct sk_buff *); + int (*destroy_cm_core)(struct nes_cm_core *); + int (*get)(struct nes_cm_core *); + int (*set)(struct nes_cm_core *, u32, u32); +}; + + +int send_mpa_request(struct nes_cm_node *); +struct sk_buff *form_cm_frame(struct sk_buff *, struct nes_cm_node *, + void *, u32, void *, u32, u8); +int schedule_nes_timer(struct nes_cm_node *, struct sk_buff *, + enum nes_timer_type, int, int); +void nes_cm_timer_tick(unsigned long); +int send_syn(struct nes_cm_node *, u32); +int send_reset(struct nes_cm_node *); +int send_ack(struct nes_cm_node *); +int send_fin(struct nes_cm_node *, struct sk_buff *); +struct sk_buff *get_free_pkt(struct nes_cm_node *); +int process_packet(struct nes_cm_node *, struct sk_buff *, struct nes_cm_core *); + +struct nes_cm_node * mini_cm_connect(struct nes_cm_core *, + struct nes_vnic *, struct ietf_mpa_frame *, struct nes_cm_info *); +int mini_cm_accept(struct nes_cm_core *, struct ietf_mpa_frame *, struct nes_cm_node *); +int mini_cm_reject(struct nes_cm_core *, struct ietf_mpa_frame *, struct nes_cm_node *); +int mini_cm_close(struct nes_cm_core *, struct nes_cm_node *); +int mini_cm_recv_pkt(struct nes_cm_core *, struct nes_vnic *, struct sk_buff *); +struct nes_cm_core *mini_cm_alloc_core(struct nes_cm_info *); +int mini_cm_dealloc_core(struct nes_cm_core *); +int mini_cm_get(struct nes_cm_core *); +int mini_cm_set(struct nes_cm_core *, u32, u32); + +int nes_cm_disconn(struct nes_qp *); +void nes_disconnect_worker(struct work_struct *); +int nes_cm_disconn_true(struct nes_qp *); +int nes_disconnect(struct nes_qp *, int); + +int nes_accept(struct iw_cm_id *, struct iw_cm_conn_param *); +int nes_reject(struct iw_cm_id *, const void *, u8); +int nes_connect(struct iw_cm_id *, struct iw_cm_conn_param *); +int nes_create_listen(struct iw_cm_id *, int); +int nes_destroy_listen(struct iw_cm_id *); + +int nes_cm_recv(struct sk_buff *, struct net_device *); +int nes_cm_start(void); +int nes_cm_stop(void); + +/* CM event handler functions */ +void cm_event_connected(struct nes_cm_event *); +void cm_event_connect_error(struct nes_cm_event *); +void cm_event_reset(struct nes_cm_event *); +void cm_event_mpa_req(struct nes_cm_event *); +int nes_cm_post_event(struct nes_cm_event *); + +#endif /* NES_CM_H */ diff --git a/drivers/infiniband/hw/nes/nes_context.h b/drivers/infiniband/hw/nes/nes_context.h new file mode 100644 index 0000000..da9daba --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_context.h @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef NES_CONTEXT_H +#define NES_CONTEXT_H + +struct nes_qp_context { + __le32 misc; + __le32 cqs; + __le32 sq_addr_low; + __le32 sq_addr_high; + __le32 rq_addr_low; + __le32 rq_addr_high; + __le32 misc2; + __le16 tcpPorts[2]; + __le32 ip0; + __le32 ip1; + __le32 ip2; + __le32 ip3; + __le32 mss; + __le32 arp_index_vlan; + __le32 tcp_state_flow_label; + __le32 pd_index_wscale; + __le32 keepalive; + u32 ts_recent; + u32 ts_age; + __le32 snd_nxt; + __le32 snd_wnd; + __le32 rcv_nxt; + __le32 rcv_wnd; + __le32 snd_max; + __le32 snd_una; + u32 srtt; + __le32 rttvar; + __le32 ssthresh; + __le32 cwnd; + __le32 snd_wl1; + __le32 snd_wl2; + __le32 max_snd_wnd; + __le32 ts_val_delta; + u32 retransmit; + u32 probe_cnt; + u32 hte_index; + __le32 q2_addr_low; + __le32 q2_addr_high; + __le32 ird_index; + u32 Rsvd3; + __le32 ird_ord_sizes; + u32 mrkr_offset; + __le32 aeq_token_low; + __le32 aeq_token_high; +}; + +/* QP Context Misc Field */ + +#define NES_QPCONTEXT_MISC_IWARP_VER_MASK 0x00000003 +#define NES_QPCONTEXT_MISC_IWARP_VER_SHIFT 0 +#define NES_QPCONTEXT_MISC_EFB_SIZE_MASK 0x000000C0 +#define NES_QPCONTEXT_MISC_EFB_SIZE_SHIFT 6 +#define NES_QPCONTEXT_MISC_RQ_SIZE_MASK 0x00000300 +#define NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT 8 +#define NES_QPCONTEXT_MISC_SQ_SIZE_MASK 0x00000c00 +#define NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT 10 +#define NES_QPCONTEXT_MISC_PCI_FCN_MASK 0x00007000 +#define NES_QPCONTEXT_MISC_PCI_FCN_SHIFT 12 +#define NES_QPCONTEXT_MISC_DUP_ACKS_MASK 0x00070000 +#define NES_QPCONTEXT_MISC_DUP_ACKS_SHIFT 16 + +enum nes_qp_context_misc_bits { + NES_QPCONTEXT_MISC_RX_WQE_SIZE = 0x00000004, + NES_QPCONTEXT_MISC_IPV4 = 0x00000008, + NES_QPCONTEXT_MISC_DO_NOT_FRAG = 0x00000010, + NES_QPCONTEXT_MISC_INSERT_VLAN = 0x00000020, + NES_QPCONTEXT_MISC_DROS = 0x00008000, + NES_QPCONTEXT_MISC_WSCALE = 0x00080000, + NES_QPCONTEXT_MISC_KEEPALIVE = 0x00100000, + NES_QPCONTEXT_MISC_TIMESTAMP = 0x00200000, + NES_QPCONTEXT_MISC_SACK = 0x00400000, + NES_QPCONTEXT_MISC_RDMA_WRITE_EN = 0x00800000, + NES_QPCONTEXT_MISC_RDMA_READ_EN = 0x01000000, + NES_QPCONTEXT_MISC_WBIND_EN = 0x10000000, + NES_QPCONTEXT_MISC_FAST_REGISTER_EN = 0x20000000, + NES_QPCONTEXT_MISC_PRIV_EN = 0x40000000, + NES_QPCONTEXT_MISC_NO_NAGLE = 0x80000000 +}; + +enum nes_qp_acc_wq_sizes { + HCONTEXT_TSA_WQ_SIZE_4 = 0, + HCONTEXT_TSA_WQ_SIZE_32 = 1, + HCONTEXT_TSA_WQ_SIZE_128 = 2, + HCONTEXT_TSA_WQ_SIZE_512 = 3 +}; + +/* QP Context Misc2 Fields */ +#define NES_QPCONTEXT_MISC2_TTL_MASK 0x000000ff +#define NES_QPCONTEXT_MISC2_TTL_SHIFT 0 +#define NES_QPCONTEXT_MISC2_HOP_LIMIT_MASK 0x000000ff +#define NES_QPCONTEXT_MISC2_HOP_LIMIT_SHIFT 0 +#define NES_QPCONTEXT_MISC2_LIMIT_MASK 0x00000300 +#define NES_QPCONTEXT_MISC2_LIMIT_SHIFT 8 +#define NES_QPCONTEXT_MISC2_NIC_INDEX_MASK 0x0000fc00 +#define NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT 10 +#define NES_QPCONTEXT_MISC2_SRC_IP_MASK 0x001f0000 +#define NES_QPCONTEXT_MISC2_SRC_IP_SHIFT 16 +#define NES_QPCONTEXT_MISC2_TOS_MASK 0xff000000 +#define NES_QPCONTEXT_MISC2_TOS_SHIFT 24 +#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_MASK 0xff000000 +#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_SHIFT 24 + +/* QP Context Tcp State/Flow Label Fields */ +#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_MASK 0x000fffff +#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_SHIFT 0 +#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_MASK 0xf0000000 +#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT 28 + +enum nes_qp_tcp_state { + NES_QPCONTEXT_TCPSTATE_CLOSED = 1, + NES_QPCONTEXT_TCPSTATE_EST = 5, + NES_QPCONTEXT_TCPSTATE_TIME_WAIT = 11, +}; + +/* QP Context PD Index/wscale Fields */ +#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK 0x0000000f +#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT 0 +#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK 0x00000f00 +#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT 8 +#define NES_QPCONTEXT_PDWSCALE_PDINDEX_MASK 0xffff0000 +#define NES_QPCONTEXT_PDWSCALE_PDINDEX_SHIFT 16 + +/* QP Context Keepalive Fields */ +#define NES_QPCONTEXT_KEEPALIVE_DELTA_MASK 0x0000ffff +#define NES_QPCONTEXT_KEEPALIVE_DELTA_SHIFT 0 +#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_MASK 0x00ff0000 +#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_SHIFT 16 +#define NES_QPCONTEXT_KEEPALIVE_INTV_MASK 0xff000000 +#define NES_QPCONTEXT_KEEPALIVE_INTV_SHIFT 24 + +/* QP Context ORD/IRD Fields */ +#define NES_QPCONTEXT_ORDIRD_ORDSIZE_MASK 0x0000007f +#define NES_QPCONTEXT_ORDIRD_ORDSIZE_SHIFT 0 +#define NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK 0x00030000 +#define NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT 16 +#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_MASK 0x30000000 +#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT 28 + +enum nes_ord_ird_bits { + NES_QPCONTEXT_ORDIRD_WRPDU = 0x02000000, + NES_QPCONTEXT_ORDIRD_LSMM_PRESENT = 0x04000000, + NES_QPCONTEXT_ORDIRD_ALSMM = 0x08000000, + NES_QPCONTEXT_ORDIRD_AAH = 0x40000000, + NES_QPCONTEXT_ORDIRD_RNMC = 0x80000000 +}; + +enum nes_iwarp_qp_state { + NES_QPCONTEXT_IWARP_STATE_NONEXIST = 0, + NES_QPCONTEXT_IWARP_STATE_IDLE = 1, + NES_QPCONTEXT_IWARP_STATE_RTS = 2, + NES_QPCONTEXT_IWARP_STATE_CLOSING = 3, + NES_QPCONTEXT_IWARP_STATE_TERMINATE = 5, + NES_QPCONTEXT_IWARP_STATE_ERROR = 6 +}; + + +#endif /* NES_CONTEXT_H */ diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c new file mode 100644 index 0000000..7c4c0fb --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -0,0 +1,3080 @@ +/* + * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/if_vlan.h> + +#include "nes.h" + +u32 crit_err_count = 0; +u32 int_mod_timer_init; +u32 int_mod_cq_depth_256; +u32 int_mod_cq_depth_128; +u32 int_mod_cq_depth_32; +u32 int_mod_cq_depth_24; +u32 int_mod_cq_depth_16; +u32 int_mod_cq_depth_4; +u32 int_mod_cq_depth_1; + +#include "nes_cm.h" + + +#ifdef CONFIG_INFINIBAND_NES_DEBUG +static unsigned char *nes_iwarp_state_str[] = { + "Non-Existant", + "Idle", + "RTS", + "Closing", + "RSVD1", + "Terminate", + "Error", + "RSVD2", +}; + +static unsigned char *nes_tcp_state_str[] = { + "Non-Existant", + "Closed", + "Listen", + "SYN Sent", + "SYN Rcvd", + "Established", + "Close Wait", + "FIN Wait 1", + "Closing", + "Last Ack", + "FIN Wait 2", + "Time Wait", + "RSVD1", + "RSVD2", + "RSVD3", + "RSVD4", +}; +#endif + + +/** + * nes_nic_init_timer_defaults + */ +void nes_nic_init_timer_defaults(struct nes_device *nesdev, u8 jumbomode) +{ + unsigned long flags; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; + + spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); + + shared_timer->timer_in_use_min = NES_NIC_FAST_TIMER_LOW; + shared_timer->timer_in_use_max = NES_NIC_FAST_TIMER_HIGH; + if (jumbomode) { + shared_timer->threshold_low = DEFAULT_JUMBO_NES_QL_LOW; + shared_timer->threshold_target = DEFAULT_JUMBO_NES_QL_TARGET; + shared_timer->threshold_high = DEFAULT_JUMBO_NES_QL_HIGH; + } else { + shared_timer->threshold_low = DEFAULT_NES_QL_LOW; + shared_timer->threshold_target = DEFAULT_NES_QL_TARGET; + shared_timer->threshold_high = DEFAULT_NES_QL_HIGH; + } + + /* todo use netdev->mtu to set thresholds */ + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); +} + + +/** + * nes_nic_init_timer + */ +static void nes_nic_init_timer(struct nes_device *nesdev) +{ + unsigned long flags; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; + + spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); + + if (shared_timer->timer_in_use_old == 0) { + nesdev->deepcq_count = 0; + shared_timer->timer_direction_upward = 0; + shared_timer->timer_direction_downward = 0; + shared_timer->timer_in_use = NES_NIC_FAST_TIMER; + shared_timer->timer_in_use_old = 0; + + } + if (shared_timer->timer_in_use != shared_timer->timer_in_use_old) { + shared_timer->timer_in_use_old = shared_timer->timer_in_use; + nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, + 0x80000000 | ((u32)(shared_timer->timer_in_use*8))); + } + /* todo use netdev->mtu to set thresholds */ + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); +} + + +/** + * nes_nic_tune_timer + */ +static void nes_nic_tune_timer(struct nes_device *nesdev) +{ + unsigned long flags; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; + u16 cq_count = nesdev->currcq_count; + + spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); + + if (shared_timer->cq_count_old < cq_count) { + if (cq_count > shared_timer->threshold_low) + shared_timer->cq_direction_downward=0; + } + if (shared_timer->cq_count_old >= cq_count) + shared_timer->cq_direction_downward++; + shared_timer->cq_count_old = cq_count; + if (shared_timer->cq_direction_downward > NES_NIC_CQ_DOWNWARD_TREND) { + if (cq_count <= shared_timer->threshold_low) { + shared_timer->threshold_low = shared_timer->threshold_low/2; + shared_timer->cq_direction_downward=0; + nesdev->currcq_count = 0; + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); + return; + } + } + + if (cq_count > 1) { + nesdev->deepcq_count += cq_count; + if (cq_count <= shared_timer->threshold_low) { /* increase timer gently */ + shared_timer->timer_direction_upward++; + shared_timer->timer_direction_downward = 0; + } else if (cq_count <= shared_timer->threshold_target) { /* balanced */ + shared_timer->timer_direction_upward = 0; + shared_timer->timer_direction_downward = 0; + } else if (cq_count <= shared_timer->threshold_high) { /* decrease timer gently */ + shared_timer->timer_direction_downward++; + shared_timer->timer_direction_upward = 0; + } else if (cq_count <= (shared_timer->threshold_high) * 2) { + shared_timer->timer_in_use -= 2; + shared_timer->timer_direction_upward = 0; + shared_timer->timer_direction_downward++; + } else { + shared_timer->timer_in_use -= 4; + shared_timer->timer_direction_upward = 0; + shared_timer->timer_direction_downward++; + } + + if (shared_timer->timer_direction_upward > 3 ) { /* using history */ + shared_timer->timer_in_use += 3; + shared_timer->timer_direction_upward = 0; + shared_timer->timer_direction_downward = 0; + } + if (shared_timer->timer_direction_downward > 5) { /* using history */ + shared_timer->timer_in_use -= 4 ; + shared_timer->timer_direction_downward = 0; + shared_timer->timer_direction_upward = 0; + } + } + + /* boundary checking */ + if (shared_timer->timer_in_use > NES_NIC_FAST_TIMER_HIGH) + shared_timer->timer_in_use = NES_NIC_FAST_TIMER_HIGH; + else if (shared_timer->timer_in_use < NES_NIC_FAST_TIMER_LOW) { + shared_timer->timer_in_use = NES_NIC_FAST_TIMER_LOW; + } + + nesdev->currcq_count = 0; + + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); +} + + +/** + * nes_init_adapter - initialize adapter + */ +struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) { + struct nes_adapter *nesadapter = NULL; + unsigned long num_pds; + u32 u32temp; + u32 port_count; + u16 max_rq_wrs; + u16 max_sq_wrs; + u32 max_mr; + u32 max_256pbl; + u32 max_4kpbl; + u32 max_qp; + u32 max_irrq; + u32 max_cq; + u32 hte_index_mask; + u32 adapter_size; + u32 arp_table_size; + u16 vendor_id; + u8 OneG_Mode; + u8 func_index; + + /* search the list of existing adapters */ + list_for_each_entry(nesadapter, &nes_adapter_list, list) { + nes_debug(NES_DBG_INIT, "Searching Adapter list for PCI devfn = 0x%X," + " adapter PCI slot/bus = %u/%u, pci devices PCI slot/bus = %u/%u, .\n", + nesdev->pcidev->devfn, + PCI_SLOT(nesadapter->devfn), + nesadapter->bus_number, + PCI_SLOT(nesdev->pcidev->devfn), + nesdev->pcidev->bus->number ); + if ((PCI_SLOT(nesadapter->devfn) == PCI_SLOT(nesdev->pcidev->devfn)) && + (nesadapter->bus_number == nesdev->pcidev->bus->number)) { + nesadapter->ref_count++; + return nesadapter; + } + } + + /* no adapter found */ + num_pds = pci_resource_len(nesdev->pcidev, BAR_1) >> PAGE_SHIFT; + if ((hw_rev != NE020_REV) && (hw_rev != NE020_REV1)) { + nes_debug(NES_DBG_INIT, "NE020 driver detected unknown hardware revision 0x%x\n", + hw_rev); + return NULL; + } + + nes_debug(NES_DBG_INIT, "Determine Soft Reset, QP_control=0x%x, CPU0=0x%x, CPU1=0x%x, CPU2=0x%x\n", + nes_read_indexed(nesdev, NES_IDX_QP_CONTROL + PCI_FUNC(nesdev->pcidev->devfn) * 8), + nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS), + nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 4), + nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 8)); + + nes_debug(NES_DBG_INIT, "Reset and init NE020\n"); + + + if ((port_count = nes_reset_adapter_ne020(nesdev, &OneG_Mode)) == 0) + return NULL; + if (nes_init_serdes(nesdev, hw_rev, port_count, OneG_Mode)) + return NULL; + nes_init_csr_ne020(nesdev, hw_rev, port_count); + + max_qp = nes_read_indexed(nesdev, NES_IDX_QP_CTX_SIZE); + nes_debug(NES_DBG_INIT, "QP_CTX_SIZE=%u\n", max_qp); + + u32temp = nes_read_indexed(nesdev, NES_IDX_QUAD_HASH_TABLE_SIZE); + if (max_qp > ((u32)1 << (u32temp & 0x001f))) { + nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to hash table size = 0x%08X\n", + max_qp, u32temp); + max_qp = (u32)1 << (u32temp & 0x001f); + } + + hte_index_mask = ((u32)1 << ((u32temp & 0x001f)+1))-1; + nes_debug(NES_DBG_INIT, "Max QP = %u, hte_index_mask = 0x%08X.\n", + max_qp, hte_index_mask); + + u32temp = nes_read_indexed(nesdev, NES_IDX_IRRQ_COUNT); + + max_irrq = 1 << (u32temp & 0x001f); + + if (max_qp > max_irrq) { + max_qp = max_irrq; + nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to Available Q1s.\n", + max_qp); + } + + /* there should be no reason to allocate more pds than qps */ + if (num_pds > max_qp) + num_pds = max_qp; + + u32temp = nes_read_indexed(nesdev, NES_IDX_MRT_SIZE); + max_mr = (u32)8192 << (u32temp & 0x7); + + u32temp = nes_read_indexed(nesdev, NES_IDX_PBL_REGION_SIZE); + max_256pbl = (u32)1 << (u32temp & 0x0000001f); + max_4kpbl = (u32)1 << ((u32temp >> 16) & 0x0000001f); + max_cq = nes_read_indexed(nesdev, NES_IDX_CQ_CTX_SIZE); + + u32temp = nes_read_indexed(nesdev, NES_IDX_ARP_CACHE_SIZE); + arp_table_size = 1 << u32temp; + + adapter_size = (sizeof(struct nes_adapter) + + (sizeof(unsigned long)-1)) & (~(sizeof(unsigned long)-1)); + adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_qp); + adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_mr); + adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_cq); + adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(num_pds); + adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(arp_table_size); + adapter_size += sizeof(struct nes_qp **) * max_qp; + + /* allocate a new adapter struct */ + nesadapter = kzalloc(adapter_size, GFP_KERNEL); + if (nesadapter == NULL) { + return NULL; + } + + nes_debug(NES_DBG_INIT, "Allocating new nesadapter @ %p, size = %u (actual size = %u).\n", + nesadapter, (u32)sizeof(struct nes_adapter), adapter_size); + + /* populate the new nesadapter */ + nesadapter->devfn = nesdev->pcidev->devfn; + nesadapter->bus_number = nesdev->pcidev->bus->number; + nesadapter->ref_count = 1; + nesadapter->timer_int_req = 0xffff0000; + nesadapter->OneG_Mode = OneG_Mode; + nesadapter->doorbell_start = nesdev->doorbell_region; + + /* nesadapter->tick_delta = clk_divisor; */ + nesadapter->hw_rev = hw_rev; + nesadapter->port_count = port_count; + + nesadapter->max_qp = max_qp; + nesadapter->hte_index_mask = hte_index_mask; + nesadapter->max_irrq = max_irrq; + nesadapter->max_mr = max_mr; + nesadapter->max_256pbl = max_256pbl - 1; + nesadapter->max_4kpbl = max_4kpbl - 1; + nesadapter->max_cq = max_cq; + nesadapter->free_256pbl = max_256pbl - 1; + nesadapter->free_4kpbl = max_4kpbl - 1; + nesadapter->max_pd = num_pds; + nesadapter->arp_table_size = arp_table_size; + + nesadapter->et_pkt_rate_low = NES_TIMER_ENABLE_LIMIT; + if (nes_drv_opt & NES_DRV_OPT_DISABLE_INT_MOD) { + nesadapter->et_use_adaptive_rx_coalesce = 0; + nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT; + nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval; + } else { + nesadapter->et_use_adaptive_rx_coalesce = 1; + nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC; + nesadapter->et_rx_coalesce_usecs_irq = 0; + printk(PFX "%s: Using Adaptive Interrupt Moderation\n", __FUNCTION__); + } + /* Setup and enable the periodic timer */ + if (nesadapter->et_rx_coalesce_usecs_irq) + nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x80000000 | + ((u32)(nesadapter->et_rx_coalesce_usecs_irq * 8))); + else + nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x00000000); + + nesadapter->base_pd = 1; + + nesadapter->device_cap_flags = + IB_DEVICE_ZERO_STAG | IB_DEVICE_SEND_W_INV | IB_DEVICE_MEM_WINDOW; + + nesadapter->allocated_qps = (unsigned long *)&(((unsigned char *)nesadapter) + [(sizeof(struct nes_adapter)+(sizeof(unsigned long)-1))&(~(sizeof(unsigned long)-1))]); + nesadapter->allocated_cqs = &nesadapter->allocated_qps[BITS_TO_LONGS(max_qp)]; + nesadapter->allocated_mrs = &nesadapter->allocated_cqs[BITS_TO_LONGS(max_cq)]; + nesadapter->allocated_pds = &nesadapter->allocated_mrs[BITS_TO_LONGS(max_mr)]; + nesadapter->allocated_arps = &nesadapter->allocated_pds[BITS_TO_LONGS(num_pds)]; + nesadapter->qp_table = (struct nes_qp **)(&nesadapter->allocated_arps[BITS_TO_LONGS(arp_table_size)]); + + + /* mark the usual suspect QPs and CQs as in use */ + for (u32temp = 0; u32temp < NES_FIRST_QPN; u32temp++) { + set_bit(u32temp, nesadapter->allocated_qps); + set_bit(u32temp, nesadapter->allocated_cqs); + } + + for (u32temp = 0; u32temp < 20; u32temp++) + set_bit(u32temp, nesadapter->allocated_pds); + u32temp = nes_read_indexed(nesdev, NES_IDX_QP_MAX_CFG_SIZES); + + max_rq_wrs = ((u32temp >> 8) & 3); + switch (max_rq_wrs) { + case 0: + max_rq_wrs = 4; + break; + case 1: + max_rq_wrs = 16; + break; + case 2: + max_rq_wrs = 32; + break; + case 3: + max_rq_wrs = 512; + break; + } + + max_sq_wrs = (u32temp & 3); + switch (max_sq_wrs) { + case 0: + max_sq_wrs = 4; + break; + case 1: + max_sq_wrs = 16; + break; + case 2: + max_sq_wrs = 32; + break; + case 3: + max_sq_wrs = 512; + break; + } + nesadapter->max_qp_wr = min(max_rq_wrs, max_sq_wrs); + nesadapter->max_irrq_wr = (u32temp >> 16) & 3; + + nesadapter->max_sge = 4; + nesadapter->max_cqe = 32767; + + if (nes_read_eeprom_values(nesdev, nesadapter)) { + printk(KERN_ERR PFX "Unable to read EEPROM data.\n"); + kfree(nesadapter); + return NULL; + } + + u32temp = nes_read_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG); + nes_write_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG, + (u32temp & 0xff000000) | (nesadapter->tcp_timer_core_clk_divisor & 0x00ffffff)); + + /* setup port configuration */ + if (nesadapter->port_count == 1) { + u32temp = 0x00000000; + if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT) + nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000002); + else + nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003); + } else { + if (nesadapter->port_count == 2) + u32temp = 0x00000044; + else + u32temp = 0x000000e4; + nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003); + } + + nes_write_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT, u32temp); + nes_debug(NES_DBG_INIT, "Probe time, LOG2PHY=%u\n", + nes_read_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT)); + + spin_lock_init(&nesadapter->resource_lock); + spin_lock_init(&nesadapter->phy_lock); + spin_lock_init(&nesadapter->pbl_lock); + spin_lock_init(&nesadapter->periodic_timer_lock); + + INIT_LIST_HEAD(&nesadapter->nesvnic_list[0]); + INIT_LIST_HEAD(&nesadapter->nesvnic_list[1]); + INIT_LIST_HEAD(&nesadapter->nesvnic_list[2]); + INIT_LIST_HEAD(&nesadapter->nesvnic_list[3]); + + if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) { + u32 pcs_control_status0, pcs_control_status1; + u32 reset_value; + u32 i = 0; + u32 int_cnt = 0; + u32 ext_cnt = 0; + unsigned long flags; + u32 j = 0; + + pcs_control_status0 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0); + pcs_control_status1 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + + for (i = 0; i < NES_MAX_LINK_CHECK; i++) { + pcs_control_status0 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0); + pcs_control_status1 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + if ((0x0F000100 == (pcs_control_status0 & 0x0F000100)) + || (0x0F000100 == (pcs_control_status1 & 0x0F000100))) + int_cnt++; + msleep(1); + } + if (int_cnt > 1) { + spin_lock_irqsave(&nesadapter->phy_lock, flags); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088); + mh_detected++; + reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); + reset_value |= 0x0000003d; + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value); + + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) + & 0x00000040) != 0x00000040) && (j++ < 5000)); + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + + pcs_control_status0 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0); + pcs_control_status1 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + + for (i = 0; i < NES_MAX_LINK_CHECK; i++) { + pcs_control_status0 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0); + pcs_control_status1 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + if ((0x0F000100 == (pcs_control_status0 & 0x0F000100)) + || (0x0F000100 == (pcs_control_status1 & 0x0F000100))) { + if (++ext_cnt > int_cnt) { + spin_lock_irqsave(&nesadapter->phy_lock, flags); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, + 0x0000F0C8); + mh_detected++; + reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); + reset_value |= 0x0000003d; + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value); + + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) + & 0x00000040) != 0x00000040) && (j++ < 5000)); + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + break; + } + } + msleep(1); + } + } + } + + if (nesadapter->hw_rev == NE020_REV) { + init_timer(&nesadapter->mh_timer); + nesadapter->mh_timer.function = nes_mh_fix; + nesadapter->mh_timer.expires = jiffies + (HZ/5); /* 1 second */ + nesadapter->mh_timer.data = (unsigned long)nesdev; + add_timer(&nesadapter->mh_timer); + } else { + nes_write32(nesdev->regs+NES_INTF_INT_STAT, 0x0f000000); + } + + init_timer(&nesadapter->lc_timer); + nesadapter->lc_timer.function = nes_clc; + nesadapter->lc_timer.expires = jiffies + 3600 * HZ; /* 1 hour */ + nesadapter->lc_timer.data = (unsigned long)nesdev; + add_timer(&nesadapter->lc_timer); + + list_add_tail(&nesadapter->list, &nes_adapter_list); + + for (func_index = 0; func_index < 8; func_index++) { + pci_bus_read_config_word(nesdev->pcidev->bus, + PCI_DEVFN(PCI_SLOT(nesdev->pcidev->devfn), + func_index), 0, &vendor_id); + if (vendor_id == 0xffff) + break; + } + nes_debug(NES_DBG_INIT, "%s %d functions found for %s.\n", __FUNCTION__, + func_index, pci_name(nesdev->pcidev)); + nesadapter->adapter_fcn_count = func_index; + + return nesadapter; +} + + +/** + * nes_reset_adapter_ne020 + */ +unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode) +{ + u32 port_count; + u32 u32temp; + u32 i; + + u32temp = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); + port_count = ((u32temp & 0x00000300) >> 8) + 1; + /* TODO: assuming that both SERDES are set the same for now */ + *OneG_Mode = (u32temp & 0x00003c00) ? 0 : 1; + nes_debug(NES_DBG_INIT, "Initial Software Reset = 0x%08X, port_count=%u\n", + u32temp, port_count); + if (*OneG_Mode) + nes_debug(NES_DBG_INIT, "Running in 1G mode.\n"); + u32temp &= 0xff00ffc0; + switch (port_count) { + case 1: + u32temp |= 0x00ee0000; + break; + case 2: + u32temp |= 0x00cc0000; + break; + case 4: + u32temp |= 0x00000000; + break; + default: + return 0; + break; + } + + /* check and do full reset if needed */ + if (nes_read_indexed(nesdev, NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))) { + nes_debug(NES_DBG_INIT, "Issuing Full Soft reset = 0x%08X\n", u32temp | 0xd); + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd); + + i = 0; + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000) + mdelay(1); + if (i >= 10000) { + nes_debug(NES_DBG_INIT, "Did not see full soft reset done.\n"); + return 0; + } + } + + /* port reset */ + switch (port_count) { + case 1: + u32temp |= 0x00ee0010; + break; + case 2: + u32temp |= 0x00cc0030; + break; + case 4: + u32temp |= 0x00000030; + break; + } + + nes_debug(NES_DBG_INIT, "Issuing Port Soft reset = 0x%08X\n", u32temp | 0xd); + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd); + + i = 0; + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000) + mdelay(1); + if (i >= 10000) { + nes_debug(NES_DBG_INIT, "Did not see port soft reset done.\n"); + return 0; + } + + /* serdes 0 */ + i = 0; + while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0) + & 0x0000000f)) != 0x0000000f) && i++ < 5000) + mdelay(1); + if (i >= 5000) { + nes_debug(NES_DBG_INIT, "Serdes 0 not ready, status=%x\n", u32temp); + return 0; + } + + /* serdes 1 */ + if (port_count > 1) { + i = 0; + while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1) + & 0x0000000f)) != 0x0000000f) && i++ < 5000) + mdelay(1); + if (i >= 5000) { + nes_debug(NES_DBG_INIT, "Serdes 1 not ready, status=%x\n", u32temp); + return 0; + } + } + + + + i = 0; + while ((nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS) != 0x80) && i++ < 10000) + mdelay(1); + if (i >= 10000) { + printk(KERN_ERR PFX "Internal CPU not ready, status = %02X\n", + nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS)); + return 0; + } + + return port_count; +} + + +/** + * nes_init_serdes + */ +int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count, u8 OneG_Mode) +{ + int i; + u32 u32temp; + + if (hw_rev != NE020_REV) { + /* init serdes 0 */ + + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF); + if (!OneG_Mode) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0, 0x11110000); + if (port_count > 1) { + /* init serdes 1 */ + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000FF); + if (!OneG_Mode) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1, 0x11110000); + } + } else { + /* init serdes 0 */ + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008); + i = 0; + while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0) + & 0x0000000f)) != 0x0000000f) && i++ < 5000) + mdelay(1); + if (i >= 5000) { + nes_debug(NES_DBG_PHY, "Init: serdes 0 not ready, status=%x\n", u32temp); + return 1; + } + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000); + if (OneG_Mode) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222); + else + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222); + + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff); + if (port_count > 1) { + /* init serdes 1 */ + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x00000048); + i = 0; + while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1) + & 0x0000000f)) != 0x0000000f) && (i++ < 5000)) + mdelay(1); + if (i >= 5000) { + printk("%s: Init: serdes 1 not ready, status=%x\n", __FUNCTION__, u32temp); + /* return 1; */ + } + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x000bdef7); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE1, 0x9ce73000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE1, 0x0ff00000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET1, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS1, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL1, 0xf0002222); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000ff); + } + } + return 0; +} + + +/** + * nes_init_csr_ne020 + * Initialize registers for ne020 hardware + */ +void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count) +{ + u32 u32temp; + + nes_debug(NES_DBG_INIT, "port_count=%d\n", port_count); + + nes_write_indexed(nesdev, 0x000001E4, 0x00000007); + /* nes_write_indexed(nesdev, 0x000001E8, 0x000208C4); */ + nes_write_indexed(nesdev, 0x000001E8, 0x00020874); + nes_write_indexed(nesdev, 0x000001D8, 0x00048002); + /* nes_write_indexed(nesdev, 0x000001D8, 0x0004B002); */ + nes_write_indexed(nesdev, 0x000001FC, 0x00050005); + nes_write_indexed(nesdev, 0x00000600, 0x55555555); + nes_write_indexed(nesdev, 0x00000604, 0x55555555); + + /* TODO: move these MAC register settings to NIC bringup */ + nes_write_indexed(nesdev, 0x00002000, 0x00000001); + nes_write_indexed(nesdev, 0x00002004, 0x00000001); + nes_write_indexed(nesdev, 0x00002008, 0x0000FFFF); + nes_write_indexed(nesdev, 0x0000200C, 0x00000001); + nes_write_indexed(nesdev, 0x00002010, 0x000003c1); + nes_write_indexed(nesdev, 0x0000201C, 0x75345678); + if (port_count > 1) { + nes_write_indexed(nesdev, 0x00002200, 0x00000001); + nes_write_indexed(nesdev, 0x00002204, 0x00000001); + nes_write_indexed(nesdev, 0x00002208, 0x0000FFFF); + nes_write_indexed(nesdev, 0x0000220C, 0x00000001); + nes_write_indexed(nesdev, 0x00002210, 0x000003c1); + nes_write_indexed(nesdev, 0x0000221C, 0x75345678); + nes_write_indexed(nesdev, 0x00000908, 0x20000001); + } + if (port_count > 2) { + nes_write_indexed(nesdev, 0x00002400, 0x00000001); + nes_write_indexed(nesdev, 0x00002404, 0x00000001); + nes_write_indexed(nesdev, 0x00002408, 0x0000FFFF); + nes_write_indexed(nesdev, 0x0000240C, 0x00000001); + nes_write_indexed(nesdev, 0x00002410, 0x000003c1); + nes_write_indexed(nesdev, 0x0000241C, 0x75345678); + nes_write_indexed(nesdev, 0x00000910, 0x20000001); + + nes_write_indexed(nesdev, 0x00002600, 0x00000001); + nes_write_indexed(nesdev, 0x00002604, 0x00000001); + nes_write_indexed(nesdev, 0x00002608, 0x0000FFFF); + nes_write_indexed(nesdev, 0x0000260C, 0x00000001); + nes_write_indexed(nesdev, 0x00002610, 0x000003c1); + nes_write_indexed(nesdev, 0x0000261C, 0x75345678); + nes_write_indexed(nesdev, 0x00000918, 0x20000001); + } + + nes_write_indexed(nesdev, 0x00005000, 0x00018000); + /* nes_write_indexed(nesdev, 0x00005000, 0x00010000); */ + nes_write_indexed(nesdev, 0x00005004, 0x00020001); + nes_write_indexed(nesdev, 0x00005008, 0x1F1F1F1F); + nes_write_indexed(nesdev, 0x00005010, 0x1F1F1F1F); + nes_write_indexed(nesdev, 0x00005018, 0x1F1F1F1F); + nes_write_indexed(nesdev, 0x00005020, 0x1F1F1F1F); + nes_write_indexed(nesdev, 0x00006090, 0xFFFFFFFF); + + /* TODO: move this to code, get from EEPROM */ + nes_write_indexed(nesdev, 0x00000900, 0x20000001); + nes_write_indexed(nesdev, 0x000060C0, 0x0000028e); + nes_write_indexed(nesdev, 0x000060C8, 0x00000020); + // + nes_write_indexed(nesdev, 0x000001EC, 0x7b2625a0); + /* nes_write_indexed(nesdev, 0x000001EC, 0x5f2625a0); */ + + if (hw_rev != NE020_REV) { + u32temp = nes_read_indexed(nesdev, 0x000008e8); + u32temp |= 0x80000000; + nes_write_indexed(nesdev, 0x000008e8, u32temp); + u32temp = nes_read_indexed(nesdev, 0x000021f8); + u32temp &= 0x7fffffff; + u32temp |= 0x7fff0010; + nes_write_indexed(nesdev, 0x000021f8, u32temp); + } +} + + +/** + * nes_destroy_adapter - destroy the adapter structure + */ +void nes_destroy_adapter(struct nes_adapter *nesadapter) +{ + struct nes_adapter *tmp_adapter; + + list_for_each_entry(tmp_adapter, &nes_adapter_list, list) { + nes_debug(NES_DBG_SHUTDOWN, "Nes Adapter list entry = 0x%p.\n", + tmp_adapter); + } + + nesadapter->ref_count--; + if (!nesadapter->ref_count) { + if (nesadapter->hw_rev == NE020_REV) { + del_timer(&nesadapter->mh_timer); + } + del_timer(&nesadapter->lc_timer); + + list_del(&nesadapter->list); + kfree(nesadapter); + } +} + + +/** + * nes_init_cqp + */ +int nes_init_cqp(struct nes_device *nesdev) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_cqp_qp_context *cqp_qp_context; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_hw_ceq *ceq; + struct nes_hw_ceq *nic_ceq; + struct nes_hw_aeq *aeq; + void *vmem; + dma_addr_t pmem; + u32 count=0; + u32 cqp_head; + u64 u64temp; + u32 u32temp; + + /* allocate CQP memory */ + /* Need to add max_cq to the aeq size once cq overflow checking is added back */ + /* SQ is 512 byte aligned, others are 256 byte aligned */ + nesdev->cqp_mem_size = 512 + + (sizeof(struct nes_hw_cqp_wqe) * NES_CQP_SQ_SIZE) + + (sizeof(struct nes_hw_cqe) * NES_CCQ_SIZE) + + max(((u32)sizeof(struct nes_hw_ceqe) * NES_CCEQ_SIZE), (u32)256) + + max(((u32)sizeof(struct nes_hw_ceqe) * NES_NIC_CEQ_SIZE), (u32)256) + + (sizeof(struct nes_hw_aeqe) * nesadapter->max_qp) + + sizeof(struct nes_hw_cqp_qp_context); + + nesdev->cqp_vbase = pci_alloc_consistent(nesdev->pcidev, nesdev->cqp_mem_size, + &nesdev->cqp_pbase); + if (!nesdev->cqp_vbase) { + nes_debug(NES_DBG_INIT, "Unable to allocate memory for host descriptor rings\n"); + return -ENOMEM; + } + memset(nesdev->cqp_vbase, 0, nesdev->cqp_mem_size); + + /* Allocate a twice the number of CQP requests as the SQ size */ + nesdev->nes_cqp_requests = kzalloc(sizeof(struct nes_cqp_request) * + 2 * NES_CQP_SQ_SIZE, GFP_KERNEL); + if (nesdev->nes_cqp_requests == NULL) { + nes_debug(NES_DBG_INIT, "Unable to allocate memory CQP request entries.\n"); + pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase, + nesdev->cqp.sq_pbase); + return -ENOMEM; + } + + nes_debug(NES_DBG_INIT, "Allocated CQP structures at %p (phys = %016lX), size = %u.\n", + nesdev->cqp_vbase, (unsigned long)nesdev->cqp_pbase, nesdev->cqp_mem_size); + + spin_lock_init(&nesdev->cqp.lock); + init_waitqueue_head(&nesdev->cqp.waitq); + + /* Setup Various Structures */ + vmem = (void *)(((unsigned long)nesdev->cqp_vbase + (512 - 1)) & + ~(unsigned long)(512 - 1)); + pmem = (dma_addr_t)(((unsigned long long)nesdev->cqp_pbase + (512 - 1)) & + ~(unsigned long long)(512 - 1)); + + nesdev->cqp.sq_vbase = vmem; + nesdev->cqp.sq_pbase = pmem; + nesdev->cqp.sq_size = NES_CQP_SQ_SIZE; + nesdev->cqp.sq_head = 0; + nesdev->cqp.sq_tail = 0; + nesdev->cqp.qp_id = PCI_FUNC(nesdev->pcidev->devfn); + + vmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size); + pmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size); + + nesdev->ccq.cq_vbase = vmem; + nesdev->ccq.cq_pbase = pmem; + nesdev->ccq.cq_size = NES_CCQ_SIZE; + nesdev->ccq.cq_head = 0; + nesdev->ccq.ce_handler = nes_cqp_ce_handler; + nesdev->ccq.cq_number = PCI_FUNC(nesdev->pcidev->devfn); + + vmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size); + pmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size); + + nesdev->ceq_index = PCI_FUNC(nesdev->pcidev->devfn); + ceq = &nesadapter->ceq[nesdev->ceq_index]; + ceq->ceq_vbase = vmem; + ceq->ceq_pbase = pmem; + ceq->ceq_size = NES_CCEQ_SIZE; + ceq->ceq_head = 0; + + vmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256); + pmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256); + + nesdev->nic_ceq_index = PCI_FUNC(nesdev->pcidev->devfn) + 8; + nic_ceq = &nesadapter->ceq[nesdev->nic_ceq_index]; + nic_ceq->ceq_vbase = vmem; + nic_ceq->ceq_pbase = pmem; + nic_ceq->ceq_size = NES_NIC_CEQ_SIZE; + nic_ceq->ceq_head = 0; + + vmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256); + pmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256); + + aeq = &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)]; + aeq->aeq_vbase = vmem; + aeq->aeq_pbase = pmem; + aeq->aeq_size = nesadapter->max_qp; + aeq->aeq_head = 0; + + /* Setup QP Context */ + vmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size); + pmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size); + + cqp_qp_context = vmem; + cqp_qp_context->context_words[0] = + cpu_to_le32((PCI_FUNC(nesdev->pcidev->devfn) << 12) + (2 << 10)); + cqp_qp_context->context_words[1] = 0; + cqp_qp_context->context_words[2] = cpu_to_le32((u32)nesdev->cqp.sq_pbase); + cqp_qp_context->context_words[3] = cpu_to_le32(((u64)nesdev->cqp.sq_pbase) >> 32); + + + /* Write the address to Create CQP */ + if ((sizeof(dma_addr_t) > 4)) { + nes_write_indexed(nesdev, + NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8), + ((u64)pmem) >> 32); + } else { + nes_write_indexed(nesdev, + NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8), 0); + } + nes_write_indexed(nesdev, + NES_IDX_CREATE_CQP_LOW + (PCI_FUNC(nesdev->pcidev->devfn) * 8), + (u32)pmem); + + INIT_LIST_HEAD(&nesdev->cqp_avail_reqs); + INIT_LIST_HEAD(&nesdev->cqp_pending_reqs); + + for (count = 0; count < 2*NES_CQP_SQ_SIZE; count++) { + init_waitqueue_head(&nesdev->nes_cqp_requests[count].waitq); + list_add_tail(&nesdev->nes_cqp_requests[count].list, &nesdev->cqp_avail_reqs); + } + + /* Write Create CCQ WQE */ + cqp_head = nesdev->cqp.sq_head++; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID | + NES_CQP_CQ_CHK_OVERFLOW | ((u32)nesdev->ccq.cq_size << 16))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + (nesdev->ccq.cq_number | + ((u32)nesdev->ceq_index << 16))); + u64temp = (u64)nesdev->ccq.cq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0; + u64temp = (unsigned long)&nesdev->ccq; + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = + cpu_to_le32((u32)(u64temp >> 1)); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = + cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0; + + /* Write Create CEQ WQE */ + cqp_head = nesdev->cqp.sq_head++; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_CREATE_CEQ + ((u32)nesdev->ceq_index << 8))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, ceq->ceq_size); + u64temp = (u64)ceq->ceq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + + /* Write Create AEQ WQE */ + cqp_head = nesdev->cqp.sq_head++; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_CREATE_AEQ + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_AEQ_WQE_ELEMENT_COUNT_IDX, aeq->aeq_size); + u64temp = (u64)aeq->aeq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + + /* Write Create NIC CEQ WQE */ + cqp_head = nesdev->cqp.sq_head++; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_CREATE_CEQ + ((u32)nesdev->nic_ceq_index << 8))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, nic_ceq->ceq_size); + u64temp = (u64)nic_ceq->ceq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + + /* Poll until CCQP done */ + count = 0; + do { + if (count++ > 1000) { + printk(KERN_ERR PFX "Error creating CQP\n"); + pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, + nesdev->cqp_vbase, nesdev->cqp_pbase); + return -1; + } + udelay(10); + } while (!(nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn) * 8)) & (1 << 8))); + + nes_debug(NES_DBG_INIT, "CQP Status = 0x%08X\n", nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))); + + u32temp = 0x04800000; + nes_write32(nesdev->regs+NES_WQE_ALLOC, u32temp | nesdev->cqp.qp_id); + + /* wait for the CCQ, CEQ, and AEQ to get created */ + count = 0; + do { + if (count++ > 1000) { + printk(KERN_ERR PFX "Error creating CCQ, CEQ, and AEQ\n"); + pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, + nesdev->cqp_vbase, nesdev->cqp_pbase); + return -1; + } + udelay(10); + } while (((nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15<<8)) != (15<<8))); + + /* dump the QP status value */ + nes_debug(NES_DBG_INIT, "QP Status = 0x%08X\n", nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))); + + nesdev->cqp.sq_tail++; + + return 0; +} + + +/** + * nes_destroy_cqp + */ +int nes_destroy_cqp(struct nes_device *nesdev) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + u32 count = 0; + u32 cqp_head; + unsigned long flags; + + do { + if (count++ > 1000) + break; + udelay(10); + } while (!(nesdev->cqp.sq_head == nesdev->cqp.sq_tail)); + + /* Reset CCQ */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_RESET | + nesdev->ccq.cq_number); + + /* Disable device interrupts */ + nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff); + + spin_lock_irqsave(&nesdev->cqp.lock, flags); + + /* Destroy the AEQ */ + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_AEQ | + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8)); + cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX] = 0; + + /* Destroy the NIC CEQ */ + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ | + ((u32)nesdev->nic_ceq_index << 8)); + + /* Destroy the CEQ */ + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ | + (nesdev->ceq_index << 8)); + + /* Destroy the CCQ */ + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CQ); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->ccq.cq_number | + ((u32)nesdev->ceq_index << 16)); + + /* Destroy CQP */ + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_QP | + NES_CQP_QP_TYPE_CQP); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->cqp.qp_id); + + barrier(); + /* Ring doorbell (5 WQEs) */ + nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x05800000 | nesdev->cqp.qp_id); + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + + /* wait for the CCQ, CEQ, and AEQ to get destroyed */ + count = 0; + do { + if (count++ > 1000) { + printk(KERN_ERR PFX "Function%d: Error destroying CCQ, CEQ, and AEQ\n", + PCI_FUNC(nesdev->pcidev->devfn)); + break; + } + udelay(10); + } while (((nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15 << 8)) != 0)); + + /* dump the QP status value */ + nes_debug(NES_DBG_SHUTDOWN, "Function%d: QP Status = 0x%08X\n", + PCI_FUNC(nesdev->pcidev->devfn), + nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))); + + kfree(nesdev->nes_cqp_requests); + + /* Free the control structures */ + pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase, + nesdev->cqp.sq_pbase); + + return 0; +} + + +/** + * nes_init_phy + */ +int nes_init_phy(struct nes_device *nesdev) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 counter = 0; + u32 mac_index = nesdev->mac_index; + u32 tx_config; + u16 phy_data; + + if (nesadapter->OneG_Mode) { + nes_debug(NES_DBG_PHY, "1G PHY, mac_index = %d.\n", mac_index); + if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_1G) { + printk(PFX "%s: Programming mdc config for 1G\n", __FUNCTION__); + tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG); + tx_config |= 0x04; + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config); + } + + nes_read_1G_phy_reg(nesdev, 1, nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy data from register 1 phy address %u = 0x%X.\n", + nesadapter->phy_index[mac_index], phy_data); + nes_write_1G_phy_reg(nesdev, 23, nesadapter->phy_index[mac_index], 0xb000); + + /* Reset the PHY */ + nes_write_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], 0x8000); + udelay(100); + counter = 0; + do { + nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy data from register 0 = 0x%X.\n", phy_data); + if (counter++ > 100) break; + } while (phy_data & 0x8000); + + /* Setting no phy loopback */ + phy_data &= 0xbfff; + phy_data |= 0x1140; + nes_write_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], phy_data); + nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy data from register 0 = 0x%X.\n", phy_data); + + nes_read_1G_phy_reg(nesdev, 0x17, nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy data from register 0x17 = 0x%X.\n", phy_data); + + nes_read_1G_phy_reg(nesdev, 0x1e, nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy data from register 0x1e = 0x%X.\n", phy_data); + + /* Setting the interrupt mask */ + nes_read_1G_phy_reg(nesdev, 0x19, nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy data from register 0x19 = 0x%X.\n", phy_data); + nes_write_1G_phy_reg(nesdev, 0x19, nesadapter->phy_index[mac_index], 0xffee); + + nes_read_1G_phy_reg(nesdev, 0x19, nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy data from register 0x19 = 0x%X.\n", phy_data); + + /* turning on flow control */ + nes_read_1G_phy_reg(nesdev, 4, nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy data from register 0x4 = 0x%X.\n", phy_data); + nes_write_1G_phy_reg(nesdev, 4, nesadapter->phy_index[mac_index], + (phy_data & ~(0x03E0)) | 0xc00); + /* nes_write_1G_phy_reg(nesdev, 4, nesadapter->phy_index[mac_index], + phy_data | 0xc00); */ + nes_read_1G_phy_reg(nesdev, 4, nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy data from register 0x4 = 0x%X.\n", phy_data); + + nes_read_1G_phy_reg(nesdev, 9, nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy data from register 0x9 = 0x%X.\n", phy_data); + /* Clear Half duplex */ + nes_write_1G_phy_reg(nesdev, 9, nesadapter->phy_index[mac_index], + phy_data & ~(0x0100)); + nes_read_1G_phy_reg(nesdev, 9, nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy data from register 0x9 = 0x%X.\n", phy_data); + + nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], &phy_data); + nes_write_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], phy_data | 0x0300); + } else { + if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_IRIS) { + /* setup 10G MDIO operation */ + tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG); + tx_config |= 0x14; + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config); + } + } + return 0; +} + + +/** + * nes_replenish_nic_rq + */ +static void nes_replenish_nic_rq(struct nes_vnic *nesvnic) +{ + unsigned long flags; + dma_addr_t bus_address; + struct sk_buff *skb; + struct nes_hw_nic_rq_wqe *nic_rqe; + struct nes_hw_nic *nesnic; + struct nes_device *nesdev; + u32 rx_wqes_posted = 0; + + nesnic = &nesvnic->nic; + nesdev = nesvnic->nesdev; + spin_lock_irqsave(&nesnic->rq_lock, flags); + if (nesnic->replenishing_rq !=0) { + if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) && + (atomic_read(&nesvnic->rx_skb_timer_running) == 0)) { + atomic_set(&nesvnic->rx_skb_timer_running, 1); + spin_unlock_irqrestore(&nesnic->rq_lock, flags); + nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2); /* 1/2 second */ + add_timer(&nesvnic->rq_wqes_timer); + } else + spin_unlock_irqrestore(&nesnic->rq_lock, flags); + return; + } + nesnic->replenishing_rq = 1; + spin_unlock_irqrestore(&nesnic->rq_lock, flags); + do { + skb = dev_alloc_skb(nesvnic->max_frame_size); + if (skb) { + skb->dev = nesvnic->netdev; + + bus_address = pci_map_single(nesdev->pcidev, + skb->data, nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + + nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_head]; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = + cpu_to_le32(nesvnic->max_frame_size); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = + cpu_to_le32((u32)bus_address); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = + cpu_to_le32((u32)((u64)bus_address >> 32)); + nesnic->rx_skb[nesnic->rq_head] = skb; + nesnic->rq_head++; + nesnic->rq_head &= nesnic->rq_size - 1; + atomic_dec(&nesvnic->rx_skbs_needed); + barrier(); + if (++rx_wqes_posted == 255) { + nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id); + rx_wqes_posted = 0; + } + } else { + spin_lock_irqsave(&nesnic->rq_lock, flags); + if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) && + (atomic_read(&nesvnic->rx_skb_timer_running) == 0)) { + atomic_set(&nesvnic->rx_skb_timer_running, 1); + spin_unlock_irqrestore(&nesnic->rq_lock, flags); + nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2); /* 1/2 second */ + add_timer(&nesvnic->rq_wqes_timer); + } else + spin_unlock_irqrestore(&nesnic->rq_lock, flags); + break; + } + } while (atomic_read(&nesvnic->rx_skbs_needed)); + barrier(); + if (rx_wqes_posted) + nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id); + nesnic->replenishing_rq = 0; +} + + +/** + * nes_rq_wqes_timeout + */ +static void nes_rq_wqes_timeout(unsigned long parm) +{ + struct nes_vnic *nesvnic = (struct nes_vnic *)parm; + printk("%s: Timer fired.\n", __FUNCTION__); + atomic_set(&nesvnic->rx_skb_timer_running, 0); + if (atomic_read(&nesvnic->rx_skbs_needed)) + nes_replenish_nic_rq(nesvnic); +} + + +/** + * nes_init_nic_qp + */ +int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_hw_nic_sq_wqe *nic_sqe; + struct nes_hw_nic_qp_context *nic_context; + struct sk_buff *skb; + struct nes_hw_nic_rq_wqe *nic_rqe; + struct nes_vnic *nesvnic = netdev_priv(netdev); + unsigned long flags; + void *vmem; + dma_addr_t pmem; + u64 u64temp; + int ret; + u32 cqp_head; + u32 counter; + u32 wqe_count; + u8 jumbomode=0; + + /* Allocate fragment, SQ, RQ, and CQ; Reuse CEQ based on the PCI function */ + nesvnic->nic_mem_size = 256 + + (NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag)) + + (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)) + + (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)) + + (NES_NIC_WQ_SIZE * 2 * sizeof(struct nes_hw_nic_cqe)) + + sizeof(struct nes_hw_nic_qp_context); + + nesvnic->nic_vbase = pci_alloc_consistent(nesdev->pcidev, nesvnic->nic_mem_size, + &nesvnic->nic_pbase); + if (!nesvnic->nic_vbase) { + nes_debug(NES_DBG_INIT, "Unable to allocate memory for NIC host descriptor rings\n"); + return -ENOMEM; + } + memset(nesvnic->nic_vbase, 0, nesvnic->nic_mem_size); + nes_debug(NES_DBG_INIT, "Allocated NIC QP structures at %p (phys = %016lX), size = %u.\n", + nesvnic->nic_vbase, (unsigned long)nesvnic->nic_pbase, nesvnic->nic_mem_size); + + vmem = (void *)(((unsigned long)nesvnic->nic_vbase + (256 - 1)) & + ~(unsigned long)(256 - 1)); + pmem = (dma_addr_t)(((unsigned long long)nesvnic->nic_pbase + (256 - 1)) & + ~(unsigned long long)(256 - 1)); + + /* Setup the first Fragment buffers */ + nesvnic->nic.first_frag_vbase = vmem; + + for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) { + nesvnic->nic.frag_paddr[counter] = pmem; + pmem += sizeof(struct nes_first_frag); + } + + /* setup the SQ */ + vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag)); + + nesvnic->nic.sq_vbase = (void *)vmem; + nesvnic->nic.sq_pbase = pmem; + nesvnic->nic.sq_head = 0; + nesvnic->nic.sq_tail = 0; + nesvnic->nic.sq_size = NES_NIC_WQ_SIZE; + for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) { + nic_sqe = &nesvnic->nic.sq_vbase[counter]; + nic_sqe->wqe_words[NES_NIC_SQ_WQE_MISC_IDX] = + cpu_to_le32(NES_NIC_SQ_WQE_DISABLE_CHKSUM | + NES_NIC_SQ_WQE_COMPLETION); + nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX] = + cpu_to_le32((u32)NES_FIRST_FRAG_SIZE << 16); + nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX] = + cpu_to_le32((u32)nesvnic->nic.frag_paddr[counter]); + nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX] = + cpu_to_le32((u32)((u64)nesvnic->nic.frag_paddr[counter] >> 32)); + } + + nesvnic->get_cqp_request = nes_get_cqp_request; + nesvnic->post_cqp_request = nes_post_cqp_request; + nesvnic->mcrq_mcast_filter = NULL; + + spin_lock_init(&nesvnic->nic.sq_lock); + spin_lock_init(&nesvnic->nic.rq_lock); + + /* setup the RQ */ + vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)); + pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)); + + + nesvnic->nic.rq_vbase = vmem; + nesvnic->nic.rq_pbase = pmem; + nesvnic->nic.rq_head = 0; + nesvnic->nic.rq_tail = 0; + nesvnic->nic.rq_size = NES_NIC_WQ_SIZE; + + /* setup the CQ */ + vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)); + pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)); + + if (nesdev->nesadapter->netdev_count > 2) + nesvnic->mcrq_qp_id = nesvnic->nic_index + 32; + else + nesvnic->mcrq_qp_id = nesvnic->nic.qp_id + 4; + + nesvnic->nic_cq.cq_vbase = vmem; + nesvnic->nic_cq.cq_pbase = pmem; + nesvnic->nic_cq.cq_head = 0; + nesvnic->nic_cq.cq_size = NES_NIC_WQ_SIZE * 2; + + nesvnic->nic_cq.ce_handler = nes_nic_napi_ce_handler; + + /* Send CreateCQ request to CQP */ + spin_lock_irqsave(&nesdev->cqp.lock, flags); + cqp_head = nesdev->cqp.sq_head; + + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32( + NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID | + ((u32)nesvnic->nic_cq.cq_size << 16)); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32( + nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16)); + u64temp = (u64)nesvnic->nic_cq.cq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0; + u64temp = (unsigned long)&nesvnic->nic_cq; + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = cpu_to_le32((u32)(u64temp >> 1)); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = + cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0; + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + /* Send CreateQP request to CQP */ + nic_context = (void *)(&nesvnic->nic_cq.cq_vbase[nesvnic->nic_cq.cq_size]); + nic_context->context_words[NES_NIC_CTX_MISC_IDX] = + cpu_to_le32((u32)NES_NIC_CTX_SIZE | + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12)); + nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n", + nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE), + nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE)); + if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0) { + nic_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE); + } + + u64temp = (u64)nesvnic->nic.sq_pbase; + nic_context->context_words[NES_NIC_CTX_SQ_LOW_IDX] = cpu_to_le32((u32)u64temp); + nic_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); + u64temp = (u64)nesvnic->nic.rq_pbase; + nic_context->context_words[NES_NIC_CTX_RQ_LOW_IDX] = cpu_to_le32((u32)u64temp); + nic_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP | + NES_CQP_QP_TYPE_NIC); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesvnic->nic.qp_id); + u64temp = (u64)nesvnic->nic_cq.cq_pbase + + (nesvnic->nic_cq.cq_size * sizeof(struct nes_hw_nic_cqe)); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + nesdev->cqp.sq_head = cqp_head; + + barrier(); + + /* Ring doorbell (2 WQEs) */ + nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id); + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + nes_debug(NES_DBG_INIT, "Waiting for create NIC QP%u to complete.\n", + nesvnic->nic.qp_id); + + ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_INIT, "Create NIC QP%u completed, wait_event_timeout ret = %u.\n", + nesvnic->nic.qp_id, ret); + if (!ret) { + nes_debug(NES_DBG_INIT, "NIC QP%u create timeout expired\n", nesvnic->nic.qp_id); + pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase, + nesvnic->nic_pbase); + return -EIO; + } + + /* Populate the RQ */ + for (counter = 0; counter < (NES_NIC_WQ_SIZE - 1); counter++) { + skb = dev_alloc_skb(nesvnic->max_frame_size); + if (!skb) { + nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name); + + nes_destroy_nic_qp(nesvnic); + return -ENOMEM; + } + + skb->dev = netdev; + + pmem = pci_map_single(nesdev->pcidev, skb->data, + nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + + nic_rqe = &nesvnic->nic.rq_vbase[counter]; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32(nesvnic->max_frame_size); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = cpu_to_le32((u32)pmem); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32)); + nesvnic->nic.rx_skb[counter] = skb; + } + + wqe_count = NES_NIC_WQ_SIZE - 1; + nesvnic->nic.rq_head = wqe_count; + barrier(); + do { + counter = min(wqe_count, ((u32)255)); + wqe_count -= counter; + nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter << 24) | nesvnic->nic.qp_id); + } while (wqe_count); + init_timer(&nesvnic->rq_wqes_timer); + nesvnic->rq_wqes_timer.function = nes_rq_wqes_timeout; + nesvnic->rq_wqes_timer.data = (unsigned long)nesvnic; + nes_debug(NES_DBG_INIT, "NAPI support Enabled\n"); + + if (nesdev->nesadapter->et_use_adaptive_rx_coalesce) + { + nes_nic_init_timer(nesdev); + if (netdev->mtu > 1500) + jumbomode = 1; + nes_nic_init_timer_defaults(nesdev, jumbomode); + } + + return 0; +} + + +/** + * nes_destroy_nic_qp + */ +void nes_destroy_nic_qp(struct nes_vnic *nesvnic) +{ + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_hw_nic_rq_wqe *nic_rqe; + u64 wqe_frag; + u32 cqp_head; + unsigned long flags; + int ret; + + /* Free remaining NIC receive buffers */ + while (nesvnic->nic.rq_head != nesvnic->nic.rq_tail) { + nic_rqe = &nesvnic->nic.rq_vbase[nesvnic->nic.rq_tail]; + wqe_frag = (u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]); + wqe_frag |= ((u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX])) << 32; + pci_unmap_single(nesdev->pcidev, (dma_addr_t)wqe_frag, + nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + dev_kfree_skb(nesvnic->nic.rx_skb[nesvnic->nic.rq_tail++]); + nesvnic->nic.rq_tail &= (nesvnic->nic.rq_size - 1); + } + + spin_lock_irqsave(&nesdev->cqp.lock, flags); + + /* Destroy NIC QP */ + cqp_head = nesdev->cqp.sq_head; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + nesvnic->nic.qp_id); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + + /* Destroy NIC CQ */ + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_DESTROY_CQ | ((u32)nesvnic->nic_cq.cq_size << 16))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + (nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16))); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + + nesdev->cqp.sq_head = cqp_head; + barrier(); + + /* Ring doorbell (2 WQEs) */ + nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id); + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u," + " cqp.sq_tail=%u, cqp.sq_size=%u\n", + cqp_head, nesdev->cqp.sq_head, + nesdev->cqp.sq_tail, nesdev->cqp.sq_size); + + ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head), + NES_EVENT_TIMEOUT); + + nes_debug(NES_DBG_SHUTDOWN, "Destroy NIC QP returned, wait_event_timeout ret = %u, cqp_head=%u," + " cqp.sq_head=%u, cqp.sq_tail=%u\n", + ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail); + if (!ret) { + nes_debug(NES_DBG_SHUTDOWN, "NIC QP%u destroy timeout expired\n", + nesvnic->nic.qp_id); + } + + pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase, + nesvnic->nic_pbase); +} + +/** + * nes_napi_isr + */ +int nes_napi_isr(struct nes_device *nesdev) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 int_stat; + + if (nesdev->napi_isr_ran) { + /* interrupt status has already been read in ISR */ + int_stat = nesdev->int_stat; + } else { + int_stat = nes_read32(nesdev->regs + NES_INT_STAT); + nesdev->int_stat = int_stat; + nesdev->napi_isr_ran = 1; + } + + int_stat &= nesdev->int_req; + /* iff NIC, process here, else wait for DPC */ + if ((int_stat) && ((int_stat & 0x0000ff00) == int_stat)) { + nesdev->napi_isr_ran = 0; + nes_write32(nesdev->regs+NES_INT_STAT, + (int_stat & + ~(NES_INT_INTF|NES_INT_TIMER|NES_INT_MAC0|NES_INT_MAC1|NES_INT_MAC2|NES_INT_MAC3))); + + /* Process the CEQs */ + nes_process_ceq(nesdev, &nesdev->nesadapter->ceq[nesdev->nic_ceq_index]); + + if (unlikely((((nesadapter->et_rx_coalesce_usecs_irq) && + (!nesadapter->et_use_adaptive_rx_coalesce)) || + ((nesadapter->et_use_adaptive_rx_coalesce) && + (nesdev->deepcq_count > nesadapter->et_pkt_rate_low)))) ) { + if ((nesdev->int_req & NES_INT_TIMER) == 0) { + /* Enable Periodic timer interrupts */ + nesdev->int_req |= NES_INT_TIMER; + /* ack any pending periodic timer interrupts so we don't get an immediate interrupt */ + /* TODO: need to also ack other unused periodic timer values, get from nesadapter */ + nes_write32(nesdev->regs+NES_TIMER_STAT, + nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req)); + nes_write32(nesdev->regs+NES_INTF_INT_MASK, + ~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER)); + } + + if (unlikely(nesadapter->et_use_adaptive_rx_coalesce)) + { + nes_nic_init_timer(nesdev); + } + /* Enable interrupts, except CEQs */ + nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req)); + } else { + /* Enable interrupts, make sure timer is off */ + nesdev->int_req &= ~NES_INT_TIMER; + nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req)); + nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); + nesadapter->tune_timer.timer_in_use_old = 0; + } + nesdev->deepcq_count = 0; + return 1; + } else { + return 0; + } +} + + +/** + * nes_dpc + */ +void nes_dpc(unsigned long param) +{ + struct nes_device *nesdev = (struct nes_device *)param; + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 counter; + u32 loop_counter = 0; + u32 int_status_bit; + u32 int_stat; + u32 timer_stat; + u32 temp_int_stat; + u32 intf_int_stat; + u32 debug_error; + u32 processed_intf_int = 0; + u16 processed_timer_int = 0; + u16 completion_ints = 0; + u16 timer_ints = 0; + + /* nes_debug(NES_DBG_ISR, "\n"); */ + + do { + timer_stat = 0; + if (nesdev->napi_isr_ran) { + nesdev->napi_isr_ran = 0; + int_stat = nesdev->int_stat; + } else + int_stat = nes_read32(nesdev->regs+NES_INT_STAT); + if (processed_intf_int != 0) + int_stat &= nesdev->int_req & ~NES_INT_INTF; + else + int_stat &= nesdev->int_req; + if (processed_timer_int == 0) { + processed_timer_int = 1; + if (int_stat & NES_INT_TIMER) { + timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT); + if ((timer_stat & nesdev->timer_int_req) == 0) { + int_stat &= ~NES_INT_TIMER; + } + } + } else { + int_stat &= ~NES_INT_TIMER; + } + + if (int_stat) { + if (int_stat & ~(NES_INT_INTF|NES_INT_TIMER|NES_INT_MAC0| + NES_INT_MAC1|NES_INT_MAC2|NES_INT_MAC3)) { + /* Ack the interrupts */ + nes_write32(nesdev->regs+NES_INT_STAT, + (int_stat & ~(NES_INT_INTF|NES_INT_TIMER|NES_INT_MAC0| + NES_INT_MAC1|NES_INT_MAC2|NES_INT_MAC3))); + } + + temp_int_stat = int_stat; + for (counter = 0, int_status_bit = 1; counter < 16; counter++) { + if (int_stat & int_status_bit) { + nes_process_ceq(nesdev, &nesadapter->ceq[counter]); + temp_int_stat &= ~int_status_bit; + completion_ints = 1; + } + if (!(temp_int_stat & 0x0000ffff)) + break; + int_status_bit <<= 1; + } + + /* Process the AEQ for this pci function */ + int_status_bit = 1 << (16 + PCI_FUNC(nesdev->pcidev->devfn)); + if (int_stat & int_status_bit) { + nes_process_aeq(nesdev, &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)]); + } + + /* Process the MAC interrupt for this pci function */ + int_status_bit = 1 << (24 + nesdev->mac_index); + if (int_stat & int_status_bit) { + nes_process_mac_intr(nesdev, nesdev->mac_index); + } + + if (int_stat & NES_INT_TIMER) { + if (timer_stat & nesdev->timer_int_req) { + nes_write32(nesdev->regs + NES_TIMER_STAT, + (timer_stat & nesdev->timer_int_req) | + ~(nesdev->nesadapter->timer_int_req)); + timer_ints = 1; + } + } + + if (int_stat & NES_INT_INTF) { + processed_intf_int = 1; + intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT); + intf_int_stat &= nesdev->intf_int_req; + if (NES_INTF_INT_CRITERR & intf_int_stat) { + debug_error = nes_read_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS); + printk(KERN_ERR PFX "Critical Error reported by device!!! 0x%02X\n", + (u16)debug_error); + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS, + 0x01010000 | (debug_error & 0x0000ffff)); + /* BUG(); */ + if (crit_err_count++ > 10) + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 1 << 0x17); + } + if (NES_INTF_INT_PCIERR & intf_int_stat) { + printk(KERN_ERR PFX "PCI Error reported by device!!!\n"); + BUG(); + } + if (NES_INTF_INT_AEQ_OFLOW & intf_int_stat) { + printk(KERN_ERR PFX "AEQ Overflow reported by device!!!\n"); + BUG(); + } + nes_write32(nesdev->regs+NES_INTF_INT_STAT, intf_int_stat); + } + + if (int_stat & NES_INT_TSW) { + } + } + /* Don't use the interface interrupt bit stay in loop */ + int_stat &= ~NES_INT_INTF|NES_INT_TIMER|NES_INT_MAC0| + NES_INT_MAC1|NES_INT_MAC2|NES_INT_MAC3; + } while ((int_stat != 0) && (loop_counter++ < MAX_DPC_ITERATIONS)); + + if (timer_ints == 1) { + if ((nesadapter->et_rx_coalesce_usecs_irq) || (nesadapter->et_use_adaptive_rx_coalesce)) { + if (completion_ints == 0) { + nesdev->timer_only_int_count++; + if (nesdev->timer_only_int_count>=nesadapter->timer_int_limit) { + nesdev->timer_only_int_count = 0; + nesdev->int_req &= ~NES_INT_TIMER; + nes_write32(nesdev->regs + NES_INTF_INT_MASK, ~(nesdev->intf_int_req)); + nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); + nesdev->nesadapter->tune_timer.timer_in_use_old = 0; + } else { + nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff|(~nesdev->int_req)); + } + } else { + if (unlikely(nesadapter->et_use_adaptive_rx_coalesce)) + { + nes_nic_init_timer(nesdev); + } + nesdev->timer_only_int_count = 0; + nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff|(~nesdev->int_req)); + } + } else { + nesdev->timer_only_int_count = 0; + nesdev->int_req &= ~NES_INT_TIMER; + nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req)); + nes_write32(nesdev->regs+NES_TIMER_STAT, + nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req)); + nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); + } + } else { + if ( (completion_ints == 1) && + (((nesadapter->et_rx_coalesce_usecs_irq) && + (!nesadapter->et_use_adaptive_rx_coalesce)) || + ((nesdev->deepcq_count > nesadapter->et_pkt_rate_low) && + (nesadapter->et_use_adaptive_rx_coalesce) )) ) { + /* nes_debug(NES_DBG_ISR, "Enabling periodic timer interrupt.\n" ); */ + nesdev->timer_only_int_count = 0; + nesdev->int_req |= NES_INT_TIMER; + nes_write32(nesdev->regs+NES_TIMER_STAT, + nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req)); + nes_write32(nesdev->regs+NES_INTF_INT_MASK, + ~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER)); + nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req)); + } else { + nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); + } + } + nesdev->deepcq_count = 0; +} + + +/** + * nes_process_ceq + */ +void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq) +{ + u64 u64temp; + struct nes_hw_cq *cq; + u32 head; + u32 ceq_size; + + /* nes_debug(NES_DBG_CQ, "\n"); */ + head = ceq->ceq_head; + ceq_size = ceq->ceq_size; + + do { + if (le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]) & + NES_CEQE_VALID) { + u64temp = (((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX])))<<32) | + ((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_LOW_IDX]))); + u64temp <<= 1; + cq = *((struct nes_hw_cq **)&u64temp); + /* nes_debug(NES_DBG_CQ, "pCQ = %p\n", cq); */ + barrier(); + ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX] = 0; + + /* call the event handler */ + cq->ce_handler(nesdev, cq); + + if (++head >= ceq_size) + head = 0; + } else { + break; + } + + } while (1); + + ceq->ceq_head = head; +} + + +/** + * nes_process_aeq + */ +void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq) +{ +// u64 u64temp; + u32 head; + u32 aeq_size; + u32 aeqe_misc; + u32 aeqe_cq_id; + struct nes_hw_aeqe volatile *aeqe; + + head = aeq->aeq_head; + aeq_size = aeq->aeq_size; + + do { + aeqe = &aeq->aeq_vbase[head]; + if ((le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]) & NES_AEQE_VALID) == 0) + break; + aeqe_misc = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]); + if (aeqe_misc & (NES_AEQE_QP|NES_AEQE_CQ)) { + if (aeqe_cq_id >= NES_FIRST_QPN) { + /* dealing with an accelerated QP related AE */ +// u64temp = (((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])))<<32) | +// ((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]))); + nes_process_iwarp_aeqe(nesdev, (struct nes_hw_aeqe *)aeqe); + } else { + /* TODO: dealing with a CQP related AE */ + nes_debug(NES_DBG_AEQ, "Processing CQP related AE, misc = 0x%04X\n", + (u16)(aeqe_misc >> 16)); + } + } + + aeqe->aeqe_words[NES_AEQE_MISC_IDX] = 0; + + if (++head >= aeq_size) + head = 0; + } + while (1); + aeq->aeq_head = head; +} + +static void nes_reset_link(struct nes_device *nesdev, u32 mac_index) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 reset_value; + u32 i=0; + u32 u32temp; + + if (nesadapter->hw_rev == NE020_REV) { + return; + } + mh_detected++; + + reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); + + if ((mac_index == 0) || ((mac_index == 1) && (nesadapter->OneG_Mode))) + reset_value |= 0x0000001d; + else + reset_value |= 0x0000002d; + + if (4 <= (nesadapter->link_interrupt_count[mac_index] / ((u16)NES_MAX_LINK_INTERRUPTS))) { + if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) { + nesadapter->link_interrupt_count[0] = 0; + nesadapter->link_interrupt_count[1] = 0; + u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1); + if (0x00000040 & u32temp) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088); + else + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8); + + reset_value |= 0x0000003d; + } + nesadapter->link_interrupt_count[mac_index] = 0; + } + + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value); + + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) + & 0x00000040) != 0x00000040) && (i++ < 5000)); + + if (0x0000003d == (reset_value & 0x0000003d)) { + u32 pcs_control_status0, pcs_control_status1; + + for (i = 0; i < 10; i++) { + pcs_control_status0 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0); + pcs_control_status1 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + if (((0x0F000000 == (pcs_control_status0 & 0x0F000000)) + && (pcs_control_status0 & 0x00100000)) + || ((0x0F000000 == (pcs_control_status1 & 0x0F000000)) + && (pcs_control_status1 & 0x00100000))) + continue; + else + break; + } + if (10 == i) { + u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1); + if (0x00000040 & u32temp) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088); + else + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8); + + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value); + + while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) + & 0x00000040) != 0x00000040) && (i++ < 5000)); + } + } +} + +/** + * nes_process_mac_intr + */ +void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number) +{ + unsigned long flags; + u32 pcs_control_status; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_vnic *nesvnic; + u32 mac_status; + u32 mac_index = nesdev->mac_index; + u32 u32temp; + u16 phy_data; + u16 temp_phy_data; + + spin_lock_irqsave(&nesadapter->phy_lock, flags); + if (nesadapter->mac_sw_state[mac_number] != NES_MAC_SW_IDLE) { + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + return; + } + nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_INTERRUPT; + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + + /* ack the MAC interrupt */ + mac_status = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200)); + /* Clear the interrupt */ + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200), mac_status); + + nes_debug(NES_DBG_PHY, "MAC%u interrupt status = 0x%X.\n", mac_number, mac_status); + + if (mac_status & (NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT)) { + nesdev->link_status_interrupts++; + if (0 == (++nesadapter->link_interrupt_count[mac_index] % ((u16)NES_MAX_LINK_INTERRUPTS))) { + spin_lock_irqsave(&nesadapter->phy_lock, flags); + nes_reset_link(nesdev, mac_index); + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + } + /* read the PHY interrupt status register */ + if (nesadapter->OneG_Mode) { + do { + nes_read_1G_phy_reg(nesdev, 0x1a, + nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1a = 0x%X.\n", + nesadapter->phy_index[mac_index], phy_data); + } while (phy_data&0x8000); + + temp_phy_data = 0; + do { + nes_read_1G_phy_reg(nesdev, 0x11, + nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy%d data from register 0x11 = 0x%X.\n", + nesadapter->phy_index[mac_index], phy_data); + if (temp_phy_data == phy_data) + break; + temp_phy_data = phy_data; + } while (1); + + nes_read_1G_phy_reg(nesdev, 0x1e, + nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1e = 0x%X.\n", + nesadapter->phy_index[mac_index], phy_data); + + nes_read_1G_phy_reg(nesdev, 1, + nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "1G phy%u data from register 1 = 0x%X\n", + nesadapter->phy_index[mac_index], phy_data); + + if (temp_phy_data & 0x1000) { + nes_debug(NES_DBG_PHY, "The Link is up according to the PHY\n"); + phy_data = 4; + } else { + nes_debug(NES_DBG_PHY, "The Link is down according to the PHY\n"); + } + } + nes_debug(NES_DBG_PHY, "Eth SERDES Common Status: 0=0x%08X, 1=0x%08X\n", + nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0), + nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0+0x200)); + pcs_control_status = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index&1)*0x200)); + pcs_control_status = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index&1)*0x200)); + nes_debug(NES_DBG_PHY, "PCS PHY Control/Status%u: 0x%08X\n", + mac_index, pcs_control_status); + if (nesadapter->OneG_Mode) { + u32temp = 0x01010000; + if (nesadapter->port_count > 2) { + u32temp |= 0x02020000; + } + if ((pcs_control_status & u32temp)!= u32temp) { + phy_data = 0; + nes_debug(NES_DBG_PHY, "PCS says the link is down\n"); + } + } else if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_IRIS) { + nes_read_10G_phy_reg(nesdev, 1, nesadapter->phy_index[mac_index]); + temp_phy_data = (u16)nes_read_indexed(nesdev, + NES_IDX_MAC_MDIO_CONTROL); + u32temp = 20; + do { + nes_read_10G_phy_reg(nesdev, 1, nesadapter->phy_index[mac_index]); + phy_data = (u16)nes_read_indexed(nesdev, + NES_IDX_MAC_MDIO_CONTROL); + if ((phy_data == temp_phy_data) || (!(--u32temp))) + break; + temp_phy_data = phy_data; + } while (1); + nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n", + __FUNCTION__, phy_data, nesadapter->mac_link_down ? "DOWN" : "UP"); + + } else { + phy_data = (0x0f0f0000 == (pcs_control_status & 0x0f1f0000)) ? 4 : 0; + } + + if (phy_data & 0x0004) { + nesadapter->mac_link_down[mac_index] = 0; + list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) { + nes_debug(NES_DBG_PHY, "The Link is UP!!. linkup was %d\n", + nesvnic->linkup); + if (nesvnic->linkup == 0) { + printk(PFX "The Link is now up for port %u, netdev %p.\n", + mac_index, nesvnic->netdev); + if (netif_queue_stopped(nesvnic->netdev)) + netif_start_queue(nesvnic->netdev); + nesvnic->linkup = 1; + netif_carrier_on(nesvnic->netdev); + } + } + } else { + nesadapter->mac_link_down[mac_index] = 1; + list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) { + nes_debug(NES_DBG_PHY, "The Link is Down!!. linkup was %d\n", + nesvnic->linkup); + if (nesvnic->linkup == 1) { + printk(PFX "The Link is now down for port %u, netdev %p.\n", + mac_index, nesvnic->netdev); + if (!(netif_queue_stopped(nesvnic->netdev))) + netif_stop_queue(nesvnic->netdev); + nesvnic->linkup = 0; + netif_carrier_off(nesvnic->netdev); + } + } + } + } + + nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_IDLE; +} + + + +void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) +{ + struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq); + + netif_rx_schedule(nesdev->netdev[nesvnic->netdev_index], &nesvnic->napi); +} + + +/* The MAX_RQES_TO_PROCESS defines how many max read requests to complete before +* getting out of nic_ce_handler +*/ +#define MAX_RQES_TO_PROCESS 384 + +/** + * nes_nic_ce_handler + */ +void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) +{ + u64 u64temp; + dma_addr_t bus_address; + struct nes_hw_nic *nesnic; + struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq); + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_nic_rq_wqe *nic_rqe; + struct nes_hw_nic_sq_wqe *nic_sqe; + struct sk_buff *skb; + struct sk_buff *rx_skb; + __le16 *wqe_fragment_length; + u32 head; + u32 cq_size; + u32 rx_pkt_size; + u32 cqe_count=0; + u32 cqe_errv; + u32 cqe_misc; + u16 wqe_fragment_index = 1; /* first fragment (0) is used by copy buffer */ + u16 vlan_tag; + u16 pkt_type; + u16 rqes_processed = 0; + u8 sq_cqes = 0; + + head = cq->cq_head; + cq_size = cq->cq_size; + cq->cqes_pending = 1; + do { + if (le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]) & + NES_NIC_CQE_VALID) { + nesnic = &nesvnic->nic; + cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]); + if (cqe_misc & NES_NIC_CQE_SQ) { + sq_cqes++; + wqe_fragment_index = 1; + nic_sqe = &nesnic->sq_vbase[nesnic->sq_tail]; + skb = nesnic->tx_skb[nesnic->sq_tail]; + wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX]; + /* bump past the vlan tag */ + wqe_fragment_length++; + if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) { + u64temp = (u64) le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX+wqe_fragment_index*2]); + u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX+wqe_fragment_index*2]))<<32; + bus_address = (dma_addr_t)u64temp; + if (test_and_clear_bit(nesnic->sq_tail, nesnic->first_frag_overflow)) { + pci_unmap_single(nesdev->pcidev, + bus_address, + le16_to_cpu(wqe_fragment_length[wqe_fragment_index++]), + PCI_DMA_TODEVICE); + } + for (; wqe_fragment_index < 5; wqe_fragment_index++) { + if (wqe_fragment_length[wqe_fragment_index]) { + u64temp = le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX+wqe_fragment_index*2]); + u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX+wqe_fragment_index*2]))<<32; + bus_address = (dma_addr_t)u64temp; + pci_unmap_page(nesdev->pcidev, + bus_address, + le16_to_cpu(wqe_fragment_length[wqe_fragment_index]), + PCI_DMA_TODEVICE); + } else + break; + } + if (skb) + dev_kfree_skb_any(skb); + } + nesnic->sq_tail++; + nesnic->sq_tail &= nesnic->sq_size-1; + if (sq_cqes > 128) { + barrier(); + /* restart the queue if it had been stopped */ + if (netif_queue_stopped(nesvnic->netdev)) + netif_wake_queue(nesvnic->netdev); + sq_cqes = 0; + } + } else { + rqes_processed ++; + + cq->rx_cqes_completed++; + cq->rx_pkts_indicated++; + rx_pkt_size = cqe_misc & 0x0000ffff; + nic_rqe = &nesnic->rq_vbase[nesnic->rq_tail]; + /* Get the skb */ + rx_skb = nesnic->rx_skb[nesnic->rq_tail]; + nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_tail]; + bus_address = (dma_addr_t)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]); + bus_address += ((u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX])) << 32; + pci_unmap_single(nesdev->pcidev, bus_address, + nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + /* rx_skb->tail = rx_skb->data + rx_pkt_size; */ + /* rx_skb->len = rx_pkt_size; */ + rx_skb->len = 0; /* TODO: see if this is necessary */ + skb_put(rx_skb, rx_pkt_size); + rx_skb->protocol = eth_type_trans(rx_skb, nesvnic->netdev); + nesnic->rq_tail++; + nesnic->rq_tail &= nesnic->rq_size - 1; + + atomic_inc(&nesvnic->rx_skbs_needed); + if (atomic_read(&nesvnic->rx_skbs_needed) > (nesvnic->nic.rq_size>>1)) { + nes_write32(nesdev->regs+NES_CQE_ALLOC, + cq->cq_number | (cqe_count << 16)); +// nesadapter->tune_timer.cq_count += cqe_count; + nesdev->currcq_count += cqe_count; + cqe_count = 0; + nes_replenish_nic_rq(nesvnic); + } + pkt_type = (u16)(le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX])); + cqe_errv = (cqe_misc & NES_NIC_CQE_ERRV_MASK) >> NES_NIC_CQE_ERRV_SHIFT; + rx_skb->ip_summed = CHECKSUM_NONE; + + if ((NES_PKT_TYPE_TCPV4_BITS == (pkt_type & NES_PKT_TYPE_TCPV4_MASK)) || + (NES_PKT_TYPE_UDPV4_BITS == (pkt_type & NES_PKT_TYPE_UDPV4_MASK))) { + if ((cqe_errv & + (NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_TCPUDP_CSUM_ERR | + NES_NIC_ERRV_BITS_IPH_ERR | NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) { + if (nesvnic->rx_checksum_disabled == 0) { + rx_skb->ip_summed = CHECKSUM_UNNECESSARY; + } + } else + nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet." + " errv = 0x%X, pkt_type = 0x%X.\n", + nesvnic->netdev->name, cqe_errv, pkt_type); + + } else if ((pkt_type & NES_PKT_TYPE_IPV4_MASK) == NES_PKT_TYPE_IPV4_BITS) { + if ((cqe_errv & + (NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_IPH_ERR | + NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) { + if (nesvnic->rx_checksum_disabled == 0) { + rx_skb->ip_summed = CHECKSUM_UNNECESSARY; + /* nes_debug(NES_DBG_CQ, "%s: Reporting successfully checksummed IPv4 packet.\n", + nesvnic->netdev->name); */ + } + } else + nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet." + " errv = 0x%X, pkt_type = 0x%X.\n", + nesvnic->netdev->name, cqe_errv, pkt_type); + } + /* nes_debug(NES_DBG_CQ, "pkt_type=%x, APBVT_MASK=%x\n", + pkt_type, (pkt_type & NES_PKT_TYPE_APBVT_MASK)); */ + + if ((pkt_type & NES_PKT_TYPE_APBVT_MASK) == NES_PKT_TYPE_APBVT_BITS) { + nes_cm_recv(rx_skb, nesvnic->netdev); + } else { + if ((cqe_misc & NES_NIC_CQE_TAG_VALID) && (nesvnic->vlan_grp != NULL)) { + vlan_tag = (u16)(le32_to_cpu( + cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX]) + >> 16); + nes_debug(NES_DBG_CQ, "%s: Reporting stripped VLAN packet. Tag = 0x%04X\n", + nesvnic->netdev->name, vlan_tag); + nes_vlan_rx(rx_skb, nesvnic->vlan_grp, vlan_tag); + } else { + nes_netif_rx(rx_skb); + } + } + + nesvnic->netdev->last_rx = jiffies; + /* nesvnic->netstats.rx_packets++; */ + /* nesvnic->netstats.rx_bytes += rx_pkt_size; */ + } + + cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0; + /* Accounting... */ + cqe_count++; + if (++head >= cq_size) + head = 0; + if (cqe_count == 255) { + /* Replenish Nic CQ */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, + cq->cq_number | (cqe_count << 16)); +// nesdev->nesadapter->tune_timer.cq_count += cqe_count; + nesdev->currcq_count += cqe_count; + cqe_count = 0; + } + + if (cq->rx_cqes_completed >= nesvnic->budget) + break; + } else { + cq->cqes_pending = 0; + break; + } + + } while (1); + + if (sq_cqes) { + barrier(); + /* restart the queue if it had been stopped */ + if (netif_queue_stopped(nesvnic->netdev)) + netif_wake_queue(nesvnic->netdev); + } + + cq->cq_head = head; + /* nes_debug(NES_DBG_CQ, "CQ%u Processed = %u cqes, new head = %u.\n", + cq->cq_number, cqe_count, cq->cq_head); */ + cq->cqe_allocs_pending = cqe_count; + if (unlikely(nesadapter->et_use_adaptive_rx_coalesce)) + { +// nesdev->nesadapter->tune_timer.cq_count += cqe_count; + nesdev->currcq_count += cqe_count; + nes_nic_tune_timer(nesdev); + } + if (atomic_read(&nesvnic->rx_skbs_needed)) + nes_replenish_nic_rq(nesvnic); + } + + +/** + * nes_cqp_ce_handler + */ +void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq) +{ + u64 u64temp; + unsigned long flags; + struct nes_hw_cqp *cqp = NULL; + struct nes_cqp_request *cqp_request; + struct nes_hw_cqp_wqe *cqp_wqe; + u32 head; + u32 cq_size; + u32 cqe_count=0; + u32 error_code; + /* u32 counter; */ + + head = cq->cq_head; + cq_size = cq->cq_size; + + do { + /* process the CQE */ + /* nes_debug(NES_DBG_CQP, "head=%u cqe_words=%08X\n", head, + le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])); */ + + if (le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_VALID) { + u64temp = (((u64)(le32_to_cpu(cq->cq_vbase[head]. + cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX])))<<32) | + ((u64)(le32_to_cpu(cq->cq_vbase[head]. + cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]))); + cqp = *((struct nes_hw_cqp **)&u64temp); + + error_code = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX]); + if (error_code) { + nes_debug(NES_DBG_CQP, "Bad Completion code for opcode 0x%02X from CQP," + " Major/Minor codes = 0x%04X:%04X.\n", + le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])&0x3f, + (u16)(error_code >> 16), + (u16)error_code); + nes_debug(NES_DBG_CQP, "cqp: qp_id=%u, sq_head=%u, sq_tail=%u\n", + cqp->qp_id, cqp->sq_head, cqp->sq_tail); + } + + u64temp = (((u64)(le32_to_cpu(nesdev->cqp.sq_vbase[cqp->sq_tail]. + wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX])))<<32) | + ((u64)(le32_to_cpu(nesdev->cqp.sq_vbase[cqp->sq_tail]. + wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX]))); + cqp_request = *((struct nes_cqp_request **)&u64temp); + if (cqp_request) { + if (cqp_request->waiting) { + /* nes_debug(NES_DBG_CQP, "%s: Waking up requestor\n"); */ + cqp_request->major_code = (u16)(error_code >> 16); + cqp_request->minor_code = (u16)error_code; + barrier(); + cqp_request->request_done = 1; + wake_up(&cqp_request->waitq); + if (atomic_dec_and_test(&cqp_request->refcount)) { + nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n", + cqp_request, + le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f); + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + } else if (cqp_request->callback) { + /* Envoke the callback routine */ + cqp_request->cqp_callback(nesdev, cqp_request); + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } else { + nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n", + cqp_request, + le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f); + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + } else { + wake_up(&nesdev->cqp.waitq); + } + + cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0; + nes_write32(nesdev->regs+NES_CQE_ALLOC, cq->cq_number | (1 << 16)); + if (++cqp->sq_tail >= cqp->sq_size) + cqp->sq_tail = 0; + + /* Accounting... */ + cqe_count++; + if (++head >= cq_size) + head = 0; + } else { + break; + } + } while (1); + cq->cq_head = head; + + spin_lock_irqsave(&nesdev->cqp.lock, flags); + while ((!list_empty(&nesdev->cqp_pending_reqs)) && + ((((nesdev->cqp.sq_tail+nesdev->cqp.sq_size)-nesdev->cqp.sq_head) & + (nesdev->cqp.sq_size - 1)) != 1)) { + cqp_request = list_entry(nesdev->cqp_pending_reqs.next, + struct nes_cqp_request, list); + list_del_init(&cqp_request->list); + head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[head]; + memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe)); + barrier(); + cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX] = + cpu_to_le32((u32)((unsigned long)cqp_request)); + cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX] = + cpu_to_le32((u32)(upper_32_bits((unsigned long)cqp_request))); + nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) put on CQPs SQ wqe%u.\n", + cqp_request, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f, head); + /* Ring doorbell (1 WQEs) */ + barrier(); + nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id); + } + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + + /* Arm the CCQ */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + cq->cq_number); + nes_read32(nesdev->regs+NES_CQE_ALLOC); +} + + +/** + * nes_process_iwarp_aeqe + */ +void nes_process_iwarp_aeqe(struct nes_device *nesdev, struct nes_hw_aeqe *aeqe) +{ + u64 context; + u64 aeqe_context = 0; + unsigned long flags; + struct nes_qp *nesqp; + int resource_allocated; + /* struct iw_cm_id *cm_id; */ + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct ib_event ibevent; + /* struct iw_cm_event cm_event; */ + u32 aeq_info; + u32 next_iwarp_state = 0; + u16 async_event_id; + u8 tcp_state; + u8 iwarp_state; + + nes_debug(NES_DBG_AEQ, "\n"); + aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + if ((NES_AEQE_INBOUND_RDMA&aeq_info) || (!(NES_AEQE_QP&aeq_info))) { + context = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]); + context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32; + } else { + aeqe_context = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]); + aeqe_context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32; + context = (unsigned long)nesadapter->qp_table[le32_to_cpu( + aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])-NES_FIRST_QPN]; + BUG_ON(!context); + } + + async_event_id = (u16)aeq_info; + tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT; + iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT; + nes_debug(NES_DBG_AEQ, "aeid = 0x%04X, qp-cq id = %d, aeqe = %p," + " Tcp state = %s, iWARP state = %s\n", + async_event_id, + le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), aeqe, + nes_tcp_state_str[tcp_state], nes_iwarp_state_str[iwarp_state]); + + + switch (async_event_id) { + case NES_AEQE_AEID_LLP_FIN_RECEIVED: + nesqp = *((struct nes_qp **)&context); + if (atomic_inc_return(&nesqp->close_timer_started) == 1) { + nesqp->cm_id->add_ref(nesqp->cm_id); + nes_add_ref(&nesqp->ibqp); + schedule_nes_timer(nesqp->cm_node, (struct sk_buff *)nesqp, + NES_TIMER_TYPE_CLOSE, 1, 0); + nes_debug(NES_DBG_AEQ, "QP%u Not decrementing QP refcount (%d)," + " need ae to finish up, original_last_aeq = 0x%04X." + " last_aeq = 0x%04X, scheduling timer. TCP state = %d\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + async_event_id, nesqp->last_aeq, tcp_state); + } + if ((tcp_state != NES_AEQE_TCP_STATE_CLOSE_WAIT) || + (nesqp->ibqp_state != IB_QPS_RTS)) { + /* FIN Received but tcp state or IB state moved on, + should expect a close complete */ + return; + } + case NES_AEQE_AEID_LLP_CLOSE_COMPLETE: + case NES_AEQE_AEID_LLP_CONNECTION_RESET: + case NES_AEQE_AEID_TERMINATE_SENT: + case NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE: + case NES_AEQE_AEID_RESET_SENT: + nesqp = *((struct nes_qp **)&context); + if (async_event_id == NES_AEQE_AEID_RESET_SENT) { + tcp_state = NES_AEQE_TCP_STATE_CLOSED; + } + nes_add_ref(&nesqp->ibqp); + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + + if ((tcp_state == NES_AEQE_TCP_STATE_CLOSED) || + (tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT)) { + nesqp->hte_added = 0; + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u to remove hte\n", + nesqp->hwqp.qp_id); + nes_hw_modify_qp(nesdev, nesqp, + NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE, 0); + spin_lock_irqsave(&nesqp->lock, flags); + } + + if ((nesqp->ibqp_state == IB_QPS_RTS) && + ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) || + (async_event_id == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + switch (nesqp->hw_iwarp_state) { + case NES_AEQE_IWARP_STATE_RTS: + next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING; + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING; + break; + case NES_AEQE_IWARP_STATE_TERMINATE: + next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE; + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_TERMINATE; + if (async_event_id == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) { + next_iwarp_state |= 0x02000000; + nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; + } + break; + default: + next_iwarp_state = 0; + } + spin_unlock_irqrestore(&nesqp->lock, flags); + if (next_iwarp_state) { + nes_add_ref(&nesqp->ibqp); + nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u. next state = 0x%08X," + " also added another reference\n", + nesqp->hwqp.qp_id, next_iwarp_state); + nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0); + } + nes_cm_disconn(nesqp); + } else { + if (async_event_id == NES_AEQE_AEID_LLP_FIN_RECEIVED) { + /* FIN Received but ib state not RTS, + close complete will be on its way */ + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_rem_ref(&nesqp->ibqp); + return; + } + spin_unlock_irqrestore(&nesqp->lock, flags); + if (async_event_id == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) { + next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000; + nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; + nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u. next state = 0x%08X," + " also added another reference\n", + nesqp->hwqp.qp_id, next_iwarp_state); + nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0); + } + nes_cm_disconn(nesqp); + } + break; + case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED: + nesqp = *((struct nes_qp **)&context); + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_LLP_TERMINATE_RECEIVED" + " event on QP%u \n Q2 Data:\n", + nesqp->hwqp.qp_id); + if (nesqp->ibqp.event_handler) { + ibevent.device = nesqp->ibqp.device; + ibevent.element.qp = &nesqp->ibqp; + ibevent.event = IB_EVENT_QP_FATAL; + nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); + } + if ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) || + ((nesqp->ibqp_state == IB_QPS_RTS)&& + (async_event_id == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + nes_add_ref(&nesqp->ibqp); + nes_cm_disconn(nesqp); + } else { + nesqp->in_disconnect = 0; + wake_up(&nesqp->kick_waitq); + } + break; + case NES_AEQE_AEID_LLP_TOO_MANY_RETRIES: + nesqp = *((struct nes_qp **)&context); + nes_add_ref(&nesqp->ibqp); + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_ERROR; + nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; + nesqp->last_aeq = async_event_id; + if (nesqp->cm_id) { + nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_LLP_TOO_MANY_RETRIES" + " event on QP%u, remote IP = 0x%08X \n", + nesqp->hwqp.qp_id, + ntohl(nesqp->cm_id->remote_addr.sin_addr.s_addr)); + } else { + nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_LLP_TOO_MANY_RETRIES" + " event on QP%u \n", + nesqp->hwqp.qp_id); + } + spin_unlock_irqrestore(&nesqp->lock, flags); + next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_RESET; + nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0); + if (nesqp->ibqp.event_handler) { + ibevent.device = nesqp->ibqp.device; + ibevent.element.qp = &nesqp->ibqp; + ibevent.event = IB_EVENT_QP_FATAL; + nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); + } + break; + case NES_AEQE_AEID_AMP_BAD_STAG_INDEX: + if (NES_AEQE_INBOUND_RDMA&aeq_info) { + nesqp = nesadapter->qp_table[le32_to_cpu( + aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])-NES_FIRST_QPN]; + } else { + /* TODO: get the actual WQE and mask off wqe index */ + context &= ~((u64)511); + nesqp = *((struct nes_qp **)&context); + } + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_AMP_BAD_STAG_INDEX event on QP%u\n", + nesqp->hwqp.qp_id); + if (nesqp->ibqp.event_handler) { + ibevent.device = nesqp->ibqp.device; + ibevent.element.qp = &nesqp->ibqp; + ibevent.event = IB_EVENT_QP_ACCESS_ERR; + nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); + } + break; + case NES_AEQE_AEID_AMP_UNALLOCATED_STAG: + nesqp = *((struct nes_qp **)&context); + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_AMP_UNALLOCATED_STAG event on QP%u\n", + nesqp->hwqp.qp_id); + if (nesqp->ibqp.event_handler) { + ibevent.device = nesqp->ibqp.device; + ibevent.element.qp = &nesqp->ibqp; + ibevent.event = IB_EVENT_QP_ACCESS_ERR; + nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); + } + break; + case NES_AEQE_AEID_PRIV_OPERATION_DENIED: + nesqp = nesadapter->qp_table[le32_to_cpu(aeqe->aeqe_words + [NES_AEQE_COMP_QP_CQ_ID_IDX])-NES_FIRST_QPN]; + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_PRIV_OPERATION_DENIED event on QP%u," + " nesqp = %p, AE reported %p\n", + nesqp->hwqp.qp_id, nesqp, *((struct nes_qp **)&context)); + if (nesqp->ibqp.event_handler) { + ibevent.device = nesqp->ibqp.device; + ibevent.element.qp = &nesqp->ibqp; + ibevent.event = IB_EVENT_QP_ACCESS_ERR; + nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); + } + break; + case NES_AEQE_AEID_CQ_OPERATION_ERROR: + context <<= 1; + nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u, %p\n", + le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), (void *)(unsigned long)context); + resource_allocated = nes_is_resource_allocated(nesadapter, nesadapter->allocated_cqs, + le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])); + if (resource_allocated) { + printk(KERN_ERR PFX "%s: Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u\n", + __FUNCTION__, le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])); + } + break; + case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: + nesqp = nesadapter->qp_table[le32_to_cpu( + aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])-NES_FIRST_QPN]; + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG" + "_FOR_AVAILABLE_BUFFER event on QP%u\n", + nesqp->hwqp.qp_id); + if (nesqp->ibqp.event_handler) { + ibevent.device = nesqp->ibqp.device; + ibevent.element.qp = &nesqp->ibqp; + ibevent.event = IB_EVENT_QP_ACCESS_ERR; + nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); + } + /* tell cm to disconnect, cm will queue work to thread */ + nes_add_ref(&nesqp->ibqp); + nes_cm_disconn(nesqp); + break; + case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE: + nesqp = *((struct nes_qp **)&context); + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_DDP_UBE_INVALID_MSN" + "_NO_BUFFER_AVAILABLE event on QP%u\n", + nesqp->hwqp.qp_id); + if (nesqp->ibqp.event_handler) { + ibevent.device = nesqp->ibqp.device; + ibevent.element.qp = &nesqp->ibqp; + ibevent.event = IB_EVENT_QP_FATAL; + nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); + } + /* tell cm to disconnect, cm will queue work to thread */ + nes_add_ref(&nesqp->ibqp); + nes_cm_disconn(nesqp); + break; + case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR: + nesqp = *((struct nes_qp **)&context); + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR" + " event on QP%u \n Q2 Data:\n", + nesqp->hwqp.qp_id); + if (nesqp->ibqp.event_handler) { + ibevent.device = nesqp->ibqp.device; + ibevent.element.qp = &nesqp->ibqp; + ibevent.event = IB_EVENT_QP_FATAL; + nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); + } + /* tell cm to disconnect, cm will queue work to thread */ + nes_add_ref(&nesqp->ibqp); + nes_cm_disconn(nesqp); + break; + /* TODO: additional AEs need to be here */ + default: + nes_debug(NES_DBG_AEQ, "Processing an iWARP related AE for QP, misc = 0x%04X\n", + async_event_id); + break; + } + +} + + +/** + * nes_iwarp_ce_handler + */ +void nes_iwarp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *hw_cq) +{ + struct nes_cq *nescq = container_of(hw_cq, struct nes_cq, hw_cq); + + /* nes_debug(NES_DBG_CQ, "Processing completion event for iWARP CQ%u.\n", + nescq->hw_cq.cq_number); */ + nes_write32(nesdev->regs+NES_CQ_ACK, nescq->hw_cq.cq_number); + + if (nescq->ibcq.comp_handler) + nescq->ibcq.comp_handler(&nescq->ibcq, nescq->ibcq.cq_context); + + return; +} + + +/** + * nes_manage_apbvt() + */ +int nes_manage_apbvt(struct nes_vnic *nesvnic, u32 accel_local_port, + u32 nic_index, u32 add_port) +{ + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_hw_cqp_wqe *cqp_wqe; + unsigned long flags; + struct nes_cqp_request *cqp_request; + int ret = 0; + u16 major_code; + + /* Send manage APBVT request to CQP */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + nes_debug(NES_DBG_QP, "%s APBV for local port=%u(0x%04x), nic_index=%u\n", + (add_port == NES_MANAGE_APBVT_ADD) ? "ADD" : "DEL", + accel_local_port, accel_local_port, nic_index); + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, (NES_CQP_MANAGE_APBVT | + ((add_port == NES_MANAGE_APBVT_ADD) ? NES_CQP_APBVT_ADD : 0))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + ((nic_index << NES_CQP_APBVT_NIC_SHIFT) | accel_local_port)); + + nes_debug(NES_DBG_QP, "Waiting for CQP completion for APBVT.\n"); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL); + + if (add_port == NES_MANAGE_APBVT_ADD) + ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_QP, "Completed, ret=%u, CQP Major:Minor codes = 0x%04X:0x%04X\n", + ret, cqp_request->major_code, cqp_request->minor_code); + major_code = cqp_request->major_code; + if (atomic_dec_and_test(&cqp_request->refcount)) { + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + if (!ret) + return -ETIME; + else if (major_code) + return -EIO; + else + return 0; +} + + +/** + * nes_manage_arp_cache + */ +void nes_manage_arp_cache(struct net_device *netdev, unsigned char *mac_addr, + u32 ip_addr, u32 action) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev; + struct nes_cqp_request *cqp_request; + int arp_index; + + nesdev = nesvnic->nesdev; + arp_index = nes_arp_table(nesdev, ip_addr, mac_addr, action); + if (arp_index == -1) { + return; + } + + /* update the ARP entry */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_NETDEV, "Failed to get a cqp_request.\n"); + return; + } + cqp_request->waiting = 0; + cqp_wqe = &cqp_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32( + NES_CQP_MANAGE_ARP_CACHE | NES_CQP_ARP_PERM); + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32( + (u32)PCI_FUNC(nesdev->pcidev->devfn) << NES_CQP_ARP_AEQ_INDEX_SHIFT); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(arp_index); + + if (action == NES_ARP_ADD) { + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_ARP_VALID); + cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = cpu_to_le32( + (((u32)mac_addr[2]) << 24) | (((u32)mac_addr[3]) << 16) | + (((u32)mac_addr[4]) << 8) | (u32)mac_addr[5]); + cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = cpu_to_le32( + (((u32)mac_addr[0]) << 16) | (u32)mac_addr[1]); + } else { + cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = 0; + } + + nes_debug(NES_DBG_NETDEV, "Not waiting for CQP, cqp.sq_head=%u, cqp.sq_tail=%u\n", + nesdev->cqp.sq_head, nesdev->cqp.sq_tail); + + atomic_set(&cqp_request->refcount, 1); + nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL); +} + + +/** + * flush_wqes + */ +void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp, + u32 which_wq, u32 wait_completion) +{ + unsigned long flags; + struct nes_cqp_request *cqp_request; + struct nes_hw_cqp_wqe *cqp_wqe; + int ret; + + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n"); + return; + } + if (wait_completion) { + cqp_request->waiting = 1; + atomic_set(&cqp_request->refcount, 2); + } else { + cqp_request->waiting = 0; + } + cqp_wqe = &cqp_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = + cpu_to_le32(NES_CQP_FLUSH_WQES | which_wq); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesqp->hwqp.qp_id); + + nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL); + + if (wait_completion) { + /* Wait for CQP */ + ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_QP, "Flush SQ QP WQEs completed, ret=%u," + " CQP Major:Minor codes = 0x%04X:0x%04X\n", + ret, cqp_request->major_code, cqp_request->minor_code); + if (atomic_dec_and_test(&cqp_request->refcount)) { + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + } +} diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h new file mode 100644 index 0000000..1e10df5 --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_hw.h @@ -0,0 +1,1206 @@ +/* +* Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenIB.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ + +#ifndef __NES_HW_H +#define __NES_HW_H + +#define NES_PHY_TYPE_1G 2 +#define NES_PHY_TYPE_IRIS 3 +#define NES_PHY_TYPE_PUMA_10G 6 + +#define NES_MULTICAST_PF_MAX 8 + +enum pci_regs { + NES_INT_STAT = 0x0000, + NES_INT_MASK = 0x0004, + NES_INT_PENDING = 0x0008, + NES_INTF_INT_STAT = 0x000C, + NES_INTF_INT_MASK = 0x0010, + NES_TIMER_STAT = 0x0014, + NES_PERIODIC_CONTROL = 0x0018, + NES_ONE_SHOT_CONTROL = 0x001C, + NES_EEPROM_COMMAND = 0x0020, + NES_EEPROM_DATA = 0x0024, + NES_FLASH_COMMAND = 0x0028, + NES_FLASH_DATA = 0x002C, + NES_SOFTWARE_RESET = 0x0030, + NES_CQ_ACK = 0x0034, + NES_WQE_ALLOC = 0x0040, + NES_CQE_ALLOC = 0x0044, +}; + +enum indexed_regs { + NES_IDX_CREATE_CQP_LOW = 0x0000, + NES_IDX_CREATE_CQP_HIGH = 0x0004, + NES_IDX_QP_CONTROL = 0x0040, + NES_IDX_FLM_CONTROL = 0x0080, + NES_IDX_INT_CPU_STATUS = 0x00a0, + NES_IDX_GPIO_CONTROL = 0x00f0, + NES_IDX_GPIO_DATA = 0x00f4, + NES_IDX_TCP_CONFIG0 = 0x01e4, + NES_IDX_TCP_TIMER_CONFIG = 0x01ec, + NES_IDX_TCP_NOW = 0x01f0, + NES_IDX_QP_MAX_CFG_SIZES = 0x0200, + NES_IDX_QP_CTX_SIZE = 0x0218, + NES_IDX_TCP_TIMER_SIZE0 = 0x0238, + NES_IDX_TCP_TIMER_SIZE1 = 0x0240, + NES_IDX_ARP_CACHE_SIZE = 0x0258, + NES_IDX_CQ_CTX_SIZE = 0x0260, + NES_IDX_MRT_SIZE = 0x0278, + NES_IDX_PBL_REGION_SIZE = 0x0280, + NES_IDX_IRRQ_COUNT = 0x02b0, + NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x02f0, + NES_IDX_RX_WINDOW_BUFFER_SIZE = 0x0300, + NES_IDX_DST_IP_ADDR = 0x0400, + NES_IDX_PCIX_DIAG = 0x08e8, + NES_IDX_MPP_DEBUG = 0x0a00, + NES_IDX_PORT_RX_DISCARDS = 0x0a30, + NES_IDX_PORT_TX_DISCARDS = 0x0a34, + NES_IDX_MPP_LB_DEBUG = 0x0b00, + NES_IDX_DENALI_CTL_22 = 0x1058, + NES_IDX_MAC_TX_CONTROL = 0x2000, + NES_IDX_MAC_TX_CONFIG = 0x2004, + NES_IDX_MAC_TX_PAUSE_QUANTA = 0x2008, + NES_IDX_MAC_RX_CONTROL = 0x200c, + NES_IDX_MAC_RX_CONFIG = 0x2010, + NES_IDX_MAC_EXACT_MATCH_BOTTOM = 0x201c, + NES_IDX_MAC_MDIO_CONTROL = 0x2084, + NES_IDX_MAC_TX_OCTETS_LOW = 0x2100, + NES_IDX_MAC_TX_OCTETS_HIGH = 0x2104, + NES_IDX_MAC_TX_FRAMES_LOW = 0x2108, + NES_IDX_MAC_TX_FRAMES_HIGH = 0x210c, + NES_IDX_MAC_TX_PAUSE_FRAMES = 0x2118, + NES_IDX_MAC_TX_ERRORS = 0x2138, + NES_IDX_MAC_RX_OCTETS_LOW = 0x213c, + NES_IDX_MAC_RX_OCTETS_HIGH = 0x2140, + NES_IDX_MAC_RX_FRAMES_LOW = 0x2144, + NES_IDX_MAC_RX_FRAMES_HIGH = 0x2148, + NES_IDX_MAC_RX_BC_FRAMES_LOW = 0x214c, + NES_IDX_MAC_RX_MC_FRAMES_HIGH = 0x2150, + NES_IDX_MAC_RX_PAUSE_FRAMES = 0x2154, + NES_IDX_MAC_RX_SHORT_FRAMES = 0x2174, + NES_IDX_MAC_RX_OVERSIZED_FRAMES = 0x2178, + NES_IDX_MAC_RX_JABBER_FRAMES = 0x217c, + NES_IDX_MAC_RX_CRC_ERR_FRAMES = 0x2180, + NES_IDX_MAC_RX_LENGTH_ERR_FRAMES = 0x2184, + NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES = 0x2188, + NES_IDX_MAC_INT_STATUS = 0x21f0, + NES_IDX_MAC_INT_MASK = 0x21f4, + NES_IDX_PHY_PCS_CONTROL_STATUS0 = 0x2800, + NES_IDX_PHY_PCS_CONTROL_STATUS1 = 0x2a00, + NES_IDX_ETH_SERDES_COMMON_CONTROL0 = 0x2808, + NES_IDX_ETH_SERDES_COMMON_CONTROL1 = 0x2a08, + NES_IDX_ETH_SERDES_COMMON_STATUS0 = 0x280c, + NES_IDX_ETH_SERDES_COMMON_STATUS1 = 0x2a0c, + NES_IDX_ETH_SERDES_TX_EMP0 = 0x2810, + NES_IDX_ETH_SERDES_TX_EMP1 = 0x2a10, + NES_IDX_ETH_SERDES_TX_DRIVE0 = 0x2814, + NES_IDX_ETH_SERDES_TX_DRIVE1 = 0x2a14, + NES_IDX_ETH_SERDES_RX_MODE0 = 0x2818, + NES_IDX_ETH_SERDES_RX_MODE1 = 0x2a18, + NES_IDX_ETH_SERDES_RX_SIGDET0 = 0x281c, + NES_IDX_ETH_SERDES_RX_SIGDET1 = 0x2a1c, + NES_IDX_ETH_SERDES_BYPASS0 = 0x2820, + NES_IDX_ETH_SERDES_BYPASS1 = 0x2a20, + NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0 = 0x2824, + NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1 = 0x2a24, + NES_IDX_ETH_SERDES_RX_EQ_CONTROL0 = 0x2828, + NES_IDX_ETH_SERDES_RX_EQ_CONTROL1 = 0x2a28, + NES_IDX_ETH_SERDES_RX_EQ_STATUS0 = 0x282c, + NES_IDX_ETH_SERDES_RX_EQ_STATUS1 = 0x2a2c, + NES_IDX_ETH_SERDES_CDR_RESET0 = 0x2830, + NES_IDX_ETH_SERDES_CDR_RESET1 = 0x2a30, + NES_IDX_ETH_SERDES_CDR_CONTROL0 = 0x2834, + NES_IDX_ETH_SERDES_CDR_CONTROL1 = 0x2a34, + NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0 = 0x2838, + NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1 = 0x2a38, + NES_IDX_ENDNODE0_NSTAT_RX_DISCARD = 0x3080, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO = 0x3000, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI = 0x3004, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO = 0x3008, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI = 0x300c, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO = 0x7000, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI = 0x7004, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO = 0x7008, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI = 0x700c, + NES_IDX_CM_CONFIG = 0x5100, + NES_IDX_NIC_LOGPORT_TO_PHYPORT = 0x6000, + NES_IDX_NIC_PHYPORT_TO_USW = 0x6008, + NES_IDX_NIC_ACTIVE = 0x6010, + NES_IDX_NIC_UNICAST_ALL = 0x6018, + NES_IDX_NIC_MULTICAST_ALL = 0x6020, + NES_IDX_NIC_MULTICAST_ENABLE = 0x6028, + NES_IDX_NIC_BROADCAST_ON = 0x6030, + NES_IDX_USED_CHUNKS_TX = 0x60b0, + NES_IDX_TX_POOL_SIZE = 0x60b8, + NES_IDX_QUAD_HASH_TABLE_SIZE = 0x6148, + NES_IDX_PERFECT_FILTER_LOW = 0x6200, + NES_IDX_PERFECT_FILTER_HIGH = 0x6204, + NES_IDX_IPV4_TCP_REXMITS = 0x7080, + NES_IDX_DEBUG_ERROR_CONTROL_STATUS = 0x913c, + NES_IDX_DEBUG_ERROR_MASKS0 = 0x9140, + NES_IDX_DEBUG_ERROR_MASKS1 = 0x9144, + NES_IDX_DEBUG_ERROR_MASKS2 = 0x9148, + NES_IDX_DEBUG_ERROR_MASKS3 = 0x914c, + NES_IDX_DEBUG_ERROR_MASKS4 = 0x9150, + NES_IDX_DEBUG_ERROR_MASKS5 = 0x9154, +}; + +#define NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE 1 +#define NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE (1 << 17) + +enum nes_cqp_opcodes { + NES_CQP_CREATE_QP = 0x00, + NES_CQP_MODIFY_QP = 0x01, + NES_CQP_DESTROY_QP = 0x02, + NES_CQP_CREATE_CQ = 0x03, + NES_CQP_MODIFY_CQ = 0x04, + NES_CQP_DESTROY_CQ = 0x05, + NES_CQP_ALLOCATE_STAG = 0x09, + NES_CQP_REGISTER_STAG = 0x0a, + NES_CQP_QUERY_STAG = 0x0b, + NES_CQP_REGISTER_SHARED_STAG = 0x0c, + NES_CQP_DEALLOCATE_STAG = 0x0d, + NES_CQP_MANAGE_ARP_CACHE = 0x0f, + NES_CQP_SUSPEND_QPS = 0x11, + NES_CQP_UPLOAD_CONTEXT = 0x13, + NES_CQP_CREATE_CEQ = 0x16, + NES_CQP_DESTROY_CEQ = 0x18, + NES_CQP_CREATE_AEQ = 0x19, + NES_CQP_DESTROY_AEQ = 0x1b, + NES_CQP_LMI_ACCESS = 0x20, + NES_CQP_FLUSH_WQES = 0x22, + NES_CQP_MANAGE_APBVT = 0x23 +}; + +enum nes_cqp_wqe_word_idx { + NES_CQP_WQE_OPCODE_IDX = 0, + NES_CQP_WQE_ID_IDX = 1, + NES_CQP_WQE_COMP_CTX_LOW_IDX = 2, + NES_CQP_WQE_COMP_CTX_HIGH_IDX = 3, + NES_CQP_WQE_COMP_SCRATCH_LOW_IDX = 4, + NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX = 5, +}; + +enum nes_cqp_cq_wqeword_idx { + NES_CQP_CQ_WQE_PBL_LOW_IDX = 6, + NES_CQP_CQ_WQE_PBL_HIGH_IDX = 7, + NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX = 8, + NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX = 9, + NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX = 10, +}; + +enum nes_cqp_stag_wqeword_idx { + NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX = 1, + NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX = 6, + NES_CQP_STAG_WQE_LEN_LOW_IDX = 7, + NES_CQP_STAG_WQE_STAG_IDX = 8, + NES_CQP_STAG_WQE_VA_LOW_IDX = 10, + NES_CQP_STAG_WQE_VA_HIGH_IDX = 11, + NES_CQP_STAG_WQE_PA_LOW_IDX = 12, + NES_CQP_STAG_WQE_PA_HIGH_IDX = 13, + NES_CQP_STAG_WQE_PBL_LEN_IDX = 14 +}; + +#define NES_CQP_OP_IWARP_STATE_SHIFT 28 + +enum nes_cqp_qp_bits { + NES_CQP_QP_ARP_VALID = (1<<8), + NES_CQP_QP_WINBUF_VALID = (1<<9), + NES_CQP_QP_CONTEXT_VALID = (1<<10), + NES_CQP_QP_ORD_VALID = (1<<11), + NES_CQP_QP_WINBUF_DATAIND_EN = (1<<12), + NES_CQP_QP_VIRT_WQS = (1<<13), + NES_CQP_QP_DEL_HTE = (1<<14), + NES_CQP_QP_CQS_VALID = (1<<15), + NES_CQP_QP_TYPE_TSA = 0, + NES_CQP_QP_TYPE_IWARP = (1<<16), + NES_CQP_QP_TYPE_CQP = (4<<16), + NES_CQP_QP_TYPE_NIC = (5<<16), + NES_CQP_QP_MSS_CHG = (1<<20), + NES_CQP_QP_STATIC_RESOURCES = (1<<21), + NES_CQP_QP_IGNORE_MW_BOUND = (1<<22), + NES_CQP_QP_VWQ_USE_LMI = (1<<23), + NES_CQP_QP_IWARP_STATE_IDLE = (1<<NES_CQP_OP_IWARP_STATE_SHIFT), + NES_CQP_QP_IWARP_STATE_RTS = (2<<NES_CQP_OP_IWARP_STATE_SHIFT), + NES_CQP_QP_IWARP_STATE_CLOSING = (3<<NES_CQP_OP_IWARP_STATE_SHIFT), + NES_CQP_QP_IWARP_STATE_TERMINATE = (5<<NES_CQP_OP_IWARP_STATE_SHIFT), + NES_CQP_QP_IWARP_STATE_ERROR = (6<<NES_CQP_OP_IWARP_STATE_SHIFT), + NES_CQP_QP_IWARP_STATE_MASK = (7<<NES_CQP_OP_IWARP_STATE_SHIFT), + NES_CQP_QP_RESET = (1<<31), +}; + +enum nes_cqp_qp_wqe_word_idx { + NES_CQP_QP_WQE_CONTEXT_LOW_IDX = 6, + NES_CQP_QP_WQE_CONTEXT_HIGH_IDX = 7, + NES_CQP_QP_WQE_NEW_MSS_IDX = 15, +}; + +enum nes_nic_ctx_bits { + NES_NIC_CTX_RQ_SIZE_32 = (3<<8), + NES_NIC_CTX_RQ_SIZE_512 = (3<<8), + NES_NIC_CTX_SQ_SIZE_32 = (1<<10), + NES_NIC_CTX_SQ_SIZE_512 = (3<<10), +}; + +enum nes_nic_qp_ctx_word_idx { + NES_NIC_CTX_MISC_IDX = 0, + NES_NIC_CTX_SQ_LOW_IDX = 2, + NES_NIC_CTX_SQ_HIGH_IDX = 3, + NES_NIC_CTX_RQ_LOW_IDX = 4, + NES_NIC_CTX_RQ_HIGH_IDX = 5, +}; + +enum nes_cqp_cq_bits { + NES_CQP_CQ_CEQE_MASK = (1<<9), + NES_CQP_CQ_CEQ_VALID = (1<<10), + NES_CQP_CQ_RESIZE = (1<<11), + NES_CQP_CQ_CHK_OVERFLOW = (1<<12), + NES_CQP_CQ_4KB_CHUNK = (1<<14), + NES_CQP_CQ_VIRT = (1<<15), +}; + +enum nes_cqp_stag_bits { + NES_CQP_STAG_VA_TO = (1<<9), + NES_CQP_STAG_DEALLOC_PBLS = (1<<10), + NES_CQP_STAG_PBL_BLK_SIZE = (1<<11), + NES_CQP_STAG_MR = (1<<13), + NES_CQP_STAG_RIGHTS_LOCAL_READ = (1<<16), + NES_CQP_STAG_RIGHTS_LOCAL_WRITE = (1<<17), + NES_CQP_STAG_RIGHTS_REMOTE_READ = (1<<18), + NES_CQP_STAG_RIGHTS_REMOTE_WRITE = (1<<19), + NES_CQP_STAG_RIGHTS_WINDOW_BIND = (1<<20), + NES_CQP_STAG_REM_ACC_EN = (1<<21), + NES_CQP_STAG_LEAVE_PENDING = (1<<31), +}; + +enum nes_cqp_ceq_wqeword_idx { + NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX = 1, + NES_CQP_CEQ_WQE_PBL_LOW_IDX = 6, + NES_CQP_CEQ_WQE_PBL_HIGH_IDX = 7, +}; + +enum nes_cqp_ceq_bits { + NES_CQP_CEQ_4KB_CHUNK = (1<<14), + NES_CQP_CEQ_VIRT = (1<<15), +}; + +enum nes_cqp_aeq_wqeword_idx { + NES_CQP_AEQ_WQE_ELEMENT_COUNT_IDX = 1, + NES_CQP_AEQ_WQE_PBL_LOW_IDX = 6, + NES_CQP_AEQ_WQE_PBL_HIGH_IDX = 7, +}; + +enum nes_cqp_aeq_bits { + NES_CQP_AEQ_4KB_CHUNK = (1<<14), + NES_CQP_AEQ_VIRT = (1<<15), +}; + +enum nes_cqp_lmi_wqeword_idx { + NES_CQP_LMI_WQE_LMI_OFFSET_IDX = 1, + NES_CQP_LMI_WQE_FRAG_LOW_IDX = 8, + NES_CQP_LMI_WQE_FRAG_HIGH_IDX = 9, + NES_CQP_LMI_WQE_FRAG_LEN_IDX = 10, +}; + +enum nes_cqp_arp_wqeword_idx { + NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX = 6, + NES_CQP_ARP_WQE_MAC_HIGH_IDX = 7, + NES_CQP_ARP_WQE_REACHABILITY_MAX_IDX = 1, +}; + +enum nes_cqp_upload_wqeword_idx { + NES_CQP_UPLOAD_WQE_CTXT_LOW_IDX = 6, + NES_CQP_UPLOAD_WQE_CTXT_HIGH_IDX = 7, + NES_CQP_UPLOAD_WQE_HTE_IDX = 8, +}; + +enum nes_cqp_arp_bits { + NES_CQP_ARP_VALID = (1<<8), + NES_CQP_ARP_PERM = (1<<9), +}; + +enum nes_cqp_flush_bits { + NES_CQP_FLUSH_SQ = (1<<30), + NES_CQP_FLUSH_RQ = (1<<31), +}; + +enum nes_cqe_opcode_bits { + NES_CQE_STAG_VALID = (1<<6), + NES_CQE_ERROR = (1<<7), + NES_CQE_SQ = (1<<8), + NES_CQE_SE = (1<<9), + NES_CQE_PSH = (1<<29), + NES_CQE_FIN = (1<<30), + NES_CQE_VALID = (1<<31), +}; + + +enum nes_cqe_word_idx { + NES_CQE_PAYLOAD_LENGTH_IDX = 0, + NES_CQE_COMP_COMP_CTX_LOW_IDX = 2, + NES_CQE_COMP_COMP_CTX_HIGH_IDX = 3, + NES_CQE_INV_STAG_IDX = 4, + NES_CQE_QP_ID_IDX = 5, + NES_CQE_ERROR_CODE_IDX = 6, + NES_CQE_OPCODE_IDX = 7, +}; + +enum nes_ceqe_word_idx { + NES_CEQE_CQ_CTX_LOW_IDX = 0, + NES_CEQE_CQ_CTX_HIGH_IDX = 1, +}; + +enum nes_ceqe_status_bit { + NES_CEQE_VALID = (1<<31), +}; + +enum nes_int_bits { + NES_INT_CEQ0 = (1<<0), + NES_INT_CEQ1 = (1<<1), + NES_INT_CEQ2 = (1<<2), + NES_INT_CEQ3 = (1<<3), + NES_INT_CEQ4 = (1<<4), + NES_INT_CEQ5 = (1<<5), + NES_INT_CEQ6 = (1<<6), + NES_INT_CEQ7 = (1<<7), + NES_INT_CEQ8 = (1<<8), + NES_INT_CEQ9 = (1<<9), + NES_INT_CEQ10 = (1<<10), + NES_INT_CEQ11 = (1<<11), + NES_INT_CEQ12 = (1<<12), + NES_INT_CEQ13 = (1<<13), + NES_INT_CEQ14 = (1<<14), + NES_INT_CEQ15 = (1<<15), + NES_INT_AEQ0 = (1<<16), + NES_INT_AEQ1 = (1<<17), + NES_INT_AEQ2 = (1<<18), + NES_INT_AEQ3 = (1<<19), + NES_INT_AEQ4 = (1<<20), + NES_INT_AEQ5 = (1<<21), + NES_INT_AEQ6 = (1<<22), + NES_INT_AEQ7 = (1<<23), + NES_INT_MAC0 = (1<<24), + NES_INT_MAC1 = (1<<25), + NES_INT_MAC2 = (1<<26), + NES_INT_MAC3 = (1<<27), + NES_INT_TSW = (1<<28), + NES_INT_TIMER = (1<<29), + NES_INT_INTF = (1<<30), +}; + +enum nes_intf_int_bits { + NES_INTF_INT_PCIERR = (1<<0), + NES_INTF_PERIODIC_TIMER = (1<<2), + NES_INTF_ONE_SHOT_TIMER = (1<<3), + NES_INTF_INT_CRITERR = (1<<14), + NES_INTF_INT_AEQ0_OFLOW = (1<<16), + NES_INTF_INT_AEQ1_OFLOW = (1<<17), + NES_INTF_INT_AEQ2_OFLOW = (1<<18), + NES_INTF_INT_AEQ3_OFLOW = (1<<19), + NES_INTF_INT_AEQ4_OFLOW = (1<<20), + NES_INTF_INT_AEQ5_OFLOW = (1<<21), + NES_INTF_INT_AEQ6_OFLOW = (1<<22), + NES_INTF_INT_AEQ7_OFLOW = (1<<23), + NES_INTF_INT_AEQ_OFLOW = (0xff<<16), +}; + +enum nes_mac_int_bits { + NES_MAC_INT_LINK_STAT_CHG = (1<<1), + NES_MAC_INT_XGMII_EXT = (1<<2), + NES_MAC_INT_TX_UNDERFLOW = (1<<6), + NES_MAC_INT_TX_ERROR = (1<<7), +}; + +enum nes_cqe_allocate_bits { + NES_CQE_ALLOC_INC_SELECT = (1<<28), + NES_CQE_ALLOC_NOTIFY_NEXT = (1<<29), + NES_CQE_ALLOC_NOTIFY_SE = (1<<30), + NES_CQE_ALLOC_RESET = (1<<31), +}; + +enum nes_nic_rq_wqe_word_idx { + NES_NIC_RQ_WQE_LENGTH_1_0_IDX = 0, + NES_NIC_RQ_WQE_LENGTH_3_2_IDX = 1, + NES_NIC_RQ_WQE_FRAG0_LOW_IDX = 2, + NES_NIC_RQ_WQE_FRAG0_HIGH_IDX = 3, + NES_NIC_RQ_WQE_FRAG1_LOW_IDX = 4, + NES_NIC_RQ_WQE_FRAG1_HIGH_IDX = 5, + NES_NIC_RQ_WQE_FRAG2_LOW_IDX = 6, + NES_NIC_RQ_WQE_FRAG2_HIGH_IDX = 7, + NES_NIC_RQ_WQE_FRAG3_LOW_IDX = 8, + NES_NIC_RQ_WQE_FRAG3_HIGH_IDX = 9, +}; + +enum nes_nic_sq_wqe_word_idx { + NES_NIC_SQ_WQE_MISC_IDX = 0, + NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX = 1, + NES_NIC_SQ_WQE_LSO_INFO_IDX = 2, + NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX = 3, + NES_NIC_SQ_WQE_LENGTH_2_1_IDX = 4, + NES_NIC_SQ_WQE_LENGTH_4_3_IDX = 5, + NES_NIC_SQ_WQE_FRAG0_LOW_IDX = 6, + NES_NIC_SQ_WQE_FRAG0_HIGH_IDX = 7, + NES_NIC_SQ_WQE_FRAG1_LOW_IDX = 8, + NES_NIC_SQ_WQE_FRAG1_HIGH_IDX = 9, + NES_NIC_SQ_WQE_FRAG2_LOW_IDX = 10, + NES_NIC_SQ_WQE_FRAG2_HIGH_IDX = 11, + NES_NIC_SQ_WQE_FRAG3_LOW_IDX = 12, + NES_NIC_SQ_WQE_FRAG3_HIGH_IDX = 13, + NES_NIC_SQ_WQE_FRAG4_LOW_IDX = 14, + NES_NIC_SQ_WQE_FRAG4_HIGH_IDX = 15, +}; + +enum nes_iwarp_sq_wqe_word_idx { + NES_IWARP_SQ_WQE_MISC_IDX = 0, + NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX = 1, + NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX = 2, + NES_IWARP_SQ_WQE_COMP_CTX_HIGH_IDX = 3, + NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX = 4, + NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX = 5, + NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX = 7, + NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX = 8, + NES_IWARP_SQ_WQE_RDMA_TO_HIGH_IDX = 9, + NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX = 10, + NES_IWARP_SQ_WQE_RDMA_STAG_IDX = 11, + NES_IWARP_SQ_WQE_IMM_DATA_START_IDX = 12, + NES_IWARP_SQ_WQE_FRAG0_LOW_IDX = 16, + NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX = 17, + NES_IWARP_SQ_WQE_LENGTH0_IDX = 18, + NES_IWARP_SQ_WQE_STAG0_IDX = 19, + NES_IWARP_SQ_WQE_FRAG1_LOW_IDX = 20, + NES_IWARP_SQ_WQE_FRAG1_HIGH_IDX = 21, + NES_IWARP_SQ_WQE_LENGTH1_IDX = 22, + NES_IWARP_SQ_WQE_STAG1_IDX = 23, + NES_IWARP_SQ_WQE_FRAG2_LOW_IDX = 24, + NES_IWARP_SQ_WQE_FRAG2_HIGH_IDX = 25, + NES_IWARP_SQ_WQE_LENGTH2_IDX = 26, + NES_IWARP_SQ_WQE_STAG2_IDX = 27, + NES_IWARP_SQ_WQE_FRAG3_LOW_IDX = 28, + NES_IWARP_SQ_WQE_FRAG3_HIGH_IDX = 29, + NES_IWARP_SQ_WQE_LENGTH3_IDX = 30, + NES_IWARP_SQ_WQE_STAG3_IDX = 31, +}; + +enum nes_iwarp_sq_bind_wqe_word_idx { + NES_IWARP_SQ_BIND_WQE_MR_IDX = 6, + NES_IWARP_SQ_BIND_WQE_MW_IDX = 7, + NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX = 8, + NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX = 9, + NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX = 10, + NES_IWARP_SQ_BIND_WQE_VA_FBO_HIGH_IDX = 11, +}; + +enum nes_iwarp_sq_fmr_wqe_word_idx { + NES_IWARP_SQ_FMR_WQE_MR_STAG_IDX = 7, + NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX = 8, + NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX = 9, + NES_IWARP_SQ_FMR_WQE_VA_FBO_LOW_IDX = 10, + NES_IWARP_SQ_FMR_WQE_VA_FBO_HIGH_IDX = 11, + NES_IWARP_SQ_FMR_WQE_PBL_ADDR_LOW_IDX = 12, + NES_IWARP_SQ_FMR_WQE_PBL_ADDR_HIGH_IDX = 13, + NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX = 14, +}; + +enum nes_iwarp_sq_locinv_wqe_word_idx { + NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX = 6, +}; + + +enum nes_iwarp_rq_wqe_word_idx { + NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX = 1, + NES_IWARP_RQ_WQE_COMP_CTX_LOW_IDX = 2, + NES_IWARP_RQ_WQE_COMP_CTX_HIGH_IDX = 3, + NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX = 4, + NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX = 5, + NES_IWARP_RQ_WQE_FRAG0_LOW_IDX = 8, + NES_IWARP_RQ_WQE_FRAG0_HIGH_IDX = 9, + NES_IWARP_RQ_WQE_LENGTH0_IDX = 10, + NES_IWARP_RQ_WQE_STAG0_IDX = 11, + NES_IWARP_RQ_WQE_FRAG1_LOW_IDX = 12, + NES_IWARP_RQ_WQE_FRAG1_HIGH_IDX = 13, + NES_IWARP_RQ_WQE_LENGTH1_IDX = 14, + NES_IWARP_RQ_WQE_STAG1_IDX = 15, + NES_IWARP_RQ_WQE_FRAG2_LOW_IDX = 16, + NES_IWARP_RQ_WQE_FRAG2_HIGH_IDX = 17, + NES_IWARP_RQ_WQE_LENGTH2_IDX = 18, + NES_IWARP_RQ_WQE_STAG2_IDX = 19, + NES_IWARP_RQ_WQE_FRAG3_LOW_IDX = 20, + NES_IWARP_RQ_WQE_FRAG3_HIGH_IDX = 21, + NES_IWARP_RQ_WQE_LENGTH3_IDX = 22, + NES_IWARP_RQ_WQE_STAG3_IDX = 23, +}; + +enum nes_nic_sq_wqe_bits { + NES_NIC_SQ_WQE_PHDR_CS_READY = (1<<21), + NES_NIC_SQ_WQE_LSO_ENABLE = (1<<22), + NES_NIC_SQ_WQE_TAGVALUE_ENABLE = (1<<23), + NES_NIC_SQ_WQE_DISABLE_CHKSUM = (1<<30), + NES_NIC_SQ_WQE_COMPLETION = (1<<31), +}; + +enum nes_nic_cqe_word_idx { + NES_NIC_CQE_ACCQP_ID_IDX = 0, + NES_NIC_CQE_TAG_PKT_TYPE_IDX = 2, + NES_NIC_CQE_MISC_IDX = 3, +}; + +#define NES_PKT_TYPE_APBVT_BITS 0xC112 +#define NES_PKT_TYPE_APBVT_MASK 0xff3e + +#define NES_PKT_TYPE_PVALID_BITS 0x10000000 +#define NES_PKT_TYPE_PVALID_MASK 0x30000000 + +#define NES_PKT_TYPE_TCPV4_BITS 0x0110 +#define NES_PKT_TYPE_TCPV4_MASK 0x3f30 + +#define NES_PKT_TYPE_UDPV4_BITS 0x0210 +#define NES_PKT_TYPE_UDPV4_MASK 0x3f30 + +#define NES_PKT_TYPE_IPV4_BITS 0x0010 +#define NES_PKT_TYPE_IPV4_MASK 0x3f30 + +#define NES_PKT_TYPE_OTHER_BITS 0x0000 +#define NES_PKT_TYPE_OTHER_MASK 0x0030 + +#define NES_NIC_CQE_ERRV_SHIFT 16 +enum nes_nic_ev_bits { + NES_NIC_ERRV_BITS_MODE = (1<<0), + NES_NIC_ERRV_BITS_IPV4_CSUM_ERR = (1<<1), + NES_NIC_ERRV_BITS_TCPUDP_CSUM_ERR = (1<<2), + NES_NIC_ERRV_BITS_WQE_OVERRUN = (1<<3), + NES_NIC_ERRV_BITS_IPH_ERR = (1<<4), +}; + +enum nes_nic_cqe_bits { + NES_NIC_CQE_ERRV_MASK = (0xff<<NES_NIC_CQE_ERRV_SHIFT), + NES_NIC_CQE_SQ = (1<<24), + NES_NIC_CQE_ACCQP_PORT = (1<<28), + NES_NIC_CQE_ACCQP_VALID = (1<<29), + NES_NIC_CQE_TAG_VALID = (1<<30), + NES_NIC_CQE_VALID = (1<<31), +}; + +enum nes_aeqe_word_idx { + NES_AEQE_COMP_CTXT_LOW_IDX = 0, + NES_AEQE_COMP_CTXT_HIGH_IDX = 1, + NES_AEQE_COMP_QP_CQ_ID_IDX = 2, + NES_AEQE_MISC_IDX = 3, +}; + +enum nes_aeqe_bits { + NES_AEQE_QP = (1<<16), + NES_AEQE_CQ = (1<<17), + NES_AEQE_SQ = (1<<18), + NES_AEQE_INBOUND_RDMA = (1<<19), + NES_AEQE_IWARP_STATE_MASK = (7<<20), + NES_AEQE_TCP_STATE_MASK = (0xf<<24), + NES_AEQE_VALID = (1<<31), +}; + +#define NES_AEQE_IWARP_STATE_SHIFT 20 +#define NES_AEQE_TCP_STATE_SHIFT 24 + +enum nes_aeqe_iwarp_state { + NES_AEQE_IWARP_STATE_NON_EXISTANT = 0, + NES_AEQE_IWARP_STATE_IDLE = 1, + NES_AEQE_IWARP_STATE_RTS = 2, + NES_AEQE_IWARP_STATE_CLOSING = 3, + NES_AEQE_IWARP_STATE_TERMINATE = 5, + NES_AEQE_IWARP_STATE_ERROR = 6 +}; + +enum nes_aeqe_tcp_state { + NES_AEQE_TCP_STATE_NON_EXISTANT = 0, + NES_AEQE_TCP_STATE_CLOSED = 1, + NES_AEQE_TCP_STATE_LISTEN = 2, + NES_AEQE_TCP_STATE_SYN_SENT = 3, + NES_AEQE_TCP_STATE_SYN_RCVD = 4, + NES_AEQE_TCP_STATE_ESTABLISHED = 5, + NES_AEQE_TCP_STATE_CLOSE_WAIT = 6, + NES_AEQE_TCP_STATE_FIN_WAIT_1 = 7, + NES_AEQE_TCP_STATE_CLOSING = 8, + NES_AEQE_TCP_STATE_LAST_ACK = 9, + NES_AEQE_TCP_STATE_FIN_WAIT_2 = 10, + NES_AEQE_TCP_STATE_TIME_WAIT = 11 +}; + +enum nes_aeqe_aeid { + NES_AEQE_AEID_AMP_UNALLOCATED_STAG = 0x0102, + NES_AEQE_AEID_AMP_INVALID_STAG = 0x0103, + NES_AEQE_AEID_AMP_BAD_QP = 0x0104, + NES_AEQE_AEID_AMP_BAD_PD = 0x0105, + NES_AEQE_AEID_AMP_BAD_STAG_KEY = 0x0106, + NES_AEQE_AEID_AMP_BAD_STAG_INDEX = 0x0107, + NES_AEQE_AEID_AMP_BOUNDS_VIOLATION = 0x0108, + NES_AEQE_AEID_AMP_RIGHTS_VIOLATION = 0x0109, + NES_AEQE_AEID_AMP_TO_WRAP = 0x010a, + NES_AEQE_AEID_AMP_FASTREG_SHARED = 0x010b, + NES_AEQE_AEID_AMP_FASTREG_VALID_STAG = 0x010c, + NES_AEQE_AEID_AMP_FASTREG_MW_STAG = 0x010d, + NES_AEQE_AEID_AMP_FASTREG_INVALID_RIGHTS = 0x010e, + NES_AEQE_AEID_AMP_FASTREG_PBL_TABLE_OVERFLOW = 0x010f, + NES_AEQE_AEID_AMP_FASTREG_INVALID_LENGTH = 0x0110, + NES_AEQE_AEID_AMP_INVALIDATE_SHARED = 0x0111, + NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS = 0x0112, + NES_AEQE_AEID_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS = 0x0113, + NES_AEQE_AEID_AMP_MWBIND_VALID_STAG = 0x0114, + NES_AEQE_AEID_AMP_MWBIND_OF_MR_STAG = 0x0115, + NES_AEQE_AEID_AMP_MWBIND_TO_ZERO_BASED_STAG = 0x0116, + NES_AEQE_AEID_AMP_MWBIND_TO_MW_STAG = 0x0117, + NES_AEQE_AEID_AMP_MWBIND_INVALID_RIGHTS = 0x0118, + NES_AEQE_AEID_AMP_MWBIND_INVALID_BOUNDS = 0x0119, + NES_AEQE_AEID_AMP_MWBIND_TO_INVALID_PARENT = 0x011a, + NES_AEQE_AEID_AMP_MWBIND_BIND_DISABLED = 0x011b, + NES_AEQE_AEID_BAD_CLOSE = 0x0201, + NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE = 0x0202, + NES_AEQE_AEID_CQ_OPERATION_ERROR = 0x0203, + NES_AEQE_AEID_PRIV_OPERATION_DENIED = 0x0204, + NES_AEQE_AEID_RDMA_READ_WHILE_ORD_ZERO = 0x0205, + NES_AEQE_AEID_STAG_ZERO_INVALID = 0x0206, + NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN = 0x0301, + NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID = 0x0302, + NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER = 0x0303, + NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION = 0x0304, + NES_AEQE_AEID_DDP_UBE_INVALID_MO = 0x0305, + NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE = 0x0306, + NES_AEQE_AEID_DDP_UBE_INVALID_QN = 0x0307, + NES_AEQE_AEID_DDP_NO_L_BIT = 0x0308, + NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION = 0x0311, + NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE = 0x0312, + NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST = 0x0313, + NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP = 0x0314, + NES_AEQE_AEID_INVALID_ARP_ENTRY = 0x0401, + NES_AEQE_AEID_INVALID_TCP_OPTION_RCVD = 0x0402, + NES_AEQE_AEID_STALE_ARP_ENTRY = 0x0403, + NES_AEQE_AEID_LLP_CLOSE_COMPLETE = 0x0501, + NES_AEQE_AEID_LLP_CONNECTION_RESET = 0x0502, + NES_AEQE_AEID_LLP_FIN_RECEIVED = 0x0503, + NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH = 0x0504, + NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR = 0x0505, + NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE = 0x0506, + NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL = 0x0507, + NES_AEQE_AEID_LLP_SYN_RECEIVED = 0x0508, + NES_AEQE_AEID_LLP_TERMINATE_RECEIVED = 0x0509, + NES_AEQE_AEID_LLP_TOO_MANY_RETRIES = 0x050a, + NES_AEQE_AEID_LLP_TOO_MANY_KEEPALIVE_RETRIES = 0x050b, + NES_AEQE_AEID_RESET_SENT = 0x0601, + NES_AEQE_AEID_TERMINATE_SENT = 0x0602, + NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC = 0x0700 +}; + +enum nes_iwarp_sq_opcodes { + NES_IWARP_SQ_WQE_WRPDU = (1<<15), + NES_IWARP_SQ_WQE_PSH = (1<<21), + NES_IWARP_SQ_WQE_STREAMING = (1<<23), + NES_IWARP_SQ_WQE_IMM_DATA = (1<<28), + NES_IWARP_SQ_WQE_READ_FENCE = (1<<29), + NES_IWARP_SQ_WQE_LOCAL_FENCE = (1<<30), + NES_IWARP_SQ_WQE_SIGNALED_COMPL = (1<<31), +}; + +enum nes_iwarp_sq_wqe_bits { + NES_IWARP_SQ_OP_RDMAW = 0, + NES_IWARP_SQ_OP_RDMAR = 1, + NES_IWARP_SQ_OP_SEND = 3, + NES_IWARP_SQ_OP_SENDINV = 4, + NES_IWARP_SQ_OP_SENDSE = 5, + NES_IWARP_SQ_OP_SENDSEINV = 6, + NES_IWARP_SQ_OP_BIND = 8, + NES_IWARP_SQ_OP_FAST_REG = 9, + NES_IWARP_SQ_OP_LOCINV = 10, + NES_IWARP_SQ_OP_RDMAR_LOCINV = 11, + NES_IWARP_SQ_OP_NOP = 12, +}; + +#define NES_EEPROM_READ_REQUEST (1<<16) +#define NES_MAC_ADDR_VALID (1<<20) + +/* + * NES index registers init values. + */ +struct nes_init_values { + u32 index; + u32 data; + u8 wrt; +}; + +/* + * NES registers in BAR0. + */ +struct nes_pci_regs { + u32 int_status; + u32 int_mask; + u32 int_pending; + u32 intf_int_status; + u32 intf_int_mask; + u32 other_regs[59]; /* pad out to 256 bytes for now */ +}; + +#define NES_CQP_SQ_SIZE 128 +#define NES_CCQ_SIZE 128 +#define NES_NIC_WQ_SIZE 512 +#define NES_NIC_CTX_SIZE ((NES_NIC_CTX_RQ_SIZE_512) | (NES_NIC_CTX_SQ_SIZE_512)) +#define NES_NIC_BACK_STORE 0x00038000 + +struct nes_device; + +struct nes_hw_nic_qp_context { + __le32 context_words[6]; +}; + +struct nes_hw_nic_sq_wqe { + __le32 wqe_words[16]; +}; + +struct nes_hw_nic_rq_wqe { + __le32 wqe_words[16]; +}; + +struct nes_hw_nic_cqe { + __le32 cqe_words[4]; +}; + +struct nes_hw_cqp_qp_context { + __le32 context_words[4]; +}; + +struct nes_hw_cqp_wqe { + __le32 wqe_words[16]; +}; + +struct nes_hw_qp_wqe { + __le32 wqe_words[32]; +}; + +struct nes_hw_cqe { + __le32 cqe_words[8]; +}; + +struct nes_hw_ceqe { + __le32 ceqe_words[2]; +}; + +struct nes_hw_aeqe { + __le32 aeqe_words[4]; +}; + +struct nes_cqp_request { + union { + u64 cqp_callback_context; + void *cqp_callback_pointer; + }; + wait_queue_head_t waitq; + struct nes_hw_cqp_wqe cqp_wqe; + struct list_head list; + atomic_t refcount; + void (*cqp_callback)(struct nes_device *nesdev, struct nes_cqp_request *cqp_request); + u16 major_code; + u16 minor_code; + u8 waiting; + u8 request_done; + u8 dynamic; + u8 callback; +}; + +struct nes_hw_cqp { + struct nes_hw_cqp_wqe *sq_vbase; + dma_addr_t sq_pbase; + spinlock_t lock; + wait_queue_head_t waitq; + u16 qp_id; + u16 sq_head; + u16 sq_tail; + u16 sq_size; +}; + +#define NES_FIRST_FRAG_SIZE 128 +struct nes_first_frag { + u8 buffer[NES_FIRST_FRAG_SIZE]; +}; + +struct nes_hw_nic { + struct nes_first_frag *first_frag_vbase; /* virtual address of first frags */ + struct nes_hw_nic_sq_wqe *sq_vbase; /* virtual address of sq */ + struct nes_hw_nic_rq_wqe *rq_vbase; /* virtual address of rq */ + struct sk_buff *tx_skb[NES_NIC_WQ_SIZE]; + struct sk_buff *rx_skb[NES_NIC_WQ_SIZE]; + dma_addr_t frag_paddr[NES_NIC_WQ_SIZE]; + unsigned long first_frag_overflow[BITS_TO_LONGS(NES_NIC_WQ_SIZE)]; + dma_addr_t sq_pbase; /* PCI memory for host rings */ + dma_addr_t rq_pbase; /* PCI memory for host rings */ + + u16 qp_id; + u16 sq_head; + u16 sq_tail; + u16 sq_size; + u16 rq_head; + u16 rq_tail; + u16 rq_size; + u8 replenishing_rq; + u8 reserved; + + spinlock_t sq_lock; + spinlock_t rq_lock; +}; + +struct nes_hw_nic_cq { + struct nes_hw_nic_cqe volatile *cq_vbase; /* PCI memory for host rings */ + void (*ce_handler)(struct nes_device *nesdev, struct nes_hw_nic_cq *cq); + dma_addr_t cq_pbase; /* PCI memory for host rings */ + int rx_cqes_completed; + int cqe_allocs_pending; + int rx_pkts_indicated; + u16 cq_head; + u16 cq_size; + u16 cq_number; + u8 cqes_pending; +}; + +struct nes_hw_qp { + struct nes_hw_qp_wqe *sq_vbase; /* PCI memory for host rings */ + struct nes_hw_qp_wqe *rq_vbase; /* PCI memory for host rings */ + void *q2_vbase; /* PCI memory for host rings */ + dma_addr_t sq_pbase; /* PCI memory for host rings */ + dma_addr_t rq_pbase; /* PCI memory for host rings */ + dma_addr_t q2_pbase; /* PCI memory for host rings */ + u32 qp_id; + u16 sq_head; + u16 sq_tail; + u16 sq_size; + u16 rq_head; + u16 rq_tail; + u16 rq_size; + u8 rq_encoded_size; + u8 sq_encoded_size; +}; + +struct nes_hw_cq { + struct nes_hw_cqe volatile *cq_vbase; /* PCI memory for host rings */ + void (*ce_handler)(struct nes_device *nesdev, struct nes_hw_cq *cq); + dma_addr_t cq_pbase; /* PCI memory for host rings */ + u16 cq_head; + u16 cq_size; + u16 cq_number; +}; + +struct nes_hw_ceq { + struct nes_hw_ceqe volatile *ceq_vbase; /* PCI memory for host rings */ + dma_addr_t ceq_pbase; /* PCI memory for host rings */ + u16 ceq_head; + u16 ceq_size; +}; + +struct nes_hw_aeq { + struct nes_hw_aeqe volatile *aeq_vbase; /* PCI memory for host rings */ + dma_addr_t aeq_pbase; /* PCI memory for host rings */ + u16 aeq_head; + u16 aeq_size; +}; + +struct nic_qp_map { + u8 qpid; + u8 nic_index; + u8 logical_port; + u8 is_hnic; +}; + +#define NES_CQP_ARP_AEQ_INDEX_MASK 0x000f0000 +#define NES_CQP_ARP_AEQ_INDEX_SHIFT 16 + +#define NES_CQP_APBVT_ADD 0x00008000 +#define NES_CQP_APBVT_NIC_SHIFT 16 + +#define NES_ARP_ADD 1 +#define NES_ARP_DELETE 2 +#define NES_ARP_RESOLVE 3 + +#define NES_MAC_SW_IDLE 0 +#define NES_MAC_SW_INTERRUPT 1 +#define NES_MAC_SW_MH 2 + +struct nes_arp_entry { + u32 ip_addr; + u8 mac_addr[ETH_ALEN]; +}; + +#define NES_NIC_FAST_TIMER 96 +#define NES_NIC_FAST_TIMER_LOW 40 +#define NES_NIC_FAST_TIMER_HIGH 1000 +#define DEFAULT_NES_QL_HIGH 256 +#define DEFAULT_NES_QL_LOW 16 +#define DEFAULT_NES_QL_TARGET 64 +#define DEFAULT_JUMBO_NES_QL_LOW 12 +#define DEFAULT_JUMBO_NES_QL_TARGET 40 +#define DEFAULT_JUMBO_NES_QL_HIGH 128 +#define NES_NIC_CQ_DOWNWARD_TREND 8 + +struct nes_hw_tune_timer { + //u16 cq_count; + u16 threshold_low; + u16 threshold_target; + u16 threshold_high; + u16 timer_in_use; + u16 timer_in_use_old; + u16 timer_in_use_min; + u16 timer_in_use_max; + u8 timer_direction_upward; + u8 timer_direction_downward; + u16 cq_count_old; + u8 cq_direction_downward; +}; + +#define NES_TIMER_INT_LIMIT 2 +#define NES_TIMER_INT_LIMIT_DYNAMIC 10 +#define NES_TIMER_ENABLE_LIMIT 4 +#define NES_MAX_LINK_INTERRUPTS 128 +#define NES_MAX_LINK_CHECK 200 + +struct nes_adapter { + u64 fw_ver; + unsigned long *allocated_qps; + unsigned long *allocated_cqs; + unsigned long *allocated_mrs; + unsigned long *allocated_pds; + unsigned long *allocated_arps; + struct nes_qp **qp_table; + struct workqueue_struct *work_q; + + struct list_head list; + struct list_head active_listeners; + /* list of the netdev's associated with each logical port */ + struct list_head nesvnic_list[4]; + + struct timer_list mh_timer; + struct timer_list lc_timer; + struct work_struct work; + spinlock_t resource_lock; + spinlock_t phy_lock; + spinlock_t pbl_lock; + spinlock_t periodic_timer_lock; + + struct nes_arp_entry arp_table[NES_MAX_ARP_TABLE_SIZE]; + + /* Adapter CEQ and AEQs */ + struct nes_hw_ceq ceq[16]; + struct nes_hw_aeq aeq[8]; + + struct nes_hw_tune_timer tune_timer; + + unsigned long doorbell_start; + + u32 hw_rev; + u32 vendor_id; + u32 vendor_part_id; + u32 device_cap_flags; + u32 tick_delta; + u32 timer_int_req; + u32 arp_table_size; + u32 next_arp_index; + + u32 max_mr; + u32 max_256pbl; + u32 max_4kpbl; + u32 free_256pbl; + u32 free_4kpbl; + u32 max_mr_size; + u32 max_qp; + u32 next_qp; + u32 max_irrq; + u32 max_qp_wr; + u32 max_sge; + u32 max_cq; + u32 next_cq; + u32 max_cqe; + u32 max_pd; + u32 base_pd; + u32 next_pd; + u32 hte_index_mask; + + /* EEPROM information */ + u32 rx_pool_size; + u32 tx_pool_size; + u32 rx_threshold; + u32 tcp_timer_core_clk_divisor; + u32 iwarp_config; + u32 cm_config; + u32 sws_timer_config; + u32 tcp_config1; + u32 wqm_wat; + u32 core_clock; + u32 firmware_version; + + u32 nic_rx_eth_route_err; + + u32 et_rx_coalesce_usecs; + u32 et_rx_max_coalesced_frames; + u32 et_rx_coalesce_usecs_irq; + u32 et_rx_max_coalesced_frames_irq; + u32 et_pkt_rate_low; + u32 et_rx_coalesce_usecs_low; + u32 et_rx_max_coalesced_frames_low; + u32 et_pkt_rate_high; + u32 et_rx_coalesce_usecs_high; + u32 et_rx_max_coalesced_frames_high; + u32 et_rate_sample_interval; + u32 timer_int_limit; + + /* Adapter base MAC address */ + u32 mac_addr_low; + u16 mac_addr_high; + + u16 firmware_eeprom_offset; + u16 software_eeprom_offset; + + u16 max_irrq_wr; + + /* pd config for each port */ + u16 pd_config_size[4]; + u16 pd_config_base[4]; + + u16 link_interrupt_count[4]; + + /* the phy index for each port */ + u8 phy_index[4]; + u8 mac_sw_state[4]; + u8 mac_link_down[4]; + u8 phy_type[4]; + + /* PCI information */ + unsigned int devfn; + unsigned char bus_number; + unsigned char OneG_Mode; + + unsigned char ref_count; + u8 netdev_count; + u8 netdev_max; /* from host nic address count in EEPROM */ + u8 port_count; + u8 virtwq; + u8 et_use_adaptive_rx_coalesce; + u8 adapter_fcn_count; +}; + +struct nes_pbl { + u64 *pbl_vbase; + dma_addr_t pbl_pbase; + struct page *page; + unsigned long user_base; + u32 pbl_size; + struct list_head list; + /* TODO: need to add list for two level tables */ +}; + +struct nes_listener { + struct work_struct work; + struct workqueue_struct *wq; + struct nes_vnic *nesvnic; + struct iw_cm_id *cm_id; + struct list_head list; + unsigned long socket; + u8 accept_failed; +}; + +struct nes_ib_device; + +struct nes_vnic { + struct nes_ib_device *nesibdev; + u64 sq_full; + u64 sq_locked; + u64 tso_requests; + u64 segmented_tso_requests; + u64 linearized_skbs; + u64 tx_sw_dropped; + u64 endnode_nstat_rx_discard; + u64 endnode_nstat_rx_octets; + u64 endnode_nstat_rx_frames; + u64 endnode_nstat_tx_octets; + u64 endnode_nstat_tx_frames; + u64 endnode_ipv4_tcp_retransmits; + /* void *mem; */ + struct nes_device *nesdev; + struct net_device *netdev; + struct vlan_group *vlan_grp; + atomic_t rx_skbs_needed; + atomic_t rx_skb_timer_running; + int budget; + u32 msg_enable; + /* u32 tx_avail; */ + __be32 local_ipaddr; + struct napi_struct napi; + spinlock_t tx_lock; /* could use netdev tx lock? */ + struct timer_list rq_wqes_timer; + u32 nic_mem_size; + void *nic_vbase; + dma_addr_t nic_pbase; + struct nes_hw_nic nic; + struct nes_hw_nic_cq nic_cq; + u32 mcrq_qp_id; + struct nes_ucontext *mcrq_ucontext; + struct nes_cqp_request* (*get_cqp_request)(struct nes_device *nesdev); + void (*post_cqp_request)(struct nes_device*, struct nes_cqp_request *, int); + int (*mcrq_mcast_filter)( struct nes_vnic* nesvnic, __u8* dmi_addr ); + struct net_device_stats netstats; + /* used to put the netdev on the adapters logical port list */ + struct list_head list; + u16 max_frame_size; + u8 netdev_open; + u8 linkup; + u8 logical_port; + u8 netdev_index; /* might not be needed, indexes nesdev->netdev */ + u8 perfect_filter_index; + u8 nic_index; + u8 qp_nic_index[4]; + u8 next_qp_nic_index; + u8 of_device_registered; + u8 rdma_enabled; + u8 rx_checksum_disabled; +}; + +struct nes_ib_device { + struct ib_device ibdev; + struct nes_vnic *nesvnic; + + /* Virtual RNIC Limits */ + u32 max_mr; + u32 max_qp; + u32 max_cq; + u32 max_pd; + u32 num_mr; + u32 num_qp; + u32 num_cq; + u32 num_pd; +}; + +#define nes_vlan_rx vlan_hwaccel_receive_skb +#define nes_netif_rx netif_receive_skb + +#endif /* __NES_HW_H */ diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c new file mode 100644 index 0000000..b6cc265 --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_nic.c @@ -0,0 +1,1703 @@ +/* + * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/if_arp.h> +#include <linux/if_vlan.h> +#include <linux/ethtool.h> +#include <net/tcp.h> + +#include <net/inet_common.h> +#include <linux/inet.h> + +#include "nes.h" + +static struct nic_qp_map nic_qp_mapping_0[] = { + {16,0,0,1},{24,4,0,0},{28,8,0,0},{32,12,0,0}, + {20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0}, + {18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0}, + {22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0} +}; + +static struct nic_qp_map nic_qp_mapping_1[] = { + {18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0}, + {22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0} +}; + +static struct nic_qp_map nic_qp_mapping_2[] = { + {20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0} +}; + +static struct nic_qp_map nic_qp_mapping_3[] = { + {22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0} +}; + +static struct nic_qp_map nic_qp_mapping_4[] = { + {28,8,0,0},{32,12,0,0} +}; + +static struct nic_qp_map nic_qp_mapping_5[] = { + {29,9,1,0},{33,13,1,0} +}; + +static struct nic_qp_map nic_qp_mapping_6[] = { + {30,10,2,0},{34,14,2,0} +}; + +static struct nic_qp_map nic_qp_mapping_7[] = { + {31,11,3,0},{35,15,3,0} +}; + +static struct nic_qp_map *nic_qp_mapping_per_function[] = { + nic_qp_mapping_0, nic_qp_mapping_1, nic_qp_mapping_2, nic_qp_mapping_3, + nic_qp_mapping_4, nic_qp_mapping_5, nic_qp_mapping_6, nic_qp_mapping_7 +}; + +static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK + | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN; +static int debug = -1; + + +static int nes_netdev_open(struct net_device *); +static int nes_netdev_stop(struct net_device *); +static int nes_netdev_start_xmit(struct sk_buff *, struct net_device *); +static struct net_device_stats *nes_netdev_get_stats(struct net_device *); +static void nes_netdev_tx_timeout(struct net_device *); +static int nes_netdev_set_mac_address(struct net_device *, void *); +static int nes_netdev_change_mtu(struct net_device *, int); + +/** + * nes_netdev_poll + */ +static int nes_netdev_poll(struct napi_struct *napi, int budget) +{ + struct nes_vnic *nesvnic = container_of(napi, struct nes_vnic, napi); + struct net_device *netdev = nesvnic->netdev; + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_hw_nic_cq *nescq = &nesvnic->nic_cq; + + nesvnic->budget = budget; + nescq->cqes_pending = 0; + nescq->rx_cqes_completed = 0; + nescq->cqe_allocs_pending = 0; + nescq->rx_pkts_indicated = 0; + + nes_nic_ce_handler(nesdev, nescq); + + if (nescq->cqes_pending == 0) { + netif_rx_complete(netdev, napi); + /* clear out completed cqes and arm */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + nescq->cq_number | (nescq->cqe_allocs_pending << 16)); + nes_read32(nesdev->regs+NES_CQE_ALLOC); + } else { + /* clear out completed cqes but don't arm */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, + nescq->cq_number | (nescq->cqe_allocs_pending << 16)); + nes_debug(NES_DBG_NETDEV, "%s: exiting with work pending\n", + nesvnic->netdev->name); + } + return nescq->rx_pkts_indicated; +} + + +/** + * nes_netdev_open - Activate the network interface; ifconfig + * ethx up. + */ +static int nes_netdev_open(struct net_device *netdev) +{ + u32 macaddr_low; + u16 macaddr_high; + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + int ret; + int i; + struct nes_vnic *first_nesvnic; + u32 nic_active_bit; + u32 nic_active; + + assert(nesdev != NULL); + + first_nesvnic = list_entry(nesdev->nesadapter->nesvnic_list[nesdev->mac_index].next, + struct nes_vnic, list); + + if (netif_msg_ifup(nesvnic)) + printk(KERN_INFO PFX "%s: enabling interface\n", netdev->name); + + ret = nes_init_nic_qp(nesdev, netdev); + if (ret) { + return ret; + } + + netif_carrier_off(netdev); + netif_stop_queue(netdev); + + if ((!nesvnic->of_device_registered) && (nesvnic->rdma_enabled)) { + nesvnic->nesibdev = nes_init_ofa_device(netdev); + if (nesvnic->nesibdev == NULL) { + printk(KERN_ERR PFX "%s: nesvnic->nesibdev alloc failed", netdev->name); + } else { + nesvnic->nesibdev->nesvnic = nesvnic; + ret = nes_register_ofa_device(nesvnic->nesibdev); + if (ret) { + printk(KERN_ERR PFX "%s: Unable to register RDMA device, ret = %d\n", + netdev->name, ret); + } + } + } + /* Set packet filters */ + nic_active_bit = 1 << nesvnic->nic_index; + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active); + + macaddr_high = ((u16)netdev->dev_addr[0]) << 8; + macaddr_high += (u16)netdev->dev_addr[1]; + macaddr_low = ((u32)netdev->dev_addr[2]) << 24; + macaddr_low += ((u32)netdev->dev_addr[3]) << 16; + macaddr_low += ((u32)netdev->dev_addr[4]) << 8; + macaddr_low += (u32)netdev->dev_addr[5]; + + /* Program the various MAC regs */ + for (i = 0; i < NES_MAX_PORT_COUNT; i++) { + if (nesvnic->qp_nic_index[i] == 0xf) { + break; + } + nes_debug(NES_DBG_NETDEV, "i=%d, perfect filter table index= %d, PERF FILTER LOW" + " (Addr:%08X) = %08X, HIGH = %08X.\n", + i, nesvnic->qp_nic_index[i], + NES_IDX_PERFECT_FILTER_LOW+((nesvnic->perfect_filter_index + i) * 8), + macaddr_low, + (u32)macaddr_high | NES_MAC_ADDR_VALID | + ((((u32)nesvnic->nic_index) << 16))); + nes_write_indexed(nesdev, + NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8), + macaddr_low); + nes_write_indexed(nesdev, + NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8), + (u32)macaddr_high | NES_MAC_ADDR_VALID | + ((((u32)nesvnic->nic_index) << 16))); + } + + + nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + nesvnic->nic_cq.cq_number); + nes_read32(nesdev->regs+NES_CQE_ALLOC); + + if (first_nesvnic->linkup) { + /* Enable network packets */ + nesvnic->linkup = 1; + netif_start_queue(netdev); + netif_carrier_on(netdev); + } + napi_enable(&nesvnic->napi); + nesvnic->netdev_open = 1; + + return 0; +} + + +/** + * nes_netdev_stop + */ +static int nes_netdev_stop(struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + u32 nic_active_mask; + u32 nic_active; + + nes_debug(NES_DBG_SHUTDOWN, "nesvnic=%p, nesdev=%p, netdev=%p %s\n", + nesvnic, nesdev, netdev, netdev->name); + if (nesvnic->netdev_open == 0) + return 0; + + if (netif_msg_ifdown(nesvnic)) + printk(KERN_INFO PFX "%s: disabling interface\n", netdev->name); + + /* Disable network packets */ + napi_disable(&nesvnic->napi); + netif_stop_queue(netdev); + if ((nesdev->netdev[0] == netdev) & (nesvnic->logical_port == nesdev->mac_index)) { + nes_write_indexed(nesdev, + NES_IDX_MAC_INT_MASK+(0x200*nesdev->mac_index), 0xffffffff); + } + + nic_active_mask = ~((u32)(1 << nesvnic->nic_index)); + nes_write_indexed(nesdev, NES_IDX_PERFECT_FILTER_HIGH+ + (nesvnic->perfect_filter_index*8), 0); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE); + nic_active &= nic_active_mask; + nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL); + nic_active &= nic_active_mask; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE); + nic_active &= nic_active_mask; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); + nic_active &= nic_active_mask; + nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON); + nic_active &= nic_active_mask; + nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active); + + + if (nesvnic->of_device_registered) { + nes_destroy_ofa_device(nesvnic->nesibdev); + nesvnic->nesibdev = NULL; + nesvnic->of_device_registered = 0; + } + nes_destroy_nic_qp(nesvnic); + + nesvnic->netdev_open = 0; + + return 0; +} + + +/** + * nes_nic_send + */ +static int nes_nic_send(struct sk_buff *skb, struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_hw_nic *nesnic = &nesvnic->nic; + struct nes_hw_nic_sq_wqe *nic_sqe; + struct tcphdr *tcph; + __le16 *wqe_fragment_length; + u32 wqe_misc; + u16 wqe_fragment_index = 1; /* first fragment (0) is used by copy buffer */ + u16 skb_fragment_index; + dma_addr_t bus_address; + + nic_sqe = &nesnic->sq_vbase[nesnic->sq_head]; + wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX]; + + /* setup the VLAN tag if present */ + if (vlan_tx_tag_present(skb)) { + nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n", + netdev->name, vlan_tx_tag_get(skb)); + wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE; + wqe_fragment_length[0] = (__force __le16) vlan_tx_tag_get(skb); + } else + wqe_misc = 0; + + /* bump past the vlan tag */ + wqe_fragment_length++; + /* wqe_fragment_address = (u64 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX]; */ + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcph = tcp_hdr(skb); + if (1) { + if (skb_is_gso(skb)) { + /* nes_debug(NES_DBG_NIC_TX, "%s: TSO request... seg size = %u\n", + netdev->name, skb_is_gso(skb)); */ + wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE | + NES_NIC_SQ_WQE_COMPLETION | (u16)skb_is_gso(skb); + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX, + ((u32)tcph->doff) | + (((u32)(((unsigned char *)tcph) - skb->data)) << 4)); + } else { + wqe_misc |= NES_NIC_SQ_WQE_COMPLETION; + } + } + } else { /* CHECKSUM_HW */ + wqe_misc |= NES_NIC_SQ_WQE_DISABLE_CHKSUM | NES_NIC_SQ_WQE_COMPLETION; + } + + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX, + skb->len); + memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer, + skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE), skb_headlen(skb))); + wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE), + skb_headlen(skb))); + wqe_fragment_length[1] = 0; + if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) { + if ((skb_shinfo(skb)->nr_frags + 1) > 4) { + nes_debug(NES_DBG_NIC_TX, "%s: Packet with %u fragments not sent, skb_headlen=%u\n", + netdev->name, skb_shinfo(skb)->nr_frags + 2, skb_headlen(skb)); + kfree_skb(skb); + nesvnic->tx_sw_dropped++; + return NETDEV_TX_LOCKED; + } + set_bit(nesnic->sq_head, nesnic->first_frag_overflow); + bus_address = pci_map_single(nesdev->pcidev, skb->data + NES_FIRST_FRAG_SIZE, + skb_headlen(skb) - NES_FIRST_FRAG_SIZE, PCI_DMA_TODEVICE); + wqe_fragment_length[wqe_fragment_index++] = + cpu_to_le16(skb_headlen(skb) - NES_FIRST_FRAG_SIZE); + wqe_fragment_length[wqe_fragment_index] = 0; + set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX, + ((u64)(bus_address))); + nesnic->tx_skb[nesnic->sq_head] = skb; + } + + if (skb_headlen(skb) == skb->len) { + if (skb_headlen(skb) <= NES_FIRST_FRAG_SIZE) { + nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_2_1_IDX] = 0; + nesnic->tx_skb[nesnic->sq_head] = NULL; + dev_kfree_skb(skb); + } + } else { + /* Deal with Fragments */ + nesnic->tx_skb[nesnic->sq_head] = skb; + for (skb_fragment_index = 0; skb_fragment_index < skb_shinfo(skb)->nr_frags; + skb_fragment_index++) { + bus_address = pci_map_page( nesdev->pcidev, + skb_shinfo(skb)->frags[skb_fragment_index].page, + skb_shinfo(skb)->frags[skb_fragment_index].page_offset, + skb_shinfo(skb)->frags[skb_fragment_index].size, + PCI_DMA_TODEVICE); + wqe_fragment_length[wqe_fragment_index] = + cpu_to_le16(skb_shinfo(skb)->frags[skb_fragment_index].size); + set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index), + bus_address); + wqe_fragment_index++; + if (wqe_fragment_index < 5) + wqe_fragment_length[wqe_fragment_index] = 0; + } + } + + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX, wqe_misc); + nesnic->sq_head++; + nesnic->sq_head &= nesnic->sq_size - 1; + + return NETDEV_TX_OK; +} + + +/** + * nes_netdev_start_xmit + */ +static int nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_hw_nic *nesnic = &nesvnic->nic; + struct nes_hw_nic_sq_wqe *nic_sqe; + struct tcphdr *tcph; + /* struct udphdr *udph; */ +#define NES_MAX_TSO_FRAGS 18 + /* 64K segment plus overflow on each side */ + dma_addr_t tso_bus_address[NES_MAX_TSO_FRAGS]; + dma_addr_t bus_address; + u32 tso_frag_index; + u32 tso_frag_count; + u32 tso_wqe_length; + u32 curr_tcp_seq; + u32 wqe_count=1; + u32 send_rc; + struct iphdr *iph; + unsigned long flags; + __le16 *wqe_fragment_length; + u32 nr_frags; + u32 original_first_length; +// u64 *wqe_fragment_address; + /* first fragment (0) is used by copy buffer */ + u16 wqe_fragment_index=1; + u16 hoffset; + u16 nhoffset; + u16 wqes_needed; + u16 wqes_available; + u32 old_head; + u32 wqe_misc; + + /* nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u," + " (%u frags), tso_size=%u\n", + netdev->name, skb->len, skb_headlen(skb), + skb_shinfo(skb)->nr_frags, skb_is_gso(skb)); + */ + + if (!netif_carrier_ok(netdev)) + return NETDEV_TX_OK; + + if (netif_queue_stopped(netdev)) + return NETDEV_TX_BUSY; + + local_irq_save(flags); + if (!spin_trylock(&nesnic->sq_lock)) { + local_irq_restore(flags); + nesvnic->sq_locked++; + return NETDEV_TX_LOCKED; + } + + /* Check if SQ is full */ + if ((((nesnic->sq_tail+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) == 1) { + if (!netif_queue_stopped(netdev)) { + netif_stop_queue(netdev); + barrier(); + if ((((((volatile u16)nesnic->sq_tail)+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) != 1) { + netif_start_queue(netdev); + goto sq_no_longer_full; + } + } + nesvnic->sq_full++; + spin_unlock_irqrestore(&nesnic->sq_lock, flags); + return NETDEV_TX_BUSY; + } + +sq_no_longer_full: + nr_frags = skb_shinfo(skb)->nr_frags; + if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) { + nr_frags++; + } + /* Check if too many fragments */ + if (unlikely((nr_frags > 4))) { + if (skb_is_gso(skb)) { + nesvnic->segmented_tso_requests++; + nesvnic->tso_requests++; + old_head = nesnic->sq_head; + /* Basically 4 fragments available per WQE with extended fragments */ + wqes_needed = nr_frags >> 2; + wqes_needed += (nr_frags&3)?1:0; + wqes_available = (((nesnic->sq_tail+nesnic->sq_size)-nesnic->sq_head) - 1) & + (nesnic->sq_size - 1); + + if (unlikely(wqes_needed > wqes_available)) { + if (!netif_queue_stopped(netdev)) { + netif_stop_queue(netdev); + barrier(); + wqes_available = (((((volatile u16)nesnic->sq_tail)+nesnic->sq_size)-nesnic->sq_head) - 1) & + (nesnic->sq_size - 1); + if (wqes_needed <= wqes_available) { + netif_start_queue(netdev); + goto tso_sq_no_longer_full; + } + } + nesvnic->sq_full++; + spin_unlock_irqrestore(&nesnic->sq_lock, flags); + nes_debug(NES_DBG_NIC_TX, "%s: HNIC SQ full- TSO request has too many frags!\n", + netdev->name); + return NETDEV_TX_BUSY; + } +tso_sq_no_longer_full: + /* Map all the buffers */ + for (tso_frag_count=0; tso_frag_count < skb_shinfo(skb)->nr_frags; + tso_frag_count++) { + tso_bus_address[tso_frag_count] = pci_map_page( nesdev->pcidev, + skb_shinfo(skb)->frags[tso_frag_count].page, + skb_shinfo(skb)->frags[tso_frag_count].page_offset, + skb_shinfo(skb)->frags[tso_frag_count].size, + PCI_DMA_TODEVICE); + } + + tso_frag_index = 0; + curr_tcp_seq = ntohl(tcp_hdr(skb)->seq); + hoffset = skb_transport_header(skb) - skb->data; + nhoffset = skb_network_header(skb) - skb->data; + original_first_length = hoffset + ((((struct tcphdr *)skb_transport_header(skb))->doff)<<2); + + for (wqe_count=0; wqe_count<((u32)wqes_needed); wqe_count++) { + tso_wqe_length = 0; + nic_sqe = &nesnic->sq_vbase[nesnic->sq_head]; + wqe_fragment_length = + (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX]; + /* setup the VLAN tag if present */ + if (vlan_tx_tag_present(skb)) { + nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n", + netdev->name, vlan_tx_tag_get(skb) ); + wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE; + wqe_fragment_length[0] = (__force __le16) vlan_tx_tag_get(skb); + } else + wqe_misc = 0; + + /* bump past the vlan tag */ + wqe_fragment_length++; + + /* Assumes header totally fits in allocated buffer and is in first fragment */ + if (original_first_length > NES_FIRST_FRAG_SIZE) { + nes_debug(NES_DBG_NIC_TX, "ERROR: SKB header too big, headlen=%u, FIRST_FRAG_SIZE=%u\n", + original_first_length, NES_FIRST_FRAG_SIZE); + nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u," + " (%u frags), tso_size=%u\n", + netdev->name, + skb->len, skb_headlen(skb), + skb_shinfo(skb)->nr_frags, skb_is_gso(skb)); + } + memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer, + skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE), + original_first_length)); + iph = (struct iphdr *) + (&nesnic->first_frag_vbase[nesnic->sq_head].buffer[nhoffset]); + tcph = (struct tcphdr *) + (&nesnic->first_frag_vbase[nesnic->sq_head].buffer[hoffset]); + if ((wqe_count+1)!=(u32)wqes_needed) { + tcph->fin = 0; + tcph->psh = 0; + tcph->rst = 0; + tcph->urg = 0; + } + if (wqe_count) { + tcph->syn = 0; + } + tcph->seq = htonl(curr_tcp_seq); + wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE), + original_first_length)); + + wqe_fragment_index = 1; + if ((wqe_count==0) && (skb_headlen(skb) > original_first_length)) { + set_bit(nesnic->sq_head, nesnic->first_frag_overflow); + bus_address = pci_map_single(nesdev->pcidev, skb->data + original_first_length, + skb_headlen(skb) - original_first_length, PCI_DMA_TODEVICE); + wqe_fragment_length[wqe_fragment_index++] = + cpu_to_le16(skb_headlen(skb) - original_first_length); + wqe_fragment_length[wqe_fragment_index] = 0; + set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX, + bus_address); + } + while (wqe_fragment_index < 5) { + wqe_fragment_length[wqe_fragment_index] = + cpu_to_le16(skb_shinfo(skb)->frags[tso_frag_index].size); + set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index), + (u64)tso_bus_address[tso_frag_index]); + wqe_fragment_index++; + tso_wqe_length += skb_shinfo(skb)->frags[tso_frag_index++].size; + if (wqe_fragment_index < 5) + wqe_fragment_length[wqe_fragment_index] = 0; + if (tso_frag_index == tso_frag_count) + break; + } + if ((wqe_count+1) == (u32)wqes_needed) { + nesnic->tx_skb[nesnic->sq_head] = skb; + } else { + nesnic->tx_skb[nesnic->sq_head] = NULL; + } + wqe_misc |= NES_NIC_SQ_WQE_COMPLETION | (u16)skb_is_gso(skb); + if ((tso_wqe_length + original_first_length) > skb_is_gso(skb)) { + wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE; + } else { + iph->tot_len = htons(tso_wqe_length + original_first_length - nhoffset); + } + + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX, + wqe_misc); + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX, + ((u32)tcph->doff) | (((u32)hoffset) << 4)); + + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX, + tso_wqe_length + original_first_length); + curr_tcp_seq += tso_wqe_length; + nesnic->sq_head++; + nesnic->sq_head &= nesnic->sq_size-1; + } + } else { + nesvnic->linearized_skbs++; + hoffset = skb_transport_header(skb) - skb->data; + nhoffset = skb_network_header(skb) - skb->data; + skb_linearize(skb); + skb_set_transport_header(skb, hoffset); + skb_set_network_header(skb, nhoffset); + send_rc = nes_nic_send(skb, netdev); + if (send_rc != NETDEV_TX_OK) { + spin_unlock_irqrestore(&nesnic->sq_lock, flags); + return NETDEV_TX_OK; + } + } + } else { + send_rc = nes_nic_send(skb, netdev); + if (send_rc != NETDEV_TX_OK) { + spin_unlock_irqrestore(&nesnic->sq_lock, flags); + return NETDEV_TX_OK; + } + } + + barrier(); + + if (wqe_count) + nes_write32(nesdev->regs+NES_WQE_ALLOC, + (wqe_count << 24) | (1 << 23) | nesvnic->nic.qp_id); + + netdev->trans_start = jiffies; + spin_unlock_irqrestore(&nesnic->sq_lock, flags); + + return NETDEV_TX_OK; +} + + +/** + * nes_netdev_get_stats + */ +static struct net_device_stats *nes_netdev_get_stats(struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + u64 u64temp; + u32 u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_DISCARD + (nesvnic->nic_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->endnode_nstat_rx_discard += u32temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO + (nesvnic->nic_index*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32; + + nesvnic->endnode_nstat_rx_octets += u64temp; + nesvnic->netstats.rx_bytes += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO + (nesvnic->nic_index*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32; + + nesvnic->endnode_nstat_rx_frames += u64temp; + nesvnic->netstats.rx_packets += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO + (nesvnic->nic_index*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32; + + nesvnic->endnode_nstat_tx_octets += u64temp; + nesvnic->netstats.tx_bytes += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO + (nesvnic->nic_index*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32; + + nesvnic->endnode_nstat_tx_frames += u64temp; + nesvnic->netstats.tx_packets += u64temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_short_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_oversized_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_jabber_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_length_errors += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_crc_errors += u32temp; + nesvnic->netstats.rx_crc_errors += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->nesdev->mac_tx_errors += u32temp; + nesvnic->netstats.tx_errors += u32temp; + + return &nesvnic->netstats; +} + + +/** + * nes_netdev_tx_timeout + */ +static void nes_netdev_tx_timeout(struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + + if (netif_msg_timer(nesvnic)) + nes_debug(NES_DBG_NIC_TX, "%s: tx timeout\n", netdev->name); +} + + +/** + * nes_netdev_set_mac_address + */ +static int nes_netdev_set_mac_address(struct net_device *netdev, void *p) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct sockaddr *mac_addr = p; + int i; + u32 macaddr_low; + u16 macaddr_high; + + if (!is_valid_ether_addr(mac_addr->sa_data)) + return -EADDRNOTAVAIL; + + memcpy(netdev->dev_addr, mac_addr->sa_data, netdev->addr_len); + printk(PFX "%s: Address length = %d, Address = %02X%02X%02X%02X%02X%02X..\n", + __FUNCTION__, netdev->addr_len, + mac_addr->sa_data[0], mac_addr->sa_data[1], + mac_addr->sa_data[2], mac_addr->sa_data[3], + mac_addr->sa_data[4], mac_addr->sa_data[5]); + macaddr_high = ((u16)netdev->dev_addr[0]) << 8; + macaddr_high += (u16)netdev->dev_addr[1]; + macaddr_low = ((u32)netdev->dev_addr[2]) << 24; + macaddr_low += ((u32)netdev->dev_addr[3]) << 16; + macaddr_low += ((u32)netdev->dev_addr[4]) << 8; + macaddr_low += (u32)netdev->dev_addr[5]; + + for (i = 0; i < NES_MAX_PORT_COUNT; i++) { + if (nesvnic->qp_nic_index[i] == 0xf) { + break; + } + nes_write_indexed(nesdev, + NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8), + macaddr_low); + nes_write_indexed(nesdev, + NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8), + (u32)macaddr_high | NES_MAC_ADDR_VALID | + ((((u32)nesvnic->nic_index) << 16))); + } + return 0; +} + + +/** + * nes_netdev_set_multicast_list + */ +void nes_netdev_set_multicast_list(struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct dev_mc_list *multicast_addr; + u32 nic_active_bit; + u32 nic_active; + u32 perfect_filter_register_address; + u32 macaddr_low; + u16 macaddr_high; + u8 mc_all_on = 0; + u8 mc_index; + int mc_nic_index = -1; + + nic_active_bit = 1 << nesvnic->nic_index; + + if (netdev->flags & IFF_PROMISC) { + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); + mc_all_on = 1; + } else if ((netdev->flags & IFF_ALLMULTI) || (netdev->mc_count > NES_MULTICAST_PF_MAX) || + (nesvnic->nic_index > 3)) { + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); + nic_active &= ~nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); + mc_all_on = 1; + } else { + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL); + nic_active &= ~nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); + nic_active &= ~nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); + } + + nes_debug(NES_DBG_NIC_RX, "Number of MC entries = %d, Promiscous = %d, All Multicast = %d.\n", + netdev->mc_count, (netdev->flags & IFF_PROMISC)?1:0, + (netdev->flags & IFF_ALLMULTI)?1:0); + if (!mc_all_on) { + multicast_addr = netdev->mc_list; + perfect_filter_register_address = NES_IDX_PERFECT_FILTER_LOW + 0x80; + perfect_filter_register_address += nesvnic->nic_index*0x40; + for (mc_index=0; mc_index < NES_MULTICAST_PF_MAX; mc_index++) { + while (multicast_addr && nesvnic->mcrq_mcast_filter && ((mc_nic_index = nesvnic->mcrq_mcast_filter(nesvnic, multicast_addr->dmi_addr)) == 0)) + multicast_addr = multicast_addr->next; + + if (mc_nic_index < 0) + mc_nic_index = nesvnic->nic_index; + if (multicast_addr) { + nes_debug(NES_DBG_NIC_RX, "Assigning MC Address = %02X%02X%02X%02X%02X%02X to register 0x%04X nic_idx=%d\n", + multicast_addr->dmi_addr[0], multicast_addr->dmi_addr[1], + multicast_addr->dmi_addr[2], multicast_addr->dmi_addr[3], + multicast_addr->dmi_addr[4], multicast_addr->dmi_addr[5], + perfect_filter_register_address+(mc_index * 8), mc_nic_index); + macaddr_high = ((u16)multicast_addr->dmi_addr[0]) << 8; + macaddr_high += (u16)multicast_addr->dmi_addr[1]; + macaddr_low = ((u32)multicast_addr->dmi_addr[2]) << 24; + macaddr_low += ((u32)multicast_addr->dmi_addr[3]) << 16; + macaddr_low += ((u32)multicast_addr->dmi_addr[4]) << 8; + macaddr_low += (u32)multicast_addr->dmi_addr[5]; + nes_write_indexed(nesdev, + perfect_filter_register_address+(mc_index * 8), + macaddr_low); + nes_write_indexed(nesdev, + perfect_filter_register_address+4+(mc_index * 8), + (u32)macaddr_high | NES_MAC_ADDR_VALID | + ((((u32)(1<<mc_nic_index)) << 16))); + multicast_addr = multicast_addr->next; + } else { + nes_debug(NES_DBG_NIC_RX, "Clearing MC Address at register 0x%04X\n", + perfect_filter_register_address+(mc_index * 8)); + nes_write_indexed(nesdev, + perfect_filter_register_address+4+(mc_index * 8), + 0); + } + } + } +} + + +/** + * nes_netdev_change_mtu + */ +static int nes_netdev_change_mtu(struct net_device *netdev, int new_mtu) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + int ret = 0; + u8 jumbomode=0; + + if ((new_mtu < ETH_ZLEN) || (new_mtu > max_mtu)) + return -EINVAL; + + netdev->mtu = new_mtu; + nesvnic->max_frame_size = new_mtu+ETH_HLEN; + + if (netdev->mtu > 1500) { + jumbomode=1; + } + nes_nic_init_timer_defaults(nesdev, jumbomode); + + if (netif_running(netdev)) { + nes_netdev_stop(netdev); + nes_netdev_open(netdev); + } + + return ret; +} + + +/** + * nes_netdev_exit - destroy network device + */ +void nes_netdev_exit(struct nes_vnic *nesvnic) +{ + struct net_device *netdev = nesvnic->netdev; + struct nes_ib_device *nesibdev = nesvnic->nesibdev; + + nes_debug(NES_DBG_SHUTDOWN, "\n"); + + // destroy the ibdevice if RDMA enabled + if ((nesvnic->rdma_enabled)&&(nesvnic->of_device_registered)) { + nes_destroy_ofa_device( nesibdev ); + nesvnic->of_device_registered = 0; + nesvnic->nesibdev = NULL; + } + unregister_netdev(netdev); + nes_debug(NES_DBG_SHUTDOWN, "\n"); +} + + +#define NES_ETHTOOL_STAT_COUNT 55 +static const char nes_ethtool_stringset[NES_ETHTOOL_STAT_COUNT][ETH_GSTRING_LEN] = { + "Link Change Interrupts", + "Linearized SKBs", + "T/GSO Requests", + "Pause Frames Sent", + "Pause Frames Received", + "Internal Routing Errors", + "SQ SW Dropped SKBs", + "SQ Locked", + "SQ Full", + "Segmented TSO Requests", + "Rx Symbol Errors", + "Rx Jabber Errors", + "Rx Oversized Frames", + "Rx Short Frames", + "Endnode Rx Discards", + "Endnode Rx Octets", + "Endnode Rx Frames", + "Endnode Tx Octets", + "Endnode Tx Frames", + "mh detected", + "mh pauses", + "Retransmission Count", + "CM Connects", + "CM Accepts", + "Disconnects", + "Connected Events", + "Connect Requests", + "CM Rejects", + "ModifyQP Timeouts", + "CreateQPs", + "SW DestroyQPs", + "DestroyQPs", + "CM Closes", + "CM Packets Sent", + "CM Packets Bounced", + "CM Packets Created", + "CM Packets Rcvd", + "CM Packets Dropped", + "CM Packets Retrans", + "CM Listens Created", + "CM Listens Destroyed", + "CM Backlog Drops", + "CM Loopbacks", + "CM Nodes Created", + "CM Nodes Destroyed", + "CM Accel Drops", + "CM Resets Received", + "Timer Inits", + "CQ Depth 1", + "CQ Depth 4", + "CQ Depth 16", + "CQ Depth 24", + "CQ Depth 32", + "CQ Depth 128", + "CQ Depth 256", +}; + + +/** + * nes_netdev_get_rx_csum + */ +static u32 nes_netdev_get_rx_csum (struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + + if (nesvnic->rx_checksum_disabled) + return 0; + else + return 1; +} + + +/** + * nes_netdev_set_rc_csum + */ +static int nes_netdev_set_rx_csum(struct net_device *netdev, u32 enable) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + + if (enable) + nesvnic->rx_checksum_disabled = 0; + else + nesvnic->rx_checksum_disabled = 1; + return 0; +} + + +/** + * nes_netdev_get_stats_count + */ +static int nes_netdev_get_stats_count(struct net_device *netdev) +{ + return NES_ETHTOOL_STAT_COUNT; +} + + +/** + * nes_netdev_get_strings + */ +static void nes_netdev_get_strings(struct net_device *netdev, u32 stringset, + u8 *ethtool_strings) +{ + if (stringset == ETH_SS_STATS) + memcpy(ethtool_strings, + &nes_ethtool_stringset, + sizeof(nes_ethtool_stringset)); +} + + +/** + * nes_netdev_get_ethtool_stats + */ +static void nes_netdev_get_ethtool_stats(struct net_device *netdev, + struct ethtool_stats *target_ethtool_stats, u64 *target_stat_values) +{ + u64 u64temp; + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + u32 nic_count; + u32 u32temp; + + target_ethtool_stats->n_stats = NES_ETHTOOL_STAT_COUNT; + target_stat_values[0] = nesvnic->nesdev->link_status_interrupts; + target_stat_values[1] = nesvnic->linearized_skbs; + target_stat_values[2] = nesvnic->tso_requests; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_TX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->nesdev->mac_pause_frames_sent += u32temp; + target_stat_values[3] = nesvnic->nesdev->mac_pause_frames_sent; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->nesdev->mac_pause_frames_received += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_PORT_RX_DISCARDS + (nesvnic->nesdev->mac_index*0x40)); + nesvnic->nesdev->port_rx_discards += u32temp; + nesvnic->netstats.rx_dropped += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_PORT_TX_DISCARDS + (nesvnic->nesdev->mac_index*0x40)); + nesvnic->nesdev->port_tx_discards += u32temp; + nesvnic->netstats.tx_dropped += u32temp; + + for (nic_count = 0; nic_count < NES_MAX_PORT_COUNT; nic_count++) { + if (nesvnic->qp_nic_index[nic_count] == 0xf) + break; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_DISCARD + + (nesvnic->qp_nic_index[nic_count]*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->endnode_nstat_rx_discard += u32temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO + + (nesvnic->qp_nic_index[nic_count]*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI + + (nesvnic->qp_nic_index[nic_count]*0x200))) << 32; + + nesvnic->endnode_nstat_rx_octets += u64temp; + nesvnic->netstats.rx_bytes += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO + + (nesvnic->qp_nic_index[nic_count]*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI + + (nesvnic->qp_nic_index[nic_count]*0x200))) << 32; + + nesvnic->endnode_nstat_rx_frames += u64temp; + nesvnic->netstats.rx_packets += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO + + (nesvnic->qp_nic_index[nic_count]*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI + + (nesvnic->qp_nic_index[nic_count]*0x200))) << 32; + + nesvnic->endnode_nstat_tx_octets += u64temp; + nesvnic->netstats.tx_bytes += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO + + (nesvnic->qp_nic_index[nic_count]*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI + + (nesvnic->qp_nic_index[nic_count]*0x200))) << 32; + + nesvnic->endnode_nstat_tx_frames += u64temp; + nesvnic->netstats.tx_packets += u64temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_IPV4_TCP_REXMITS + (nesvnic->qp_nic_index[nic_count]*0x200)); + nesvnic->endnode_ipv4_tcp_retransmits += u32temp; + } + + target_stat_values[4] = nesvnic->nesdev->mac_pause_frames_received; + target_stat_values[5] = nesdev->nesadapter->nic_rx_eth_route_err; + target_stat_values[6] = nesvnic->tx_sw_dropped; + target_stat_values[7] = nesvnic->sq_locked; + target_stat_values[8] = nesvnic->sq_full; + target_stat_values[9] = nesvnic->segmented_tso_requests; + target_stat_values[10] = nesvnic->nesdev->mac_rx_symbol_err_frames; + target_stat_values[11] = nesvnic->nesdev->mac_rx_jabber_frames; + target_stat_values[12] = nesvnic->nesdev->mac_rx_oversized_frames; + target_stat_values[13] = nesvnic->nesdev->mac_rx_short_frames; + target_stat_values[14] = nesvnic->endnode_nstat_rx_discard; + target_stat_values[15] = nesvnic->endnode_nstat_rx_octets; + target_stat_values[16] = nesvnic->endnode_nstat_rx_frames; + target_stat_values[17] = nesvnic->endnode_nstat_tx_octets; + target_stat_values[18] = nesvnic->endnode_nstat_tx_frames; + target_stat_values[19] = mh_detected; + target_stat_values[20] = mh_pauses_sent; + target_stat_values[21] = nesvnic->endnode_ipv4_tcp_retransmits; + target_stat_values[22] = atomic_read(&cm_connects); + target_stat_values[23] = atomic_read(&cm_accepts); + target_stat_values[24] = atomic_read(&cm_disconnects); + target_stat_values[25] = atomic_read(&cm_connecteds); + target_stat_values[26] = atomic_read(&cm_connect_reqs); + target_stat_values[27] = atomic_read(&cm_rejects); + target_stat_values[28] = atomic_read(&mod_qp_timouts); + target_stat_values[29] = atomic_read(&qps_created); + target_stat_values[30] = atomic_read(&sw_qps_destroyed); + target_stat_values[31] = atomic_read(&qps_destroyed); + target_stat_values[32] = atomic_read(&cm_closes); + target_stat_values[33] = cm_packets_sent; + target_stat_values[34] = cm_packets_bounced; + target_stat_values[35] = cm_packets_created; + target_stat_values[36] = cm_packets_received; + target_stat_values[37] = cm_packets_dropped; + target_stat_values[38] = cm_packets_retrans; + target_stat_values[39] = cm_listens_created; + target_stat_values[40] = cm_listens_destroyed; + target_stat_values[41] = cm_backlog_drops; + target_stat_values[42] = atomic_read(&cm_loopbacks); + target_stat_values[43] = atomic_read(&cm_nodes_created); + target_stat_values[44] = atomic_read(&cm_nodes_destroyed); + target_stat_values[45] = atomic_read(&cm_accel_dropped_pkts); + target_stat_values[46] = atomic_read(&cm_resets_recvd); + target_stat_values[47] = int_mod_timer_init; + target_stat_values[48] = int_mod_cq_depth_1; + target_stat_values[49] = int_mod_cq_depth_4; + target_stat_values[50] = int_mod_cq_depth_16; + target_stat_values[51] = int_mod_cq_depth_24; + target_stat_values[52] = int_mod_cq_depth_32; + target_stat_values[53] = int_mod_cq_depth_128; + target_stat_values[54] = int_mod_cq_depth_256; + +} + + +/** + * nes_netdev_get_drvinfo + */ +static void nes_netdev_get_drvinfo(struct net_device *netdev, + struct ethtool_drvinfo *drvinfo) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + + strcpy(drvinfo->driver, DRV_NAME); + strcpy(drvinfo->bus_info, pci_name(nesvnic->nesdev->pcidev)); + strcpy(drvinfo->fw_version, "TBD"); + strcpy(drvinfo->version, DRV_VERSION); + drvinfo->n_stats = nes_netdev_get_stats_count(netdev); + drvinfo->testinfo_len = 0; + drvinfo->eedump_len = 0; + drvinfo->regdump_len = 0; +} + + +/** + * nes_netdev_set_coalesce + */ +static int nes_netdev_set_coalesce(struct net_device *netdev, + struct ethtool_coalesce *et_coalesce) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; + unsigned long flags; + + spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); + if (et_coalesce->rx_max_coalesced_frames_low) { + shared_timer->threshold_low = et_coalesce->rx_max_coalesced_frames_low; + } + if (et_coalesce->rx_max_coalesced_frames_irq) { + shared_timer->threshold_target = et_coalesce->rx_max_coalesced_frames_irq; + } + if (et_coalesce->rx_max_coalesced_frames_high) { + shared_timer->threshold_high = et_coalesce->rx_max_coalesced_frames_high; + } + if (et_coalesce->rx_coalesce_usecs_low) { + shared_timer->timer_in_use_min = et_coalesce->rx_coalesce_usecs_low; + } + if (et_coalesce->rx_coalesce_usecs_high) { + shared_timer->timer_in_use_max = et_coalesce->rx_coalesce_usecs_high; + } + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); + + /* using this to drive total interrupt moderation */ + nesadapter->et_rx_coalesce_usecs_irq = et_coalesce->rx_coalesce_usecs_irq; + if (et_coalesce->use_adaptive_rx_coalesce) { + nesadapter->et_use_adaptive_rx_coalesce = 1; + nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC; + nesadapter->et_rx_coalesce_usecs_irq = 0; + if (et_coalesce->pkt_rate_low) { + nesadapter->et_pkt_rate_low = et_coalesce->pkt_rate_low; + } + } else { + nesadapter->et_use_adaptive_rx_coalesce = 0; + nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT; + if (nesadapter->et_rx_coalesce_usecs_irq) { + nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, + 0x80000000 | ((u32)(nesadapter->et_rx_coalesce_usecs_irq*8))); + } + } + return 0; +} + + +/** + * nes_netdev_get_coalesce + */ +static int nes_netdev_get_coalesce(struct net_device *netdev, + struct ethtool_coalesce *et_coalesce) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct ethtool_coalesce temp_et_coalesce; + struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; + unsigned long flags; + + memset(&temp_et_coalesce, 0, sizeof(temp_et_coalesce)); + temp_et_coalesce.rx_coalesce_usecs_irq = nesadapter->et_rx_coalesce_usecs_irq; + temp_et_coalesce.use_adaptive_rx_coalesce = nesadapter->et_use_adaptive_rx_coalesce; + temp_et_coalesce.rate_sample_interval = nesadapter->et_rate_sample_interval; + temp_et_coalesce.pkt_rate_low = nesadapter->et_pkt_rate_low; + spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); + temp_et_coalesce.rx_max_coalesced_frames_low = shared_timer->threshold_low; + temp_et_coalesce.rx_max_coalesced_frames_irq = shared_timer->threshold_target; + temp_et_coalesce.rx_max_coalesced_frames_high = shared_timer->threshold_high; + temp_et_coalesce.rx_coalesce_usecs_low = shared_timer->timer_in_use_min; + temp_et_coalesce.rx_coalesce_usecs_high = shared_timer->timer_in_use_max; + if (nesadapter->et_use_adaptive_rx_coalesce) { + temp_et_coalesce.rx_coalesce_usecs_irq = shared_timer->timer_in_use; + } + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); + memcpy(et_coalesce, &temp_et_coalesce, sizeof(*et_coalesce)); + return 0; +} + + +/** + * nes_netdev_get_pauseparam + */ +static void nes_netdev_get_pauseparam(struct net_device *netdev, + struct ethtool_pauseparam *et_pauseparam) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + + et_pauseparam->autoneg = 0; + et_pauseparam->rx_pause = (nesvnic->nesdev->disable_rx_flow_control == 0) ? 1:0; + et_pauseparam->tx_pause = (nesvnic->nesdev->disable_tx_flow_control == 0) ? 1:0; +} + + +/** + * nes_netdev_set_pauseparam + */ +static int nes_netdev_set_pauseparam(struct net_device *netdev, + struct ethtool_pauseparam *et_pauseparam) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + u32 u32temp; + + if (et_pauseparam->autoneg) { + /* TODO: should return unsupported */ + return 0; + } + if ((et_pauseparam->tx_pause == 1) && (nesdev->disable_tx_flow_control == 1)) { + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200)); + u32temp |= NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE; + nes_write_indexed(nesdev, + NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE + (nesdev->mac_index*0x200), u32temp); + nesdev->disable_tx_flow_control = 0; + } else if ((et_pauseparam->tx_pause == 0) && (nesdev->disable_tx_flow_control == 0)) { + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200)); + u32temp &= ~NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE; + nes_write_indexed(nesdev, + NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE + (nesdev->mac_index*0x200), u32temp); + nesdev->disable_tx_flow_control = 1; + } + if ((et_pauseparam->rx_pause == 1) && (nesdev->disable_rx_flow_control == 1)) { + u32temp = nes_read_indexed(nesdev, + NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40)); + u32temp &= ~NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE; + nes_write_indexed(nesdev, + NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp); + nesdev->disable_rx_flow_control = 0; + } else if ((et_pauseparam->rx_pause == 0) && (nesdev->disable_rx_flow_control == 0)) { + u32temp = nes_read_indexed(nesdev, + NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40)); + u32temp |= NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE; + nes_write_indexed(nesdev, + NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp); + nesdev->disable_rx_flow_control = 1; + } + + return 0; +} + + +/** + * nes_netdev_get_settings + */ +static int nes_netdev_get_settings(struct net_device *netdev, struct ethtool_cmd *et_cmd) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + u16 phy_data; + + et_cmd->duplex = DUPLEX_FULL; + et_cmd->port = PORT_MII; + if (nesadapter->OneG_Mode) { + et_cmd->supported = SUPPORTED_1000baseT_Full|SUPPORTED_Autoneg; + et_cmd->advertising = ADVERTISED_1000baseT_Full|ADVERTISED_Autoneg; + et_cmd->speed = SPEED_1000; + nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[nesdev->mac_index], + &phy_data); + if (phy_data&0x1000) { + et_cmd->autoneg = AUTONEG_ENABLE; + } else { + et_cmd->autoneg = AUTONEG_DISABLE; + } + et_cmd->transceiver = XCVR_EXTERNAL; + et_cmd->phy_address = nesadapter->phy_index[nesdev->mac_index]; + } else { + if (nesadapter->phy_type[nesvnic->logical_port] == NES_PHY_TYPE_IRIS) { + et_cmd->transceiver = XCVR_EXTERNAL; + et_cmd->port = PORT_FIBRE; + et_cmd->supported = SUPPORTED_FIBRE; + et_cmd->advertising = ADVERTISED_FIBRE; + et_cmd->phy_address = nesadapter->phy_index[nesdev->mac_index]; + } else { + et_cmd->transceiver = XCVR_INTERNAL; + et_cmd->supported = SUPPORTED_10000baseT_Full; + et_cmd->advertising = ADVERTISED_10000baseT_Full; + et_cmd->phy_address = nesdev->mac_index; + } + et_cmd->speed = SPEED_10000; + et_cmd->autoneg = AUTONEG_DISABLE; + } + et_cmd->maxtxpkt = 511; + et_cmd->maxrxpkt = 511; + return 0; +} + + +/** + * nes_netdev_set_settings + */ +static int nes_netdev_set_settings(struct net_device *netdev, struct ethtool_cmd *et_cmd) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + u16 phy_data; + + if (nesadapter->OneG_Mode) { + nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[nesdev->mac_index], + &phy_data); + if (et_cmd->autoneg) { + /* Turn on Full duplex, Autoneg, and restart autonegotiation */ + phy_data |= 0x1300; + } else { + // Turn off autoneg + phy_data &= ~0x1000; + } + nes_write_1G_phy_reg(nesdev, 0, nesadapter->phy_index[nesdev->mac_index], + phy_data); + } + + return 0; +} + + +static struct ethtool_ops nes_ethtool_ops = { + .get_link = ethtool_op_get_link, + .get_settings = nes_netdev_get_settings, + .set_settings = nes_netdev_set_settings, + .get_tx_csum = ethtool_op_get_tx_csum, + .get_rx_csum = nes_netdev_get_rx_csum, + .get_sg = ethtool_op_get_sg, + .get_strings = nes_netdev_get_strings, + .get_stats_count = nes_netdev_get_stats_count, + .get_ethtool_stats = nes_netdev_get_ethtool_stats, + .get_drvinfo = nes_netdev_get_drvinfo, + .get_coalesce = nes_netdev_get_coalesce, + .set_coalesce = nes_netdev_set_coalesce, + .get_pauseparam = nes_netdev_get_pauseparam, + .set_pauseparam = nes_netdev_set_pauseparam, + .set_tx_csum = ethtool_op_set_tx_csum, + .set_rx_csum = nes_netdev_set_rx_csum, + .set_sg = ethtool_op_set_sg, + .get_tso = ethtool_op_get_tso, + .set_tso = ethtool_op_set_tso, +}; + + +static void nes_netdev_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + u32 u32temp; + + nesvnic->vlan_grp = grp; + + /* Enable/Disable VLAN Stripping */ + u32temp = nes_read_indexed(nesdev, NES_IDX_PCIX_DIAG); + if (grp) + u32temp &= 0xfdffffff; + else + u32temp |= 0x02000000; + + nes_write_indexed(nesdev, NES_IDX_PCIX_DIAG, u32temp); +} + + +/** + * nes_netdev_init - initialize network device + */ +struct net_device *nes_netdev_init(struct nes_device *nesdev, + void __iomem *mmio_addr) +{ + u64 u64temp; + struct nes_vnic *nesvnic = NULL; + struct net_device *netdev; + struct nic_qp_map *curr_qp_map; + u32 u32temp; + u16 phy_data; + u16 temp_phy_data; + + netdev = alloc_etherdev(sizeof(struct nes_vnic)); + if (!netdev) { + printk(KERN_ERR PFX "nesvnic etherdev alloc failed"); + return NULL; + } + + nes_debug(NES_DBG_INIT, "netdev = %p, %s\n", netdev, netdev->name); + + SET_NETDEV_DEV(netdev, &nesdev->pcidev->dev); + + nesvnic = netdev_priv(netdev); + memset(nesvnic, 0, sizeof(*nesvnic)); + + netdev->open = nes_netdev_open; + netdev->stop = nes_netdev_stop; + netdev->hard_start_xmit = nes_netdev_start_xmit; + netdev->get_stats = nes_netdev_get_stats; + netdev->tx_timeout = nes_netdev_tx_timeout; + netdev->set_mac_address = nes_netdev_set_mac_address; + netdev->set_multicast_list = nes_netdev_set_multicast_list; + netdev->change_mtu = nes_netdev_change_mtu; + netdev->watchdog_timeo = NES_TX_TIMEOUT; + netdev->irq = nesdev->pcidev->irq; + netdev->mtu = ETH_DATA_LEN; + netdev->hard_header_len = ETH_HLEN; + netdev->addr_len = ETH_ALEN; + netdev->type = ARPHRD_ETHER; + netdev->features = NETIF_F_HIGHDMA; + netdev->ethtool_ops = &nes_ethtool_ops; + netif_napi_add(netdev, &nesvnic->napi, nes_netdev_poll, 128); + nes_debug(NES_DBG_INIT, "Enabling VLAN Insert/Delete.\n"); + netdev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX; + netdev->vlan_rx_register = nes_netdev_vlan_rx_register; + netdev->features |= NETIF_F_LLTX; + + /* Fill in the port structure */ + nesvnic->netdev = netdev; + nesvnic->nesdev = nesdev; + nesvnic->msg_enable = netif_msg_init(debug, default_msg); + nesvnic->netdev_index = nesdev->netdev_count; + nesvnic->perfect_filter_index = nesdev->nesadapter->netdev_count; + nesvnic->max_frame_size = netdev->mtu+netdev->hard_header_len; + + curr_qp_map = nic_qp_mapping_per_function[PCI_FUNC(nesdev->pcidev->devfn)]; + nesvnic->nic.qp_id = curr_qp_map[nesdev->netdev_count].qpid; + nesvnic->nic_index = curr_qp_map[nesdev->netdev_count].nic_index; + nesvnic->logical_port = curr_qp_map[nesdev->netdev_count].logical_port; + + /* Setup the burned in MAC address */ + u64temp = (u64)nesdev->nesadapter->mac_addr_low; + u64temp += ((u64)nesdev->nesadapter->mac_addr_high) << 32; + u64temp += nesvnic->nic_index; + netdev->dev_addr[0] = (u8)(u64temp>>40); + netdev->dev_addr[1] = (u8)(u64temp>>32); + netdev->dev_addr[2] = (u8)(u64temp>>24); + netdev->dev_addr[3] = (u8)(u64temp>>16); + netdev->dev_addr[4] = (u8)(u64temp>>8); + netdev->dev_addr[5] = (u8)u64temp; + memcpy(netdev->perm_addr, netdev->dev_addr, 6); + + if ((nesvnic->logical_port < 2) || (nesdev->nesadapter->hw_rev != NE020_REV)) { + netdev->features |= NETIF_F_TSO | NETIF_F_SG | NETIF_F_IP_CSUM; + netdev->features |= NETIF_F_GSO | NETIF_F_TSO | NETIF_F_SG | NETIF_F_IP_CSUM; + } else { + netdev->features |= NETIF_F_SG | NETIF_F_IP_CSUM; + } + + nes_debug(NES_DBG_INIT, "nesvnic = %p, reported features = 0x%lX, QPid = %d," + " nic_index = %d, logical_port = %d, mac_index = %d.\n", + nesvnic, (unsigned long)netdev->features, nesvnic->nic.qp_id, + nesvnic->nic_index, nesvnic->logical_port, nesdev->mac_index); + + if (nesvnic->nesdev->nesadapter->port_count == 1) { + nesvnic->qp_nic_index[0] = nesvnic->nic_index; + nesvnic->qp_nic_index[1] = nesvnic->nic_index + 1; + if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT) { + nesvnic->qp_nic_index[2] = 0xf; + nesvnic->qp_nic_index[3] = 0xf; + } else { + nesvnic->qp_nic_index[2] = nesvnic->nic_index + 2; + nesvnic->qp_nic_index[3] = nesvnic->nic_index + 3; + } + } else { + if (nesvnic->nesdev->nesadapter->port_count == 2) { + nesvnic->qp_nic_index[0] = nesvnic->nic_index; + nesvnic->qp_nic_index[1] = nesvnic->nic_index + 2; + nesvnic->qp_nic_index[2] = 0xf; + nesvnic->qp_nic_index[3] = 0xf; + } else { + nesvnic->qp_nic_index[0] = nesvnic->nic_index; + nesvnic->qp_nic_index[1] = 0xf; + nesvnic->qp_nic_index[2] = 0xf; + nesvnic->qp_nic_index[3] = 0xf; + } + } + nesvnic->next_qp_nic_index = 0; + + if (nesdev->netdev_count == 0) { + nesvnic->rdma_enabled = 1; + } else { + nesvnic->rdma_enabled = 0; + } + nesvnic->nic_cq.cq_number = nesvnic->nic.qp_id; + spin_lock_init(&nesvnic->tx_lock); + nesdev->netdev[nesdev->netdev_count] = netdev; + + nes_debug(NES_DBG_INIT, "Adding nesvnic (%p) to the adapters nesvnic_list for MAC%d.\n", + nesvnic, nesdev->mac_index); + list_add_tail(&nesvnic->list, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]); + + if ((nesdev->netdev_count == 0) && + (PCI_FUNC(nesdev->pcidev->devfn) == nesdev->mac_index)) { + nes_debug(NES_DBG_INIT, "Setting up PHY interrupt mask. Using register index 0x%04X\n", + NES_IDX_PHY_PCS_CONTROL_STATUS0+(0x200*(nesvnic->logical_port&1))); + u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + + (0x200*(nesvnic->logical_port&1))); + u32temp |= 0x00200000; + nes_write_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + + (0x200*(nesvnic->logical_port&1)), u32temp); + u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + + (0x200*(nesvnic->logical_port&1)) ); + if ((u32temp&0x0f1f0000) == 0x0f0f0000) { + if (nesdev->nesadapter->phy_type[nesvnic->logical_port] == NES_PHY_TYPE_IRIS) { + nes_init_phy(nesdev); + nes_read_10G_phy_reg(nesdev, 1, + nesdev->nesadapter->phy_index[nesvnic->logical_port]); + temp_phy_data = (u16)nes_read_indexed(nesdev, + NES_IDX_MAC_MDIO_CONTROL); + u32temp = 20; + do { + nes_read_10G_phy_reg(nesdev, 1, + nesdev->nesadapter->phy_index[nesvnic->logical_port]); + phy_data = (u16)nes_read_indexed(nesdev, + NES_IDX_MAC_MDIO_CONTROL); + if ((phy_data == temp_phy_data) || (!(--u32temp))) + break; + temp_phy_data = phy_data; + } while (1); + if (phy_data & 4) { + nes_debug(NES_DBG_INIT, "The Link is UP!!.\n"); + nesvnic->linkup = 1; + } else { + nes_debug(NES_DBG_INIT, "The Link is DOWN!!.\n"); + } + } else { + nes_debug(NES_DBG_INIT, "The Link is UP!!.\n"); + nesvnic->linkup = 1; + } + } + nes_debug(NES_DBG_INIT, "Setting up MAC interrupt mask.\n"); + /* clear the MAC interrupt status, assumes direct logical to physical mapping */ + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS+(0x200*nesvnic->logical_port)); + nes_debug(NES_DBG_INIT, "Phy interrupt status = 0x%X.\n", u32temp); + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS+(0x200*nesvnic->logical_port), u32temp); + + if (nesdev->nesadapter->phy_type[nesvnic->logical_port] != NES_PHY_TYPE_IRIS) + nes_init_phy(nesdev); + + nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+(0x200*nesvnic->logical_port), + ~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT | + NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR)); + } + + return netdev; +} + + +/** + * nes_netdev_destroy - destroy network device structure + */ +void nes_netdev_destroy(struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + + /* make sure 'stop' method is called by Linux stack */ + /* nes_netdev_stop(netdev); */ + + list_del(&nesvnic->list); + + if (nesvnic->of_device_registered) { + nes_destroy_ofa_device(nesvnic->nesibdev); + } + + free_netdev(netdev); +} + + +/** + * nes_nic_cm_xmit -- CM calls this to send out pkts + */ +int nes_nic_cm_xmit(struct sk_buff *skb, struct net_device *netdev) +{ + int ret; + + skb->dev = netdev; + ret = dev_queue_xmit(skb); + if (ret) { + nes_debug(NES_DBG_CM, "Bad return code from dev_queue_xmit %d\n", ret); + } + + return ret; +} diff --git a/drivers/infiniband/hw/nes/nes_user.h b/drivers/infiniband/hw/nes/nes_user.h new file mode 100644 index 0000000..e64306b --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_user.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2006 - 2008 NetEffect. All rights reserved. + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef NES_USER_H +#define NES_USER_H + +#include <linux/types.h> + +#define NES_ABI_USERSPACE_VER 1 +#define NES_ABI_KERNEL_VER 1 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct nes_alloc_ucontext_req { + __u32 reserved32; + __u8 userspace_ver; + __u8 reserved8[3]; +}; + +struct nes_alloc_ucontext_resp { + __u32 max_pds; /* maximum pds allowed for this user process */ + __u32 max_qps; /* maximum qps allowed for this user process */ + __u32 wq_size; /* size of the WQs (sq+rq) allocated to the mmaped area */ + __u8 virtwq; /* flag to indicate if virtual WQ are to be used or not */ + __u8 kernel_ver; + __u8 reserved[2]; +}; + +struct nes_alloc_pd_resp { + __u32 pd_id; + __u32 mmap_db_index; +}; + +struct nes_create_cq_req { + __u64 user_cq_buffer; + __u32 mcrqf; + __u8 reserved[4]; +}; + +struct nes_create_qp_req { + __u64 user_wqe_buffers; +}; + +enum iwnes_memreg_type { + IWNES_MEMREG_TYPE_MEM = 0x0000, + IWNES_MEMREG_TYPE_QP = 0x0001, + IWNES_MEMREG_TYPE_CQ = 0x0002, + IWNES_MEMREG_TYPE_MW = 0x0003, + IWNES_MEMREG_TYPE_FMR = 0x0004, +}; + +struct nes_mem_reg_req { + __u32 reg_type; /* indicates if id is memory, QP or CQ */ + __u32 reserved; +}; + +struct nes_create_cq_resp { + __u32 cq_id; + __u32 cq_size; + __u32 mmap_db_index; + __u32 reserved; +}; + +struct nes_create_qp_resp { + __u32 qp_id; + __u32 actual_sq_size; + __u32 actual_rq_size; + __u32 mmap_sq_db_index; + __u32 mmap_rq_db_index; + __u32 nes_drv_opt; +}; + +#endif /* NES_USER_H */ diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c new file mode 100644 index 0000000..c4ec6ac --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_utils.c @@ -0,0 +1,917 @@ +/* + * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/ethtool.h> +#include <linux/mii.h> +#include <linux/if_vlan.h> +#include <linux/crc32.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/init.h> + +#include <asm/io.h> +#include <asm/irq.h> +#include <asm/byteorder.h> + +#include "nes.h" + + + +static u16 nes_read16_eeprom(void __iomem *addr, u16 offset); + +u32 mh_detected; +u32 mh_pauses_sent; + +/** + * nes_read_eeprom_values - + */ +int nes_read_eeprom_values(struct nes_device *nesdev, struct nes_adapter *nesadapter) +{ + u32 mac_addr_low; + u16 mac_addr_high; + u16 eeprom_data; + u16 eeprom_offset; + u16 next_section_address; + u16 sw_section_ver; + u8 major_ver = 0; + u8 minor_ver = 0; + + /* TODO: deal with EEPROM endian issues */ + if (nesadapter->firmware_eeprom_offset == 0) { + /* Read the EEPROM Parameters */ + eeprom_data = nes_read16_eeprom(nesdev->regs, 0); + nes_debug(NES_DBG_HW, "EEPROM Offset 0 = 0x%04X\n", eeprom_data); + eeprom_offset = 2 + (((eeprom_data & 0x007f) << 3) << + ((eeprom_data & 0x0080) >> 7)); + nes_debug(NES_DBG_HW, "Firmware Offset = 0x%04X\n", eeprom_offset); + nesadapter->firmware_eeprom_offset = eeprom_offset; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4); + if (eeprom_data != 0x5746) { + nes_debug(NES_DBG_HW, "Not a valid Firmware Image = 0x%04X\n", eeprom_data); + return -1; + } + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + eeprom_offset += ((eeprom_data & 0x00ff) << 3) << ((eeprom_data & 0x0100) >> 8); + nes_debug(NES_DBG_HW, "Software Offset = 0x%04X\n", eeprom_offset); + nesadapter->software_eeprom_offset = eeprom_offset; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4); + if (eeprom_data != 0x5753) { + printk("Not a valid Software Image = 0x%04X\n", eeprom_data); + return -1; + } + sw_section_ver = nes_read16_eeprom(nesdev->regs, nesadapter->software_eeprom_offset + 6); + nes_debug(NES_DBG_HW, "Software section version number = 0x%04X\n", + sw_section_ver); + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) << + ((eeprom_data & 0x0100) >> 8)); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x414d) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_offset = next_section_address; + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) << + ((eeprom_data & 0x0100) >> 8)); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x4f52) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x4f52 but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_offset = next_section_address; + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x5746) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5746 but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_offset = next_section_address; + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x5753) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5753 but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_offset = next_section_address; + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x414d) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_offset = next_section_address; + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x464e) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x464e but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 8); + printk(PFX "Firmware version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data); + major_ver = (u8)(eeprom_data >> 8); + minor_ver = (u8)(eeprom_data); + + if (nes_drv_opt & NES_DRV_OPT_DISABLE_VIRT_WQ) { + nes_debug(NES_DBG_HW, "Virtual WQs have been disabled\n"); + } else if (((major_ver == 2) && (minor_ver > 21)) || ((major_ver > 2) && (major_ver != 255))) { + nesadapter->virtwq = 1; + } + nesadapter->firmware_version = (((u32)(u8)(eeprom_data>>8)) << 16) + + (u32)((u8)eeprom_data); + +no_fw_rev: + /* eeprom is valid */ + eeprom_offset = nesadapter->software_eeprom_offset; + eeprom_offset += 8; + nesadapter->netdev_max = (u8)nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + mac_addr_high = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + mac_addr_low = (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + mac_addr_low <<= 16; + mac_addr_low += (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "Base MAC Address = 0x%04X%08X\n", + mac_addr_high, mac_addr_low); + nes_debug(NES_DBG_HW, "MAC Address count = %u\n", nesadapter->netdev_max); + + nesadapter->mac_addr_low = mac_addr_low; + nesadapter->mac_addr_high = mac_addr_high; + + /* Read the Phy Type array */ + eeprom_offset += 10; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->phy_type[0] = (u8)(eeprom_data >> 8); + nesadapter->phy_type[1] = (u8)eeprom_data; + + /* Read the port array */ + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->phy_type[2] = (u8)(eeprom_data >> 8); + nesadapter->phy_type[3] = (u8)eeprom_data; + /* port_count is set by soft reset reg */ + nes_debug(NES_DBG_HW, "port_count = %u, port 0 -> %u, port 1 -> %u," + " port 2 -> %u, port 3 -> %u\n", + nesadapter->port_count, + nesadapter->phy_type[0], nesadapter->phy_type[1], + nesadapter->phy_type[2], nesadapter->phy_type[3]); + + /* Read PD config array */ + eeprom_offset += 10; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_size[0] = eeprom_data; + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_base[0] = eeprom_data; + nes_debug(NES_DBG_HW, "PD0 config, size=0x%04x, base=0x%04x\n", + nesadapter->pd_config_size[0], nesadapter->pd_config_base[0]); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_size[1] = eeprom_data; + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_base[1] = eeprom_data; + nes_debug(NES_DBG_HW, "PD1 config, size=0x%04x, base=0x%04x\n", + nesadapter->pd_config_size[1], nesadapter->pd_config_base[1]); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_size[2] = eeprom_data; + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_base[2] = eeprom_data; + nes_debug(NES_DBG_HW, "PD2 config, size=0x%04x, base=0x%04x\n", + nesadapter->pd_config_size[2], nesadapter->pd_config_base[2]); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_size[3] = eeprom_data; + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_base[3] = eeprom_data; + nes_debug(NES_DBG_HW, "PD3 config, size=0x%04x, base=0x%04x\n", + nesadapter->pd_config_size[3], nesadapter->pd_config_base[3]); + + /* Read Rx Pool Size */ + eeprom_offset += 22; /* 46 */ + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->rx_pool_size = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "rx_pool_size = 0x%08X\n", nesadapter->rx_pool_size); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->tx_pool_size = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "tx_pool_size = 0x%08X\n", nesadapter->tx_pool_size); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->rx_threshold = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "rx_threshold = 0x%08X\n", nesadapter->rx_threshold); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->tcp_timer_core_clk_divisor = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "tcp_timer_core_clk_divisor = 0x%08X\n", + nesadapter->tcp_timer_core_clk_divisor); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->iwarp_config = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "iwarp_config = 0x%08X\n", nesadapter->iwarp_config); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->cm_config = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "cm_config = 0x%08X\n", nesadapter->cm_config); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->sws_timer_config = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "sws_timer_config = 0x%08X\n", nesadapter->sws_timer_config); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->tcp_config1 = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "tcp_config1 = 0x%08X\n", nesadapter->tcp_config1); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->wqm_wat = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "wqm_wat = 0x%08X\n", nesadapter->wqm_wat); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->core_clock = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "core_clock = 0x%08X\n", nesadapter->core_clock); + + if ((sw_section_ver) && (nesadapter->hw_rev != NE020_REV)) { + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->phy_index[0] = (eeprom_data & 0xff00)>>8; + nesadapter->phy_index[1] = eeprom_data & 0x00ff; + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->phy_index[2] = (eeprom_data & 0xff00)>>8; + nesadapter->phy_index[3] = eeprom_data & 0x00ff; + } else { + nesadapter->phy_index[0] = 4; + nesadapter->phy_index[1] = 5; + nesadapter->phy_index[2] = 6; + nesadapter->phy_index[3] = 7; + } + nes_debug(NES_DBG_HW, "Phy address map = 0 > %u, 1 > %u, 2 > %u, 3 > %u\n", + nesadapter->phy_index[0],nesadapter->phy_index[1], + nesadapter->phy_index[2],nesadapter->phy_index[3]); + } + + return 0; +} + + +/** + * nes_read16_eeprom + */ +static u16 nes_read16_eeprom(void __iomem *addr, u16 offset) +{ + writel(NES_EEPROM_READ_REQUEST + (offset >> 1), + (void __iomem *)addr + NES_EEPROM_COMMAND); + + do { + } while (readl((void __iomem *)addr + NES_EEPROM_COMMAND) & + NES_EEPROM_READ_REQUEST); + + return readw((void __iomem *)addr + NES_EEPROM_DATA); +} + + +/** + * nes_write_1G_phy_reg + */ +void nes_write_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 data) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 u32temp; + u32 counter; + unsigned long flags; + + spin_lock_irqsave(&nesadapter->phy_lock, flags); + + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x50020000 | data | ((u32)phy_reg << 18) | ((u32)phy_addr << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + /* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */ + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); + + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); +} + + +/** + * nes_read_1G_phy_reg + * This routine only issues the read, the data must be read + * separately. + */ +void nes_read_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 *data) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 u32temp; + u32 counter; + unsigned long flags; + + /* nes_debug(NES_DBG_PHY, "phy addr = %d, mac_index = %d\n", + phy_addr, nesdev->mac_index); */ + spin_lock_irqsave(&nesadapter->phy_lock, flags); + + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x60020000 | ((u32)phy_reg << 18) | ((u32)phy_addr << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + /* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */ + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) { + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); + *data = 0xffff; + } else { + *data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + } + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); +} + + +/** + * nes_write_10G_phy_reg + */ +void nes_write_10G_phy_reg(struct nes_device *nesdev, u16 phy_reg, + u8 phy_addr, u16 data) +{ + u32 dev_addr; + u32 port_addr; + u32 u32temp; + u32 counter; + + dev_addr = 1; + port_addr = phy_addr; + + /* set address */ + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); + + /* set data */ + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x10020000 | (u32)data | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); +} + + +/** + * nes_read_10G_phy_reg + * This routine only issues the read, the data must be read + * separately. + */ +void nes_read_10G_phy_reg(struct nes_device *nesdev, u16 phy_reg, u8 phy_addr) +{ + u32 dev_addr; + u32 port_addr; + u32 u32temp; + u32 counter; + + dev_addr = 1; + port_addr = phy_addr; + + /* set address */ + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); + + /* issue read */ + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x30020000 | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); +} + + +/** + * nes_get_cqp_request + */ +struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev) +{ + unsigned long flags; + struct nes_cqp_request *cqp_request = NULL; + + if (!list_empty(&nesdev->cqp_avail_reqs)) { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + cqp_request = list_entry(nesdev->cqp_avail_reqs.next, + struct nes_cqp_request, list); + list_del_init(&cqp_request->list); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } else { + cqp_request = kzalloc(sizeof(struct nes_cqp_request), GFP_KERNEL); + if (cqp_request) { + cqp_request->dynamic = 1; + INIT_LIST_HEAD(&cqp_request->list); + } + } + + if (cqp_request) { + init_waitqueue_head(&cqp_request->waitq); + cqp_request->waiting = 0; + cqp_request->request_done = 0; + cqp_request->callback = 0; + init_waitqueue_head(&cqp_request->waitq); + nes_debug(NES_DBG_CQP, "Got cqp request %p from the available list \n", + cqp_request); + } else + printk(KERN_ERR PFX "%s: Could not allocated a CQP request.\n", + __FUNCTION__); + + return cqp_request; +} + + +/** + * nes_post_cqp_request + */ +void nes_post_cqp_request(struct nes_device *nesdev, + struct nes_cqp_request *cqp_request, int ring_doorbell) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + unsigned long flags; + u32 cqp_head; + u64 u64temp; + + spin_lock_irqsave(&nesdev->cqp.lock, flags); + + if (((((nesdev->cqp.sq_tail+(nesdev->cqp.sq_size*2))-nesdev->cqp.sq_head) & + (nesdev->cqp.sq_size - 1)) != 1) + && (list_empty(&nesdev->cqp_pending_reqs))) { + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe)); + barrier(); + u64temp = (unsigned long)cqp_request; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_COMP_SCRATCH_LOW_IDX, + u64temp); + nes_debug(NES_DBG_CQP, "CQP request (opcode 0x%02X), line 1 = 0x%08X put on CQPs SQ," + " request = %p, cqp_head = %u, cqp_tail = %u, cqp_size = %u," + " waiting = %d, refcount = %d.\n", + le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f, + le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX]), cqp_request, + nesdev->cqp.sq_head, nesdev->cqp.sq_tail, nesdev->cqp.sq_size, + cqp_request->waiting, atomic_read(&cqp_request->refcount)); + barrier(); + if (ring_doorbell) { + /* Ring doorbell (1 WQEs) */ + nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id); + } + + barrier(); + } else { + nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X), line 1 = 0x%08X" + " put on the pending queue.\n", + cqp_request, + le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f, + le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_ID_IDX])); + list_add_tail(&cqp_request->list, &nesdev->cqp_pending_reqs); + } + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + + return; +} + + +/** + * nes_arp_table + */ +int nes_arp_table(struct nes_device *nesdev, u32 ip_addr, u8 *mac_addr, u32 action) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + int arp_index; + int err = 0; + + for (arp_index = 0; (u32) arp_index < nesadapter->arp_table_size; arp_index++) { + if (nesadapter->arp_table[arp_index].ip_addr == ip_addr) + break; + } + + if (action == NES_ARP_ADD) { + if (arp_index != nesadapter->arp_table_size) { + return -1; + } + + arp_index = 0; + err = nes_alloc_resource(nesadapter, nesadapter->allocated_arps, + nesadapter->arp_table_size, (u32 *)&arp_index, &nesadapter->next_arp_index); + if (err) { + nes_debug(NES_DBG_NETDEV, "nes_alloc_resource returned error = %u\n", err); + return err; + } + nes_debug(NES_DBG_NETDEV, "ADD, arp_index=%d\n", arp_index); + + nesadapter->arp_table[arp_index].ip_addr = ip_addr; + memcpy(nesadapter->arp_table[arp_index].mac_addr, mac_addr, ETH_ALEN); + return arp_index; + } + + /* DELETE or RESOLVE */ + if (arp_index == nesadapter->arp_table_size) { + nes_debug(NES_DBG_NETDEV, "mac address not in ARP table - cannot delete or resolve\n"); + return -1; + } + + if (action == NES_ARP_RESOLVE) { + nes_debug(NES_DBG_NETDEV, "RESOLVE, arp_index=%d\n", arp_index); + return arp_index; + } + + if (action == NES_ARP_DELETE) { + nes_debug(NES_DBG_NETDEV, "DELETE, arp_index=%d\n", arp_index); + nesadapter->arp_table[arp_index].ip_addr = 0; + memset(nesadapter->arp_table[arp_index].mac_addr, 0x00, ETH_ALEN); + nes_free_resource(nesadapter, nesadapter->allocated_arps, arp_index); + return arp_index; + } + + return -1; +} + + +/** + * nes_mh_fix + */ +void nes_mh_fix(unsigned long parm) +{ + unsigned long flags; + struct nes_device *nesdev = (struct nes_device *)parm; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_vnic *nesvnic; + u32 used_chunks_tx; + u32 temp_used_chunks_tx; + u32 temp_last_used_chunks_tx; + u32 used_chunks_mask; + u32 mac_tx_frames_low; + u32 mac_tx_frames_high; + u32 mac_tx_pauses; + u32 serdes_status; + u32 reset_value; + u32 tx_control; + u32 tx_config; + u32 tx_pause_quanta; + u32 rx_control; + u32 rx_config; + u32 mac_exact_match; + u32 mpp_debug; + u32 i=0; + u32 chunks_tx_progress = 0; + + spin_lock_irqsave(&nesadapter->phy_lock, flags); + if ((nesadapter->mac_sw_state[0] != NES_MAC_SW_IDLE) || (nesadapter->mac_link_down[0])) { + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + goto no_mh_work; + } + nesadapter->mac_sw_state[0] = NES_MAC_SW_MH; + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + do { + mac_tx_frames_low = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_LOW); + mac_tx_frames_high = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_HIGH); + mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES); + used_chunks_tx = nes_read_indexed(nesdev, NES_IDX_USED_CHUNKS_TX); + nesdev->mac_pause_frames_sent += mac_tx_pauses; + used_chunks_mask = 0; + temp_used_chunks_tx = used_chunks_tx; + temp_last_used_chunks_tx = nesdev->last_used_chunks_tx; + + if (nesdev->netdev[0]) { + nesvnic = netdev_priv(nesdev->netdev[0]); + } else { + break; + } + + for (i=0; i<4; i++) { + used_chunks_mask <<= 8; + if (nesvnic->qp_nic_index[i] != 0xff) { + used_chunks_mask |= 0xff; + if ((temp_used_chunks_tx&0xff)<(temp_last_used_chunks_tx&0xff)) { + chunks_tx_progress = 1; + } + } + temp_used_chunks_tx >>= 8; + temp_last_used_chunks_tx >>= 8; + } + if ((mac_tx_frames_low) || (mac_tx_frames_high) || + (!(used_chunks_tx&used_chunks_mask)) || + (!(nesdev->last_used_chunks_tx&used_chunks_mask)) || + (chunks_tx_progress) ) { + nesdev->last_used_chunks_tx = used_chunks_tx; + break; + } + nesdev->last_used_chunks_tx = used_chunks_tx; + barrier(); + + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000005); + mh_pauses_sent++; + mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES); + if (mac_tx_pauses) { + nesdev->mac_pause_frames_sent += mac_tx_pauses; + break; + } + + tx_control = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONTROL); + tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG); + tx_pause_quanta = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA); + rx_control = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONTROL); + rx_config = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONFIG); + mac_exact_match = nes_read_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM); + mpp_debug = nes_read_indexed(nesdev, NES_IDX_MPP_DEBUG); + + /* one last ditch effort to avoid a false positive */ + mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES); + if (mac_tx_pauses) { + nesdev->last_mac_tx_pauses = nesdev->mac_pause_frames_sent; + nes_debug(NES_DBG_HW, "failsafe caught slow outbound pause\n"); + break; + } + mh_detected++; + + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, 0x00000000); + reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); + + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value | 0x0000001d); + + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) + & 0x00000040) != 0x00000040) && (i++ < 5000)) { + /* mdelay(1); */ + } + + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008); + serdes_status = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0); + + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000); + if (nesadapter->OneG_Mode) { + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222); + } else { + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222); + } + serdes_status = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_STATUS0); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff); + + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, tx_control); + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config); + nes_write_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA, tx_pause_quanta); + nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONTROL, rx_control); + nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONFIG, rx_config); + nes_write_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM, mac_exact_match); + nes_write_indexed(nesdev, NES_IDX_MPP_DEBUG, mpp_debug); + + } while (0); + + nesadapter->mac_sw_state[0] = NES_MAC_SW_IDLE; +no_mh_work: + nesdev->nesadapter->mh_timer.expires = jiffies + (HZ/5); + add_timer(&nesdev->nesadapter->mh_timer); +} + +/** + * nes_clc + */ +void nes_clc(unsigned long parm) +{ + unsigned long flags; + struct nes_device *nesdev = (struct nes_device *)parm; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + spin_lock_irqsave(&nesadapter->phy_lock, flags); + nesadapter->link_interrupt_count[0] = 0; + nesadapter->link_interrupt_count[1] = 0; + nesadapter->link_interrupt_count[2] = 0; + nesadapter->link_interrupt_count[3] = 0; + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + + nesadapter->lc_timer.expires = jiffies + 3600 * HZ; /* 1 hour */ + add_timer(&nesadapter->lc_timer); +} + + +/** + * nes_dump_mem + */ +void nes_dump_mem(unsigned int dump_debug_level, void *addr, int length) +{ + char xlate[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', + 'a', 'b', 'c', 'd', 'e', 'f'}; + char *ptr; + char hex_buf[80]; + char ascii_buf[20]; + int num_char; + int num_ascii; + int num_hex; + + if (!(nes_debug_level & dump_debug_level)) { + return; + } + + ptr = addr; + if (length > 0x100) { + nes_debug(dump_debug_level, "Length truncated from %x to %x\n", length, 0x100); + length = 0x100; + } + nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", ptr, length, length); + + memset(ascii_buf, 0, 20); + memset(hex_buf, 0, 80); + + num_ascii = 0; + num_hex = 0; + for (num_char = 0; num_char < length; num_char++) { + if (num_ascii == 8) { + ascii_buf[num_ascii++] = ' '; + hex_buf[num_hex++] = '-'; + hex_buf[num_hex++] = ' '; + } + + if (*ptr < 0x20 || *ptr > 0x7e) + ascii_buf[num_ascii++] = '.'; + else + ascii_buf[num_ascii++] = *ptr; + hex_buf[num_hex++] = xlate[((*ptr & 0xf0) >> 4)]; + hex_buf[num_hex++] = xlate[*ptr & 0x0f]; + hex_buf[num_hex++] = ' '; + ptr++; + + if (num_ascii >= 17) { + /* output line and reset */ + nes_debug(dump_debug_level, " %s | %s\n", hex_buf, ascii_buf); + memset(ascii_buf, 0, 20); + memset(hex_buf, 0, 80); + num_ascii = 0; + num_hex = 0; + } + } + + /* output the rest */ + if (num_ascii) { + while (num_ascii < 17) { + if (num_ascii == 8) { + hex_buf[num_hex++] = ' '; + hex_buf[num_hex++] = ' '; + } + hex_buf[num_hex++] = ' '; + hex_buf[num_hex++] = ' '; + hex_buf[num_hex++] = ' '; + num_ascii++; + } + + nes_debug(dump_debug_level, " %s | %s\n", hex_buf, ascii_buf); + } +} diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c new file mode 100644 index 0000000..ffd4b42 --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -0,0 +1,3917 @@ +/* + * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/random.h> +#include <linux/highmem.h> +#include <asm/byteorder.h> + +#include <rdma/ib_verbs.h> +#include <rdma/iw_cm.h> +#include <rdma/ib_user_verbs.h> + +#include "nes.h" + +#include <rdma/ib_umem.h> + +atomic_t mod_qp_timouts; +atomic_t qps_created; +atomic_t sw_qps_destroyed; + + +/** + * nes_alloc_mw + */ +static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd) { + unsigned long flags; + struct nes_pd *nespd = to_nespd(ibpd); + struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_cqp_request *cqp_request; + struct nes_mr *nesmr; + struct ib_mw *ibmw; + struct nes_hw_cqp_wqe *cqp_wqe; + int ret; + u32 stag; + u32 stag_index = 0; + u32 next_stag_index = 0; + u32 driver_key = 0; + u8 stag_key = 0; + + get_random_bytes(&next_stag_index, sizeof(next_stag_index)); + stag_key = (u8)next_stag_index; + + driver_key = 0; + + next_stag_index >>= 8; + next_stag_index %= nesadapter->max_mr; + + ret = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, + nesadapter->max_mr, &stag_index, &next_stag_index); + if (ret) { + return ERR_PTR(ret); + } + + nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); + if (!nesmr) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + return ERR_PTR(-ENOMEM); + } + + stag = stag_index << 8; + stag |= driver_key; + stag += (u32)stag_key; + + nes_debug(NES_DBG_MR, "Registering STag 0x%08X, index = 0x%08X\n", + stag, stag_index); + + /* Register the region with the adapter */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + kfree(nesmr); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + return ERR_PTR(-ENOMEM); + } + + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = + cpu_to_le32( NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_RIGHTS_REMOTE_READ | + NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_VA_TO | + NES_CQP_STAG_REM_ACC_EN); + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX, (nespd->pd_id & 0x00007fff)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL); + + /* Wait for CQP */ + ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X.\n", + stag, ret, cqp_request->major_code, cqp_request->minor_code); + if ((!ret) || (cqp_request->major_code)) { + if (atomic_dec_and_test(&cqp_request->refcount)) { + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + kfree(nesmr); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + if (!ret) { + return ERR_PTR(-ETIME); + } else { + return ERR_PTR(-ENOMEM); + } + } else { + if (atomic_dec_and_test(&cqp_request->refcount)) { + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + } + + nesmr->ibmw.rkey = stag; + nesmr->mode = IWNES_MEMREG_TYPE_MW; + ibmw = &nesmr->ibmw; + nesmr->pbl_4k = 0; + nesmr->pbls_used = 0; + + return ibmw; +} + + +/** + * nes_dealloc_mw + */ +static int nes_dealloc_mw(struct ib_mw *ibmw) +{ + struct nes_mr *nesmr = to_nesmw(ibmw); + struct nes_vnic *nesvnic = to_nesvnic(ibmw->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + int err = 0; + unsigned long flags; + int ret; + + /* Deallocate the window with the adapter */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, NES_CQP_DEALLOCATE_STAG); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ibmw->rkey); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL); + + /* Wait for CQP */ + nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X to complete.\n", + ibmw->rkey); + ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_MR, "Deallocate STag completed, wait_event_timeout ret = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X.\n", + ret, cqp_request->major_code, cqp_request->minor_code); + if ((!ret) || (cqp_request->major_code)) { + if (atomic_dec_and_test(&cqp_request->refcount)) { + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + if (!ret) { + err = -ETIME; + } else { + err = -EIO; + } + } else { + if (atomic_dec_and_test(&cqp_request->refcount)) { + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + } + + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + (ibmw->rkey & 0x0fffff00) >> 8); + kfree(nesmr); + + return err; +} + + +/** + * nes_bind_mw + */ +static int nes_bind_mw(struct ib_qp *ibqp, struct ib_mw *ibmw, + struct ib_mw_bind *ibmw_bind) +{ + u64 u64temp; + struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); + struct nes_device *nesdev = nesvnic->nesdev; + /* struct nes_mr *nesmr = to_nesmw(ibmw); */ + struct nes_qp *nesqp = to_nesqp(ibqp); + struct nes_hw_qp_wqe *wqe; + unsigned long flags = 0; + u32 head; + u32 wqe_misc = 0; + u32 qsize; + + if (nesqp->ibqp_state > IB_QPS_RTS) + return -EINVAL; + + spin_lock_irqsave(&nesqp->lock, flags); + + head = nesqp->hwqp.sq_head; + qsize = nesqp->hwqp.sq_tail; + + /* Check for SQ overflow */ + if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) { + spin_unlock_irqrestore(&nesqp->lock, flags); + return -EINVAL; + } + + wqe = &nesqp->hwqp.sq_vbase[head]; + /* nes_debug(NES_DBG_MR, "processing sq wqe at %p, head = %u.\n", wqe, head); */ + nes_fill_init_qp_wqe(wqe, nesqp, head); + u64temp = ibmw_bind->wr_id; + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, u64temp); + wqe_misc = NES_IWARP_SQ_OP_BIND; + + wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; + + if (ibmw_bind->send_flags & IB_SEND_SIGNALED) + wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL; + + if (ibmw_bind->mw_access_flags & IB_ACCESS_REMOTE_WRITE) { + wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE; + } + if (ibmw_bind->mw_access_flags & IB_ACCESS_REMOTE_READ) { + wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_READ; + } + + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_MISC_IDX, wqe_misc); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MR_IDX, ibmw_bind->mr->lkey); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MW_IDX, ibmw->rkey); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX, + ibmw_bind->length); + wqe->wqe_words[NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX] = 0; + u64temp = (u64)ibmw_bind->addr; + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX, u64temp); + + head++; + if (head >= qsize) + head = 0; + + nesqp->hwqp.sq_head = head; + barrier(); + + nes_write32(nesdev->regs+NES_WQE_ALLOC, + (1 << 24) | 0x00800000 | nesqp->hwqp.qp_id); + + spin_unlock_irqrestore(&nesqp->lock, flags); + + return 0; +} + + +/** + * nes_alloc_fmr + */ +static struct ib_fmr *nes_alloc_fmr(struct ib_pd *ibpd, + int ibmr_access_flags, + struct ib_fmr_attr *ibfmr_attr) +{ + unsigned long flags; + struct nes_pd *nespd = to_nespd(ibpd); + struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_fmr *nesfmr; + struct nes_cqp_request *cqp_request; + struct nes_hw_cqp_wqe *cqp_wqe; + int ret; + u32 stag; + u32 stag_index = 0; + u32 next_stag_index = 0; + u32 driver_key = 0; + u32 opcode = 0; + u8 stag_key = 0; + int i=0; + struct nes_vpbl vpbl; + + get_random_bytes(&next_stag_index, sizeof(next_stag_index)); + stag_key = (u8)next_stag_index; + + driver_key = 0; + + next_stag_index >>= 8; + next_stag_index %= nesadapter->max_mr; + + ret = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, + nesadapter->max_mr, &stag_index, &next_stag_index); + if (ret) { + goto failed_resource_alloc; + } + + nesfmr = kzalloc(sizeof(*nesfmr), GFP_KERNEL); + if (!nesfmr) { + ret = -ENOMEM; + goto failed_fmr_alloc; + } + + nesfmr->nesmr.mode = IWNES_MEMREG_TYPE_FMR; + if (ibfmr_attr->max_pages == 1) { + /* use zero length PBL */ + nesfmr->nesmr.pbl_4k = 0; + nesfmr->nesmr.pbls_used = 0; + } else if (ibfmr_attr->max_pages <= 32) { + /* use PBL 256 */ + nesfmr->nesmr.pbl_4k = 0; + nesfmr->nesmr.pbls_used = 1; + } else if (ibfmr_attr->max_pages <= 512) { + /* use 4K PBLs */ + nesfmr->nesmr.pbl_4k = 1; + nesfmr->nesmr.pbls_used = 1; + } else { + /* use two level 4K PBLs */ + /* add support for two level 256B PBLs */ + nesfmr->nesmr.pbl_4k = 1; + nesfmr->nesmr.pbls_used = 1 + (ibfmr_attr->max_pages >> 9) + + ((ibfmr_attr->max_pages & 511) ? 1 : 0); + } + /* Register the region with the adapter */ + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + + /* track PBL resources */ + if (nesfmr->nesmr.pbls_used != 0) { + if (nesfmr->nesmr.pbl_4k) { + if (nesfmr->nesmr.pbls_used > nesadapter->free_4kpbl) { + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + ret = -ENOMEM; + goto failed_vpbl_alloc; + } else { + nesadapter->free_4kpbl -= nesfmr->nesmr.pbls_used; + } + } else { + if (nesfmr->nesmr.pbls_used > nesadapter->free_256pbl) { + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + ret = -ENOMEM; + goto failed_vpbl_alloc; + } else { + nesadapter->free_256pbl -= nesfmr->nesmr.pbls_used; + } + } + } + + /* one level pbl */ + if (nesfmr->nesmr.pbls_used == 0) { + nesfmr->root_vpbl.pbl_vbase = NULL; + nes_debug(NES_DBG_MR, "zero level pbl \n"); + } else if (nesfmr->nesmr.pbls_used == 1) { + /* can change it to kmalloc & dma_map_single */ + nesfmr->root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, + &nesfmr->root_vpbl.pbl_pbase); + if (!nesfmr->root_vpbl.pbl_vbase) { + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + ret = -ENOMEM; + goto failed_vpbl_alloc; + } + nesfmr->leaf_pbl_cnt = 0; + nes_debug(NES_DBG_MR, "one level pbl, root_vpbl.pbl_vbase=%p \n", + nesfmr->root_vpbl.pbl_vbase); + } + /* two level pbl */ + else { + nesfmr->root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 8192, + &nesfmr->root_vpbl.pbl_pbase); + if (!nesfmr->root_vpbl.pbl_vbase) { + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + ret = -ENOMEM; + goto failed_vpbl_alloc; + } + + nesfmr->root_vpbl.leaf_vpbl = kzalloc(sizeof(*nesfmr->root_vpbl.leaf_vpbl)*1024, GFP_KERNEL); + if (!nesfmr->root_vpbl.leaf_vpbl) { + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + ret = -ENOMEM; + goto failed_leaf_vpbl_alloc; + } + + nesfmr->leaf_pbl_cnt = nesfmr->nesmr.pbls_used-1; + nes_debug(NES_DBG_MR, "two level pbl, root_vpbl.pbl_vbase=%p" + " leaf_pbl_cnt=%d root_vpbl.leaf_vpbl=%p\n", + nesfmr->root_vpbl.pbl_vbase, nesfmr->leaf_pbl_cnt, nesfmr->root_vpbl.leaf_vpbl); + + for (i=0; i<nesfmr->leaf_pbl_cnt; i++) + nesfmr->root_vpbl.leaf_vpbl[i].pbl_vbase = NULL; + + for (i=0; i<nesfmr->leaf_pbl_cnt; i++) { + vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, + &vpbl.pbl_pbase); + + if (!vpbl.pbl_vbase) { + ret = -ENOMEM; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + goto failed_leaf_vpbl_pages_alloc; + } + + nesfmr->root_vpbl.pbl_vbase[i].pa_low = cpu_to_le32((u32)vpbl.pbl_pbase); + nesfmr->root_vpbl.pbl_vbase[i].pa_high = cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32))); + nesfmr->root_vpbl.leaf_vpbl[i] = vpbl; + + nes_debug(NES_DBG_MR, "pbase_low=0x%x, pbase_high=0x%x, vpbl=%p\n", + nesfmr->root_vpbl.pbl_vbase[i].pa_low, + nesfmr->root_vpbl.pbl_vbase[i].pa_high, + &nesfmr->root_vpbl.leaf_vpbl[i]); + } + } + nesfmr->ib_qp = NULL; + nesfmr->access_rights =0; + + stag = stag_index << 8; + stag |= driver_key; + stag += (u32)stag_key; + + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); + ret = -ENOMEM; + goto failed_leaf_vpbl_pages_alloc; + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + nes_debug(NES_DBG_MR, "Registering STag 0x%08X, index = 0x%08X\n", + stag, stag_index); + + opcode = NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_VA_TO | NES_CQP_STAG_MR; + + if (nesfmr->nesmr.pbl_4k == 1) + opcode |= NES_CQP_STAG_PBL_BLK_SIZE; + + if (ibmr_access_flags & IB_ACCESS_REMOTE_WRITE) { + opcode |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE | + NES_CQP_STAG_RIGHTS_LOCAL_WRITE | NES_CQP_STAG_REM_ACC_EN; + nesfmr->access_rights |= + NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_RIGHTS_LOCAL_WRITE | + NES_CQP_STAG_REM_ACC_EN; + } + + if (ibmr_access_flags & IB_ACCESS_REMOTE_READ) { + opcode |= NES_CQP_STAG_RIGHTS_REMOTE_READ | + NES_CQP_STAG_RIGHTS_LOCAL_READ | NES_CQP_STAG_REM_ACC_EN; + nesfmr->access_rights |= + NES_CQP_STAG_RIGHTS_REMOTE_READ | NES_CQP_STAG_RIGHTS_LOCAL_READ | + NES_CQP_STAG_REM_ACC_EN; + } + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX, (nespd->pd_id & 0x00007fff)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag); + + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX] = + cpu_to_le32((nesfmr->nesmr.pbls_used>1) ? + (nesfmr->nesmr.pbls_used-1) : nesfmr->nesmr.pbls_used); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL); + + /* Wait for CQP */ + ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X.\n", + stag, ret, cqp_request->major_code, cqp_request->minor_code); + + if ((!ret) || (cqp_request->major_code)) { + if (atomic_dec_and_test(&cqp_request->refcount)) { + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + ret = (!ret) ? -ETIME : -EIO; + goto failed_leaf_vpbl_pages_alloc; + } else { + if (atomic_dec_and_test(&cqp_request->refcount)) { + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + } + + nesfmr->nesmr.ibfmr.lkey = stag; + nesfmr->nesmr.ibfmr.rkey = stag; + nesfmr->attr = *ibfmr_attr; + + return &nesfmr->nesmr.ibfmr; + + failed_leaf_vpbl_pages_alloc: + /* unroll all allocated pages */ + for (i=0; i<nesfmr->leaf_pbl_cnt; i++) { + if (nesfmr->root_vpbl.leaf_vpbl[i].pbl_vbase) { + pci_free_consistent(nesdev->pcidev, 4096, nesfmr->root_vpbl.leaf_vpbl[i].pbl_vbase, + nesfmr->root_vpbl.leaf_vpbl[i].pbl_pbase); + } + } + if (nesfmr->root_vpbl.leaf_vpbl) + kfree(nesfmr->root_vpbl.leaf_vpbl); + + failed_leaf_vpbl_alloc: + if (nesfmr->leaf_pbl_cnt == 0) { + if (nesfmr->root_vpbl.pbl_vbase) + pci_free_consistent(nesdev->pcidev, 4096, nesfmr->root_vpbl.pbl_vbase, + nesfmr->root_vpbl.pbl_pbase); + } else + pci_free_consistent(nesdev->pcidev, 8192, nesfmr->root_vpbl.pbl_vbase, + nesfmr->root_vpbl.pbl_pbase); + + failed_vpbl_alloc: + kfree(nesfmr); + + failed_fmr_alloc: + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + + failed_resource_alloc: + return ERR_PTR(ret); +} + + +/** + * nes_dealloc_fmr + */ +static int nes_dealloc_fmr(struct ib_fmr *ibfmr) +{ + struct nes_mr *nesmr = to_nesmr_from_ibfmr(ibfmr); + struct nes_fmr *nesfmr = to_nesfmr(nesmr); + struct nes_vnic *nesvnic = to_nesvnic(ibfmr->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_mr temp_nesmr = *nesmr; + int i = 0; + + temp_nesmr.ibmw.device = ibfmr->device; + temp_nesmr.ibmw.pd = ibfmr->pd; + temp_nesmr.ibmw.rkey = ibfmr->rkey; + temp_nesmr.ibmw.uobject = NULL; + + /* free the resources */ + if (nesfmr->leaf_pbl_cnt == 0) { + /* single PBL case */ + if (nesfmr->root_vpbl.pbl_vbase) + pci_free_consistent(nesdev->pcidev, 4096, nesfmr->root_vpbl.pbl_vbase, + nesfmr->root_vpbl.pbl_pbase); + } else { + for (i = 0; i < nesfmr->leaf_pbl_cnt; i++) { + pci_free_consistent(nesdev->pcidev, 4096, nesfmr->root_vpbl.leaf_vpbl[i].pbl_vbase, + nesfmr->root_vpbl.leaf_vpbl[i].pbl_pbase); + } + kfree(nesfmr->root_vpbl.leaf_vpbl); + pci_free_consistent(nesdev->pcidev, 8192, nesfmr->root_vpbl.pbl_vbase, + nesfmr->root_vpbl.pbl_pbase); + } + + return nes_dealloc_mw(&temp_nesmr.ibmw); +} + + +/** + * nes_map_phys_fmr + */ +static int nes_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int list_len, u64 iova) +{ + return 0; +} + + +/** + * nes_unmap_frm + */ +static int nes_unmap_fmr(struct list_head *ibfmr_list) +{ + return 0; +} + + + +/** + * nes_query_device + */ +static int nes_query_device(struct ib_device *ibdev, struct ib_device_attr *props) +{ + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_ib_device *nesibdev = nesvnic->nesibdev; + + memset(props, 0, sizeof(*props)); + memcpy(&props->sys_image_guid, nesvnic->netdev->dev_addr, 6); + + props->fw_ver = nesdev->nesadapter->fw_ver; + props->device_cap_flags = nesdev->nesadapter->device_cap_flags; + props->vendor_id = nesdev->nesadapter->vendor_id; + props->vendor_part_id = nesdev->nesadapter->vendor_part_id; + props->hw_ver = nesdev->nesadapter->hw_rev; + props->max_mr_size = 0x80000000; + props->max_qp = nesibdev->max_qp; + props->max_qp_wr = nesdev->nesadapter->max_qp_wr - 2; + props->max_sge = nesdev->nesadapter->max_sge; + props->max_cq = nesibdev->max_cq; + props->max_cqe = nesdev->nesadapter->max_cqe - 1; + props->max_mr = nesibdev->max_mr; + props->max_mw = nesibdev->max_mr; + props->max_pd = nesibdev->max_pd; + props->max_sge_rd = 1; + switch (nesdev->nesadapter->max_irrq_wr) { + case 0: + props->max_qp_rd_atom = 1; + break; + case 1: + props->max_qp_rd_atom = 4; + break; + case 2: + props->max_qp_rd_atom = 16; + break; + case 3: + props->max_qp_rd_atom = 32; + break; + default: + props->max_qp_rd_atom = 0; + } + props->max_qp_init_rd_atom = props->max_qp_wr; + props->atomic_cap = IB_ATOMIC_NONE; + props->max_map_per_fmr = 1; + + return 0; +} + + +/** + * nes_query_port + */ +static int nes_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) +{ + memset(props, 0, sizeof(*props)); + + props->max_mtu = IB_MTU_2048; + props->active_mtu = IB_MTU_2048; + props->lid = 1; + props->lmc = 0; + props->sm_lid = 0; + props->sm_sl = 0; + props->state = IB_PORT_ACTIVE; + props->phys_state = 0; + props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP | + IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->qkey_viol_cntr = 0; + props->active_width = IB_WIDTH_4X; + props->active_speed = 1; + props->max_msg_sz = 0x80000000; + + return 0; +} + + +/** + * nes_modify_port + */ +static int nes_modify_port(struct ib_device *ibdev, u8 port, + int port_modify_mask, struct ib_port_modify *props) +{ + return 0; +} + + +/** + * nes_query_pkey + */ +static int nes_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) +{ + *pkey = 0; + return 0; +} + + +/** + * nes_query_gid + */ +static int nes_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + + memset(&(gid->raw[0]), 0, sizeof(gid->raw)); + memcpy(&(gid->raw[0]), nesvnic->netdev->dev_addr, 6); + + return 0; +} + + +/** + * nes_alloc_ucontext - Allocate the user context data structure. This keeps track + * of all objects associated with a particular user-mode client. + */ +static struct ib_ucontext *nes_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_alloc_ucontext_req req; + struct nes_alloc_ucontext_resp uresp; + struct nes_ucontext *nes_ucontext; + struct nes_ib_device *nesibdev = nesvnic->nesibdev; + + + if (ib_copy_from_udata(&req, udata, sizeof(struct nes_alloc_ucontext_req))) { + printk(KERN_ERR PFX "Invalid structure size on allocate user context.\n"); + return ERR_PTR(-EINVAL); + } + + if (req.userspace_ver != NES_ABI_USERSPACE_VER) { + printk(KERN_ERR PFX "Invalid userspace driver version detected. Detected version %d, should be %d\n", + req.userspace_ver, NES_ABI_USERSPACE_VER); + return ERR_PTR(-EINVAL); + } + + + memset(&uresp, 0, sizeof uresp); + + uresp.max_qps = nesibdev->max_qp; + uresp.max_pds = nesibdev->max_pd; + uresp.wq_size = nesdev->nesadapter->max_qp_wr * 2; + uresp.virtwq = nesadapter->virtwq; + uresp.kernel_ver = NES_ABI_KERNEL_VER; + + nes_ucontext = kzalloc(sizeof *nes_ucontext, GFP_KERNEL); + if (!nes_ucontext) + return ERR_PTR(-ENOMEM); + + nes_ucontext->nesdev = nesdev; + nes_ucontext->mmap_wq_offset = uresp.max_pds; + nes_ucontext->mmap_cq_offset = nes_ucontext->mmap_wq_offset + + ((sizeof(struct nes_hw_qp_wqe) * uresp.max_qps * 2) + PAGE_SIZE-1) / + PAGE_SIZE; + + + if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) { + kfree(nes_ucontext); + return ERR_PTR(-EFAULT); + } + + INIT_LIST_HEAD(&nes_ucontext->cq_reg_mem_list); + INIT_LIST_HEAD(&nes_ucontext->qp_reg_mem_list); + atomic_set(&nes_ucontext->usecnt, 1); + return &nes_ucontext->ibucontext; +} + + +/** + * nes_dealloc_ucontext + */ +static int nes_dealloc_ucontext(struct ib_ucontext *context) +{ + /* struct nes_vnic *nesvnic = to_nesvnic(context->device); */ + /* struct nes_device *nesdev = nesvnic->nesdev; */ + struct nes_ucontext *nes_ucontext = to_nesucontext(context); + + if (!atomic_dec_and_test(&nes_ucontext->usecnt)) + return 0; + kfree(nes_ucontext); + return 0; +} + + +/** + * nes_mmap + */ +static int nes_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + unsigned long index; + struct nes_vnic *nesvnic = to_nesvnic(context->device); + struct nes_device *nesdev = nesvnic->nesdev; + /* struct nes_adapter *nesadapter = nesdev->nesadapter; */ + struct nes_ucontext *nes_ucontext; + struct nes_qp *nesqp; + + nes_ucontext = to_nesucontext(context); + + + if (vma->vm_pgoff >= nes_ucontext->mmap_wq_offset) { + index = (vma->vm_pgoff - nes_ucontext->mmap_wq_offset) * PAGE_SIZE; + index /= ((sizeof(struct nes_hw_qp_wqe) * nesdev->nesadapter->max_qp_wr * 2) + + PAGE_SIZE-1) & (~(PAGE_SIZE-1)); + if (!test_bit(index, nes_ucontext->allocated_wqs)) { + nes_debug(NES_DBG_MMAP, "wq %lu not allocated\n", index); + return -EFAULT; + } + nesqp = nes_ucontext->mmap_nesqp[index]; + if (nesqp == NULL) { + nes_debug(NES_DBG_MMAP, "wq %lu has a NULL QP base.\n", index); + return -EFAULT; + } + if (remap_pfn_range(vma, vma->vm_start, + virt_to_phys(nesqp->hwqp.sq_vbase) >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot)) { + nes_debug(NES_DBG_MMAP, "remap_pfn_range failed.\n"); + return -EAGAIN; + } + vma->vm_private_data = nesqp; + return 0; + } else { + index = vma->vm_pgoff; + if (!test_bit(index, nes_ucontext->allocated_doorbells)) + return -EFAULT; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + if (io_remap_pfn_range(vma, vma->vm_start, + (nesdev->doorbell_start + + ((nes_ucontext->mmap_db_index[index] - nesdev->base_doorbell_index) * 4096)) + >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + vma->vm_private_data = nes_ucontext; + return 0; + } + + return -ENOSYS; +} + + +/** + * nes_alloc_pd + */ +static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, struct ib_udata *udata) +{ + struct nes_pd *nespd; + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_ucontext *nesucontext; + struct nes_alloc_pd_resp uresp; + u32 pd_num = 0; + int err; + + nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n", + nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context, + atomic_read(&nesvnic->netdev->refcnt)); + + err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds, + nesadapter->max_pd, &pd_num, &nesadapter->next_pd); + if (err) { + return ERR_PTR(err); + } + + nespd = kzalloc(sizeof (struct nes_pd), GFP_KERNEL); + if (!nespd) { + nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num); + return ERR_PTR(-ENOMEM); + } + + nes_debug(NES_DBG_PD, "Allocating PD (%p) for ib device %s\n", + nespd, nesvnic->nesibdev->ibdev.name); + + nespd->pd_id = (pd_num << (PAGE_SHIFT-12)) + nesadapter->base_pd; + + if (context) { + nesucontext = to_nesucontext(context); + nespd->mmap_db_index = find_next_zero_bit(nesucontext->allocated_doorbells, + NES_MAX_USER_DB_REGIONS, nesucontext->first_free_db); + nes_debug(NES_DBG_PD, "find_first_zero_biton doorbells returned %u, mapping pd_id %u.\n", + nespd->mmap_db_index, nespd->pd_id); + if (nespd->mmap_db_index > NES_MAX_USER_DB_REGIONS) { + nes_debug(NES_DBG_PD, "mmap_db_index > MAX\n"); + nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num); + kfree(nespd); + return ERR_PTR(-ENOMEM); + } + + uresp.pd_id = nespd->pd_id; + uresp.mmap_db_index = nespd->mmap_db_index; + if (ib_copy_to_udata(udata, &uresp, sizeof (struct nes_alloc_pd_resp))) { + nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num); + kfree(nespd); + return ERR_PTR(-EFAULT); + } + + set_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells); + nesucontext->mmap_db_index[nespd->mmap_db_index] = nespd->pd_id; + nesucontext->first_free_db = nespd->mmap_db_index + 1; + } + + nes_debug(NES_DBG_PD, "PD%u structure located @%p.\n", nespd->pd_id, nespd); + return &nespd->ibpd; +} + + +/** + * nes_dealloc_pd + */ +static int nes_dealloc_pd(struct ib_pd *ibpd) +{ + struct nes_ucontext *nesucontext; + struct nes_pd *nespd = to_nespd(ibpd); + struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + if ((ibpd->uobject) && (ibpd->uobject->context)) { + nesucontext = to_nesucontext(ibpd->uobject->context); + nes_debug(NES_DBG_PD, "Clearing bit %u from allocated doorbells\n", + nespd->mmap_db_index); + clear_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells); + nesucontext->mmap_db_index[nespd->mmap_db_index] = 0; + if (nesucontext->first_free_db > nespd->mmap_db_index) { + nesucontext->first_free_db = nespd->mmap_db_index; + } + } + + nes_debug(NES_DBG_PD, "Deallocating PD%u structure located @%p.\n", + nespd->pd_id, nespd); + nes_free_resource(nesadapter, nesadapter->allocated_pds, + (nespd->pd_id-nesadapter->base_pd)>>(PAGE_SHIFT-12)); + kfree(nespd); + + return 0; +} + + +/** + * nes_create_ah + */ +static struct ib_ah *nes_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + return ERR_PTR(-ENOSYS); +} + + +/** + * nes_destroy_ah + */ +static int nes_destroy_ah(struct ib_ah *ah) +{ + return -ENOSYS; +} + + +/** + * nes_get_encoded_size + */ +static inline u8 nes_get_encoded_size(int *size) +{ + u8 encoded_size = 0; + if (*size <= 32) { + *size = 32; + encoded_size = 1; + } else if (*size <= 128) { + *size = 128; + encoded_size = 2; + } else if (*size <= 512) { + *size = 512; + encoded_size = 3; + } + return (encoded_size); +} + + + +/** + * nes_setup_virt_qp + */ +static int nes_setup_virt_qp(struct nes_qp *nesqp, struct nes_pbl *nespbl, + struct nes_vnic *nesvnic, int sq_size, int rq_size) +{ + unsigned long flags; + void *mem; + __le64 *pbl = NULL; + __le64 *tpbl; + __le64 *pblbuffer; + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 pbl_entries; + u8 rq_pbl_entries; + u8 sq_pbl_entries; + + pbl_entries = nespbl->pbl_size >> 3; + nes_debug(NES_DBG_QP, "Userspace PBL, pbl_size=%u, pbl_entries = %d pbl_vbase=%p, pbl_pbase=%p\n", + nespbl->pbl_size, pbl_entries, + (void *)nespbl->pbl_vbase, + (void *)nespbl->pbl_pbase); + pbl = (__le64 *) nespbl->pbl_vbase; /* points to first pbl entry */ + /* now lets set the sq_vbase as well as rq_vbase addrs we will assign */ + /* the first pbl to be fro the rq_vbase... */ + rq_pbl_entries = (rq_size * sizeof(struct nes_hw_qp_wqe)) >> 12; + sq_pbl_entries = (sq_size * sizeof(struct nes_hw_qp_wqe)) >> 12; + nesqp->hwqp.sq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32); + if (!nespbl->page) { + nes_debug(NES_DBG_QP, "QP nespbl->page is NULL \n"); + kfree(nespbl); + return -ENOMEM; + } + + nesqp->hwqp.sq_vbase = kmap(nespbl->page); + nesqp->page = nespbl->page; + if (!nesqp->hwqp.sq_vbase) { + nes_debug(NES_DBG_QP, "QP sq_vbase kmap failed\n"); + kfree(nespbl); + return -ENOMEM; + } + + /* Now to get to sq.. we need to calculate how many */ + /* PBL entries were used by the rq.. */ + pbl += sq_pbl_entries; + nesqp->hwqp.rq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32); + /* nesqp->hwqp.rq_vbase = bus_to_virt(*pbl); */ + /*nesqp->hwqp.rq_vbase = phys_to_virt(*pbl); */ + + nes_debug(NES_DBG_QP, "QP sq_vbase= %p sq_pbase=%p rq_vbase=%p rq_pbase=%p\n", + nesqp->hwqp.sq_vbase, (void *)nesqp->hwqp.sq_pbase, + nesqp->hwqp.rq_vbase, (void *)nesqp->hwqp.rq_pbase); + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + if (!nesadapter->free_256pbl) { + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, + nespbl->pbl_pbase); + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + kunmap(nesqp->page); + kfree(nespbl); + return -ENOMEM; + } + nesadapter->free_256pbl--; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + + nesqp->pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 256, &nesqp->pbl_pbase); + pblbuffer = nesqp->pbl_vbase; + if (!nesqp->pbl_vbase) { + /* memory allocated during nes_reg_user_mr() */ + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, + nespbl->pbl_pbase); + kfree(nespbl); + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + nesadapter->free_256pbl++; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + kunmap(nesqp->page); + return -ENOMEM; + } + memset(nesqp->pbl_vbase, 0, 256); + /* fill in the page address in the pbl buffer.. */ + tpbl = pblbuffer + 16; + pbl = (__le64 *)nespbl->pbl_vbase; + while (sq_pbl_entries--) + *tpbl++ = *pbl++; + tpbl = pblbuffer; + while (rq_pbl_entries--) + *tpbl++ = *pbl++; + + /* done with memory allocated during nes_reg_user_mr() */ + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, + nespbl->pbl_pbase); + kfree(nespbl); + + nesqp->qp_mem_size = + max((u32)sizeof(struct nes_qp_context), ((u32)256)) + 256; /* this is Q2 */ + /* Round up to a multiple of a page */ + nesqp->qp_mem_size += PAGE_SIZE - 1; + nesqp->qp_mem_size &= ~(PAGE_SIZE - 1); + + mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size, + &nesqp->hwqp.q2_pbase); + + if (!mem) { + pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase); + nesqp->pbl_vbase = NULL; + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + nesadapter->free_256pbl++; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + kunmap(nesqp->page); + return -ENOMEM; + } + nesqp->hwqp.q2_vbase = mem; + mem += 256; + memset(nesqp->hwqp.q2_vbase, 0, 256); + nesqp->nesqp_context = mem; + memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context)); + nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256; + + return 0; +} + + +/** + * nes_setup_mmap_qp + */ +static int nes_setup_mmap_qp(struct nes_qp *nesqp, struct nes_vnic *nesvnic, + int sq_size, int rq_size) +{ + void *mem; + struct nes_device *nesdev = nesvnic->nesdev; + + nesqp->qp_mem_size = (sizeof(struct nes_hw_qp_wqe) * sq_size) + + (sizeof(struct nes_hw_qp_wqe) * rq_size) + + max((u32)sizeof(struct nes_qp_context), ((u32)256)) + + 256; /* this is Q2 */ + /* Round up to a multiple of a page */ + nesqp->qp_mem_size += PAGE_SIZE - 1; + nesqp->qp_mem_size &= ~(PAGE_SIZE - 1); + + mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size, + &nesqp->hwqp.sq_pbase); + if (!mem) + return -ENOMEM; + nes_debug(NES_DBG_QP, "PCI consistent memory for " + "host descriptor rings located @ %p (pa = 0x%08lX.) size = %u.\n", + mem, (unsigned long)nesqp->hwqp.sq_pbase, nesqp->qp_mem_size); + + memset(mem, 0, nesqp->qp_mem_size); + + nesqp->hwqp.sq_vbase = mem; + mem += sizeof(struct nes_hw_qp_wqe) * sq_size; + + nesqp->hwqp.rq_vbase = mem; + nesqp->hwqp.rq_pbase = nesqp->hwqp.sq_pbase + + sizeof(struct nes_hw_qp_wqe) * sq_size; + mem += sizeof(struct nes_hw_qp_wqe) * rq_size; + + nesqp->hwqp.q2_vbase = mem; + nesqp->hwqp.q2_pbase = nesqp->hwqp.rq_pbase + + sizeof(struct nes_hw_qp_wqe) * rq_size; + mem += 256; + memset(nesqp->hwqp.q2_vbase, 0, 256); + + nesqp->nesqp_context = mem; + nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256; + memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context)); + return 0; +} + + +/** + * nes_free_qp_mem() is to free up the qp's pci_alloc_consistent() memory. + */ +static inline void nes_free_qp_mem(struct nes_device *nesdev, + struct nes_qp *nesqp, int virt_wqs) +{ + unsigned long flags; + struct nes_adapter *nesadapter = nesdev->nesadapter; + if (!virt_wqs) { + pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, + nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase); + }else { + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + nesadapter->free_256pbl++; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase); + pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase ); + nesqp->pbl_vbase = NULL; + kunmap(nesqp->page); + } +} + + +/** + * nes_create_qp + */ +static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, struct ib_udata *udata) +{ + u64 u64temp= 0; + u64 u64nesqp = 0; + struct nes_pd *nespd = to_nespd(ibpd); + struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_qp *nesqp; + struct nes_cq *nescq; + struct nes_ucontext *nes_ucontext; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + struct nes_create_qp_req req; + struct nes_create_qp_resp uresp; + struct nes_pbl *nespbl = NULL; + u32 qp_num = 0; + u32 opcode = 0; + /* u32 counter = 0; */ + void *mem; + unsigned long flags; + int ret; + int err; + int virt_wqs = 0; + int sq_size; + int rq_size; + u8 sq_encoded_size; + u8 rq_encoded_size; + /* int counter; */ + + atomic_inc(&qps_created); + switch (init_attr->qp_type) { + case IB_QPT_RC: + if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) { + init_attr->cap.max_inline_data = 0; + } else { + init_attr->cap.max_inline_data = 64; + } + sq_size = init_attr->cap.max_send_wr; + rq_size = init_attr->cap.max_recv_wr; + + // check if the encoded sizes are OK or not... + sq_encoded_size = nes_get_encoded_size(&sq_size); + rq_encoded_size = nes_get_encoded_size(&rq_size); + + if ((!sq_encoded_size) || (!rq_encoded_size)) { + nes_debug(NES_DBG_QP, "ERROR bad rq (%u) or sq (%u) size\n", + rq_size, sq_size); + return ERR_PTR(-EINVAL); + } + + init_attr->cap.max_send_wr = sq_size -2; + init_attr->cap.max_recv_wr = rq_size -1; + nes_debug(NES_DBG_QP, "RQ size=%u, SQ Size=%u\n", rq_size, sq_size); + + ret = nes_alloc_resource(nesadapter, nesadapter->allocated_qps, + nesadapter->max_qp, &qp_num, &nesadapter->next_qp); + if (ret) { + return ERR_PTR(ret); + } + + /* Need 512 (actually now 1024) byte alignment on this structure */ + mem = kzalloc(sizeof(*nesqp)+NES_SW_CONTEXT_ALIGN-1, GFP_KERNEL); + if (!mem) { + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + nes_debug(NES_DBG_QP, "Unable to allocate QP\n"); + return ERR_PTR(-ENOMEM); + } + u64nesqp = (unsigned long)mem; + u64nesqp += ((u64)NES_SW_CONTEXT_ALIGN) - 1; + u64temp = ((u64)NES_SW_CONTEXT_ALIGN) - 1; + u64nesqp &= ~u64temp; + nesqp = (struct nes_qp *)(unsigned long)u64nesqp; + /* nes_debug(NES_DBG_QP, "nesqp=%p, allocated buffer=%p. Rounded to closest %u\n", + nesqp, mem, NES_SW_CONTEXT_ALIGN); */ + nesqp->allocated_buffer = mem; + + if (udata) { + if (ib_copy_from_udata(&req, udata, sizeof(struct nes_create_qp_req))) { + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + kfree(nesqp->allocated_buffer); + nes_debug(NES_DBG_QP, "ib_copy_from_udata() Failed \n"); + return NULL; + } + if (req.user_wqe_buffers) { + virt_wqs = 1; + } + if ((ibpd->uobject) && (ibpd->uobject->context)) { + nesqp->user_mode = 1; + nes_ucontext = to_nesucontext(ibpd->uobject->context); + if (virt_wqs) { + err = 1; + list_for_each_entry(nespbl, &nes_ucontext->qp_reg_mem_list, list) { + if (nespbl->user_base == (unsigned long )req.user_wqe_buffers) { + list_del(&nespbl->list); + err = 0; + nes_debug(NES_DBG_QP, "Found PBL for virtual QP. nespbl=%p. user_base=0x%lx\n", + nespbl, nespbl->user_base); + break; + } + } + if (err) { + nes_debug(NES_DBG_QP, "Didn't Find PBL for virtual QP. address = %llx.\n", + (long long unsigned int)req.user_wqe_buffers); + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + kfree(nesqp->allocated_buffer); + return ERR_PTR(-ENOMEM); + } + } + + nes_ucontext = to_nesucontext(ibpd->uobject->context); + nesqp->mmap_sq_db_index = + find_next_zero_bit(nes_ucontext->allocated_wqs, + NES_MAX_USER_WQ_REGIONS, nes_ucontext->first_free_wq); + /* nes_debug(NES_DBG_QP, "find_first_zero_biton wqs returned %u\n", + nespd->mmap_db_index); */ + if (nesqp->mmap_sq_db_index > NES_MAX_USER_WQ_REGIONS) { + nes_debug(NES_DBG_QP, + "db index > max user regions, failing create QP\n"); + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + if (virt_wqs) { + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, + nespbl->pbl_pbase); + kfree(nespbl); + } + kfree(nesqp->allocated_buffer); + return ERR_PTR(-ENOMEM); + } + set_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs); + nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = nesqp; + nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index + 1; + } else { + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + kfree(nesqp->allocated_buffer); + return ERR_PTR(-EFAULT); + } + } + err = (!virt_wqs) ? nes_setup_mmap_qp(nesqp, nesvnic, sq_size, rq_size) : + nes_setup_virt_qp(nesqp, nespbl, nesvnic, sq_size, rq_size); + if (err) { + nes_debug(NES_DBG_QP, + "error geting qp mem code = %d\n", err); + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + kfree(nesqp->allocated_buffer); + return ERR_PTR(-ENOMEM); + } + + nesqp->hwqp.sq_size = sq_size; + nesqp->hwqp.sq_encoded_size = sq_encoded_size; + nesqp->hwqp.sq_head = 1; + nesqp->hwqp.rq_size = rq_size; + nesqp->hwqp.rq_encoded_size = rq_encoded_size; + /* nes_debug(NES_DBG_QP, "nesqp->nesqp_context_pbase = %p\n", + (void *)nesqp->nesqp_context_pbase); + */ + nesqp->hwqp.qp_id = qp_num; + nesqp->ibqp.qp_num = nesqp->hwqp.qp_id; + nesqp->nespd = nespd; + + nescq = to_nescq(init_attr->send_cq); + nesqp->nesscq = nescq; + nescq = to_nescq(init_attr->recv_cq); + nesqp->nesrcq = nescq; + + nesqp->nesqp_context->misc |= cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << + NES_QPCONTEXT_MISC_PCI_FCN_SHIFT); + nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.rq_encoded_size << + NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT); + nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.sq_encoded_size << + NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT); + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_PRIV_EN); + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_FAST_REGISTER_EN); + nesqp->nesqp_context->cqs = cpu_to_le32(nesqp->nesscq->hw_cq.cq_number + + ((u32)nesqp->nesrcq->hw_cq.cq_number << 16)); + u64temp = (u64)nesqp->hwqp.sq_pbase; + nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp); + nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32)); + + + if (!virt_wqs) { + u64temp = (u64)nesqp->hwqp.sq_pbase; + nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp); + nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32)); + u64temp = (u64)nesqp->hwqp.rq_pbase; + nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp); + nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32)); + } else { + u64temp = (u64)nesqp->pbl_pbase; + nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp); + nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32)); + } + + /* nes_debug(NES_DBG_QP, "next_qp_nic_index=%u, using nic_index=%d\n", + nesvnic->next_qp_nic_index, + nesvnic->qp_nic_index[nesvnic->next_qp_nic_index]); */ + spin_lock_irqsave(&nesdev->cqp.lock, flags); + nesqp->nesqp_context->misc2 |= cpu_to_le32( + (u32)nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] << + NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT); + nesvnic->next_qp_nic_index++; + if ((nesvnic->next_qp_nic_index > 3) || + (nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] == 0xf)) { + nesvnic->next_qp_nic_index = 0; + } + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + + nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32((u32)nesqp->nespd->pd_id << 16); + u64temp = (u64)nesqp->hwqp.q2_pbase; + nesqp->nesqp_context->q2_addr_low = cpu_to_le32((u32)u64temp); + nesqp->nesqp_context->q2_addr_high = cpu_to_le32((u32)(u64temp >> 32)); + nesqp->nesqp_context->aeq_token_low = cpu_to_le32((u32)((unsigned long)(nesqp))); + nesqp->nesqp_context->aeq_token_high = cpu_to_le32((u32)(upper_32_bits((unsigned long)(nesqp)))); + nesqp->nesqp_context->ird_ord_sizes = cpu_to_le32(NES_QPCONTEXT_ORDIRD_ALSMM | + ((((u32)nesadapter->max_irrq_wr) << + NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT) & NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK)); + if (disable_mpa_crc) { + nes_debug(NES_DBG_QP, "Disabling MPA crc checking due to module option.\n"); + nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(NES_QPCONTEXT_ORDIRD_RNMC); + } + + + /* Create the QP */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_QP, "Failed to get a cqp_request\n"); + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + nes_free_qp_mem(nesdev, nesqp,virt_wqs); + kfree(nesqp->allocated_buffer); + return ERR_PTR(-ENOMEM); + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + if (!virt_wqs) { + opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP | + NES_CQP_QP_IWARP_STATE_IDLE; + } else { + opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_VIRT_WQS | + NES_CQP_QP_IWARP_STATE_IDLE; + } + opcode |= NES_CQP_QP_CQS_VALID; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); + + u64temp = (u64)nesqp->nesqp_context_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL); + + /* Wait for CQP */ + nes_debug(NES_DBG_QP, "Waiting for create iWARP QP%u to complete.\n", + nesqp->hwqp.qp_id); + ret = wait_event_timeout(cqp_request->waitq, + (cqp_request->request_done != 0), NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_QP, "Create iwarp QP%u completed, wait_event_timeout ret=%u," + " nesdev->cqp_head = %u, nesdev->cqp.sq_tail = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X.\n", + nesqp->hwqp.qp_id, ret, nesdev->cqp.sq_head, nesdev->cqp.sq_tail, + cqp_request->major_code, cqp_request->minor_code); + if ((!ret) || (cqp_request->major_code)) { + if (atomic_dec_and_test(&cqp_request->refcount)) { + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + nes_free_qp_mem(nesdev, nesqp,virt_wqs); + kfree(nesqp->allocated_buffer); + if (!ret) { + return ERR_PTR(-ETIME); + } else { + return ERR_PTR(-EIO); + } + } else { + if (atomic_dec_and_test(&cqp_request->refcount)) { + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + } + + if (ibpd->uobject) { + uresp.mmap_sq_db_index = nesqp->mmap_sq_db_index; + uresp.actual_sq_size = sq_size; + uresp.actual_rq_size = rq_size; + uresp.qp_id = nesqp->hwqp.qp_id; + uresp.nes_drv_opt = nes_drv_opt; + if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) { + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + nes_free_qp_mem(nesdev, nesqp,virt_wqs); + kfree(nesqp->allocated_buffer); + return ERR_PTR(-EFAULT); + } + } + + nes_debug(NES_DBG_QP, "QP%u structure located @%p.Size = %u.\n", + nesqp->hwqp.qp_id, nesqp, (u32)sizeof(*nesqp)); + spin_lock_init(&nesqp->lock); + init_waitqueue_head(&nesqp->state_waitq); + init_waitqueue_head(&nesqp->kick_waitq); + nes_add_ref(&nesqp->ibqp); + break; + default: + nes_debug(NES_DBG_QP, "Invalid QP type: %d\n", init_attr->qp_type); + return ERR_PTR(-EINVAL); + break; + } + + /* update the QP table */ + nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp; + nes_debug(NES_DBG_QP, "netdev refcnt=%u\n", + atomic_read(&nesvnic->netdev->refcnt)); + + return &nesqp->ibqp; +} + + +/** + * nes_destroy_qp + */ +static int nes_destroy_qp(struct ib_qp *ibqp) +{ + struct nes_qp *nesqp = to_nesqp(ibqp); + /* struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); */ + struct nes_ucontext *nes_ucontext; + struct ib_qp_attr attr; + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + int ret; + + atomic_inc(&sw_qps_destroyed); + nesqp->destroyed = 1; + + /* Blow away the connection if it exists. */ + if (nesqp->ibqp_state >= IB_QPS_INIT && nesqp->ibqp_state <= IB_QPS_RTS) { + /* if (nesqp->ibqp_state == IB_QPS_RTS) { */ + attr.qp_state = IB_QPS_ERR; + nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); + } + + if (((nesqp->ibqp_state == IB_QPS_INIT) || + (nesqp->ibqp_state == IB_QPS_RTR)) && (nesqp->cm_id)) { + cm_id = nesqp->cm_id; + cm_event.event = IW_CM_EVENT_CONNECT_REPLY; + cm_event.status = IW_CM_EVENT_STATUS_TIMEOUT; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + + nes_debug(NES_DBG_QP, "Generating a CM Timeout Event for " + "QP%u. cm_id = %p, refcount = %u. \n", + nesqp->hwqp.qp_id, cm_id, atomic_read(&nesqp->refcount)); + + cm_id->rem_ref(cm_id); + ret = cm_id->event_handler(cm_id, &cm_event); + if (ret) + nes_debug(NES_DBG_QP, "OFA CM event_handler returned, ret=%d\n", ret); + } + + + if (nesqp->user_mode) { + if ((ibqp->uobject)&&(ibqp->uobject->context)) { + nes_ucontext = to_nesucontext(ibqp->uobject->context); + clear_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs); + nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = NULL; + if (nes_ucontext->first_free_wq > nesqp->mmap_sq_db_index) { + nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index; + } + } + if (nesqp->pbl_pbase) + kunmap(nesqp->page); + } + + nes_rem_ref(&nesqp->ibqp); + return 0; +} + + +/** + * nes_create_cq + */ +static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int entries, + int comp_vector, + struct ib_ucontext *context, struct ib_udata *udata) +{ + u64 u64temp; + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_cq *nescq; + struct nes_ucontext *nes_ucontext = NULL; + struct nes_cqp_request *cqp_request; + void *mem = NULL; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_pbl *nespbl = NULL; + struct nes_create_cq_req req; + struct nes_create_cq_resp resp; + u32 cq_num = 0; + u32 opcode = 0; + u32 pbl_entries = 1; + int err; + unsigned long flags; + int ret; + + err = nes_alloc_resource(nesadapter, nesadapter->allocated_cqs, + nesadapter->max_cq, &cq_num, &nesadapter->next_cq); + if (err) { + return ERR_PTR(err); + } + + nescq = kzalloc(sizeof(struct nes_cq), GFP_KERNEL); + if (!nescq) { + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + nes_debug(NES_DBG_CQ, "Unable to allocate nes_cq struct\n"); + return ERR_PTR(-ENOMEM); + } + + nescq->hw_cq.cq_size = max(entries + 1, 5); + nescq->hw_cq.cq_number = cq_num; + nescq->ibcq.cqe = nescq->hw_cq.cq_size - 1; + + + if (context) { + nes_ucontext = to_nesucontext(context); + if (ib_copy_from_udata(&req, udata, sizeof (struct nes_create_cq_req))) { + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-EFAULT); + } + nesvnic->mcrq_ucontext = nes_ucontext; + nes_ucontext->mcrqf = req.mcrqf; + if (nes_ucontext->mcrqf) { + if (nes_ucontext->mcrqf & 0x80000000) + nescq->hw_cq.cq_number = nesvnic->nic.qp_id + 12 + (nes_ucontext->mcrqf & 0xf) - 1; + else if (nes_ucontext->mcrqf & 0x40000000) + nescq->hw_cq.cq_number = nes_ucontext->mcrqf & 0xffff; + else + nescq->hw_cq.cq_number = nesvnic->mcrq_qp_id + nes_ucontext->mcrqf-1; + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + } + nes_debug(NES_DBG_CQ, "CQ Virtual Address = %08lX, size = %u.\n", + (unsigned long)req.user_cq_buffer, entries); + list_for_each_entry(nespbl, &nes_ucontext->cq_reg_mem_list, list) { + if (nespbl->user_base == (unsigned long )req.user_cq_buffer) { + list_del(&nespbl->list); + err = 0; + nes_debug(NES_DBG_CQ, "Found PBL for virtual CQ. nespbl=%p.\n", + nespbl); + break; + } + } + if (err) { + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(err); + } + + pbl_entries = nespbl->pbl_size >> 3; + nescq->cq_mem_size = 0; + } else { + nescq->cq_mem_size = nescq->hw_cq.cq_size * sizeof(struct nes_hw_cqe); + nes_debug(NES_DBG_CQ, "Attempting to allocate pci memory (%u entries, %u bytes) for CQ%u.\n", + entries, nescq->cq_mem_size, nescq->hw_cq.cq_number); + + /* allocate the physical buffer space */ + mem = pci_alloc_consistent(nesdev->pcidev, nescq->cq_mem_size, + &nescq->hw_cq.cq_pbase); + if (!mem) { + printk(KERN_ERR PFX "Unable to allocate pci memory for cq\n"); + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-ENOMEM); + } + + memset(mem, 0, nescq->cq_mem_size); + nescq->hw_cq.cq_vbase = mem; + nescq->hw_cq.cq_head = 0; + nes_debug(NES_DBG_CQ, "CQ%u virtual address @ %p, phys = 0x%08X\n", + nescq->hw_cq.cq_number, nescq->hw_cq.cq_vbase, + (u32)nescq->hw_cq.cq_pbase); + } + + nescq->hw_cq.ce_handler = nes_iwarp_ce_handler; + spin_lock_init(&nescq->lock); + + /* send CreateCQ request to CQP */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n"); + if (!context) + pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, + nescq->hw_cq.cq_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-ENOMEM); + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + opcode = NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID | + NES_CQP_CQ_CHK_OVERFLOW | + NES_CQP_CQ_CEQE_MASK | ((u32)nescq->hw_cq.cq_size << 16); + + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + + if (pbl_entries != 1) { + if (pbl_entries > 32) { + /* use 4k pbl */ + nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 4k PBL\n", pbl_entries); + if (nesadapter->free_4kpbl == 0) { + if (cqp_request->dynamic) { + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + kfree(cqp_request); + } else { + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + } + if (!context) + pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, + nescq->hw_cq.cq_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-ENOMEM); + } else { + opcode |= (NES_CQP_CQ_VIRT | NES_CQP_CQ_4KB_CHUNK); + nescq->virtual_cq = 2; + nesadapter->free_4kpbl--; + } + } else { + /* use 256 byte pbl */ + nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 256 byte PBL\n", pbl_entries); + if (nesadapter->free_256pbl == 0) { + if (cqp_request->dynamic) { + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + kfree(cqp_request); + } else { + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + } + if (!context) + pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, + nescq->hw_cq.cq_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-ENOMEM); + } else { + opcode |= NES_CQP_CQ_VIRT; + nescq->virtual_cq = 1; + nesadapter->free_256pbl--; + } + } + } + + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + (nescq->hw_cq.cq_number | ((u32)nesdev->ceq_index << 16))); + + if (context) { + if (pbl_entries != 1) + u64temp = (u64)nespbl->pbl_pbase; + else + u64temp = le64_to_cpu(nespbl->pbl_vbase[0]); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX, + nes_ucontext->mmap_db_index[0]); + } else { + u64temp = (u64)nescq->hw_cq.cq_pbase; + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0; + } + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0; + u64temp = (u64)(unsigned long)&nescq->hw_cq; + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = + cpu_to_le32((u32)(u64temp >> 1)); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = + cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL); + + /* Wait for CQP */ + nes_debug(NES_DBG_CQ, "Waiting for create iWARP CQ%u to complete.\n", + nescq->hw_cq.cq_number); + ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done), + NES_EVENT_TIMEOUT * 2); + nes_debug(NES_DBG_CQ, "Create iWARP CQ%u completed, wait_event_timeout ret = %d.\n", + nescq->hw_cq.cq_number, ret); + if ((!ret) || (cqp_request->major_code)) { + if (atomic_dec_and_test(&cqp_request->refcount)) { + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + nes_debug(NES_DBG_CQ, "iWARP CQ%u create timeout expired, major code = 0x%04X," + " minor code = 0x%04X\n", + nescq->hw_cq.cq_number, cqp_request->major_code, cqp_request->minor_code); + if (!context) + pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, + nescq->hw_cq.cq_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-EIO); + } else { + if (atomic_dec_and_test(&cqp_request->refcount)) { + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + } + + if (context) { + /* free the nespbl */ + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, + nespbl->pbl_pbase); + kfree(nespbl); + resp.cq_id = nescq->hw_cq.cq_number; + resp.cq_size = nescq->hw_cq.cq_size; + resp.mmap_db_index = 0; + if (ib_copy_to_udata(udata, &resp, sizeof resp)) { + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-EFAULT); + } + } + + return &nescq->ibcq; +} + + +/** + * nes_destroy_cq + */ +static int nes_destroy_cq(struct ib_cq *ib_cq) +{ + struct nes_cq *nescq; + struct nes_device *nesdev; + struct nes_vnic *nesvnic; + struct nes_adapter *nesadapter; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + unsigned long flags; + u32 opcode = 0; + int ret; + + if (ib_cq == NULL) + return 0; + + nescq = to_nescq(ib_cq); + nesvnic = to_nesvnic(ib_cq->device); + nesdev = nesvnic->nesdev; + nesadapter = nesdev->nesadapter; + + nes_debug(NES_DBG_CQ, "Destroy CQ%u\n", nescq->hw_cq.cq_number); + + /* Send DestroyCQ request to CQP */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + opcode = NES_CQP_DESTROY_CQ | (nescq->hw_cq.cq_size << 16); + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + if (nescq->virtual_cq == 1) { + nesadapter->free_256pbl++; + if (nesadapter->free_256pbl > nesadapter->max_256pbl) { + printk(KERN_ERR PFX "%s: free 256B PBLs(%u) has exceeded the max(%u)\n", + __FUNCTION__, nesadapter->free_256pbl, nesadapter->max_256pbl); + } + } else if (nescq->virtual_cq == 2) { + nesadapter->free_4kpbl++; + if (nesadapter->free_4kpbl > nesadapter->max_4kpbl) { + printk(KERN_ERR PFX "%s: free 4K PBLs(%u) has exceeded the max(%u)\n", + __FUNCTION__, nesadapter->free_4kpbl, nesadapter->max_4kpbl); + } + opcode |= NES_CQP_CQ_4KB_CHUNK; + } + + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + (nescq->hw_cq.cq_number | ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 16))); + nes_free_resource(nesadapter, nesadapter->allocated_cqs, nescq->hw_cq.cq_number); + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL); + + /* Wait for CQP */ + nes_debug(NES_DBG_CQ, "Waiting for destroy iWARP CQ%u to complete.\n", + nescq->hw_cq.cq_number); + ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_CQ, "Destroy iWARP CQ%u completed, wait_event_timeout ret = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X.\n", + nescq->hw_cq.cq_number, ret, cqp_request->major_code, + cqp_request->minor_code); + if ((!ret) || (cqp_request->major_code)) { + if (atomic_dec_and_test(&cqp_request->refcount)) { + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + if (!ret) { + nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy timeout expired\n", + nescq->hw_cq.cq_number); + ret = -ETIME; + } else { + nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy failed\n", + nescq->hw_cq.cq_number); + ret = -EIO; + } + } else { + ret = 0; + if (atomic_dec_and_test(&cqp_request->refcount)) { + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + } + + if (nescq->cq_mem_size) + pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, + (void *)nescq->hw_cq.cq_vbase, nescq->hw_cq.cq_pbase); + kfree(nescq); + + return ret; +} + + +/** + * nes_reg_mr + */ +static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd, + u32 stag, u64 region_length, struct nes_root_vpbl *root_vpbl, + dma_addr_t single_buffer, u16 pbl_count, u16 residual_page_count, + int acc, u64 *iova_start) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + unsigned long flags; + int ret; + struct nes_adapter *nesadapter = nesdev->nesadapter; + /* int count; */ + u32 opcode = 0; + u16 major_code; + + /* Register the region with the adapter */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + /* track PBL resources */ + if (pbl_count != 0) { + if (pbl_count > 1) { + /* Two level PBL */ + if ((pbl_count+1) > nesadapter->free_4kpbl) { + nes_debug(NES_DBG_MR, "Out of 4KB Pbls for two level request.\n"); + if (cqp_request->dynamic) { + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + kfree(cqp_request); + } else { + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + } + return -ENOMEM; + } else { + nesadapter->free_4kpbl -= pbl_count+1; + } + } else if (residual_page_count > 32) { + if (pbl_count > nesadapter->free_4kpbl) { + nes_debug(NES_DBG_MR, "Out of 4KB Pbls.\n"); + if (cqp_request->dynamic) { + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + kfree(cqp_request); + } else { + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + } + return -ENOMEM; + } else { + nesadapter->free_4kpbl -= pbl_count; + } + } else { + if (pbl_count > nesadapter->free_256pbl) { + nes_debug(NES_DBG_MR, "Out of 256B Pbls.\n"); + if (cqp_request->dynamic) { + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + kfree(cqp_request); + } else { + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + } + return -ENOMEM; + } else { + nesadapter->free_256pbl -= pbl_count; + } + } + } + + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + + opcode = NES_CQP_REGISTER_STAG | NES_CQP_STAG_RIGHTS_LOCAL_READ | + NES_CQP_STAG_VA_TO | NES_CQP_STAG_MR; + if (acc & IB_ACCESS_LOCAL_WRITE) + opcode |= NES_CQP_STAG_RIGHTS_LOCAL_WRITE; + if (acc & IB_ACCESS_REMOTE_WRITE) + opcode |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_REM_ACC_EN; + if (acc & IB_ACCESS_REMOTE_READ) + opcode |= NES_CQP_STAG_RIGHTS_REMOTE_READ | NES_CQP_STAG_REM_ACC_EN; + if (acc & IB_ACCESS_MW_BIND) + opcode |= NES_CQP_STAG_RIGHTS_WINDOW_BIND | NES_CQP_STAG_REM_ACC_EN; + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, *iova_start); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, region_length); + + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] = + cpu_to_le32((u32)(region_length >> 8) & 0xff000000); + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |= + cpu_to_le32(nespd->pd_id & 0x00007fff); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag); + + if (pbl_count == 0) { + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, single_buffer); + } else { + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, root_vpbl->pbl_pbase); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, pbl_count); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, + (((pbl_count - 1) * 4096) + (residual_page_count*8))); + + if ((pbl_count > 1) || (residual_page_count > 32)) + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE); + } + barrier(); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL); + + /* Wait for CQP */ + ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X.\n", + stag, ret, cqp_request->major_code, cqp_request->minor_code); + major_code = cqp_request->major_code; + if (atomic_dec_and_test(&cqp_request->refcount)) { + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + if (!ret) + return -ETIME; + else if (major_code) + return -EIO; + else + return 0; + + return 0; +} + + +/** + * nes_reg_phys_mr + */ +static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, + struct ib_phys_buf *buffer_list, int num_phys_buf, int acc, + u64 * iova_start) +{ + u64 region_length; + struct nes_pd *nespd = to_nespd(ib_pd); + struct nes_vnic *nesvnic = to_nesvnic(ib_pd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_mr *nesmr; + struct ib_mr *ibmr; + struct nes_vpbl vpbl; + struct nes_root_vpbl root_vpbl; + u32 stag; + u32 i; + u32 stag_index = 0; + u32 next_stag_index = 0; + u32 driver_key = 0; + u32 root_pbl_index = 0; + u32 cur_pbl_index = 0; + int err = 0, pbl_depth = 0; + int ret = 0; + u16 pbl_count = 0; + u8 single_page = 1; + u8 stag_key = 0; + + pbl_depth = 0; + region_length = 0; + vpbl.pbl_vbase = NULL; + root_vpbl.pbl_vbase = NULL; + root_vpbl.pbl_pbase = 0; + + get_random_bytes(&next_stag_index, sizeof(next_stag_index)); + stag_key = (u8)next_stag_index; + + driver_key = 0; + + next_stag_index >>= 8; + next_stag_index %= nesadapter->max_mr; + if (num_phys_buf > (1024*512)) { + return ERR_PTR(-E2BIG); + } + + err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, nesadapter->max_mr, + &stag_index, &next_stag_index); + if (err) { + return ERR_PTR(err); + } + + nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); + if (!nesmr) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < num_phys_buf; i++) { + + if ((i & 0x01FF) == 0) { + if (root_pbl_index == 1) { + /* Allocate the root PBL */ + root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 8192, + &root_vpbl.pbl_pbase); + nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n", + root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase); + if (!root_vpbl.pbl_vbase) { + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + kfree(nesmr); + return ERR_PTR(-ENOMEM); + } + root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, GFP_KERNEL); + if (!root_vpbl.leaf_vpbl) { + pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, + root_vpbl.pbl_pbase); + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + kfree(nesmr); + return ERR_PTR(-ENOMEM); + } + root_vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)vpbl.pbl_pbase); + root_vpbl.pbl_vbase[0].pa_high = + cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); + root_vpbl.leaf_vpbl[0] = vpbl; + } + /* Allocate a 4K buffer for the PBL */ + vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, + &vpbl.pbl_pbase); + nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n", + vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase); + if (!vpbl.pbl_vbase) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + ibmr = ERR_PTR(-ENOMEM); + kfree(nesmr); + goto reg_phys_err; + } + /* Fill in the root table */ + if (1 <= root_pbl_index) { + root_vpbl.pbl_vbase[root_pbl_index].pa_low = + cpu_to_le32((u32)vpbl.pbl_pbase); + root_vpbl.pbl_vbase[root_pbl_index].pa_high = + cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); + root_vpbl.leaf_vpbl[root_pbl_index] = vpbl; + } + root_pbl_index++; + cur_pbl_index = 0; + } + if (buffer_list[i].addr & ~PAGE_MASK) { + /* TODO: Unwind allocated buffers */ + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n", + (unsigned int) buffer_list[i].addr); + ibmr = ERR_PTR(-EINVAL); + kfree(nesmr); + goto reg_phys_err; + } + + if (!buffer_list[i].size) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + nes_debug(NES_DBG_MR, "Invalid Buffer Size\n"); + ibmr = ERR_PTR(-EINVAL); + kfree(nesmr); + goto reg_phys_err; + } + + region_length += buffer_list[i].size; + if ((i != 0) && (single_page)) { + if ((buffer_list[i-1].addr+PAGE_SIZE) != buffer_list[i].addr) + single_page = 0; + } + vpbl.pbl_vbase[cur_pbl_index].pa_low = cpu_to_le32((u32)buffer_list[i].addr); + vpbl.pbl_vbase[cur_pbl_index++].pa_high = + cpu_to_le32((u32)((((u64)buffer_list[i].addr) >> 32))); + } + + stag = stag_index << 8; + stag |= driver_key; + stag += (u32)stag_key; + + nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%016lX," + " length = 0x%016lX, index = 0x%08X\n", + stag, (unsigned long)*iova_start, (unsigned long)region_length, stag_index); + + region_length -= (*iova_start)&PAGE_MASK; + + /* Make the leaf PBL the root if only one PBL */ + if (root_pbl_index == 1) { + root_vpbl.pbl_pbase = vpbl.pbl_pbase; + } + + if (single_page) { + pbl_count = 0; + } else { + pbl_count = root_pbl_index; + } + ret = nes_reg_mr(nesdev, nespd, stag, region_length, &root_vpbl, + buffer_list[0].addr, pbl_count, (u16)cur_pbl_index, acc, iova_start); + + if (ret == 0) { + nesmr->ibmr.rkey = stag; + nesmr->ibmr.lkey = stag; + nesmr->mode = IWNES_MEMREG_TYPE_MEM; + ibmr = &nesmr->ibmr; + nesmr->pbl_4k = ((pbl_count > 1) || (cur_pbl_index > 32)) ? 1 : 0; + nesmr->pbls_used = pbl_count; + if (pbl_count > 1) { + nesmr->pbls_used++; + } + } else { + kfree(nesmr); + ibmr = ERR_PTR(-ENOMEM); + } + + reg_phys_err: + /* free the resources */ + if (root_pbl_index == 1) { + /* single PBL case */ + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase); + } else { + for (i=0; i<root_pbl_index; i++) { + pci_free_consistent(nesdev->pcidev, 4096, root_vpbl.leaf_vpbl[i].pbl_vbase, + root_vpbl.leaf_vpbl[i].pbl_pbase); + } + kfree(root_vpbl.leaf_vpbl); + pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, + root_vpbl.pbl_pbase); + } + + return ibmr; +} + + +/** + * nes_get_dma_mr + */ +static struct ib_mr *nes_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct ib_phys_buf bl; + u64 kva = 0; + + nes_debug(NES_DBG_MR, "\n"); + + bl.size = (u64)0xffffffffffULL; + bl.addr = 0; + return nes_reg_phys_mr(pd, &bl, 1, acc, &kva); +} + + +/** + * nes_reg_user_mr + */ +static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *udata) +{ + u64 iova_start; + __le64 *pbl; + u64 region_length; + dma_addr_t last_dma_addr = 0; + dma_addr_t first_dma_addr = 0; + struct nes_pd *nespd = to_nespd(pd); + struct nes_vnic *nesvnic = to_nesvnic(pd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct ib_mr *ibmr = ERR_PTR(-EINVAL); + struct ib_umem_chunk *chunk; + struct nes_ucontext *nes_ucontext; + struct nes_pbl *nespbl; + struct nes_mr *nesmr; + struct ib_umem *region; + struct nes_mem_reg_req req; + struct nes_vpbl vpbl; + struct nes_root_vpbl root_vpbl; + int nmap_index, page_index; + int page_count = 0; + int err, pbl_depth = 0; + int chunk_pages; + int ret; + u32 stag; + u32 stag_index = 0; + u32 next_stag_index; + u32 driver_key; + u32 root_pbl_index = 0; + u32 cur_pbl_index = 0; + u32 skip_pages; + u16 pbl_count; + u8 single_page = 1; + u8 stag_key; + + region = ib_umem_get(pd->uobject->context, start, length, acc); + if (IS_ERR(region)) { + return (struct ib_mr *)region; + } + + nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u," + " offset = %u, page size = %u.\n", + (unsigned long int)start, (unsigned long int)virt, (u32)length, + region->offset, region->page_size); + + skip_pages = ((u32)region->offset) >> 12; + + if (ib_copy_from_udata(&req, udata, sizeof(req))) + return ERR_PTR(-EFAULT); + nes_debug(NES_DBG_MR, "Memory Registration type = %08X.\n", req.reg_type); + + switch (req.reg_type) { + case IWNES_MEMREG_TYPE_MEM: + pbl_depth = 0; + region_length = 0; + vpbl.pbl_vbase = NULL; + root_vpbl.pbl_vbase = NULL; + root_vpbl.pbl_pbase = 0; + + get_random_bytes(&next_stag_index, sizeof(next_stag_index)); + stag_key = (u8)next_stag_index; + + driver_key = next_stag_index & 0x70000000; + + next_stag_index >>= 8; + next_stag_index %= nesadapter->max_mr; + + err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, + nesadapter->max_mr, &stag_index, &next_stag_index); + if (err) { + ib_umem_release(region); + return ERR_PTR(err); + } + + nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); + if (!nesmr) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + return ERR_PTR(-ENOMEM); + } + nesmr->region = region; + + list_for_each_entry(chunk, ®ion->chunk_list, list) { + nes_debug(NES_DBG_MR, "Chunk: nents = %u, nmap = %u .\n", + chunk->nents, chunk->nmap); + for (nmap_index = 0; nmap_index < chunk->nmap; ++nmap_index) { + if (sg_dma_address(&chunk->page_list[nmap_index]) & ~PAGE_MASK) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n", + (unsigned int) sg_dma_address(&chunk->page_list[nmap_index])); + ibmr = ERR_PTR(-EINVAL); + kfree(nesmr); + goto reg_user_mr_err; + } + + if (!sg_dma_len(&chunk->page_list[nmap_index])) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + stag_index); + nes_debug(NES_DBG_MR, "Invalid Buffer Size\n"); + ibmr = ERR_PTR(-EINVAL); + kfree(nesmr); + goto reg_user_mr_err; + } + + region_length += sg_dma_len(&chunk->page_list[nmap_index]); + chunk_pages = sg_dma_len(&chunk->page_list[nmap_index]) >> 12; + region_length -= skip_pages << 12; + for (page_index=skip_pages; page_index < chunk_pages; page_index++) { + skip_pages = 0; + if ((page_count!=0)&&(page_count<<12)-(region->offset&(4096-1))>=region->length) + goto enough_pages; + if ((page_count&0x01FF) == 0) { + if (page_count>(1024*512)) { + ib_umem_release(region); + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + nes_free_resource(nesadapter, + nesadapter->allocated_mrs, stag_index); + kfree(nesmr); + ibmr = ERR_PTR(-E2BIG); + goto reg_user_mr_err; + } + if (root_pbl_index == 1) { + root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, + 8192, &root_vpbl.pbl_pbase); + nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n", + root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase); + if (!root_vpbl.pbl_vbase) { + ib_umem_release(region); + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + stag_index); + kfree(nesmr); + ibmr = ERR_PTR(-ENOMEM); + goto reg_user_mr_err; + } + root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, + GFP_KERNEL); + if (!root_vpbl.leaf_vpbl) { + ib_umem_release(region); + pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, + root_vpbl.pbl_pbase); + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + stag_index); + kfree(nesmr); + ibmr = ERR_PTR(-ENOMEM); + goto reg_user_mr_err; + } + root_vpbl.pbl_vbase[0].pa_low = + cpu_to_le32((u32)vpbl.pbl_pbase); + root_vpbl.pbl_vbase[0].pa_high = + cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); + root_vpbl.leaf_vpbl[0] = vpbl; + } + vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, + &vpbl.pbl_pbase); + nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n", + vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase); + if (!vpbl.pbl_vbase) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + ibmr = ERR_PTR(-ENOMEM); + kfree(nesmr); + goto reg_user_mr_err; + } + if (1 <= root_pbl_index) { + root_vpbl.pbl_vbase[root_pbl_index].pa_low = + cpu_to_le32((u32)vpbl.pbl_pbase); + root_vpbl.pbl_vbase[root_pbl_index].pa_high = + cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32))); + root_vpbl.leaf_vpbl[root_pbl_index] = vpbl; + } + root_pbl_index++; + cur_pbl_index = 0; + } + if (single_page) { + if (page_count != 0) { + if ((last_dma_addr+4096) != + (sg_dma_address(&chunk->page_list[nmap_index])+ + (page_index*4096))) + single_page = 0; + last_dma_addr = sg_dma_address(&chunk->page_list[nmap_index])+ + (page_index*4096); + } else { + first_dma_addr = sg_dma_address(&chunk->page_list[nmap_index])+ + (page_index*4096); + last_dma_addr = first_dma_addr; + } + } + + vpbl.pbl_vbase[cur_pbl_index].pa_low = + cpu_to_le32((u32)(sg_dma_address(&chunk->page_list[nmap_index])+ + (page_index*4096))); + vpbl.pbl_vbase[cur_pbl_index].pa_high = + cpu_to_le32((u32)((((u64)(sg_dma_address(&chunk->page_list[nmap_index])+ + (page_index*4096))) >> 32))); + cur_pbl_index++; + page_count++; + } + } + } + enough_pages: + nes_debug(NES_DBG_MR, "calculating stag, stag_index=0x%08x, driver_key=0x%08x," + " stag_key=0x%08x\n", + stag_index, driver_key, stag_key); + stag = stag_index << 8; + stag |= driver_key; + stag += (u32)stag_key; + if (stag == 0) { + stag = 1; + } + + iova_start = virt; + /* Make the leaf PBL the root if only one PBL */ + if (root_pbl_index == 1) { + root_vpbl.pbl_pbase = vpbl.pbl_pbase; + } + + if (single_page) { + pbl_count = 0; + } else { + pbl_count = root_pbl_index; + first_dma_addr = 0; + } + nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%08X, length = 0x%08X," + " index = 0x%08X, region->length=0x%08llx, pbl_count = %u\n", + stag, (unsigned int)iova_start, + (unsigned int)region_length, stag_index, + (unsigned long long)region->length, pbl_count); + ret = nes_reg_mr( nesdev, nespd, stag, region->length, &root_vpbl, + first_dma_addr, pbl_count, (u16)cur_pbl_index, acc, &iova_start); + + nes_debug(NES_DBG_MR, "ret=%d\n", ret); + + if (ret == 0) { + nesmr->ibmr.rkey = stag; + nesmr->ibmr.lkey = stag; + nesmr->mode = IWNES_MEMREG_TYPE_MEM; + ibmr = &nesmr->ibmr; + nesmr->pbl_4k = ((pbl_count > 1) || (cur_pbl_index > 32)) ? 1 : 0; + nesmr->pbls_used = pbl_count; + if (pbl_count > 1) { + nesmr->pbls_used++; + } + } else { + ib_umem_release(region); + kfree(nesmr); + ibmr = ERR_PTR(-ENOMEM); + } + + reg_user_mr_err: + /* free the resources */ + if (root_pbl_index == 1) { + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + } else { + for (page_index=0; page_index<root_pbl_index; page_index++) { + pci_free_consistent(nesdev->pcidev, 4096, + root_vpbl.leaf_vpbl[page_index].pbl_vbase, + root_vpbl.leaf_vpbl[page_index].pbl_pbase); + } + kfree(root_vpbl.leaf_vpbl); + pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, + root_vpbl.pbl_pbase); + } + + nes_debug(NES_DBG_MR, "Leaving, ibmr=%p", ibmr); + + return ibmr; + break; + case IWNES_MEMREG_TYPE_QP: + case IWNES_MEMREG_TYPE_CQ: + nespbl = kzalloc(sizeof(*nespbl), GFP_KERNEL); + if (!nespbl) { + nes_debug(NES_DBG_MR, "Unable to allocate PBL\n"); + ib_umem_release(region); + return ERR_PTR(-ENOMEM); + } + nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); + if (!nesmr) { + ib_umem_release(region); + kfree(nespbl); + nes_debug(NES_DBG_MR, "Unable to allocate nesmr\n"); + return ERR_PTR(-ENOMEM); + } + nesmr->region = region; + nes_ucontext = to_nesucontext(pd->uobject->context); + pbl_depth = region->length >> 12; + pbl_depth += (region->length & (4096-1)) ? 1 : 0; + nespbl->pbl_size = pbl_depth*sizeof(u64); + if (req.reg_type == IWNES_MEMREG_TYPE_QP) { + nes_debug(NES_DBG_MR, "Attempting to allocate QP PBL memory"); + } else { + nes_debug(NES_DBG_MR, "Attempting to allocate CP PBL memory"); + } + + nes_debug(NES_DBG_MR, " %u bytes, %u entries.\n", + nespbl->pbl_size, pbl_depth); + pbl = pci_alloc_consistent(nesdev->pcidev, nespbl->pbl_size, + &nespbl->pbl_pbase); + if (!pbl) { + ib_umem_release(region); + kfree(nesmr); + kfree(nespbl); + nes_debug(NES_DBG_MR, "Unable to allocate PBL memory\n"); + return ERR_PTR(-ENOMEM); + } + + nespbl->pbl_vbase = (u64 *)pbl; + nespbl->user_base = start; + nes_debug(NES_DBG_MR, "Allocated PBL memory, %u bytes, pbl_pbase=%p," + " pbl_vbase=%p user_base=0x%lx\n", + nespbl->pbl_size, (void *)nespbl->pbl_pbase, + (void*)nespbl->pbl_vbase, nespbl->user_base); + + list_for_each_entry(chunk, ®ion->chunk_list, list) { + for (nmap_index = 0; nmap_index < chunk->nmap; ++nmap_index) { + chunk_pages = sg_dma_len(&chunk->page_list[nmap_index]) >> 12; + chunk_pages += (sg_dma_len(&chunk->page_list[nmap_index]) & (4096-1)) ? 1 : 0; + nespbl->page = sg_page(&chunk->page_list[0]); + for (page_index=0; page_index<chunk_pages; page_index++) { + ((__le32 *)pbl)[0] = cpu_to_le32((u32) + (sg_dma_address(&chunk->page_list[nmap_index])+ + (page_index*4096))); + ((__le32 *)pbl)[1] = cpu_to_le32(((u64) + (sg_dma_address(&chunk->page_list[nmap_index])+ + (page_index*4096)))>>32); + nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl, + (unsigned long long)*pbl, + le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0])); + pbl++; + } + } + } + if (req.reg_type == IWNES_MEMREG_TYPE_QP) { + list_add_tail(&nespbl->list, &nes_ucontext->qp_reg_mem_list); + } else { + list_add_tail(&nespbl->list, &nes_ucontext->cq_reg_mem_list); + } + nesmr->ibmr.rkey = -1; + nesmr->ibmr.lkey = -1; + nesmr->mode = req.reg_type; + return &nesmr->ibmr; + break; + } + + return ERR_PTR(-ENOSYS); +} + + +/** + * nes_dereg_mr + */ +static int nes_dereg_mr(struct ib_mr *ib_mr) +{ + struct nes_mr *nesmr = to_nesmr(ib_mr); + struct nes_vnic *nesvnic = to_nesvnic(ib_mr->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + unsigned long flags; + int ret; + u16 major_code; + u16 minor_code; + + if (nesmr->region) { + ib_umem_release(nesmr->region); + } + if (nesmr->mode != IWNES_MEMREG_TYPE_MEM) { + kfree(nesmr); + return 0; + } + + /* Deallocate the region with the adapter */ + + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + if (nesmr->pbls_used != 0) { + if (nesmr->pbl_4k) { + nesadapter->free_4kpbl += nesmr->pbls_used; + if (nesadapter->free_4kpbl > nesadapter->max_4kpbl) { + printk(KERN_ERR PFX "free 4KB PBLs(%u) has exceeded the max(%u)\n", + nesadapter->free_4kpbl, nesadapter->max_4kpbl); + } + } else { + nesadapter->free_256pbl += nesmr->pbls_used; + if (nesadapter->free_256pbl > nesadapter->max_256pbl) { + printk(KERN_ERR PFX "free 256B PBLs(%u) has exceeded the max(%u)\n", + nesadapter->free_256pbl, nesadapter->max_256pbl); + } + } + } + + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + NES_CQP_DEALLOCATE_STAG | NES_CQP_STAG_VA_TO | + NES_CQP_STAG_DEALLOC_PBLS | NES_CQP_STAG_MR); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ib_mr->rkey); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL); + + /* Wait for CQP */ + nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X completed\n", ib_mr->rkey); + ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_MR, "Deallocate STag 0x%08X completed, wait_event_timeout ret = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X\n", + ib_mr->rkey, ret, cqp_request->major_code, cqp_request->minor_code); + + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + (ib_mr->rkey & 0x0fffff00) >> 8); + + kfree(nesmr); + + major_code = cqp_request->major_code; + minor_code = cqp_request->minor_code; + if (atomic_dec_and_test(&cqp_request->refcount)) { + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + if (!ret) { + nes_debug(NES_DBG_MR, "Timeout waiting to destroy STag," + " ib_mr=%p, rkey = 0x%08X\n", + ib_mr, ib_mr->rkey); + return -ETIME; + } else if (major_code) { + nes_debug(NES_DBG_MR, "Error (0x%04X:0x%04X) while attempting" + " to destroy STag, ib_mr=%p, rkey = 0x%08X\n", + major_code, minor_code, ib_mr, ib_mr->rkey); + return -EIO; + } else + return 0; +} + + +/** + * show_rev + */ +static ssize_t show_rev(struct class_device *cdev, char *buf) +{ + struct nes_ib_device *nesibdev = + container_of(cdev, struct nes_ib_device, ibdev.class_dev); + struct nes_vnic *nesvnic = nesibdev->nesvnic; + + nes_debug(NES_DBG_INIT, "\n"); + return sprintf(buf, "%x\n", nesvnic->nesdev->nesadapter->hw_rev); +} + + +/** + * show_fw_ver + */ +static ssize_t show_fw_ver(struct class_device *cdev, char *buf) +{ + struct nes_ib_device *nesibdev = + container_of(cdev, struct nes_ib_device, ibdev.class_dev); + struct nes_vnic *nesvnic = nesibdev->nesvnic; + + nes_debug(NES_DBG_INIT, "\n"); + return sprintf(buf, "%x.%x.%x\n", + (int)(nesvnic->nesdev->nesadapter->fw_ver >> 32), + (int)(nesvnic->nesdev->nesadapter->fw_ver >> 16) & 0xffff, + (int)(nesvnic->nesdev->nesadapter->fw_ver & 0xffff)); +} + + +/** + * show_hca + */ +static ssize_t show_hca(struct class_device *cdev, char *buf) +{ + nes_debug(NES_DBG_INIT, "\n"); + return sprintf(buf, "NES020\n"); +} + + +/** + * show_board + */ +static ssize_t show_board(struct class_device *cdev, char *buf) +{ + nes_debug(NES_DBG_INIT, "\n"); + return sprintf(buf, "%.*s\n", 32, "NES020 Board ID"); +} + + +static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static CLASS_DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct class_device_attribute *nes_class_attributes[] = { + &class_device_attr_hw_rev, + &class_device_attr_fw_ver, + &class_device_attr_hca_type, + &class_device_attr_board_id +}; + + +/** + * nes_query_qp + */ +static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr) +{ + struct nes_qp *nesqp = to_nesqp(ibqp); + + nes_debug(NES_DBG_QP, "\n"); + + attr->qp_access_flags = 0; + attr->cap.max_send_wr = nesqp->hwqp.sq_size; + attr->cap.max_recv_wr = nesqp->hwqp.rq_size; + attr->cap.max_recv_sge = 1; + if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) { + init_attr->cap.max_inline_data = 0; + } else { + init_attr->cap.max_inline_data = 64; + } + + init_attr->event_handler = nesqp->ibqp.event_handler; + init_attr->qp_context = nesqp->ibqp.qp_context; + init_attr->send_cq = nesqp->ibqp.send_cq; + init_attr->recv_cq = nesqp->ibqp.recv_cq; + init_attr->srq = nesqp->ibqp.srq = nesqp->ibqp.srq; + init_attr->cap = attr->cap; + + return 0; +} + + +/** + * nes_hw_modify_qp + */ +int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp, + u32 next_iwarp_state, u32 wait_completion) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + /* struct iw_cm_id *cm_id = nesqp->cm_id; */ + /* struct iw_cm_event cm_event; */ + struct nes_cqp_request *cqp_request; + unsigned long flags; + int ret; + u16 major_code; + + nes_debug(NES_DBG_MOD_QP, "QP%u, refcount=%d\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount)); + + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_MOD_QP, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + if (wait_completion) { + cqp_request->waiting = 1; + } else { + cqp_request->waiting = 0; + } + cqp_wqe = &cqp_request->cqp_wqe; + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + NES_CQP_MODIFY_QP | NES_CQP_QP_TYPE_IWARP | next_iwarp_state); + nes_debug(NES_DBG_MOD_QP, "using next_iwarp_state=%08x, wqe_words=%08x\n", + next_iwarp_state, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])); + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, (u64)nesqp->nesqp_context_pbase); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL); + + /* Wait for CQP */ + if (wait_completion) { + /* nes_debug(NES_DBG_MOD_QP, "Waiting for modify iWARP QP%u to complete.\n", + nesqp->hwqp.qp_id); */ + ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u completed, wait_event_timeout ret=%u, " + "CQP Major:Minor codes = 0x%04X:0x%04X.\n", + nesqp->hwqp.qp_id, ret, cqp_request->major_code, cqp_request->minor_code); + major_code = cqp_request->major_code; + if (major_code) { + nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u failed" + "CQP Major:Minor codes = 0x%04X:0x%04X, intended next state = 0x%08X.\n", + nesqp->hwqp.qp_id, cqp_request->major_code, + cqp_request->minor_code, next_iwarp_state); + } + if (atomic_dec_and_test(&cqp_request->refcount)) { + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + } + if (!ret) + return -ETIME; + else if (major_code) + return -EIO; + else + return 0; + } else { + return 0; + } +} + + +/** + * nes_modify_qp + */ +int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct nes_qp *nesqp = to_nesqp(ibqp); + struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); + struct nes_device *nesdev = nesvnic->nesdev; + /* u32 cqp_head; */ + /* u32 counter; */ + u32 next_iwarp_state = 0; + int err; + unsigned long qplockflags; + int ret; + u16 original_last_aeq; + u8 issue_modify_qp = 0; + u8 issue_disconnect = 0; + u8 dont_wait = 0; + + nes_debug(NES_DBG_MOD_QP, "QP%u: QP State=%u, cur QP State=%u," + " iwarp_state=0x%X, refcount=%d\n", + nesqp->hwqp.qp_id, attr->qp_state, nesqp->ibqp_state, + nesqp->iwarp_state, atomic_read(&nesqp->refcount)); + + nes_add_ref(&nesqp->ibqp); + spin_lock_irqsave(&nesqp->lock, qplockflags); + + nes_debug(NES_DBG_MOD_QP, "QP%u: hw_iwarp_state=0x%X, hw_tcp_state=0x%X," + " QP Access Flags=0x%X, attr_mask = 0x%0x\n", + nesqp->hwqp.qp_id, nesqp->hw_iwarp_state, + nesqp->hw_tcp_state, attr->qp_access_flags, attr_mask); + + if (attr_mask & IB_QP_STATE) { + switch (attr->qp_state) { + case IB_QPS_INIT: + nes_debug(NES_DBG_MOD_QP, "QP%u: new state = init\n", + nesqp->hwqp.qp_id); + if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_IDLE) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_rem_ref(&nesqp->ibqp); + return -EINVAL; + } + next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE; + issue_modify_qp = 1; + break; + case IB_QPS_RTR: + nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rtr\n", + nesqp->hwqp.qp_id); + if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_IDLE) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_rem_ref(&nesqp->ibqp); + return -EINVAL; + } + next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE; + issue_modify_qp = 1; + break; + case IB_QPS_RTS: + nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rts\n", + nesqp->hwqp.qp_id); + if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_RTS) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_rem_ref(&nesqp->ibqp); + return -EINVAL; + } + if (nesqp->cm_id == NULL) { + nes_debug(NES_DBG_MOD_QP, "QP%u: Failing attempt to move QP to RTS without a CM_ID. \n", + nesqp->hwqp.qp_id ); + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_rem_ref(&nesqp->ibqp); + return -EINVAL; + } + next_iwarp_state = NES_CQP_QP_IWARP_STATE_RTS; + if (nesqp->iwarp_state != NES_CQP_QP_IWARP_STATE_RTS) + next_iwarp_state |= NES_CQP_QP_CONTEXT_VALID | + NES_CQP_QP_ARP_VALID | NES_CQP_QP_ORD_VALID; + issue_modify_qp = 1; + nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_ESTABLISHED; + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_RTS; + nesqp->hte_added = 1; + break; + case IB_QPS_SQD: + issue_modify_qp = 1; + nes_debug(NES_DBG_MOD_QP, "QP%u: new state=closing. SQ head=%u, SQ tail=%u\n", + nesqp->hwqp.qp_id, nesqp->hwqp.sq_head, nesqp->hwqp.sq_tail); + if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_CLOSING) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_rem_ref(&nesqp->ibqp); + return 0; + } else { + if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_CLOSING) { + nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing" + " ignored due to current iWARP state\n", + nesqp->hwqp.qp_id); + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_rem_ref(&nesqp->ibqp); + return -EINVAL; + } + if (nesqp->hw_iwarp_state != NES_AEQE_IWARP_STATE_RTS) { + nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing" + " already done based on hw state.\n", + nesqp->hwqp.qp_id); + issue_modify_qp = 0; + nesqp->in_disconnect = 0; + } + switch (nesqp->hw_iwarp_state) { + case NES_AEQE_IWARP_STATE_CLOSING: + next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING; + case NES_AEQE_IWARP_STATE_TERMINATE: + next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE; + break; + case NES_AEQE_IWARP_STATE_ERROR: + next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR; + break; + default: + next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING; + nesqp->in_disconnect = 1; + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING; + break; + } + } + break; + case IB_QPS_SQE: + nes_debug(NES_DBG_MOD_QP, "QP%u: new state = terminate\n", + nesqp->hwqp.qp_id); + if (nesqp->iwarp_state>=(u32)NES_CQP_QP_IWARP_STATE_TERMINATE) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_rem_ref(&nesqp->ibqp); + return -EINVAL; + } + /* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */ + next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE; + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_TERMINATE; + issue_modify_qp = 1; + nesqp->in_disconnect = 1; + break; + case IB_QPS_ERR: + case IB_QPS_RESET: + if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_ERROR) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_rem_ref(&nesqp->ibqp); + return -EINVAL; + } + nes_debug(NES_DBG_MOD_QP, "QP%u: new state = error\n", + nesqp->hwqp.qp_id); + next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR; + /* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */ + if (nesqp->hte_added) { + nes_debug(NES_DBG_MOD_QP, "set CQP_QP_DEL_HTE\n"); + next_iwarp_state |= NES_CQP_QP_DEL_HTE; + nesqp->hte_added = 0; + } + if ((nesqp->hw_tcp_state > NES_AEQE_TCP_STATE_CLOSED) && + (nesqp->hw_tcp_state != NES_AEQE_TCP_STATE_TIME_WAIT)) { + next_iwarp_state |= NES_CQP_QP_RESET; + nesqp->in_disconnect = 1; + } else { + nes_debug(NES_DBG_MOD_QP, "QP%u NOT setting NES_CQP_QP_RESET since TCP state = %u\n", + nesqp->hwqp.qp_id, nesqp->hw_tcp_state); + dont_wait = 1; + } + issue_modify_qp = 1; + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_ERROR; + break; + default: + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_rem_ref(&nesqp->ibqp); + return -EINVAL; + break; + } + + nesqp->ibqp_state = attr->qp_state; + if (((nesqp->iwarp_state & NES_CQP_QP_IWARP_STATE_MASK) == + (u32)NES_CQP_QP_IWARP_STATE_RTS) && + ((next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK) > + (u32)NES_CQP_QP_IWARP_STATE_RTS)) { + nesqp->iwarp_state = next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK; + nes_debug(NES_DBG_MOD_QP, "Change nesqp->iwarp_state=%08x\n", + nesqp->iwarp_state); + issue_disconnect = 1; + } else { + nesqp->iwarp_state = next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK; + nes_debug(NES_DBG_MOD_QP, "Change nesqp->iwarp_state=%08x\n", + nesqp->iwarp_state); + } + } + + if (attr_mask & IB_QP_ACCESS_FLAGS) { + if (attr->qp_access_flags & IB_ACCESS_LOCAL_WRITE) { + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN | + NES_QPCONTEXT_MISC_RDMA_READ_EN); + issue_modify_qp = 1; + } + if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) { + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN); + issue_modify_qp = 1; + } + if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) { + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_READ_EN); + issue_modify_qp = 1; + } + if (attr->qp_access_flags & IB_ACCESS_MW_BIND) { + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WBIND_EN); + issue_modify_qp = 1; + } + + if (nesqp->user_mode) { + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN | + NES_QPCONTEXT_MISC_RDMA_READ_EN); + issue_modify_qp = 1; + } + } + + original_last_aeq = nesqp->last_aeq; + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + + nes_debug(NES_DBG_MOD_QP, "issue_modify_qp=%u\n", issue_modify_qp); + + ret = 0; + + + if (issue_modify_qp) { + nes_debug(NES_DBG_MOD_QP, "call nes_hw_modify_qp\n"); + ret = nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 1); + if (ret) + nes_debug(NES_DBG_MOD_QP, "nes_hw_modify_qp (next_iwarp_state = 0x%08X)" + " failed for QP%u.\n", + next_iwarp_state, nesqp->hwqp.qp_id); + + } + + if ((issue_modify_qp) && (nesqp->ibqp_state > IB_QPS_RTS)) { + nes_debug(NES_DBG_MOD_QP, "QP%u Issued ModifyQP refcount (%d)," + " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + if ((!ret) || + ((original_last_aeq != NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) && + (ret))) { + if (dont_wait) { + if (nesqp->cm_id && nesqp->hw_tcp_state != 0) { + nes_debug(NES_DBG_MOD_QP, "QP%u Queuing fake disconnect for QP refcount (%d)," + " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + /* this one is for the cm_disconnect thread */ + nes_add_ref(&nesqp->ibqp); + spin_lock_irqsave(&nesqp->lock, qplockflags); + nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; + nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT; + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_cm_disconn(nesqp); + } else { + nes_debug(NES_DBG_MOD_QP, "QP%u No fake disconnect, QP refcount=%d\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount)); + nes_rem_ref(&nesqp->ibqp); + } + } else { + spin_lock_irqsave(&nesqp->lock, qplockflags); + if (nesqp->cm_id) { + /* These two are for the timer thread */ + if (atomic_inc_return(&nesqp->close_timer_started) == 1) { + nes_add_ref(&nesqp->ibqp); + nesqp->cm_id->add_ref(nesqp->cm_id); + nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d)," + " need ae to finish up, original_last_aeq = 0x%04X." + " last_aeq = 0x%04X, scheduling timer.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + schedule_nes_timer(nesqp->cm_node, (struct sk_buff *) nesqp, NES_TIMER_TYPE_CLOSE, 1, 0); + } + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + } else { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d)," + " need ae to finish up, original_last_aeq = 0x%04X." + " last_aeq = 0x%04X.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + } + } + } else { + nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up," + " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + nes_rem_ref(&nesqp->ibqp); + } + } else { + nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up," + " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + nes_rem_ref(&nesqp->ibqp); + } + + err = 0; + + nes_debug(NES_DBG_MOD_QP, "QP%u Leaving, refcount=%d\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount)); + + return err; +} + + +/** + * nes_muticast_attach + */ +static int nes_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + nes_debug(NES_DBG_INIT, "\n"); + return -ENOSYS; +} + + +/** + * nes_multicast_detach + */ +static int nes_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + nes_debug(NES_DBG_INIT, "\n"); + return -ENOSYS; +} + + +/** + * nes_process_mad + */ +static int nes_process_mad(struct ib_device *ibdev, int mad_flags, + u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + nes_debug(NES_DBG_INIT, "\n"); + return -ENOSYS; +} + +static inline void +fill_wqe_sg_send(struct nes_hw_qp_wqe *wqe, struct ib_send_wr *ib_wr, u32 uselkey) +{ + int sge_index; + int total_payload_length = 0; + for (sge_index = 0; sge_index < ib_wr->num_sge; sge_index++) { + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX+(sge_index*4), + ib_wr->sg_list[sge_index].addr); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_LENGTH0_IDX + (sge_index*4), + ib_wr->sg_list[sge_index].length); + if (uselkey) + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4), + (ib_wr->sg_list[sge_index].lkey)); + else + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4), 0); + + total_payload_length += ib_wr->sg_list[sge_index].length; + } + nes_debug(NES_DBG_IW_TX, "UC UC UC, sending total_payload_length=%u \n", + total_payload_length); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, + total_payload_length); +} + +/** + * nes_post_send + */ +static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, + struct ib_send_wr **bad_wr) +{ + u64 u64temp; + unsigned long flags = 0; + struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_qp *nesqp = to_nesqp(ibqp); + struct nes_hw_qp_wqe *wqe; + int err; + u32 qsize = nesqp->hwqp.sq_size; + u32 head; + u32 wqe_misc; + u32 wqe_count; + u32 counter; + u32 total_payload_length; + + err = 0; + wqe_misc = 0; + wqe_count = 0; + total_payload_length = 0; + + if (nesqp->ibqp_state > IB_QPS_RTS) + return -EINVAL; + + spin_lock_irqsave(&nesqp->lock, flags); + + head = nesqp->hwqp.sq_head; + + while (ib_wr) { + /* Check for SQ overflow */ + if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) { + err = -EINVAL; + break; + } + + wqe = &nesqp->hwqp.sq_vbase[head]; + /* nes_debug(NES_DBG_IW_TX, "processing sq wqe for QP%u at %p, head = %u.\n", + nesqp->hwqp.qp_id, wqe, head); */ + nes_fill_init_qp_wqe(wqe, nesqp, head); + u64temp = (u64)(ib_wr->wr_id); + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, + u64temp); + switch (ib_wr->opcode) { + case IB_WR_SEND: + if (ib_wr->send_flags & IB_SEND_SOLICITED) { + wqe_misc = NES_IWARP_SQ_OP_SENDSE; + } else { + wqe_misc = NES_IWARP_SQ_OP_SEND; + } + if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { + err = -EINVAL; + break; + } + if (ib_wr->send_flags & IB_SEND_FENCE) { + wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; + } + if ((ib_wr->send_flags & IB_SEND_INLINE) && + ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) && + (ib_wr->sg_list[0].length <= 64)) { + memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX], + (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, + ib_wr->sg_list[0].length); + wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA; + } else { + fill_wqe_sg_send(wqe, ib_wr, 1); + } + + break; + case IB_WR_RDMA_WRITE: + wqe_misc = NES_IWARP_SQ_OP_RDMAW; + if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { + nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=%u\n", + ib_wr->num_sge, + nesdev->nesadapter->max_sge); + err = -EINVAL; + break; + } + if (ib_wr->send_flags & IB_SEND_FENCE) { + wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; + } + + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX, + ib_wr->wr.rdma.rkey); + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX, + ib_wr->wr.rdma.remote_addr); + + if ((ib_wr->send_flags & IB_SEND_INLINE) && + ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) && + (ib_wr->sg_list[0].length <= 64)) { + memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX], + (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, + ib_wr->sg_list[0].length); + wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA; + } else { + fill_wqe_sg_send(wqe, ib_wr, 1); + } + wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] = + wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]; + break; + case IB_WR_RDMA_READ: + /* iWARP only supports 1 sge for RDMA reads */ + if (ib_wr->num_sge > 1) { + nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=1\n", + ib_wr->num_sge); + err = -EINVAL; + break; + } + wqe_misc = NES_IWARP_SQ_OP_RDMAR; + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX, + ib_wr->wr.rdma.remote_addr); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX, + ib_wr->wr.rdma.rkey); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX, + ib_wr->sg_list->length); + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX, + ib_wr->sg_list->addr); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX, + ib_wr->sg_list->lkey); + break; + default: + /* error */ + err = -EINVAL; + break; + } + + if (ib_wr->send_flags & IB_SEND_SIGNALED) { + wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL; + } + wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = cpu_to_le32(wqe_misc); + + ib_wr = ib_wr->next; + head++; + wqe_count++; + if (head >= qsize) + head = 0; + + } + + nesqp->hwqp.sq_head = head; + barrier(); + while (wqe_count) { + counter = min(wqe_count, ((u32)255)); + wqe_count -= counter; + nes_write32(nesdev->regs + NES_WQE_ALLOC, + (counter << 24) | 0x00800000 | nesqp->hwqp.qp_id); + } + + spin_unlock_irqrestore(&nesqp->lock, flags); + + if (err) + *bad_wr = ib_wr; + return err; +} + + +/** + * nes_post_recv + */ +static int nes_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, + struct ib_recv_wr **bad_wr) +{ + u64 u64temp; + unsigned long flags = 0; + struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_qp *nesqp = to_nesqp(ibqp); + struct nes_hw_qp_wqe *wqe; + int err = 0; + int sge_index; + u32 qsize = nesqp->hwqp.rq_size; + u32 head; + u32 wqe_count = 0; + u32 counter; + u32 total_payload_length; + + if (nesqp->ibqp_state > IB_QPS_RTS) + return -EINVAL; + + spin_lock_irqsave(&nesqp->lock, flags); + + head = nesqp->hwqp.rq_head; + + while (ib_wr) { + if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { + err = -EINVAL; + break; + } + /* Check for RQ overflow */ + if (((head + (2 * qsize) - nesqp->hwqp.rq_tail) % qsize) == (qsize - 1)) { + err = -EINVAL; + break; + } + + nes_debug(NES_DBG_IW_RX, "ibwr sge count = %u.\n", ib_wr->num_sge); + wqe = &nesqp->hwqp.rq_vbase[head]; + + /* nes_debug(NES_DBG_IW_RX, "QP%u:processing rq wqe at %p, head = %u.\n", + nesqp->hwqp.qp_id, wqe, head); */ + nes_fill_init_qp_wqe(wqe, nesqp, head); + u64temp = (u64)(ib_wr->wr_id); + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, + u64temp); + total_payload_length = 0; + for (sge_index=0; sge_index < ib_wr->num_sge; sge_index++) { + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_FRAG0_LOW_IDX+(sge_index*4), + ib_wr->sg_list[sge_index].addr); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_LENGTH0_IDX+(sge_index*4), + ib_wr->sg_list[sge_index].length); + set_wqe_32bit_value(wqe->wqe_words,NES_IWARP_RQ_WQE_STAG0_IDX+(sge_index*4), + ib_wr->sg_list[sge_index].lkey); + + total_payload_length += ib_wr->sg_list[sge_index].length; + } + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX, + total_payload_length); + + ib_wr = ib_wr->next; + head++; + wqe_count++; + if (head >= qsize) + head = 0; + } + + nesqp->hwqp.rq_head = head; + barrier(); + while (wqe_count) { + counter = min(wqe_count, ((u32)255)); + wqe_count -= counter; + nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter<<24) | nesqp->hwqp.qp_id); + } + + spin_unlock_irqrestore(&nesqp->lock, flags); + + if (err) + *bad_wr = ib_wr; + return err; +} + + +/** + * nes_poll_cq + */ +static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) +{ + u64 u64temp; + u64 wrid; + /* u64 u64temp; */ + unsigned long flags = 0; + struct nes_vnic *nesvnic = to_nesvnic(ibcq->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_cq *nescq = to_nescq(ibcq); + struct nes_qp *nesqp; + struct nes_hw_cqe cqe; + u32 head; + u32 wq_tail; + u32 cq_size; + u32 cqe_count = 0; + u32 wqe_index; + u32 u32temp; + /* u32 counter; */ + + nes_debug(NES_DBG_CQ, "\n"); + + spin_lock_irqsave(&nescq->lock, flags); + + head = nescq->hw_cq.cq_head; + cq_size = nescq->hw_cq.cq_size; + + while (cqe_count < num_entries) { + if (le32_to_cpu(nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) & + NES_CQE_VALID) { + cqe = nescq->hw_cq.cq_vbase[head]; + nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0; + u32temp = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); + wqe_index = u32temp & + (nesdev->nesadapter->max_qp_wr - 1); + u32temp &= ~(NES_SW_CONTEXT_ALIGN-1); + /* parse CQE, get completion context from WQE (either rq or sq */ + u64temp = (((u64)(le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX])))<<32) | + ((u64)u32temp); + nesqp = *((struct nes_qp **)&u64temp); + memset(entry, 0, sizeof *entry); + if (cqe.cqe_words[NES_CQE_ERROR_CODE_IDX] == 0) { + entry->status = IB_WC_SUCCESS; + } else { + entry->status = IB_WC_WR_FLUSH_ERR; + } + + entry->qp = &nesqp->ibqp; + entry->src_qp = nesqp->hwqp.qp_id; + + if (le32_to_cpu(cqe.cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_SQ) { + if (nesqp->skip_lsmm) { + nesqp->skip_lsmm = 0; + wq_tail = nesqp->hwqp.sq_tail++; + } + + /* Working on a SQ Completion*/ + wq_tail = wqe_index; + nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1); + wrid = (((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wq_tail]. + wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX]))) << 32) | + ((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wq_tail]. + wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX]))); + entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wq_tail]. + wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]); + + switch (le32_to_cpu(nesqp->hwqp.sq_vbase[wq_tail]. + wqe_words[NES_IWARP_SQ_WQE_MISC_IDX]) & 0x3f) { + case NES_IWARP_SQ_OP_RDMAW: + nes_debug(NES_DBG_CQ, "Operation = RDMA WRITE.\n"); + entry->opcode = IB_WC_RDMA_WRITE; + break; + case NES_IWARP_SQ_OP_RDMAR: + nes_debug(NES_DBG_CQ, "Operation = RDMA READ.\n"); + entry->opcode = IB_WC_RDMA_READ; + entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wq_tail]. + wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX]); + break; + case NES_IWARP_SQ_OP_SENDINV: + case NES_IWARP_SQ_OP_SENDSEINV: + case NES_IWARP_SQ_OP_SEND: + case NES_IWARP_SQ_OP_SENDSE: + nes_debug(NES_DBG_CQ, "Operation = Send.\n"); + entry->opcode = IB_WC_SEND; + break; + } + } else { + /* Working on a RQ Completion*/ + wq_tail = wqe_index; + nesqp->hwqp.rq_tail = (wqe_index+1)&(nesqp->hwqp.rq_size - 1); + entry->byte_len = le32_to_cpu(cqe.cqe_words[NES_CQE_PAYLOAD_LENGTH_IDX]); + wrid = ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wq_tail].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX]))) | + ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wq_tail].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX]))<<32); + entry->opcode = IB_WC_RECV; + } + entry->wr_id = wrid; + + if (++head >= cq_size) + head = 0; + cqe_count++; + nescq->polled_completions++; + if ((nescq->polled_completions > (cq_size / 2)) || + (nescq->polled_completions == 255)) { + nes_debug(NES_DBG_CQ, "CQ%u Issuing CQE Allocate since more than half of cqes" + " are pending %u of %u.\n", + nescq->hw_cq.cq_number, nescq->polled_completions, cq_size); + nes_write32(nesdev->regs+NES_CQE_ALLOC, + nescq->hw_cq.cq_number | (nescq->polled_completions << 16)); + nescq->polled_completions = 0; + } + entry++; + } else + break; + } + + if (nescq->polled_completions) { + nes_write32(nesdev->regs+NES_CQE_ALLOC, + nescq->hw_cq.cq_number | (nescq->polled_completions << 16)); + nescq->polled_completions = 0; + } + + nescq->hw_cq.cq_head = head; + nes_debug(NES_DBG_CQ, "Reporting %u completions for CQ%u.\n", + cqe_count, nescq->hw_cq.cq_number); + + spin_unlock_irqrestore(&nescq->lock, flags); + + return cqe_count; +} + + +/** + * nes_req_notify_cq + */ +static int nes_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) + { + struct nes_vnic *nesvnic = to_nesvnic(ibcq->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_cq *nescq = to_nescq(ibcq); + u32 cq_arm; + + nes_debug(NES_DBG_CQ, "Requesting notification for CQ%u.\n", + nescq->hw_cq.cq_number); + + cq_arm = nescq->hw_cq.cq_number; + if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP) + cq_arm |= NES_CQE_ALLOC_NOTIFY_NEXT; + else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) + cq_arm |= NES_CQE_ALLOC_NOTIFY_SE; + else + return -EINVAL; + + nes_write32(nesdev->regs+NES_CQE_ALLOC, cq_arm); + nes_read32(nesdev->regs+NES_CQE_ALLOC); + + return 0; +} + + +/** + * nes_init_ofa_device + */ +struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) +{ + struct nes_ib_device *nesibdev; + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + + nesibdev = (struct nes_ib_device *)ib_alloc_device(sizeof(struct nes_ib_device)); + if (nesibdev == NULL) { + return NULL; + } + strlcpy(nesibdev->ibdev.name, "nes%d", IB_DEVICE_NAME_MAX); + nesibdev->ibdev.owner = THIS_MODULE; + + nesibdev->ibdev.node_type = RDMA_NODE_RNIC; + memset(&nesibdev->ibdev.node_guid, 0, sizeof(nesibdev->ibdev.node_guid)); + memcpy(&nesibdev->ibdev.node_guid, netdev->dev_addr, 6); + + nesibdev->ibdev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_AH) | + (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_BIND_MW) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_POST_RECV) | + (1ull << IB_USER_VERBS_CMD_POST_SEND); + + nesibdev->ibdev.phys_port_cnt = 1; + nesibdev->ibdev.num_comp_vectors = 1; + nesibdev->ibdev.dma_device = &nesdev->pcidev->dev; + nesibdev->ibdev.class_dev.dev = &nesdev->pcidev->dev; + nesibdev->ibdev.query_device = nes_query_device; + nesibdev->ibdev.query_port = nes_query_port; + nesibdev->ibdev.modify_port = nes_modify_port; + nesibdev->ibdev.query_pkey = nes_query_pkey; + nesibdev->ibdev.query_gid = nes_query_gid; + nesibdev->ibdev.alloc_ucontext = nes_alloc_ucontext; + nesibdev->ibdev.dealloc_ucontext = nes_dealloc_ucontext; + nesibdev->ibdev.mmap = nes_mmap; + nesibdev->ibdev.alloc_pd = nes_alloc_pd; + nesibdev->ibdev.dealloc_pd = nes_dealloc_pd; + nesibdev->ibdev.create_ah = nes_create_ah; + nesibdev->ibdev.destroy_ah = nes_destroy_ah; + nesibdev->ibdev.create_qp = nes_create_qp; + nesibdev->ibdev.modify_qp = nes_modify_qp; + nesibdev->ibdev.query_qp = nes_query_qp; + nesibdev->ibdev.destroy_qp = nes_destroy_qp; + nesibdev->ibdev.create_cq = nes_create_cq; + nesibdev->ibdev.destroy_cq = nes_destroy_cq; + nesibdev->ibdev.poll_cq = nes_poll_cq; + nesibdev->ibdev.get_dma_mr = nes_get_dma_mr; + nesibdev->ibdev.reg_phys_mr = nes_reg_phys_mr; + nesibdev->ibdev.reg_user_mr = nes_reg_user_mr; + nesibdev->ibdev.dereg_mr = nes_dereg_mr; + nesibdev->ibdev.alloc_mw = nes_alloc_mw; + nesibdev->ibdev.dealloc_mw = nes_dealloc_mw; + nesibdev->ibdev.bind_mw = nes_bind_mw; + + nesibdev->ibdev.alloc_fmr = nes_alloc_fmr; + nesibdev->ibdev.unmap_fmr = nes_unmap_fmr; + nesibdev->ibdev.dealloc_fmr = nes_dealloc_fmr; + nesibdev->ibdev.map_phys_fmr = nes_map_phys_fmr; + + nesibdev->ibdev.attach_mcast = nes_multicast_attach; + nesibdev->ibdev.detach_mcast = nes_multicast_detach; + nesibdev->ibdev.process_mad = nes_process_mad; + + nesibdev->ibdev.req_notify_cq = nes_req_notify_cq; + nesibdev->ibdev.post_send = nes_post_send; + nesibdev->ibdev.post_recv = nes_post_recv; + + nesibdev->ibdev.iwcm = kzalloc(sizeof(*nesibdev->ibdev.iwcm), GFP_KERNEL); + if (nesibdev->ibdev.iwcm == NULL) { + ib_dealloc_device(&nesibdev->ibdev); + return NULL; + } + nesibdev->ibdev.iwcm->add_ref = nes_add_ref; + nesibdev->ibdev.iwcm->rem_ref = nes_rem_ref; + nesibdev->ibdev.iwcm->get_qp = nes_get_qp; + nesibdev->ibdev.iwcm->connect = nes_connect; + nesibdev->ibdev.iwcm->accept = nes_accept; + nesibdev->ibdev.iwcm->reject = nes_reject; + nesibdev->ibdev.iwcm->create_listen = nes_create_listen; + nesibdev->ibdev.iwcm->destroy_listen = nes_destroy_listen; + + return nesibdev; +} + + +/** + * nes_destroy_ofa_device + */ +void nes_destroy_ofa_device(struct nes_ib_device *nesibdev) +{ + if (nesibdev == NULL) + return; + + nes_unregister_ofa_device(nesibdev); + + kfree(nesibdev->ibdev.iwcm); + ib_dealloc_device(&nesibdev->ibdev); +} + + +/** + * nes_register_ofa_device + */ +int nes_register_ofa_device(struct nes_ib_device *nesibdev) +{ + struct nes_vnic *nesvnic = nesibdev->nesvnic; + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + int i, ret; + + ret = ib_register_device(&nesvnic->nesibdev->ibdev); + if (ret) { + return ret; + } + + /* Get the resources allocated to this device */ + nesibdev->max_cq = (nesadapter->max_cq-NES_FIRST_QPN) / nesadapter->port_count; + nesibdev->max_mr = nesadapter->max_mr / nesadapter->port_count; + nesibdev->max_qp = (nesadapter->max_qp-NES_FIRST_QPN) / nesadapter->port_count; + nesibdev->max_pd = nesadapter->max_pd / nesadapter->port_count; + + for (i = 0; i < ARRAY_SIZE(nes_class_attributes); ++i) { + ret = class_device_create_file(&nesibdev->ibdev.class_dev, nes_class_attributes[i]); + if (ret) { + while (i > 0) { + i--; + class_device_remove_file(&nesibdev->ibdev.class_dev, + nes_class_attributes[i]); + } + ib_unregister_device(&nesibdev->ibdev); + return ret; + } + } + + nesvnic->of_device_registered = 1; + + return 0; +} + + +/** + * nes_unregister_ofa_device + */ +void nes_unregister_ofa_device(struct nes_ib_device *nesibdev) +{ + struct nes_vnic *nesvnic = nesibdev->nesvnic; + int i; + + if (nesibdev == NULL) + return; + + for (i = 0; i < ARRAY_SIZE(nes_class_attributes); ++i) { + class_device_remove_file(&nesibdev->ibdev.class_dev, nes_class_attributes[i]); + } + + if (nesvnic->of_device_registered) { + ib_unregister_device(&nesibdev->ibdev); + } + + nesvnic->of_device_registered = 0; +} diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h new file mode 100644 index 0000000..6c6b4da --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef NES_VERBS_H +#define NES_VERBS_H + +struct nes_device; + +#define NES_MAX_USER_DB_REGIONS 4096 +#define NES_MAX_USER_WQ_REGIONS 4096 + +struct nes_ucontext { + struct ib_ucontext ibucontext; + struct nes_device *nesdev; + unsigned long mmap_wq_offset; + unsigned long mmap_cq_offset; /* to be removed */ + int index; /* rnic index (minor) */ + unsigned long allocated_doorbells[BITS_TO_LONGS(NES_MAX_USER_DB_REGIONS)]; + u16 mmap_db_index[NES_MAX_USER_DB_REGIONS]; + u16 first_free_db; + unsigned long allocated_wqs[BITS_TO_LONGS(NES_MAX_USER_WQ_REGIONS)]; + struct nes_qp *mmap_nesqp[NES_MAX_USER_WQ_REGIONS]; + u16 first_free_wq; + struct list_head cq_reg_mem_list; + struct list_head qp_reg_mem_list; + u32 mcrqf; + atomic_t usecnt; +}; + +struct nes_pd { + struct ib_pd ibpd; + u16 pd_id; + atomic_t sqp_count; + u16 mmap_db_index; +}; + +struct nes_mr { + union { + struct ib_mr ibmr; + struct ib_mw ibmw; + struct ib_fmr ibfmr; + }; + struct ib_umem *region; + u16 pbls_used; + u8 mode; + u8 pbl_4k; +}; + +struct nes_hw_pb { + __le32 pa_low; + __le32 pa_high; +}; + +struct nes_vpbl { + dma_addr_t pbl_pbase; + struct nes_hw_pb *pbl_vbase; +}; + +struct nes_root_vpbl { + dma_addr_t pbl_pbase; + struct nes_hw_pb *pbl_vbase; + struct nes_vpbl *leaf_vpbl; +}; + +struct nes_fmr { + struct nes_mr nesmr; + u32 leaf_pbl_cnt; + struct nes_root_vpbl root_vpbl; + struct ib_qp *ib_qp; + int access_rights; + struct ib_fmr_attr attr; +}; + +struct nes_av; + +struct nes_cq { + struct ib_cq ibcq; + struct nes_hw_cq hw_cq; + u32 polled_completions; + u32 cq_mem_size; + spinlock_t lock; + u8 virtual_cq; + u8 pad[3]; +}; + +struct nes_wq { + spinlock_t lock; +}; + +struct iw_cm_id; +struct ietf_mpa_frame; + +struct nes_qp { + struct ib_qp ibqp; + void *allocated_buffer; + struct iw_cm_id *cm_id; + struct workqueue_struct *wq; + struct work_struct disconn_work; + struct nes_cq *nesscq; + struct nes_cq *nesrcq; + struct nes_pd *nespd; + void *cm_node; /* handle of the node this QP is associated with */ + struct ietf_mpa_frame *ietf_frame; + dma_addr_t ietf_frame_pbase; + wait_queue_head_t state_waitq; + unsigned long socket; + struct nes_hw_qp hwqp; + struct work_struct work; + struct work_struct ae_work; + enum ib_qp_state ibqp_state; + u32 iwarp_state; + u32 hte_index; + u32 last_aeq; + u32 qp_mem_size; + atomic_t refcount; + atomic_t close_timer_started; + u32 mmap_sq_db_index; + u32 mmap_rq_db_index; + spinlock_t lock; + struct nes_qp_context *nesqp_context; + dma_addr_t nesqp_context_pbase; + void *pbl_vbase; + dma_addr_t pbl_pbase; + struct page *page; + wait_queue_head_t kick_waitq; + u16 in_disconnect; + u16 private_data_len; + u8 active_conn; + u8 skip_lsmm; + u8 user_mode; + u8 hte_added; + u8 hw_iwarp_state; + u8 flush_issued; + u8 hw_tcp_state; + u8 disconn_pending; + u8 destroyed; +}; +#endif /* NES_VERBS_H */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index a082466..09f5371 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -680,12 +680,7 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) neigh = *to_ipoib_neigh(skb->dst->neighbour); - if (ipoib_cm_get(neigh)) { - if (ipoib_cm_up(neigh)) { - ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); - goto out; - } - } else if (neigh->ah) { + if (neigh->ah) if (unlikely((memcmp(&neigh->dgid.raw, skb->dst->neighbour->ha + 4, sizeof(union ib_gid))) || @@ -706,6 +701,12 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) goto out; } + if (ipoib_cm_get(neigh)) { + if (ipoib_cm_up(neigh)) { + ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); + goto out; + } + } else if (neigh->ah) { ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb->dst->neighbour->ha)); goto out; } @@ -813,11 +814,9 @@ static void ipoib_neigh_cleanup(struct neighbour *n) struct ipoib_ah *ah = NULL; neigh = *to_ipoib_neigh(n); - if (neigh) { + if (neigh) priv = netdev_priv(neigh->dev); - ipoib_dbg(priv, "neigh_destructor for bonding device: %s\n", - n->dev->name); - } else + else return; ipoib_dbg(priv, "neigh_cleanup for %06x " IPOIB_GID_FMT "\n", diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 195ce7c..fd4a49f 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -204,6 +204,22 @@ out: return ret; } +static int srp_new_cm_id(struct srp_target_port *target) +{ + struct ib_cm_id *new_cm_id; + + new_cm_id = ib_create_cm_id(target->srp_host->dev->dev, + srp_cm_handler, target); + if (IS_ERR(new_cm_id)) + return PTR_ERR(new_cm_id); + + if (target->cm_id) + ib_destroy_cm_id(target->cm_id); + target->cm_id = new_cm_id; + + return 0; +} + static int srp_create_target_ib(struct srp_target_port *target) { struct ib_qp_init_attr *init_attr; @@ -436,6 +452,7 @@ static void srp_remove_work(struct work_struct *work) static int srp_connect_target(struct srp_target_port *target) { + int retries = 3; int ret; ret = srp_lookup_path(target); @@ -468,6 +485,21 @@ static int srp_connect_target(struct srp_target_port *target) case SRP_DLID_REDIRECT: break; + case SRP_STALE_CONN: + /* Our current CM id was stale, and is now in timewait. + * Try to reconnect with a new one. + */ + if (!retries-- || srp_new_cm_id(target)) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "giving up on stale connection\n"); + target->status = -ECONNRESET; + return target->status; + } + + shost_printk(KERN_ERR, target->scsi_host, PFX + "retrying stale connection\n"); + break; + default: return target->status; } @@ -507,7 +539,6 @@ static void srp_reset_req(struct srp_target_port *target, struct srp_request *re static int srp_reconnect_target(struct srp_target_port *target) { - struct ib_cm_id *new_cm_id; struct ib_qp_attr qp_attr; struct srp_request *req, *tmp; struct ib_wc wc; @@ -526,14 +557,9 @@ static int srp_reconnect_target(struct srp_target_port *target) * Now get a new local CM ID so that we avoid confusing the * target in case things are really fouled up. */ - new_cm_id = ib_create_cm_id(target->srp_host->dev->dev, - srp_cm_handler, target); - if (IS_ERR(new_cm_id)) { - ret = PTR_ERR(new_cm_id); + ret = srp_new_cm_id(target); + if (ret) goto err; - } - ib_destroy_cm_id(target->cm_id); - target->cm_id = new_cm_id; qp_attr.qp_state = IB_QPS_RESET; ret = ib_modify_qp(target->qp, &qp_attr, IB_QP_STATE); @@ -1171,6 +1197,11 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, target->status = -ECONNRESET; break; + case IB_CM_REJ_STALE_CONN: + shost_printk(KERN_WARNING, shost, " REJ reason: stale connection\n"); + target->status = SRP_STALE_CONN; + break; + default: shost_printk(KERN_WARNING, shost, " REJ reason 0x%x\n", event->param.rej_rcvd.reason); @@ -1862,11 +1893,9 @@ static ssize_t srp_create_target(struct class_device *class_dev, if (ret) goto err; - target->cm_id = ib_create_cm_id(host->dev->dev, srp_cm_handler, target); - if (IS_ERR(target->cm_id)) { - ret = PTR_ERR(target->cm_id); + ret = srp_new_cm_id(target); + if (ret) goto err_free; - } target->qp_in_error = 0; ret = srp_connect_target(target); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 4a3c1f3..cb6eb81 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -54,6 +54,7 @@ enum { SRP_PORT_REDIRECT = 1, SRP_DLID_REDIRECT = 2, + SRP_STALE_CONN = 3, SRP_MAX_LUN = 512, SRP_DEF_SG_TABLESIZE = 12, diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index 64c66b3..4a93878 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -137,12 +137,14 @@ err_out: EXPORT_SYMBOL_GPL(led_classdev_register); /** - * led_classdev_unregister - unregisters a object of led_properties class. + * __led_classdev_unregister - unregisters a object of led_properties class. * @led_cdev: the led device to unregister + * @suspended: indicates whether system-wide suspend or resume is in progress * * Unregisters a previously registered via led_classdev_register object. */ -void led_classdev_unregister(struct led_classdev *led_cdev) +void __led_classdev_unregister(struct led_classdev *led_cdev, + bool suspended) { device_remove_file(led_cdev->dev, &dev_attr_brightness); #ifdef CONFIG_LEDS_TRIGGERS @@ -153,13 +155,16 @@ void led_classdev_unregister(struct led_classdev *led_cdev) up_write(&led_cdev->trigger_lock); #endif - device_unregister(led_cdev->dev); + if (suspended) + device_pm_schedule_removal(led_cdev->dev); + else + device_unregister(led_cdev->dev); down_write(&leds_list_lock); list_del(&led_cdev->node); up_write(&leds_list_lock); } -EXPORT_SYMBOL_GPL(led_classdev_unregister); +EXPORT_SYMBOL_GPL(__led_classdev_unregister); static int __init leds_init(void) { diff --git a/drivers/macintosh/via-macii.c b/drivers/macintosh/via-macii.c index 01b8eca..6e6dd17 100644 --- a/drivers/macintosh/via-macii.c +++ b/drivers/macintosh/via-macii.c @@ -111,7 +111,7 @@ static enum macii_state { static struct adb_request *current_req; /* first request struct in the queue */ static struct adb_request *last_req; /* last request struct in the queue */ static unsigned char reply_buf[16]; /* storage for autopolled replies */ -static unsigned char *reply_ptr; /* next byte in req->data or reply_buf */ +static unsigned char *reply_ptr; /* next byte in reply_buf or req->reply */ static int reading_reply; /* store reply in reply_buf else req->reply */ static int data_index; /* index of the next byte to send from req->data */ static int reply_len; /* number of bytes received in reply_buf or req->reply */ diff --git a/drivers/net/cxgb3/cxgb3_offload.c b/drivers/net/cxgb3/cxgb3_offload.c index d48c396..901c824 100644 --- a/drivers/net/cxgb3/cxgb3_offload.c +++ b/drivers/net/cxgb3/cxgb3_offload.c @@ -1070,9 +1070,7 @@ void *cxgb_alloc_mem(unsigned long size) */ void cxgb_free_mem(void *addr) { - unsigned long p = (unsigned long)addr; - - if (p >= VMALLOC_START && p < VMALLOC_END) + if (is_vmalloc_addr(addr)) vfree(addr); else kfree(addr); diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index 535a446..61dc495 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -617,9 +617,6 @@ int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter) int err; #define QUERY_ADAPTER_OUT_SIZE 0x100 -#define QUERY_ADAPTER_VENDOR_ID_OFFSET 0x00 -#define QUERY_ADAPTER_DEVICE_ID_OFFSET 0x04 -#define QUERY_ADAPTER_REVISION_ID_OFFSET 0x08 #define QUERY_ADAPTER_INTA_PIN_OFFSET 0x10 #define QUERY_ADAPTER_VSD_OFFSET 0x20 @@ -633,9 +630,6 @@ int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter) if (err) goto out; - MLX4_GET(adapter->vendor_id, outbox, QUERY_ADAPTER_VENDOR_ID_OFFSET); - MLX4_GET(adapter->device_id, outbox, QUERY_ADAPTER_DEVICE_ID_OFFSET); - MLX4_GET(adapter->revision_id, outbox, QUERY_ADAPTER_REVISION_ID_OFFSET); MLX4_GET(adapter->inta_pin, outbox, QUERY_ADAPTER_INTA_PIN_OFFSET); get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4, diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h index 7e1dd9e..e16dec8 100644 --- a/drivers/net/mlx4/fw.h +++ b/drivers/net/mlx4/fw.h @@ -99,9 +99,6 @@ struct mlx4_dev_cap { }; struct mlx4_adapter { - u32 vendor_id; - u32 device_id; - u32 revision_id; char board_id[MLX4_BOARD_ID_LEN]; u8 inta_pin; }; diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 89b3f0b..08bfc13 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -71,7 +71,7 @@ MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero"); #endif /* CONFIG_PCI_MSI */ -static const char mlx4_version[] __devinitdata = +static char mlx4_version[] __devinitdata = DRV_NAME ": Mellanox ConnectX core driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; @@ -163,7 +163,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) return 0; } -static int __devinit mlx4_load_fw(struct mlx4_dev *dev) +static int mlx4_load_fw(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int err; @@ -197,8 +197,8 @@ err_free: return err; } -static int __devinit mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base, - int cmpt_entry_sz) +static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base, + int cmpt_entry_sz) { struct mlx4_priv *priv = mlx4_priv(dev); int err; @@ -534,7 +534,6 @@ static int mlx4_init_hca(struct mlx4_dev *dev) } priv->eq_table.inta_pin = adapter.inta_pin; - dev->rev_id = adapter.revision_id; memcpy(dev->board_id, adapter.board_id, sizeof dev->board_id); return 0; @@ -688,7 +687,7 @@ err_uar_table_free: return err; } -static void __devinit mlx4_enable_msi_x(struct mlx4_dev *dev) +static void mlx4_enable_msi_x(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); struct msix_entry entries[MLX4_NUM_EQ]; diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c index 0c05a10..9c9e308 100644 --- a/drivers/net/mlx4/mr.c +++ b/drivers/net/mlx4/mr.c @@ -122,7 +122,7 @@ static void mlx4_buddy_free(struct mlx4_buddy *buddy, u32 seg, int order) spin_unlock(&buddy->lock); } -static int __devinit mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order) +static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order) { int i, s; diff --git a/drivers/net/pcmcia/3c574_cs.c b/drivers/net/pcmcia/3c574_cs.c index 36a7ba3..3b78a38 100644 --- a/drivers/net/pcmcia/3c574_cs.c +++ b/drivers/net/pcmcia/3c574_cs.c @@ -230,10 +230,11 @@ static char mii_preamble_required = 0; static int tc574_config(struct pcmcia_device *link); static void tc574_release(struct pcmcia_device *link); -static void mdio_sync(kio_addr_t ioaddr, int bits); -static int mdio_read(kio_addr_t ioaddr, int phy_id, int location); -static void mdio_write(kio_addr_t ioaddr, int phy_id, int location, int value); -static unsigned short read_eeprom(kio_addr_t ioaddr, int index); +static void mdio_sync(unsigned int ioaddr, int bits); +static int mdio_read(unsigned int ioaddr, int phy_id, int location); +static void mdio_write(unsigned int ioaddr, int phy_id, int location, + int value); +static unsigned short read_eeprom(unsigned int ioaddr, int index); static void tc574_wait_for_completion(struct net_device *dev, int cmd); static void tc574_reset(struct net_device *dev); @@ -341,7 +342,7 @@ static int tc574_config(struct pcmcia_device *link) tuple_t tuple; __le16 buf[32]; int last_fn, last_ret, i, j; - kio_addr_t ioaddr; + unsigned int ioaddr; __be16 *phys_addr; char *cardname; __u32 config; @@ -515,7 +516,7 @@ static int tc574_resume(struct pcmcia_device *link) static void dump_status(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; EL3WINDOW(1); printk(KERN_INFO " irq status %04x, rx status %04x, tx status " "%02x, tx free %04x\n", inw(ioaddr+EL3_STATUS), @@ -544,7 +545,7 @@ static void tc574_wait_for_completion(struct net_device *dev, int cmd) /* Read a word from the EEPROM using the regular EEPROM access register. Assume that we are in register window zero. */ -static unsigned short read_eeprom(kio_addr_t ioaddr, int index) +static unsigned short read_eeprom(unsigned int ioaddr, int index) { int timer; outw(EEPROM_Read + index, ioaddr + Wn0EepromCmd); @@ -572,9 +573,9 @@ static unsigned short read_eeprom(kio_addr_t ioaddr, int index) /* Generate the preamble required for initial synchronization and a few older transceivers. */ -static void mdio_sync(kio_addr_t ioaddr, int bits) +static void mdio_sync(unsigned int ioaddr, int bits) { - kio_addr_t mdio_addr = ioaddr + Wn4_PhysicalMgmt; + unsigned int mdio_addr = ioaddr + Wn4_PhysicalMgmt; /* Establish sync by sending at least 32 logic ones. */ while (-- bits >= 0) { @@ -583,12 +584,12 @@ static void mdio_sync(kio_addr_t ioaddr, int bits) } } -static int mdio_read(kio_addr_t ioaddr, int phy_id, int location) +static int mdio_read(unsigned int ioaddr, int phy_id, int location) { int i; int read_cmd = (0xf6 << 10) | (phy_id << 5) | location; unsigned int retval = 0; - kio_addr_t mdio_addr = ioaddr + Wn4_PhysicalMgmt; + unsigned int mdio_addr = ioaddr + Wn4_PhysicalMgmt; if (mii_preamble_required) mdio_sync(ioaddr, 32); @@ -608,10 +609,10 @@ static int mdio_read(kio_addr_t ioaddr, int phy_id, int location) return (retval>>1) & 0xffff; } -static void mdio_write(kio_addr_t ioaddr, int phy_id, int location, int value) +static void mdio_write(unsigned int ioaddr, int phy_id, int location, int value) { int write_cmd = 0x50020000 | (phy_id << 23) | (location << 18) | value; - kio_addr_t mdio_addr = ioaddr + Wn4_PhysicalMgmt; + unsigned int mdio_addr = ioaddr + Wn4_PhysicalMgmt; int i; if (mii_preamble_required) @@ -637,7 +638,7 @@ static void tc574_reset(struct net_device *dev) { struct el3_private *lp = netdev_priv(dev); int i; - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; unsigned long flags; tc574_wait_for_completion(dev, TotalReset|0x10); @@ -695,7 +696,7 @@ static void tc574_reset(struct net_device *dev) mdio_write(ioaddr, lp->phys, 4, lp->advertising); if (!auto_polarity) { /* works for TDK 78Q2120 series MII's */ - int i = mdio_read(ioaddr, lp->phys, 16) | 0x20; + i = mdio_read(ioaddr, lp->phys, 16) | 0x20; mdio_write(ioaddr, lp->phys, 16, i); } @@ -741,7 +742,7 @@ static int el3_open(struct net_device *dev) static void el3_tx_timeout(struct net_device *dev) { struct el3_private *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; printk(KERN_NOTICE "%s: Transmit timed out!\n", dev->name); dump_status(dev); @@ -756,7 +757,7 @@ static void el3_tx_timeout(struct net_device *dev) static void pop_tx_status(struct net_device *dev) { struct el3_private *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; int i; /* Clear the Tx status stack. */ @@ -779,7 +780,7 @@ static void pop_tx_status(struct net_device *dev) static int el3_start_xmit(struct sk_buff *skb, struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; struct el3_private *lp = netdev_priv(dev); unsigned long flags; @@ -813,7 +814,7 @@ static irqreturn_t el3_interrupt(int irq, void *dev_id) { struct net_device *dev = (struct net_device *) dev_id; struct el3_private *lp = netdev_priv(dev); - kio_addr_t ioaddr; + unsigned int ioaddr; unsigned status; int work_budget = max_interrupt_work; int handled = 0; @@ -907,7 +908,7 @@ static void media_check(unsigned long arg) { struct net_device *dev = (struct net_device *) arg; struct el3_private *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; unsigned long flags; unsigned short /* cable, */ media, partner; @@ -996,7 +997,7 @@ static struct net_device_stats *el3_get_stats(struct net_device *dev) static void update_stats(struct net_device *dev) { struct el3_private *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; u8 rx, tx, up; DEBUG(2, "%s: updating the statistics.\n", dev->name); @@ -1033,7 +1034,7 @@ static void update_stats(struct net_device *dev) static int el3_rx(struct net_device *dev, int worklimit) { struct el3_private *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; short rx_status; DEBUG(3, "%s: in rx_packet(), status %4.4x, rx_status %4.4x.\n", @@ -1094,7 +1095,7 @@ static const struct ethtool_ops netdev_ethtool_ops = { static int el3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) { struct el3_private *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; u16 *data = (u16 *)&rq->ifr_ifru; int phy = lp->phys & 0x1f; @@ -1148,7 +1149,7 @@ static int el3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) static void set_rx_mode(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; if (dev->flags & IFF_PROMISC) outw(SetRxFilter | RxStation | RxMulticast | RxBroadcast | RxProm, @@ -1161,7 +1162,7 @@ static void set_rx_mode(struct net_device *dev) static int el3_close(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; struct el3_private *lp = netdev_priv(dev); struct pcmcia_device *link = lp->p_dev; diff --git a/drivers/net/pcmcia/3c589_cs.c b/drivers/net/pcmcia/3c589_cs.c index e862d14e..1b1abb1 100644 --- a/drivers/net/pcmcia/3c589_cs.c +++ b/drivers/net/pcmcia/3c589_cs.c @@ -145,7 +145,7 @@ DRV_NAME ".c " DRV_VERSION " 2001/10/13 00:08:50 (David Hinds)"; static int tc589_config(struct pcmcia_device *link); static void tc589_release(struct pcmcia_device *link); -static u16 read_eeprom(kio_addr_t ioaddr, int index); +static u16 read_eeprom(unsigned int ioaddr, int index); static void tc589_reset(struct net_device *dev); static void media_check(unsigned long arg); static int el3_config(struct net_device *dev, struct ifmap *map); @@ -254,7 +254,7 @@ static int tc589_config(struct pcmcia_device *link) __le16 buf[32]; __be16 *phys_addr; int last_fn, last_ret, i, j, multi = 0, fifo; - kio_addr_t ioaddr; + unsigned int ioaddr; char *ram_split[] = {"5:3", "3:1", "1:1", "3:5"}; DECLARE_MAC_BUF(mac); @@ -403,7 +403,7 @@ static void tc589_wait_for_completion(struct net_device *dev, int cmd) Read a word from the EEPROM using the regular EEPROM access register. Assume that we are in register window zero. */ -static u16 read_eeprom(kio_addr_t ioaddr, int index) +static u16 read_eeprom(unsigned int ioaddr, int index) { int i; outw(EEPROM_READ + index, ioaddr + 10); @@ -421,7 +421,7 @@ static u16 read_eeprom(kio_addr_t ioaddr, int index) static void tc589_set_xcvr(struct net_device *dev, int if_port) { struct el3_private *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; EL3WINDOW(0); switch (if_port) { @@ -443,7 +443,7 @@ static void tc589_set_xcvr(struct net_device *dev, int if_port) static void dump_status(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; EL3WINDOW(1); printk(KERN_INFO " irq status %04x, rx status %04x, tx status " "%02x tx free %04x\n", inw(ioaddr+EL3_STATUS), @@ -459,7 +459,7 @@ static void dump_status(struct net_device *dev) /* Reset and restore all of the 3c589 registers. */ static void tc589_reset(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; int i; EL3WINDOW(0); @@ -567,7 +567,7 @@ static int el3_open(struct net_device *dev) static void el3_tx_timeout(struct net_device *dev) { struct el3_private *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; printk(KERN_WARNING "%s: Transmit timed out!\n", dev->name); dump_status(dev); @@ -582,7 +582,7 @@ static void el3_tx_timeout(struct net_device *dev) static void pop_tx_status(struct net_device *dev) { struct el3_private *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; int i; /* Clear the Tx status stack. */ @@ -604,7 +604,7 @@ static void pop_tx_status(struct net_device *dev) static int el3_start_xmit(struct sk_buff *skb, struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; struct el3_private *priv = netdev_priv(dev); unsigned long flags; @@ -641,7 +641,7 @@ static irqreturn_t el3_interrupt(int irq, void *dev_id) { struct net_device *dev = (struct net_device *) dev_id; struct el3_private *lp = netdev_priv(dev); - kio_addr_t ioaddr; + unsigned int ioaddr; __u16 status; int i = 0, handled = 1; @@ -727,7 +727,7 @@ static void media_check(unsigned long arg) { struct net_device *dev = (struct net_device *)(arg); struct el3_private *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; u16 media, errs; unsigned long flags; @@ -828,7 +828,7 @@ static struct net_device_stats *el3_get_stats(struct net_device *dev) static void update_stats(struct net_device *dev) { struct el3_private *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; DEBUG(2, "%s: updating the statistics.\n", dev->name); /* Turn off statistics updates while reading. */ @@ -855,7 +855,7 @@ static void update_stats(struct net_device *dev) static int el3_rx(struct net_device *dev) { struct el3_private *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; int worklimit = 32; short rx_status; @@ -909,7 +909,7 @@ static void set_multicast_list(struct net_device *dev) { struct el3_private *lp = netdev_priv(dev); struct pcmcia_device *link = lp->p_dev; - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; u16 opts = SetRxFilter | RxStation | RxBroadcast; if (!pcmcia_dev_present(link)) return; @@ -924,7 +924,7 @@ static int el3_close(struct net_device *dev) { struct el3_private *lp = netdev_priv(dev); struct pcmcia_device *link = lp->p_dev; - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; DEBUG(1, "%s: shutting down ethercard.\n", dev->name); diff --git a/drivers/net/pcmcia/axnet_cs.c b/drivers/net/pcmcia/axnet_cs.c index 6d342f6..e8a63e4 100644 --- a/drivers/net/pcmcia/axnet_cs.c +++ b/drivers/net/pcmcia/axnet_cs.c @@ -96,8 +96,8 @@ static irqreturn_t ei_irq_wrapper(int irq, void *dev_id); static void ei_watchdog(u_long arg); static void axnet_reset_8390(struct net_device *dev); -static int mdio_read(kio_addr_t addr, int phy_id, int loc); -static void mdio_write(kio_addr_t addr, int phy_id, int loc, int value); +static int mdio_read(unsigned int addr, int phy_id, int loc); +static void mdio_write(unsigned int addr, int phy_id, int loc, int value); static void get_8390_hdr(struct net_device *, struct e8390_pkt_hdr *, int); @@ -203,7 +203,7 @@ static void axnet_detach(struct pcmcia_device *link) static int get_prom(struct pcmcia_device *link) { struct net_device *dev = link->priv; - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; int i, j; /* This is based on drivers/net/ne.c */ @@ -473,7 +473,7 @@ static int axnet_resume(struct pcmcia_device *link) #define MDIO_MASK 0x0f #define MDIO_ENB_IN 0x02 -static void mdio_sync(kio_addr_t addr) +static void mdio_sync(unsigned int addr) { int bits; for (bits = 0; bits < 32; bits++) { @@ -482,7 +482,7 @@ static void mdio_sync(kio_addr_t addr) } } -static int mdio_read(kio_addr_t addr, int phy_id, int loc) +static int mdio_read(unsigned int addr, int phy_id, int loc) { u_int cmd = (0xf6<<10)|(phy_id<<5)|loc; int i, retval = 0; @@ -501,7 +501,7 @@ static int mdio_read(kio_addr_t addr, int phy_id, int loc) return (retval>>1) & 0xffff; } -static void mdio_write(kio_addr_t addr, int phy_id, int loc, int value) +static void mdio_write(unsigned int addr, int phy_id, int loc, int value) { u_int cmd = (0x05<<28)|(phy_id<<23)|(loc<<18)|(1<<17)|value; int i; @@ -575,7 +575,7 @@ static int axnet_close(struct net_device *dev) static void axnet_reset_8390(struct net_device *dev) { - kio_addr_t nic_base = dev->base_addr; + unsigned int nic_base = dev->base_addr; int i; ei_status.txing = ei_status.dmaing = 0; @@ -610,8 +610,8 @@ static void ei_watchdog(u_long arg) { struct net_device *dev = (struct net_device *)(arg); axnet_dev_t *info = PRIV(dev); - kio_addr_t nic_base = dev->base_addr; - kio_addr_t mii_addr = nic_base + AXNET_MII_EEP; + unsigned int nic_base = dev->base_addr; + unsigned int mii_addr = nic_base + AXNET_MII_EEP; u_short link; if (!netif_device_present(dev)) goto reschedule; @@ -681,7 +681,7 @@ static int axnet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) { axnet_dev_t *info = PRIV(dev); u16 *data = (u16 *)&rq->ifr_ifru; - kio_addr_t mii_addr = dev->base_addr + AXNET_MII_EEP; + unsigned int mii_addr = dev->base_addr + AXNET_MII_EEP; switch (cmd) { case SIOCGMIIPHY: data[0] = info->phy_id; @@ -703,7 +703,7 @@ static void get_8390_hdr(struct net_device *dev, struct e8390_pkt_hdr *hdr, int ring_page) { - kio_addr_t nic_base = dev->base_addr; + unsigned int nic_base = dev->base_addr; outb_p(0, nic_base + EN0_RSARLO); /* On page boundary */ outb_p(ring_page, nic_base + EN0_RSARHI); @@ -721,7 +721,7 @@ static void get_8390_hdr(struct net_device *dev, static void block_input(struct net_device *dev, int count, struct sk_buff *skb, int ring_offset) { - kio_addr_t nic_base = dev->base_addr; + unsigned int nic_base = dev->base_addr; int xfer_count = count; char *buf = skb->data; @@ -744,7 +744,7 @@ static void block_input(struct net_device *dev, int count, static void block_output(struct net_device *dev, int count, const u_char *buf, const int start_page) { - kio_addr_t nic_base = dev->base_addr; + unsigned int nic_base = dev->base_addr; #ifdef PCMCIA_DEBUG if (ei_debug > 4) @@ -991,7 +991,7 @@ static int ax_open(struct net_device *dev) * * Opposite of ax_open(). Only used when "ifconfig <devname> down" is done. */ -int ax_close(struct net_device *dev) +static int ax_close(struct net_device *dev) { unsigned long flags; @@ -1014,7 +1014,7 @@ int ax_close(struct net_device *dev) * completed (or failed) - i.e. never posted a Tx related interrupt. */ -void ei_tx_timeout(struct net_device *dev) +static void ei_tx_timeout(struct net_device *dev) { long e8390_base = dev->base_addr; struct ei_device *ei_local = (struct ei_device *) netdev_priv(dev); @@ -1087,8 +1087,8 @@ static int ei_start_xmit(struct sk_buff *skb, struct net_device *dev) ei_local->irqlock = 1; - send_length = ETH_ZLEN < length ? length : ETH_ZLEN; - + send_length = max(length, ETH_ZLEN); + /* * We have two Tx slots available for use. Find the first free * slot, and then perform some sanity checks. With two Tx bufs, diff --git a/drivers/net/pcmcia/fmvj18x_cs.c b/drivers/net/pcmcia/fmvj18x_cs.c index 949c6df..8f328a0 100644 --- a/drivers/net/pcmcia/fmvj18x_cs.c +++ b/drivers/net/pcmcia/fmvj18x_cs.c @@ -298,7 +298,8 @@ do { last_fn = (fn); if ((last_ret = (ret)) != 0) goto cs_failed; } while (0) static int mfc_try_io_port(struct pcmcia_device *link) { int i, ret; - static const kio_addr_t serial_base[5] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8, 0x0 }; + static const unsigned int serial_base[5] = + { 0x3f8, 0x2f8, 0x3e8, 0x2e8, 0x0 }; for (i = 0; i < 5; i++) { link->io.BasePort2 = serial_base[i]; @@ -316,7 +317,7 @@ static int mfc_try_io_port(struct pcmcia_device *link) static int ungermann_try_io_port(struct pcmcia_device *link) { int ret; - kio_addr_t ioaddr; + unsigned int ioaddr; /* Ungermann-Bass Access/CARD accepts 0x300,0x320,0x340,0x360 0x380,0x3c0 only for ioport. @@ -342,7 +343,7 @@ static int fmvj18x_config(struct pcmcia_device *link) cisparse_t parse; u_short buf[32]; int i, last_fn = 0, last_ret = 0, ret; - kio_addr_t ioaddr; + unsigned int ioaddr; cardtype_t cardtype; char *card_name = "unknown"; u_char *node_id; @@ -610,7 +611,7 @@ static int fmvj18x_setup_mfc(struct pcmcia_device *link) u_char __iomem *base; int i, j; struct net_device *dev = link->priv; - kio_addr_t ioaddr; + unsigned int ioaddr; /* Allocate a small memory window */ req.Attributes = WIN_DATA_WIDTH_8|WIN_MEMORY_TYPE_AM|WIN_ENABLE; @@ -735,7 +736,7 @@ static irqreturn_t fjn_interrupt(int dummy, void *dev_id) { struct net_device *dev = dev_id; local_info_t *lp = netdev_priv(dev); - kio_addr_t ioaddr; + unsigned int ioaddr; unsigned short tx_stat, rx_stat; ioaddr = dev->base_addr; @@ -789,7 +790,7 @@ static irqreturn_t fjn_interrupt(int dummy, void *dev_id) static void fjn_tx_timeout(struct net_device *dev) { struct local_info_t *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; printk(KERN_NOTICE "%s: transmit timed out with status %04x, %s?\n", dev->name, htons(inw(ioaddr + TX_STATUS)), @@ -819,7 +820,7 @@ static void fjn_tx_timeout(struct net_device *dev) static int fjn_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct local_info_t *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; short length = skb->len; if (length < ETH_ZLEN) @@ -892,7 +893,7 @@ static int fjn_start_xmit(struct sk_buff *skb, struct net_device *dev) static void fjn_reset(struct net_device *dev) { struct local_info_t *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; int i; DEBUG(4, "fjn_reset(%s) called.\n",dev->name); @@ -971,7 +972,7 @@ static void fjn_reset(struct net_device *dev) static void fjn_rx(struct net_device *dev) { struct local_info_t *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; int boguscount = 10; /* 5 -> 10: by agy 19940922 */ DEBUG(4, "%s: in rx_packet(), rx_status %02x.\n", @@ -1125,7 +1126,7 @@ static int fjn_close(struct net_device *dev) { struct local_info_t *lp = netdev_priv(dev); struct pcmcia_device *link = lp->p_dev; - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; DEBUG(4, "fjn_close('%s').\n", dev->name); @@ -1168,7 +1169,7 @@ static struct net_device_stats *fjn_get_stats(struct net_device *dev) static void set_rx_mode(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; u_char mc_filter[8]; /* Multicast hash filter */ u_long flags; int i; @@ -1197,8 +1198,7 @@ static void set_rx_mode(struct net_device *dev) outb(1, ioaddr + RX_MODE); /* Ignore almost all multicasts. */ } else { struct dev_mc_list *mclist; - int i; - + memset(mc_filter, 0, sizeof(mc_filter)); for (i = 0, mclist = dev->mc_list; mclist && i < dev->mc_count; i++, mclist = mclist->next) { diff --git a/drivers/net/pcmcia/nmclan_cs.c b/drivers/net/pcmcia/nmclan_cs.c index a355a93..cfcbea9 100644 --- a/drivers/net/pcmcia/nmclan_cs.c +++ b/drivers/net/pcmcia/nmclan_cs.c @@ -518,7 +518,7 @@ mace_read assuming that during normal operation, the MACE is always in bank 0. ---------------------------------------------------------------------------- */ -static int mace_read(mace_private *lp, kio_addr_t ioaddr, int reg) +static int mace_read(mace_private *lp, unsigned int ioaddr, int reg) { int data = 0xFF; unsigned long flags; @@ -545,7 +545,8 @@ mace_write are assuming that during normal operation, the MACE is always in bank 0. ---------------------------------------------------------------------------- */ -static void mace_write(mace_private *lp, kio_addr_t ioaddr, int reg, int data) +static void mace_write(mace_private *lp, unsigned int ioaddr, int reg, + int data) { unsigned long flags; @@ -567,7 +568,7 @@ static void mace_write(mace_private *lp, kio_addr_t ioaddr, int reg, int data) mace_init Resets the MACE chip. ---------------------------------------------------------------------------- */ -static int mace_init(mace_private *lp, kio_addr_t ioaddr, char *enet_addr) +static int mace_init(mace_private *lp, unsigned int ioaddr, char *enet_addr) { int i; int ct = 0; @@ -657,7 +658,7 @@ static int nmclan_config(struct pcmcia_device *link) tuple_t tuple; u_char buf[64]; int i, last_ret, last_fn; - kio_addr_t ioaddr; + unsigned int ioaddr; DECLARE_MAC_BUF(mac); DEBUG(0, "nmclan_config(0x%p)\n", link); @@ -839,7 +840,7 @@ mace_open ---------------------------------------------------------------------------- */ static int mace_open(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; mace_private *lp = netdev_priv(dev); struct pcmcia_device *link = lp->p_dev; @@ -862,7 +863,7 @@ mace_close ---------------------------------------------------------------------------- */ static int mace_close(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; mace_private *lp = netdev_priv(dev); struct pcmcia_device *link = lp->p_dev; @@ -935,7 +936,7 @@ static void mace_tx_timeout(struct net_device *dev) static int mace_start_xmit(struct sk_buff *skb, struct net_device *dev) { mace_private *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; netif_stop_queue(dev); @@ -996,7 +997,7 @@ static irqreturn_t mace_interrupt(int irq, void *dev_id) { struct net_device *dev = (struct net_device *) dev_id; mace_private *lp = netdev_priv(dev); - kio_addr_t ioaddr; + unsigned int ioaddr; int status; int IntrCnt = MACE_MAX_IR_ITERATIONS; @@ -1140,7 +1141,7 @@ mace_rx static int mace_rx(struct net_device *dev, unsigned char RxCnt) { mace_private *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; unsigned char rx_framecnt; unsigned short rx_status; @@ -1302,7 +1303,7 @@ update_stats card's SRAM fast enough. If this happens, something is seriously wrong with the hardware. ---------------------------------------------------------------------------- */ -static void update_stats(kio_addr_t ioaddr, struct net_device *dev) +static void update_stats(unsigned int ioaddr, struct net_device *dev) { mace_private *lp = netdev_priv(dev); @@ -1448,7 +1449,7 @@ static void restore_multicast_list(struct net_device *dev) mace_private *lp = netdev_priv(dev); int num_addrs = lp->multicast_num_addrs; int *ladrf = lp->multicast_ladrf; - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; int i; DEBUG(2, "%s: restoring Rx mode to %d addresses.\n", @@ -1540,7 +1541,7 @@ static void set_multicast_list(struct net_device *dev) static void restore_multicast_list(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; mace_private *lp = netdev_priv(dev); DEBUG(2, "%s: restoring Rx mode to %d addresses.\n", dev->name, diff --git a/drivers/net/pcmcia/pcnet_cs.c b/drivers/net/pcmcia/pcnet_cs.c index 9ba56aa..6323988 100644 --- a/drivers/net/pcmcia/pcnet_cs.c +++ b/drivers/net/pcmcia/pcnet_cs.c @@ -349,7 +349,7 @@ static hw_info_t *get_hwinfo(struct pcmcia_device *link) static hw_info_t *get_prom(struct pcmcia_device *link) { struct net_device *dev = link->priv; - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; u_char prom[32]; int i, j; @@ -425,7 +425,7 @@ static hw_info_t *get_dl10019(struct pcmcia_device *link) static hw_info_t *get_ax88190(struct pcmcia_device *link) { struct net_device *dev = link->priv; - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; int i, j; /* Not much of a test, but the alternatives are messy */ @@ -521,7 +521,7 @@ static int pcnet_config(struct pcmcia_device *link) int i, last_ret, last_fn, start_pg, stop_pg, cm_offset; int has_shmem = 0; u_short buf[64]; - hw_info_t *hw_info; + hw_info_t *local_hw_info; DECLARE_MAC_BUF(mac); DEBUG(0, "pcnet_config(0x%p)\n", link); @@ -590,23 +590,23 @@ static int pcnet_config(struct pcmcia_device *link) dev->if_port = 0; } - hw_info = get_hwinfo(link); - if (hw_info == NULL) - hw_info = get_prom(link); - if (hw_info == NULL) - hw_info = get_dl10019(link); - if (hw_info == NULL) - hw_info = get_ax88190(link); - if (hw_info == NULL) - hw_info = get_hwired(link); - - if (hw_info == NULL) { + local_hw_info = get_hwinfo(link); + if (local_hw_info == NULL) + local_hw_info = get_prom(link); + if (local_hw_info == NULL) + local_hw_info = get_dl10019(link); + if (local_hw_info == NULL) + local_hw_info = get_ax88190(link); + if (local_hw_info == NULL) + local_hw_info = get_hwired(link); + + if (local_hw_info == NULL) { printk(KERN_NOTICE "pcnet_cs: unable to read hardware net" " address for io base %#3lx\n", dev->base_addr); goto failed; } - info->flags = hw_info->flags; + info->flags = local_hw_info->flags; /* Check for user overrides */ info->flags |= (delay_output) ? DELAY_OUTPUT : 0; if ((link->manf_id == MANFID_SOCKET) && @@ -756,7 +756,7 @@ static int pcnet_resume(struct pcmcia_device *link) #define MDIO_DATA_READ 0x10 #define MDIO_MASK 0x0f -static void mdio_sync(kio_addr_t addr) +static void mdio_sync(unsigned int addr) { int bits, mask = inb(addr) & MDIO_MASK; for (bits = 0; bits < 32; bits++) { @@ -765,7 +765,7 @@ static void mdio_sync(kio_addr_t addr) } } -static int mdio_read(kio_addr_t addr, int phy_id, int loc) +static int mdio_read(unsigned int addr, int phy_id, int loc) { u_int cmd = (0x06<<10)|(phy_id<<5)|loc; int i, retval = 0, mask = inb(addr) & MDIO_MASK; @@ -784,7 +784,7 @@ static int mdio_read(kio_addr_t addr, int phy_id, int loc) return (retval>>1) & 0xffff; } -static void mdio_write(kio_addr_t addr, int phy_id, int loc, int value) +static void mdio_write(unsigned int addr, int phy_id, int loc, int value) { u_int cmd = (0x05<<28)|(phy_id<<23)|(loc<<18)|(1<<17)|value; int i, mask = inb(addr) & MDIO_MASK; @@ -818,10 +818,10 @@ static void mdio_write(kio_addr_t addr, int phy_id, int loc, int value) #define DL19FDUPLX 0x0400 /* DL10019 Full duplex mode */ -static int read_eeprom(kio_addr_t ioaddr, int location) +static int read_eeprom(unsigned int ioaddr, int location) { int i, retval = 0; - kio_addr_t ee_addr = ioaddr + DLINK_EEPROM; + unsigned int ee_addr = ioaddr + DLINK_EEPROM; int read_cmd = location | (EE_READ_CMD << 8); outb(0, ee_addr); @@ -852,10 +852,10 @@ static int read_eeprom(kio_addr_t ioaddr, int location) In ASIC mode, EE_ADOT is used to output the data to the ASIC. */ -static void write_asic(kio_addr_t ioaddr, int location, short asic_data) +static void write_asic(unsigned int ioaddr, int location, short asic_data) { int i; - kio_addr_t ee_addr = ioaddr + DLINK_EEPROM; + unsigned int ee_addr = ioaddr + DLINK_EEPROM; short dataval; int read_cmd = location | (EE_READ_CMD << 8); @@ -897,7 +897,7 @@ static void write_asic(kio_addr_t ioaddr, int location, short asic_data) static void set_misc_reg(struct net_device *dev) { - kio_addr_t nic_base = dev->base_addr; + unsigned int nic_base = dev->base_addr; pcnet_dev_t *info = PRIV(dev); u_char tmp; @@ -936,7 +936,7 @@ static void set_misc_reg(struct net_device *dev) static void mii_phy_probe(struct net_device *dev) { pcnet_dev_t *info = PRIV(dev); - kio_addr_t mii_addr = dev->base_addr + DLINK_GPIO; + unsigned int mii_addr = dev->base_addr + DLINK_GPIO; int i; u_int tmp, phyid; @@ -1014,7 +1014,7 @@ static int pcnet_close(struct net_device *dev) static void pcnet_reset_8390(struct net_device *dev) { - kio_addr_t nic_base = dev->base_addr; + unsigned int nic_base = dev->base_addr; int i; ei_status.txing = ei_status.dmaing = 0; @@ -1074,8 +1074,8 @@ static void ei_watchdog(u_long arg) { struct net_device *dev = (struct net_device *)arg; pcnet_dev_t *info = PRIV(dev); - kio_addr_t nic_base = dev->base_addr; - kio_addr_t mii_addr = nic_base + DLINK_GPIO; + unsigned int nic_base = dev->base_addr; + unsigned int mii_addr = nic_base + DLINK_GPIO; u_short link; if (!netif_device_present(dev)) goto reschedule; @@ -1177,7 +1177,7 @@ static int ei_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) { pcnet_dev_t *info = PRIV(dev); u16 *data = (u16 *)&rq->ifr_ifru; - kio_addr_t mii_addr = dev->base_addr + DLINK_GPIO; + unsigned int mii_addr = dev->base_addr + DLINK_GPIO; switch (cmd) { case SIOCGMIIPHY: data[0] = info->phy_id; @@ -1199,7 +1199,7 @@ static void dma_get_8390_hdr(struct net_device *dev, struct e8390_pkt_hdr *hdr, int ring_page) { - kio_addr_t nic_base = dev->base_addr; + unsigned int nic_base = dev->base_addr; if (ei_status.dmaing) { printk(KERN_NOTICE "%s: DMAing conflict in dma_block_input." @@ -1230,7 +1230,7 @@ static void dma_get_8390_hdr(struct net_device *dev, static void dma_block_input(struct net_device *dev, int count, struct sk_buff *skb, int ring_offset) { - kio_addr_t nic_base = dev->base_addr; + unsigned int nic_base = dev->base_addr; int xfer_count = count; char *buf = skb->data; @@ -1285,7 +1285,7 @@ static void dma_block_input(struct net_device *dev, int count, static void dma_block_output(struct net_device *dev, int count, const u_char *buf, const int start_page) { - kio_addr_t nic_base = dev->base_addr; + unsigned int nic_base = dev->base_addr; pcnet_dev_t *info = PRIV(dev); #ifdef PCMCIA_DEBUG int retries = 0; diff --git a/drivers/net/pcmcia/smc91c92_cs.c b/drivers/net/pcmcia/smc91c92_cs.c index c9868e9..f18eca9 100644 --- a/drivers/net/pcmcia/smc91c92_cs.c +++ b/drivers/net/pcmcia/smc91c92_cs.c @@ -295,7 +295,7 @@ static int s9k_config(struct net_device *dev, struct ifmap *map); static void smc_set_xcvr(struct net_device *dev, int if_port); static void smc_reset(struct net_device *dev); static void media_check(u_long arg); -static void mdio_sync(kio_addr_t addr); +static void mdio_sync(unsigned int addr); static int mdio_read(struct net_device *dev, int phy_id, int loc); static void mdio_write(struct net_device *dev, int phy_id, int loc, int value); static int smc_link_ok(struct net_device *dev); @@ -601,8 +601,8 @@ static void mot_config(struct pcmcia_device *link) { struct net_device *dev = link->priv; struct smc_private *smc = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; - kio_addr_t iouart = link->io.BasePort2; + unsigned int ioaddr = dev->base_addr; + unsigned int iouart = link->io.BasePort2; /* Set UART base address and force map with COR bit 1 */ writeb(iouart & 0xff, smc->base + MOT_UART + CISREG_IOBASE_0); @@ -621,7 +621,7 @@ static void mot_config(struct pcmcia_device *link) static int mot_setup(struct pcmcia_device *link) { struct net_device *dev = link->priv; - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; int i, wait, loop; u_int addr; @@ -754,7 +754,7 @@ free_cfg_mem: static int osi_config(struct pcmcia_device *link) { struct net_device *dev = link->priv; - static const kio_addr_t com[4] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8 }; + static const unsigned int com[4] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8 }; int i, j; link->conf.Attributes |= CONF_ENABLE_SPKR; @@ -900,7 +900,7 @@ static int smc91c92_resume(struct pcmcia_device *link) static int check_sig(struct pcmcia_device *link) { struct net_device *dev = link->priv; - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; int width; u_short s; @@ -960,7 +960,7 @@ static int smc91c92_config(struct pcmcia_device *link) struct smc_private *smc = netdev_priv(dev); char *name; int i, j, rev; - kio_addr_t ioaddr; + unsigned int ioaddr; u_long mir; DECLARE_MAC_BUF(mac); @@ -1136,7 +1136,7 @@ static void smc91c92_release(struct pcmcia_device *link) #define MDIO_DATA_WRITE1 (MDIO_DIR_WRITE | MDIO_DATA_OUT) #define MDIO_DATA_READ 0x02 -static void mdio_sync(kio_addr_t addr) +static void mdio_sync(unsigned int addr) { int bits; for (bits = 0; bits < 32; bits++) { @@ -1147,7 +1147,7 @@ static void mdio_sync(kio_addr_t addr) static int mdio_read(struct net_device *dev, int phy_id, int loc) { - kio_addr_t addr = dev->base_addr + MGMT; + unsigned int addr = dev->base_addr + MGMT; u_int cmd = (0x06<<10)|(phy_id<<5)|loc; int i, retval = 0; @@ -1167,7 +1167,7 @@ static int mdio_read(struct net_device *dev, int phy_id, int loc) static void mdio_write(struct net_device *dev, int phy_id, int loc, int value) { - kio_addr_t addr = dev->base_addr + MGMT; + unsigned int addr = dev->base_addr + MGMT; u_int cmd = (0x05<<28)|(phy_id<<23)|(loc<<18)|(1<<17)|value; int i; @@ -1193,7 +1193,7 @@ static void mdio_write(struct net_device *dev, int phy_id, int loc, int value) #ifdef PCMCIA_DEBUG static void smc_dump(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; u_short i, w, save; save = inw(ioaddr + BANK_SELECT); for (w = 0; w < 4; w++) { @@ -1248,7 +1248,7 @@ static int smc_close(struct net_device *dev) { struct smc_private *smc = netdev_priv(dev); struct pcmcia_device *link = smc->p_dev; - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; DEBUG(0, "%s: smc_close(), status %4.4x.\n", dev->name, inw(ioaddr + BANK_SELECT)); @@ -1285,7 +1285,7 @@ static void smc_hardware_send_packet(struct net_device * dev) { struct smc_private *smc = netdev_priv(dev); struct sk_buff *skb = smc->saved_skb; - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; u_char packet_no; if (!skb) { @@ -1349,7 +1349,7 @@ static void smc_hardware_send_packet(struct net_device * dev) static void smc_tx_timeout(struct net_device *dev) { struct smc_private *smc = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; printk(KERN_NOTICE "%s: SMC91c92 transmit timed out, " "Tx_status %2.2x status %4.4x.\n", @@ -1364,7 +1364,7 @@ static void smc_tx_timeout(struct net_device *dev) static int smc_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct smc_private *smc = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; u_short num_pages; short time_out, ir; unsigned long flags; @@ -1434,7 +1434,7 @@ static int smc_start_xmit(struct sk_buff *skb, struct net_device *dev) static void smc_tx_err(struct net_device * dev) { struct smc_private *smc = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; int saved_packet = inw(ioaddr + PNR_ARR) & 0xff; int packet_no = inw(ioaddr + FIFO_PORTS) & 0x7f; int tx_status; @@ -1478,7 +1478,7 @@ static void smc_tx_err(struct net_device * dev) static void smc_eph_irq(struct net_device *dev) { struct smc_private *smc = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; u_short card_stats, ephs; SMC_SELECT_BANK(0); @@ -1513,7 +1513,7 @@ static irqreturn_t smc_interrupt(int irq, void *dev_id) { struct net_device *dev = dev_id; struct smc_private *smc = netdev_priv(dev); - kio_addr_t ioaddr; + unsigned int ioaddr; u_short saved_bank, saved_pointer, mask, status; unsigned int handled = 1; char bogus_cnt = INTR_WORK; /* Work we are willing to do. */ @@ -1633,7 +1633,7 @@ irq_done: static void smc_rx(struct net_device *dev) { struct smc_private *smc = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; int rx_status; int packet_length; /* Caution: not frame length, rather words to transfer from the chip. */ @@ -1738,7 +1738,7 @@ static void fill_multicast_tbl(int count, struct dev_mc_list *addrs, static void set_rx_mode(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; struct smc_private *smc = netdev_priv(dev); u_int multicast_table[ 2 ] = { 0, }; unsigned long flags; @@ -1804,7 +1804,7 @@ static int s9k_config(struct net_device *dev, struct ifmap *map) static void smc_set_xcvr(struct net_device *dev, int if_port) { struct smc_private *smc = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; u_short saved_bank; saved_bank = inw(ioaddr + BANK_SELECT); @@ -1827,7 +1827,7 @@ static void smc_set_xcvr(struct net_device *dev, int if_port) static void smc_reset(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; struct smc_private *smc = netdev_priv(dev); int i; @@ -1904,7 +1904,7 @@ static void media_check(u_long arg) { struct net_device *dev = (struct net_device *) arg; struct smc_private *smc = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; u_short i, media, saved_bank; u_short link; unsigned long flags; @@ -2021,7 +2021,7 @@ reschedule: static int smc_link_ok(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; struct smc_private *smc = netdev_priv(dev); if (smc->cfg & CFG_MII_SELECT) { @@ -2035,7 +2035,7 @@ static int smc_link_ok(struct net_device *dev) static int smc_netdev_get_ecmd(struct net_device *dev, struct ethtool_cmd *ecmd) { u16 tmp; - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; ecmd->supported = (SUPPORTED_TP | SUPPORTED_AUI | SUPPORTED_10baseT_Half | SUPPORTED_10baseT_Full); @@ -2057,7 +2057,7 @@ static int smc_netdev_get_ecmd(struct net_device *dev, struct ethtool_cmd *ecmd) static int smc_netdev_set_ecmd(struct net_device *dev, struct ethtool_cmd *ecmd) { u16 tmp; - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; if (ecmd->speed != SPEED_10) return -EINVAL; @@ -2100,7 +2100,7 @@ static void smc_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info static int smc_get_settings(struct net_device *dev, struct ethtool_cmd *ecmd) { struct smc_private *smc = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; u16 saved_bank = inw(ioaddr + BANK_SELECT); int ret; @@ -2118,7 +2118,7 @@ static int smc_get_settings(struct net_device *dev, struct ethtool_cmd *ecmd) static int smc_set_settings(struct net_device *dev, struct ethtool_cmd *ecmd) { struct smc_private *smc = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; u16 saved_bank = inw(ioaddr + BANK_SELECT); int ret; @@ -2136,7 +2136,7 @@ static int smc_set_settings(struct net_device *dev, struct ethtool_cmd *ecmd) static u32 smc_get_link(struct net_device *dev) { struct smc_private *smc = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; u16 saved_bank = inw(ioaddr + BANK_SELECT); u32 ret; @@ -2164,7 +2164,7 @@ static int smc_nway_reset(struct net_device *dev) { struct smc_private *smc = netdev_priv(dev); if (smc->cfg & CFG_MII_SELECT) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; u16 saved_bank = inw(ioaddr + BANK_SELECT); int res; @@ -2196,7 +2196,7 @@ static int smc_ioctl (struct net_device *dev, struct ifreq *rq, int cmd) struct mii_ioctl_data *mii = if_mii(rq); int rc = 0; u16 saved_bank; - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; if (!netif_running(dev)) return -EINVAL; diff --git a/drivers/net/pcmcia/xirc2ps_cs.c b/drivers/net/pcmcia/xirc2ps_cs.c index 1f09bea..d041f83 100644 --- a/drivers/net/pcmcia/xirc2ps_cs.c +++ b/drivers/net/pcmcia/xirc2ps_cs.c @@ -273,12 +273,12 @@ INT_MODULE_PARM(lockup_hack, 0); /* anti lockup hack */ static unsigned maxrx_bytes = 22000; /* MII management prototypes */ -static void mii_idle(kio_addr_t ioaddr); -static void mii_putbit(kio_addr_t ioaddr, unsigned data); -static int mii_getbit(kio_addr_t ioaddr); -static void mii_wbits(kio_addr_t ioaddr, unsigned data, int len); -static unsigned mii_rd(kio_addr_t ioaddr, u_char phyaddr, u_char phyreg); -static void mii_wr(kio_addr_t ioaddr, u_char phyaddr, u_char phyreg, +static void mii_idle(unsigned int ioaddr); +static void mii_putbit(unsigned int ioaddr, unsigned data); +static int mii_getbit(unsigned int ioaddr); +static void mii_wbits(unsigned int ioaddr, unsigned data, int len); +static unsigned mii_rd(unsigned int ioaddr, u_char phyaddr, u_char phyreg); +static void mii_wr(unsigned int ioaddr, u_char phyaddr, u_char phyreg, unsigned data, int len); /* @@ -403,7 +403,7 @@ next_tuple(struct pcmcia_device *handle, tuple_t *tuple, cisparse_t *parse) static void PrintRegisters(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; if (pc_debug > 1) { int i, page; @@ -439,7 +439,7 @@ PrintRegisters(struct net_device *dev) * Turn around for read */ static void -mii_idle(kio_addr_t ioaddr) +mii_idle(unsigned int ioaddr) { PutByte(XIRCREG2_GPR2, 0x04|0); /* drive MDCK low */ udelay(1); @@ -451,7 +451,7 @@ mii_idle(kio_addr_t ioaddr) * Write a bit to MDI/O */ static void -mii_putbit(kio_addr_t ioaddr, unsigned data) +mii_putbit(unsigned int ioaddr, unsigned data) { #if 1 if (data) { @@ -484,7 +484,7 @@ mii_putbit(kio_addr_t ioaddr, unsigned data) * Get a bit from MDI/O */ static int -mii_getbit(kio_addr_t ioaddr) +mii_getbit(unsigned int ioaddr) { unsigned d; @@ -497,7 +497,7 @@ mii_getbit(kio_addr_t ioaddr) } static void -mii_wbits(kio_addr_t ioaddr, unsigned data, int len) +mii_wbits(unsigned int ioaddr, unsigned data, int len) { unsigned m = 1 << (len-1); for (; m; m >>= 1) @@ -505,7 +505,7 @@ mii_wbits(kio_addr_t ioaddr, unsigned data, int len) } static unsigned -mii_rd(kio_addr_t ioaddr, u_char phyaddr, u_char phyreg) +mii_rd(unsigned int ioaddr, u_char phyaddr, u_char phyreg) { int i; unsigned data=0, m; @@ -527,7 +527,8 @@ mii_rd(kio_addr_t ioaddr, u_char phyaddr, u_char phyreg) } static void -mii_wr(kio_addr_t ioaddr, u_char phyaddr, u_char phyreg, unsigned data, int len) +mii_wr(unsigned int ioaddr, u_char phyaddr, u_char phyreg, unsigned data, + int len) { int i; @@ -726,7 +727,7 @@ xirc2ps_config(struct pcmcia_device * link) local_info_t *local = netdev_priv(dev); tuple_t tuple; cisparse_t parse; - kio_addr_t ioaddr; + unsigned int ioaddr; int err, i; u_char buf[64]; cistpl_lan_node_id_t *node_id = (cistpl_lan_node_id_t*)parse.funce.data; @@ -1104,7 +1105,7 @@ xirc2ps_interrupt(int irq, void *dev_id) { struct net_device *dev = (struct net_device *)dev_id; local_info_t *lp = netdev_priv(dev); - kio_addr_t ioaddr; + unsigned int ioaddr; u_char saved_page; unsigned bytes_rcvd; unsigned int_status, eth_status, rx_status, tx_status; @@ -1209,7 +1210,7 @@ xirc2ps_interrupt(int irq, void *dev_id) unsigned i; u_long *p = skb_put(skb, pktlen); register u_long a; - kio_addr_t edpreg = ioaddr+XIRCREG_EDP-2; + unsigned int edpreg = ioaddr+XIRCREG_EDP-2; for (i=0; i < len ; i += 4, p++) { a = inl(edpreg); __asm__("rorl $16,%0\n\t" @@ -1346,7 +1347,7 @@ static int do_start_xmit(struct sk_buff *skb, struct net_device *dev) { local_info_t *lp = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; int okay; unsigned freespace; unsigned pktlen = skb->len; @@ -1415,7 +1416,7 @@ do_get_stats(struct net_device *dev) static void set_addresses(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; local_info_t *lp = netdev_priv(dev); struct dev_mc_list *dmi = dev->mc_list; unsigned char *addr; @@ -1459,7 +1460,7 @@ set_addresses(struct net_device *dev) static void set_multicast_list(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; SelectPage(0x42); if (dev->flags & IFF_PROMISC) { /* snoop */ @@ -1543,7 +1544,7 @@ static int do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) { local_info_t *local = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; u16 *data = (u16 *)&rq->ifr_ifru; DEBUG(1, "%s: ioctl(%-.6s, %#04x) %04x %04x %04x %04x\n", @@ -1575,7 +1576,7 @@ static void hardreset(struct net_device *dev) { local_info_t *local = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; SelectPage(4); udelay(1); @@ -1592,7 +1593,7 @@ static void do_reset(struct net_device *dev, int full) { local_info_t *local = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; unsigned value; DEBUG(0, "%s: do_reset(%p,%d)\n", dev? dev->name:"eth?", dev, full); @@ -1753,7 +1754,7 @@ static int init_mii(struct net_device *dev) { local_info_t *local = netdev_priv(dev); - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; unsigned control, status, linkpartner; int i; @@ -1826,7 +1827,7 @@ static void do_powerdown(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; DEBUG(0, "do_powerdown(%p)\n", dev); @@ -1838,7 +1839,7 @@ do_powerdown(struct net_device *dev) static int do_stop(struct net_device *dev) { - kio_addr_t ioaddr = dev->base_addr; + unsigned int ioaddr = dev->base_addr; local_info_t *lp = netdev_priv(dev); struct pcmcia_device *link = lp->p_dev; diff --git a/drivers/net/wireless/b43/b43.h b/drivers/net/wireless/b43/b43.h index 32a24f5..08a011f 100644 --- a/drivers/net/wireless/b43/b43.h +++ b/drivers/net/wireless/b43/b43.h @@ -724,6 +724,7 @@ struct b43_wldev { bool short_preamble; /* TRUE, if short preamble is enabled. */ bool short_slot; /* TRUE, if short slot timing is enabled. */ bool radio_hw_enable; /* saved state of radio hardware enabled state */ + bool suspend_in_progress; /* TRUE, if we are in a suspend/resume cycle */ /* PHY/Radio device. */ struct b43_phy phy; diff --git a/drivers/net/wireless/b43/leds.c b/drivers/net/wireless/b43/leds.c index 4b590d8..0908335 100644 --- a/drivers/net/wireless/b43/leds.c +++ b/drivers/net/wireless/b43/leds.c @@ -116,7 +116,10 @@ static void b43_unregister_led(struct b43_led *led) { if (!led->dev) return; - led_classdev_unregister(&led->led_dev); + if (led->dev->suspend_in_progress) + led_classdev_unregister_suspended(&led->led_dev); + else + led_classdev_unregister(&led->led_dev); b43_led_turn_off(led->dev, led->index, led->activelow); led->dev = NULL; } diff --git a/drivers/net/wireless/b43/main.c b/drivers/net/wireless/b43/main.c index 64c154d..ef65c41 100644 --- a/drivers/net/wireless/b43/main.c +++ b/drivers/net/wireless/b43/main.c @@ -38,6 +38,7 @@ #include <linux/wireless.h> #include <linux/workqueue.h> #include <linux/skbuff.h> +#include <linux/io.h> #include <linux/dma-mapping.h> #include <asm/unaligned.h> @@ -2554,10 +2555,10 @@ static int b43_rng_read(struct hwrng *rng, u32 * data) return (sizeof(u16)); } -static void b43_rng_exit(struct b43_wl *wl) +static void b43_rng_exit(struct b43_wl *wl, bool suspended) { if (wl->rng_initialized) - hwrng_unregister(&wl->rng); + __hwrng_unregister(&wl->rng, suspended); } static int b43_rng_init(struct b43_wl *wl) @@ -3417,8 +3418,10 @@ static void b43_wireless_core_exit(struct b43_wldev *dev) macctl |= B43_MACCTL_PSM_JMP0; b43_write32(dev, B43_MMIO_MACCTL, macctl); - b43_leds_exit(dev); - b43_rng_exit(dev->wl); + if (!dev->suspend_in_progress) { + b43_leds_exit(dev); + b43_rng_exit(dev->wl, false); + } b43_dma_free(dev); b43_chip_exit(dev); b43_radio_turn_off(dev, 1); @@ -3534,11 +3537,13 @@ static int b43_wireless_core_init(struct b43_wldev *dev) ssb_bus_powerup(bus, 1); /* Enable dynamic PCTL */ b43_upload_card_macaddress(dev); b43_security_init(dev); - b43_rng_init(wl); + if (!dev->suspend_in_progress) + b43_rng_init(wl); b43_set_status(dev, B43_STAT_INITIALIZED); - b43_leds_init(dev); + if (!dev->suspend_in_progress) + b43_leds_init(dev); out: return err; @@ -4135,6 +4140,7 @@ static int b43_suspend(struct ssb_device *dev, pm_message_t state) b43dbg(wl, "Suspending...\n"); mutex_lock(&wl->mutex); + wldev->suspend_in_progress = true; wldev->suspend_init_status = b43_status(wldev); if (wldev->suspend_init_status >= B43_STAT_STARTED) b43_wireless_core_stop(wldev); @@ -4166,15 +4172,17 @@ static int b43_resume(struct ssb_device *dev) if (wldev->suspend_init_status >= B43_STAT_STARTED) { err = b43_wireless_core_start(wldev); if (err) { + b43_leds_exit(wldev); + b43_rng_exit(wldev->wl, true); b43_wireless_core_exit(wldev); b43err(wl, "Resume failed at core start\n"); goto out; } } - mutex_unlock(&wl->mutex); - b43dbg(wl, "Device resumed.\n"); - out: + out: + wldev->suspend_in_progress = false; + mutex_unlock(&wl->mutex); return err; } diff --git a/drivers/net/wireless/ipw2100.c b/drivers/net/wireless/ipw2100.c index 2ab107f..5bf9e00 100644 --- a/drivers/net/wireless/ipw2100.c +++ b/drivers/net/wireless/ipw2100.c @@ -162,7 +162,7 @@ that only one external action is invoked at a time. #include <linux/firmware.h> #include <linux/acpi.h> #include <linux/ctype.h> -#include <linux/latency.h> +#include <linux/pm_qos_params.h> #include "ipw2100.h" @@ -1701,7 +1701,7 @@ static int ipw2100_up(struct ipw2100_priv *priv, int deferred) /* the ipw2100 hardware really doesn't want power management delays * longer than 175usec */ - modify_acceptable_latency("ipw2100", 175); + pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100", 175); /* If the interrupt is enabled, turn it off... */ spin_lock_irqsave(&priv->low_lock, flags); @@ -1856,7 +1856,8 @@ static void ipw2100_down(struct ipw2100_priv *priv) ipw2100_disable_interrupts(priv); spin_unlock_irqrestore(&priv->low_lock, flags); - modify_acceptable_latency("ipw2100", INFINITE_LATENCY); + pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100", + PM_QOS_DEFAULT_VALUE); /* We have to signal any supplicant if we are disassociating */ if (associated) @@ -6554,7 +6555,8 @@ static int __init ipw2100_init(void) if (ret) goto out; - set_acceptable_latency("ipw2100", INFINITE_LATENCY); + pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100", + PM_QOS_DEFAULT_VALUE); #ifdef CONFIG_IPW2100_DEBUG ipw2100_debug_level = debug; ret = driver_create_file(&ipw2100_pci_driver.driver, @@ -6576,7 +6578,7 @@ static void __exit ipw2100_exit(void) &driver_attr_debug_level); #endif pci_unregister_driver(&ipw2100_pci_driver); - remove_acceptable_latency("ipw2100"); + pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100"); } module_init(ipw2100_init); diff --git a/drivers/net/wireless/netwave_cs.c b/drivers/net/wireless/netwave_cs.c index d2fa079..f479c1a 100644 --- a/drivers/net/wireless/netwave_cs.c +++ b/drivers/net/wireless/netwave_cs.c @@ -195,7 +195,7 @@ static int netwave_pcmcia_config(struct pcmcia_device *arg); /* Runs after card static void netwave_detach(struct pcmcia_device *p_dev); /* Destroy instance */ /* Hardware configuration */ -static void netwave_doreset(kio_addr_t iobase, u_char __iomem *ramBase); +static void netwave_doreset(unsigned int iobase, u_char __iomem *ramBase); static void netwave_reset(struct net_device *dev); /* Misc device stuff */ @@ -309,7 +309,7 @@ static inline void wait_WOC(unsigned int iobase) } static void netwave_snapshot(netwave_private *priv, u_char __iomem *ramBase, - kio_addr_t iobase) { + unsigned int iobase) { u_short resultBuffer; /* if time since last snapshot is > 1 sec. (100 jiffies?) then take @@ -340,7 +340,7 @@ static void netwave_snapshot(netwave_private *priv, u_char __iomem *ramBase, static struct iw_statistics *netwave_get_wireless_stats(struct net_device *dev) { unsigned long flags; - kio_addr_t iobase = dev->base_addr; + unsigned int iobase = dev->base_addr; netwave_private *priv = netdev_priv(dev); u_char __iomem *ramBase = priv->ramBase; struct iw_statistics* wstats; @@ -471,7 +471,7 @@ static int netwave_set_nwid(struct net_device *dev, char *extra) { unsigned long flags; - kio_addr_t iobase = dev->base_addr; + unsigned int iobase = dev->base_addr; netwave_private *priv = netdev_priv(dev); u_char __iomem *ramBase = priv->ramBase; @@ -518,7 +518,7 @@ static int netwave_set_scramble(struct net_device *dev, char *key) { unsigned long flags; - kio_addr_t iobase = dev->base_addr; + unsigned int iobase = dev->base_addr; netwave_private *priv = netdev_priv(dev); u_char __iomem *ramBase = priv->ramBase; @@ -621,7 +621,7 @@ static int netwave_get_snap(struct net_device *dev, char *extra) { unsigned long flags; - kio_addr_t iobase = dev->base_addr; + unsigned int iobase = dev->base_addr; netwave_private *priv = netdev_priv(dev); u_char __iomem *ramBase = priv->ramBase; @@ -874,7 +874,7 @@ static int netwave_resume(struct pcmcia_device *link) * * Proper hardware reset of the card. */ -static void netwave_doreset(kio_addr_t ioBase, u_char __iomem *ramBase) +static void netwave_doreset(unsigned int ioBase, u_char __iomem *ramBase) { /* Reset card */ wait_WOC(ioBase); @@ -892,7 +892,7 @@ static void netwave_reset(struct net_device *dev) { /* u_char state; */ netwave_private *priv = netdev_priv(dev); u_char __iomem *ramBase = priv->ramBase; - kio_addr_t iobase = dev->base_addr; + unsigned int iobase = dev->base_addr; DEBUG(0, "netwave_reset: Done with hardware reset\n"); @@ -973,7 +973,7 @@ static int netwave_hw_xmit(unsigned char* data, int len, netwave_private *priv = netdev_priv(dev); u_char __iomem * ramBase = priv->ramBase; - kio_addr_t iobase = dev->base_addr; + unsigned int iobase = dev->base_addr; /* Disable interrupts & save flags */ spin_lock_irqsave(&priv->spinlock, flags); @@ -1065,7 +1065,7 @@ static int netwave_start_xmit(struct sk_buff *skb, struct net_device *dev) { */ static irqreturn_t netwave_interrupt(int irq, void* dev_id) { - kio_addr_t iobase; + unsigned int iobase; u_char __iomem *ramBase; struct net_device *dev = (struct net_device *)dev_id; struct netwave_private *priv = netdev_priv(dev); @@ -1235,7 +1235,7 @@ static int netwave_rx(struct net_device *dev) { netwave_private *priv = netdev_priv(dev); u_char __iomem *ramBase = priv->ramBase; - kio_addr_t iobase = dev->base_addr; + unsigned int iobase = dev->base_addr; u_char rxStatus; struct sk_buff *skb = NULL; unsigned int curBuffer, @@ -1388,7 +1388,7 @@ module_exit(exit_netwave_cs); */ static void set_multicast_list(struct net_device *dev) { - kio_addr_t iobase = dev->base_addr; + unsigned int iobase = dev->base_addr; netwave_private *priv = netdev_priv(dev); u_char __iomem * ramBase = priv->ramBase; u_char rcvMode = 0; diff --git a/drivers/net/wireless/wavelan_cs.c b/drivers/net/wireless/wavelan_cs.c index c2037b2..06eea6a 100644 --- a/drivers/net/wireless/wavelan_cs.c +++ b/drivers/net/wireless/wavelan_cs.c @@ -149,7 +149,7 @@ psa_write(struct net_device * dev, net_local *lp = netdev_priv(dev); u_char __iomem *ptr = lp->mem + PSA_ADDR + (o << 1); int count = 0; - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; /* As there seem to have no flag PSA_BUSY as in the ISA model, we are * oblige to verify this address to know when the PSA is ready... */ volatile u_char __iomem *verify = lp->mem + PSA_ADDR + @@ -708,7 +708,7 @@ static void wl_update_history(wavepoint_history *wavepoint, unsigned char sigqua /* Perform a handover to a new WavePoint */ static void wv_roam_handover(wavepoint_history *wavepoint, net_local *lp) { - kio_addr_t base = lp->dev->base_addr; + unsigned int base = lp->dev->base_addr; mm_t m; unsigned long flags; @@ -821,7 +821,7 @@ wv_82593_cmd(struct net_device * dev, int cmd, int result) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; int status; int wait_completed; long spin; @@ -945,7 +945,7 @@ read_ringbuf(struct net_device * dev, char * buf, int len) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; int ring_ptr = addr; int chunk_len; char * buf_ptr = buf; @@ -1096,7 +1096,7 @@ wv_psa_show(psa_t * p) static void wv_mmc_show(struct net_device * dev) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; net_local * lp = netdev_priv(dev); mmr_t m; @@ -1275,7 +1275,7 @@ wv_packet_info(u_char * p, /* Packet to dump */ static inline void wv_init_info(struct net_device * dev) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; psa_t psa; DECLARE_MAC_BUF(mac); @@ -1294,7 +1294,7 @@ wv_init_info(struct net_device * dev) #ifdef DEBUG_BASIC_SHOW /* Now, let's go for the basic stuff */ - printk(KERN_NOTICE "%s: WaveLAN: port %#lx, irq %d, " + printk(KERN_NOTICE "%s: WaveLAN: port %#x, irq %d, " "hw_addr %s", dev->name, base, dev->irq, print_mac(mac, dev->dev_addr)); @@ -1828,7 +1828,7 @@ static int wavelan_set_nwid(struct net_device *dev, union iwreq_data *wrqu, char *extra) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; net_local *lp = netdev_priv(dev); psa_t psa; mm_t m; @@ -1918,7 +1918,7 @@ static int wavelan_set_freq(struct net_device *dev, union iwreq_data *wrqu, char *extra) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; net_local *lp = netdev_priv(dev); unsigned long flags; int ret; @@ -1948,7 +1948,7 @@ static int wavelan_get_freq(struct net_device *dev, union iwreq_data *wrqu, char *extra) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; net_local *lp = netdev_priv(dev); psa_t psa; unsigned long flags; @@ -1994,7 +1994,7 @@ static int wavelan_set_sens(struct net_device *dev, union iwreq_data *wrqu, char *extra) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; net_local *lp = netdev_priv(dev); psa_t psa; unsigned long flags; @@ -2060,7 +2060,7 @@ static int wavelan_set_encode(struct net_device *dev, union iwreq_data *wrqu, char *extra) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; net_local *lp = netdev_priv(dev); unsigned long flags; psa_t psa; @@ -2130,7 +2130,7 @@ static int wavelan_get_encode(struct net_device *dev, union iwreq_data *wrqu, char *extra) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; net_local *lp = netdev_priv(dev); psa_t psa; unsigned long flags; @@ -2349,7 +2349,7 @@ static int wavelan_get_range(struct net_device *dev, union iwreq_data *wrqu, char *extra) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; net_local *lp = netdev_priv(dev); struct iw_range *range = (struct iw_range *) extra; unsigned long flags; @@ -2425,7 +2425,7 @@ static int wavelan_set_qthr(struct net_device *dev, union iwreq_data *wrqu, char *extra) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; net_local *lp = netdev_priv(dev); psa_t psa; unsigned long flags; @@ -2701,7 +2701,7 @@ static const struct iw_handler_def wavelan_handler_def = static iw_stats * wavelan_get_wireless_stats(struct net_device * dev) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; net_local * lp = netdev_priv(dev); mmr_t m; iw_stats * wstats; @@ -2764,7 +2764,7 @@ wv_start_of_frame(struct net_device * dev, int rfp, /* end of frame */ int wrap) /* start of buffer */ { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; int rp; int len; @@ -2925,7 +2925,7 @@ wv_packet_read(struct net_device * dev, static inline void wv_packet_rcv(struct net_device * dev) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; net_local * lp = netdev_priv(dev); int newrfp; int rp; @@ -3062,7 +3062,7 @@ wv_packet_write(struct net_device * dev, short length) { net_local * lp = netdev_priv(dev); - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; unsigned long flags; int clen = length; register u_short xmtdata_base = TX_BASE; @@ -3183,7 +3183,7 @@ wavelan_packet_xmit(struct sk_buff * skb, static inline int wv_mmc_init(struct net_device * dev) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; psa_t psa; mmw_t m; int configured; @@ -3377,7 +3377,7 @@ wv_mmc_init(struct net_device * dev) static int wv_ru_stop(struct net_device * dev) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; net_local * lp = netdev_priv(dev); unsigned long flags; int status; @@ -3440,7 +3440,7 @@ wv_ru_stop(struct net_device * dev) static int wv_ru_start(struct net_device * dev) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; net_local * lp = netdev_priv(dev); unsigned long flags; @@ -3528,7 +3528,7 @@ wv_ru_start(struct net_device * dev) static int wv_82593_config(struct net_device * dev) { - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; net_local * lp = netdev_priv(dev); struct i82593_conf_block cfblk; int ret = TRUE; @@ -3765,7 +3765,7 @@ static int wv_hw_config(struct net_device * dev) { net_local * lp = netdev_priv(dev); - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; unsigned long flags; int ret = FALSE; @@ -4047,7 +4047,7 @@ wavelan_interrupt(int irq, { struct net_device * dev = dev_id; net_local * lp; - kio_addr_t base; + unsigned int base; int status0; u_int tx_status; @@ -4306,7 +4306,7 @@ static void wavelan_watchdog(struct net_device * dev) { net_local * lp = netdev_priv(dev); - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; unsigned long flags; int aborted = FALSE; @@ -4382,7 +4382,7 @@ wavelan_open(struct net_device * dev) { net_local * lp = netdev_priv(dev); struct pcmcia_device * link = lp->link; - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; #ifdef DEBUG_CALLBACK_TRACE printk(KERN_DEBUG "%s: ->wavelan_open(dev=0x%x)\n", dev->name, @@ -4436,7 +4436,7 @@ static int wavelan_close(struct net_device * dev) { struct pcmcia_device * link = ((net_local *)netdev_priv(dev))->link; - kio_addr_t base = dev->base_addr; + unsigned int base = dev->base_addr; #ifdef DEBUG_CALLBACK_TRACE printk(KERN_DEBUG "%s: ->wavelan_close(dev=0x%x)\n", dev->name, diff --git a/drivers/nubus/Makefile b/drivers/nubus/Makefile index f5ef03c..21bda20 100644 --- a/drivers/nubus/Makefile +++ b/drivers/nubus/Makefile @@ -4,5 +4,4 @@ obj-y := nubus.o -obj-$(CONFIG_MODULES) += nubus_syms.o obj-$(CONFIG_PROC_FS) += proc.o diff --git a/drivers/nubus/nubus.c b/drivers/nubus/nubus.c index f4076ae..2f047e5 100644 --- a/drivers/nubus/nubus.c +++ b/drivers/nubus/nubus.c @@ -14,6 +14,7 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/delay.h> +#include <linux/module.h> #include <asm/setup.h> #include <asm/system.h> #include <asm/page.h> @@ -186,6 +187,7 @@ void nubus_get_rsrc_mem(void *dest, const struct nubus_dirent* dirent, len--; } } +EXPORT_SYMBOL(nubus_get_rsrc_mem); void nubus_get_rsrc_str(void *dest, const struct nubus_dirent* dirent, int len) @@ -200,6 +202,7 @@ void nubus_get_rsrc_str(void *dest, const struct nubus_dirent* dirent, len--; } } +EXPORT_SYMBOL(nubus_get_rsrc_str); int nubus_get_root_dir(const struct nubus_board* board, struct nubus_dir* dir) @@ -209,6 +212,7 @@ int nubus_get_root_dir(const struct nubus_board* board, dir->mask = board->lanes; return 0; } +EXPORT_SYMBOL(nubus_get_root_dir); /* This is a slyly renamed version of the above */ int nubus_get_func_dir(const struct nubus_dev* dev, @@ -219,6 +223,7 @@ int nubus_get_func_dir(const struct nubus_dev* dev, dir->mask = dev->board->lanes; return 0; } +EXPORT_SYMBOL(nubus_get_func_dir); int nubus_get_board_dir(const struct nubus_board* board, struct nubus_dir* dir) @@ -237,6 +242,7 @@ int nubus_get_board_dir(const struct nubus_board* board, return -1; return 0; } +EXPORT_SYMBOL(nubus_get_board_dir); int nubus_get_subdir(const struct nubus_dirent *ent, struct nubus_dir *dir) @@ -246,6 +252,7 @@ int nubus_get_subdir(const struct nubus_dirent *ent, dir->mask = ent->mask; return 0; } +EXPORT_SYMBOL(nubus_get_subdir); int nubus_readdir(struct nubus_dir *nd, struct nubus_dirent *ent) { @@ -274,12 +281,14 @@ int nubus_readdir(struct nubus_dir *nd, struct nubus_dirent *ent) ent->mask = nd->mask; return 0; } +EXPORT_SYMBOL(nubus_readdir); int nubus_rewinddir(struct nubus_dir* dir) { dir->ptr = dir->base; return 0; } +EXPORT_SYMBOL(nubus_rewinddir); /* Driver interface functions, more or less like in pci.c */ @@ -303,6 +312,7 @@ nubus_find_device(unsigned short category, } return NULL; } +EXPORT_SYMBOL(nubus_find_device); struct nubus_dev* nubus_find_type(unsigned short category, @@ -320,6 +330,7 @@ nubus_find_type(unsigned short category, } return NULL; } +EXPORT_SYMBOL(nubus_find_type); struct nubus_dev* nubus_find_slot(unsigned int slot, @@ -335,6 +346,7 @@ nubus_find_slot(unsigned int slot, } return NULL; } +EXPORT_SYMBOL(nubus_find_slot); int nubus_find_rsrc(struct nubus_dir* dir, unsigned char rsrc_type, @@ -346,6 +358,7 @@ nubus_find_rsrc(struct nubus_dir* dir, unsigned char rsrc_type, } return -1; } +EXPORT_SYMBOL(nubus_find_rsrc); /* Initialization functions - decide which slots contain stuff worth looking at, and print out lots and lots of information from the diff --git a/drivers/nubus/nubus_syms.c b/drivers/nubus/nubus_syms.c deleted file mode 100644 index 9204f04..0000000 --- a/drivers/nubus/nubus_syms.c +++ /dev/null @@ -1,28 +0,0 @@ -/* Exported symbols for NuBus services - - (c) 1999 David Huggins-Daines <dhd@debian.org> */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/nubus.h> - -#ifdef CONFIG_PROC_FS -EXPORT_SYMBOL(nubus_proc_attach_device); -EXPORT_SYMBOL(nubus_proc_detach_device); -#endif - -MODULE_LICENSE("GPL"); - -EXPORT_SYMBOL(nubus_find_device); -EXPORT_SYMBOL(nubus_find_type); -EXPORT_SYMBOL(nubus_find_slot); -EXPORT_SYMBOL(nubus_get_root_dir); -EXPORT_SYMBOL(nubus_get_board_dir); -EXPORT_SYMBOL(nubus_get_func_dir); -EXPORT_SYMBOL(nubus_readdir); -EXPORT_SYMBOL(nubus_find_rsrc); -EXPORT_SYMBOL(nubus_rewinddir); -EXPORT_SYMBOL(nubus_get_subdir); -EXPORT_SYMBOL(nubus_get_rsrc_mem); -EXPORT_SYMBOL(nubus_get_rsrc_str); - diff --git a/drivers/nubus/proc.c b/drivers/nubus/proc.c index 5271a4a..e07492b 100644 --- a/drivers/nubus/proc.c +++ b/drivers/nubus/proc.c @@ -22,6 +22,8 @@ #include <linux/nubus.h> #include <linux/proc_fs.h> #include <linux/init.h> +#include <linux/module.h> + #include <asm/uaccess.h> #include <asm/byteorder.h> @@ -140,6 +142,7 @@ int nubus_proc_attach_device(struct nubus_dev *dev) return 0; } +EXPORT_SYMBOL(nubus_proc_attach_device); /* FIXME: this is certainly broken! */ int nubus_proc_detach_device(struct nubus_dev *dev) @@ -154,6 +157,7 @@ int nubus_proc_detach_device(struct nubus_dev *dev) } return 0; } +EXPORT_SYMBOL(nubus_proc_detach_device); void __init proc_bus_nubus_add_devices(void) { diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index ca52307..d08b284 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -941,7 +941,7 @@ ccio_map_sg(struct device *dev, struct scatterlist *sglist, int nents, ** w/o this association, we wouldn't have coherent DMA! ** Access to the virtual address is what forces a two pass algorithm. */ - coalesced = iommu_coalesce_chunks(ioc, sglist, nents, ccio_alloc_range); + coalesced = iommu_coalesce_chunks(ioc, dev, sglist, nents, ccio_alloc_range); /* ** Program the I/O Pdir diff --git a/drivers/parisc/iommu-helpers.h b/drivers/parisc/iommu-helpers.h index 0a1f99a..97ba828 100644 --- a/drivers/parisc/iommu-helpers.h +++ b/drivers/parisc/iommu-helpers.h @@ -95,12 +95,14 @@ iommu_fill_pdir(struct ioc *ioc, struct scatterlist *startsg, int nents, */ static inline unsigned int -iommu_coalesce_chunks(struct ioc *ioc, struct scatterlist *startsg, int nents, +iommu_coalesce_chunks(struct ioc *ioc, struct device *dev, + struct scatterlist *startsg, int nents, int (*iommu_alloc_range)(struct ioc *, size_t)) { struct scatterlist *contig_sg; /* contig chunk head */ unsigned long dma_offset, dma_len; /* start/len of DMA stream */ unsigned int n_mappings = 0; + unsigned int max_seg_size = dma_get_max_seg_size(dev); while (nents > 0) { @@ -142,6 +144,9 @@ iommu_coalesce_chunks(struct ioc *ioc, struct scatterlist *startsg, int nents, IOVP_SIZE) > DMA_CHUNK_SIZE)) break; + if (startsg->length + dma_len > max_seg_size) + break; + /* ** Next see if we can append the next chunk (i.e. ** it must end on one page and begin on another diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c index e527a0e..d06627c 100644 --- a/drivers/parisc/sba_iommu.c +++ b/drivers/parisc/sba_iommu.c @@ -946,7 +946,7 @@ sba_map_sg(struct device *dev, struct scatterlist *sglist, int nents, ** w/o this association, we wouldn't have coherent DMA! ** Access to the virtual address is what forces a two pass algorithm. */ - coalesced = iommu_coalesce_chunks(ioc, sglist, nents, sba_alloc_range); + coalesced = iommu_coalesce_chunks(ioc, dev, sglist, nents, sba_alloc_range); /* ** Program the I/O Pdir diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 04aac77..ae3df46 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1451,6 +1451,22 @@ pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask) } #endif +#ifndef HAVE_ARCH_PCI_SET_DMA_MAX_SEGMENT_SIZE +int pci_set_dma_max_seg_size(struct pci_dev *dev, unsigned int size) +{ + return dma_set_max_seg_size(&dev->dev, size); +} +EXPORT_SYMBOL(pci_set_dma_max_seg_size); +#endif + +#ifndef HAVE_ARCH_PCI_SET_DMA_SEGMENT_BOUNDARY +int pci_set_dma_seg_boundary(struct pci_dev *dev, unsigned long mask) +{ + return dma_set_seg_boundary(&dev->dev, mask); +} +EXPORT_SYMBOL(pci_set_dma_seg_boundary); +#endif + /** * pcix_get_max_mmrbc - get PCI-X maximum designed memory read byte count * @dev: PCI device to query diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 7f5dab3..4d23b9f 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -933,8 +933,12 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) set_dev_node(&dev->dev, pcibus_to_node(bus)); dev->dev.dma_mask = &dev->dma_mask; + dev->dev.dma_parms = &dev->dma_parms; dev->dev.coherent_dma_mask = 0xffffffffull; + pci_set_dma_max_seg_size(dev, 65536); + pci_set_dma_seg_boundary(dev, 0xffffffff); + /* Fix up broken headers */ pci_fixup_device(pci_fixup_header, dev); diff --git a/drivers/pcmcia/at91_cf.c b/drivers/pcmcia/at91_cf.c index eb6abd3..385e145 100644 --- a/drivers/pcmcia/at91_cf.c +++ b/drivers/pcmcia/at91_cf.c @@ -21,9 +21,9 @@ #include <asm/hardware.h> #include <asm/io.h> #include <asm/sizes.h> +#include <asm/gpio.h> #include <asm/arch/board.h> -#include <asm/arch/gpio.h> #include <asm/arch/at91rm9200_mc.h> @@ -56,7 +56,7 @@ struct at91_cf_socket { static inline int at91_cf_present(struct at91_cf_socket *cf) { - return !at91_get_gpio_value(cf->board->det_pin); + return !gpio_get_value(cf->board->det_pin); } /*--------------------------------------------------------------------------*/ @@ -100,9 +100,9 @@ static int at91_cf_get_status(struct pcmcia_socket *s, u_int *sp) int vcc = cf->board->vcc_pin; *sp = SS_DETECT | SS_3VCARD; - if (!rdy || at91_get_gpio_value(rdy)) + if (!rdy || gpio_get_value(rdy)) *sp |= SS_READY; - if (!vcc || at91_get_gpio_value(vcc)) + if (!vcc || gpio_get_value(vcc)) *sp |= SS_POWERON; } else *sp = 0; @@ -121,10 +121,10 @@ at91_cf_set_socket(struct pcmcia_socket *sock, struct socket_state_t *s) if (cf->board->vcc_pin) { switch (s->Vcc) { case 0: - at91_set_gpio_value(cf->board->vcc_pin, 0); + gpio_set_value(cf->board->vcc_pin, 0); break; case 33: - at91_set_gpio_value(cf->board->vcc_pin, 1); + gpio_set_value(cf->board->vcc_pin, 1); break; default: return -EINVAL; @@ -132,7 +132,7 @@ at91_cf_set_socket(struct pcmcia_socket *sock, struct socket_state_t *s) } /* toggle reset if needed */ - at91_set_gpio_value(cf->board->rst_pin, s->flags & SS_RESET); + gpio_set_value(cf->board->rst_pin, s->flags & SS_RESET); pr_debug("%s: Vcc %d, io_irq %d, flags %04x csc %04x\n", driver_name, s->Vcc, s->io_irq, s->flags, s->csc_mask); @@ -239,11 +239,24 @@ static int __init at91_cf_probe(struct platform_device *pdev) platform_set_drvdata(pdev, cf); /* must be a GPIO; ergo must trigger on both edges */ - status = request_irq(board->det_pin, at91_cf_irq, 0, driver_name, cf); + status = gpio_request(board->det_pin, "cf_det"); if (status < 0) goto fail0; + status = request_irq(board->det_pin, at91_cf_irq, 0, driver_name, cf); + if (status < 0) + goto fail00; device_init_wakeup(&pdev->dev, 1); + status = gpio_request(board->rst_pin, "cf_rst"); + if (status < 0) + goto fail0a; + + if (board->vcc_pin) { + status = gpio_request(board->vcc_pin, "cf_vcc"); + if (status < 0) + goto fail0b; + } + /* * The card driver will request this irq later as needed. * but it causes lots of "irqNN: nobody cared" messages @@ -251,16 +264,20 @@ static int __init at91_cf_probe(struct platform_device *pdev) * (Note: DK board doesn't wire the IRQ pin...) */ if (board->irq_pin) { + status = gpio_request(board->irq_pin, "cf_irq"); + if (status < 0) + goto fail0c; status = request_irq(board->irq_pin, at91_cf_irq, IRQF_SHARED, driver_name, cf); if (status < 0) - goto fail0a; + goto fail0d; cf->socket.pci_irq = board->irq_pin; } else cf->socket.pci_irq = NR_IRQS + 1; /* pcmcia layer only remaps "real" memory not iospace */ - cf->socket.io_offset = (unsigned long) ioremap(cf->phys_baseaddr + CF_IO_PHYS, SZ_2K); + cf->socket.io_offset = (unsigned long) + ioremap(cf->phys_baseaddr + CF_IO_PHYS, SZ_2K); if (!cf->socket.io_offset) { status = -ENXIO; goto fail1; @@ -296,11 +313,21 @@ fail2: fail1: if (cf->socket.io_offset) iounmap((void __iomem *) cf->socket.io_offset); - if (board->irq_pin) + if (board->irq_pin) { free_irq(board->irq_pin, cf); +fail0d: + gpio_free(board->irq_pin); + } +fail0c: + if (board->vcc_pin) + gpio_free(board->vcc_pin); +fail0b: + gpio_free(board->rst_pin); fail0a: device_init_wakeup(&pdev->dev, 0); free_irq(board->det_pin, cf); +fail00: + gpio_free(board->det_pin); fail0: kfree(cf); return status; @@ -313,13 +340,18 @@ static int __exit at91_cf_remove(struct platform_device *pdev) struct resource *io = cf->socket.io[0].res; pcmcia_unregister_socket(&cf->socket); - if (board->irq_pin) + release_mem_region(io->start, io->end + 1 - io->start); + iounmap((void __iomem *) cf->socket.io_offset); + if (board->irq_pin) { free_irq(board->irq_pin, cf); + gpio_free(board->irq_pin); + } + if (board->vcc_pin) + gpio_free(board->vcc_pin); + gpio_free(board->rst_pin); device_init_wakeup(&pdev->dev, 0); free_irq(board->det_pin, cf); - iounmap((void __iomem *) cf->socket.io_offset); - release_mem_region(io->start, io->end + 1 - io->start); - + gpio_free(board->det_pin); kfree(cf); return 0; } diff --git a/drivers/pcmcia/cardbus.c b/drivers/pcmcia/cardbus.c index a1bd763..714baae 100644 --- a/drivers/pcmcia/cardbus.c +++ b/drivers/pcmcia/cardbus.c @@ -143,7 +143,7 @@ int read_cb_mem(struct pcmcia_socket * s, int space, u_int addr, u_int len, void /* Config space? */ if (space == 0) { if (addr + len > 0x100) - goto fail; + goto failput; for (; len; addr++, ptr++, len--) pci_read_config_byte(dev, addr, ptr); return 0; @@ -171,6 +171,8 @@ int read_cb_mem(struct pcmcia_socket * s, int space, u_int addr, u_int len, void memcpy_fromio(ptr, s->cb_cis_virt + addr, len); return 0; +failput: + pci_dev_put(dev); fail: memset(ptr, 0xff, len); return -1; diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c index 15c18f5..5a85871 100644 --- a/drivers/pcmcia/ds.c +++ b/drivers/pcmcia/ds.c @@ -865,11 +865,12 @@ static int pcmcia_load_firmware(struct pcmcia_device *dev, char * filename) ds_dbg(1, "trying to load CIS file %s\n", filename); if (strlen(filename) > 14) { - printk(KERN_WARNING "pcmcia: CIS filename is too long\n"); + printk(KERN_WARNING "pcmcia: CIS filename is too long [%s]\n", + filename); return -EINVAL; } - snprintf(path, 20, "%s", filename); + snprintf(path, sizeof(path), "%s", filename); if (request_firmware(&fw, path, &dev->dev) == 0) { if (fw->size >= CISTPL_MAX_CIS_SIZE) { @@ -1130,8 +1131,6 @@ static int runtime_suspend(struct device *dev) down(&dev->sem); rc = pcmcia_dev_suspend(dev, PMSG_SUSPEND); up(&dev->sem); - if (!rc) - dev->power.power_state.event = PM_EVENT_SUSPEND; return rc; } @@ -1142,8 +1141,6 @@ static void runtime_resume(struct device *dev) down(&dev->sem); rc = pcmcia_dev_resume(dev); up(&dev->sem); - if (!rc) - dev->power.power_state.event = PM_EVENT_ON; } /************************ per-device sysfs output ***************************/ @@ -1265,6 +1262,9 @@ static int pcmcia_dev_suspend(struct device * dev, pm_message_t state) struct pcmcia_driver *p_drv = NULL; int ret = 0; + if (p_dev->suspended) + return 0; + ds_dbg(2, "suspending %s\n", dev->bus_id); if (dev->driver) @@ -1301,6 +1301,9 @@ static int pcmcia_dev_resume(struct device * dev) struct pcmcia_driver *p_drv = NULL; int ret = 0; + if (!p_dev->suspended) + return 0; + ds_dbg(2, "resuming %s\n", dev->bus_id); if (dev->driver) diff --git a/drivers/pcmcia/i82092.c b/drivers/pcmcia/i82092.c index df21e2d..7495155 100644 --- a/drivers/pcmcia/i82092.c +++ b/drivers/pcmcia/i82092.c @@ -82,7 +82,7 @@ struct socket_info { 1 = empty socket, 2 = card but not initialized, 3 = operational card */ - kio_addr_t io_base; /* base io address of the socket */ + unsigned int io_base; /* base io address of the socket */ struct pcmcia_socket socket; struct pci_dev *dev; /* The PCI device for the socket */ diff --git a/drivers/pcmcia/i82365.c b/drivers/pcmcia/i82365.c index 839bb1c..32a2ab1 100644 --- a/drivers/pcmcia/i82365.c +++ b/drivers/pcmcia/i82365.c @@ -164,7 +164,7 @@ struct i82365_socket { u_short type, flags; struct pcmcia_socket socket; unsigned int number; - kio_addr_t ioaddr; + unsigned int ioaddr; u_short psock; u_char cs_irq, intr; union { @@ -238,7 +238,7 @@ static u_char i365_get(u_short sock, u_short reg) unsigned long flags; spin_lock_irqsave(&bus_lock,flags); { - kio_addr_t port = socket[sock].ioaddr; + unsigned int port = socket[sock].ioaddr; u_char val; reg = I365_REG(socket[sock].psock, reg); outb(reg, port); val = inb(port+1); @@ -252,7 +252,7 @@ static void i365_set(u_short sock, u_short reg, u_char data) unsigned long flags; spin_lock_irqsave(&bus_lock,flags); { - kio_addr_t port = socket[sock].ioaddr; + unsigned int port = socket[sock].ioaddr; u_char val = I365_REG(socket[sock].psock, reg); outb(val, port); outb(data, port+1); spin_unlock_irqrestore(&bus_lock,flags); @@ -588,7 +588,7 @@ static int to_cycles(int ns) /*====================================================================*/ -static int __init identify(kio_addr_t port, u_short sock) +static int __init identify(unsigned int port, u_short sock) { u_char val; int type = -1; @@ -659,7 +659,7 @@ static int __init identify(kio_addr_t port, u_short sock) static int __init is_alive(u_short sock) { u_char stat; - kio_addr_t start, stop; + unsigned int start, stop; stat = i365_get(sock, I365_STATUS); start = i365_get_pair(sock, I365_IO(0)+I365_W_START); @@ -678,7 +678,7 @@ static int __init is_alive(u_short sock) /*====================================================================*/ -static void __init add_socket(kio_addr_t port, int psock, int type) +static void __init add_socket(unsigned int port, int psock, int type) { socket[sockets].ioaddr = port; socket[sockets].psock = psock; @@ -698,7 +698,7 @@ static void __init add_pcic(int ns, int type) base = sockets-ns; if (base == 0) printk("\n"); printk(KERN_INFO " %s", pcic[type].name); - printk(" ISA-to-PCMCIA at port %#lx ofs 0x%02x", + printk(" ISA-to-PCMCIA at port %#x ofs 0x%02x", t->ioaddr, t->psock*0x40); printk(", %d socket%s\n", ns, ((ns > 1) ? "s" : "")); @@ -772,7 +772,7 @@ static struct pnp_dev *i82365_pnpdev; static void __init isa_probe(void) { int i, j, sock, k, ns, id; - kio_addr_t port; + unsigned int port; #ifdef CONFIG_PNP struct isapnp_device_id *devid; struct pnp_dev *dev; @@ -1053,7 +1053,7 @@ static int i365_set_io_map(u_short sock, struct pccard_io_map *io) u_char map, ioctl; debug(1, "SetIOMap(%d, %d, %#2.2x, %d ns, " - "%#lx-%#lx)\n", sock, io->map, io->flags, + "%#x-%#x)\n", sock, io->map, io->flags, io->speed, io->start, io->stop); map = io->map; if ((map > 1) || (io->start > 0xffff) || (io->stop > 0xffff) || diff --git a/drivers/pcmcia/m32r_cfc.c b/drivers/pcmcia/m32r_cfc.c index 91da15b..3616da2 100644 --- a/drivers/pcmcia/m32r_cfc.c +++ b/drivers/pcmcia/m32r_cfc.c @@ -58,7 +58,7 @@ typedef struct pcc_socket { u_short type, flags; struct pcmcia_socket socket; unsigned int number; - kio_addr_t ioaddr; + unsigned int ioaddr; u_long mapaddr; u_long base; /* PCC register base */ u_char cs_irq1, cs_irq2, intr; @@ -298,7 +298,8 @@ static int __init is_alive(u_short sock) return 0; } -static void add_pcc_socket(ulong base, int irq, ulong mapaddr, kio_addr_t ioaddr) +static void add_pcc_socket(ulong base, int irq, ulong mapaddr, + unsigned int ioaddr) { pcc_socket_t *t = &socket[pcc_sockets]; @@ -738,7 +739,7 @@ static int __init init_m32r_pcc(void) #else /* CONFIG_PLAT_USRV */ { ulong base, mapaddr; - kio_addr_t ioaddr; + unsigned int ioaddr; for (i = 0 ; i < M32R_MAX_PCC ; i++) { base = (ulong)PLD_CFRSTCR; diff --git a/drivers/pcmcia/m32r_pcc.c b/drivers/pcmcia/m32r_pcc.c index ec4c125..2b42b71 100644 --- a/drivers/pcmcia/m32r_pcc.c +++ b/drivers/pcmcia/m32r_pcc.c @@ -65,7 +65,7 @@ typedef struct pcc_socket { u_short type, flags; struct pcmcia_socket socket; unsigned int number; - kio_addr_t ioaddr; + unsigned int ioaddr; u_long mapaddr; u_long base; /* PCC register base */ u_char cs_irq, intr; @@ -310,7 +310,8 @@ static int __init is_alive(u_short sock) return 0; } -static void add_pcc_socket(ulong base, int irq, ulong mapaddr, kio_addr_t ioaddr) +static void add_pcc_socket(ulong base, int irq, ulong mapaddr, + unsigned int ioaddr) { pcc_socket_t *t = &socket[pcc_sockets]; @@ -491,7 +492,7 @@ static int _pcc_set_io_map(u_short sock, struct pccard_io_map *io) u_char map; debug(3, "m32r-pcc: SetIOMap(%d, %d, %#2.2x, %d ns, " - "%#lx-%#lx)\n", sock, io->map, io->flags, + "%#x-%#x)\n", sock, io->map, io->flags, io->speed, io->start, io->stop); map = io->map; diff --git a/drivers/pcmcia/m8xx_pcmcia.c b/drivers/pcmcia/m8xx_pcmcia.c index 4ea426a..ac70d2c 100644 --- a/drivers/pcmcia/m8xx_pcmcia.c +++ b/drivers/pcmcia/m8xx_pcmcia.c @@ -1174,8 +1174,10 @@ static int __init m8xx_probe(struct of_device *ofdev, pcmcia_schlvl = irq_of_parse_and_map(np, 0); hwirq = irq_map[pcmcia_schlvl].hwirq; - if (pcmcia_schlvl < 0) + if (pcmcia_schlvl < 0) { + iounmap(pcmcia); return -EINVAL; + } m8xx_pgcrx[0] = &pcmcia->pcmc_pgcra; m8xx_pgcrx[1] = &pcmcia->pcmc_pgcrb; @@ -1189,6 +1191,7 @@ static int __init m8xx_probe(struct of_device *ofdev, driver_name, socket)) { pcmcia_error("Cannot allocate IRQ %u for SCHLVL!\n", pcmcia_schlvl); + iounmap(pcmcia); return -1; } @@ -1284,6 +1287,7 @@ static int m8xx_remove(struct of_device *ofdev) } for (i = 0; i < PCMCIA_SOCKETS_NO; i++) pcmcia_unregister_socket(&socket[i].socket); + iounmap(pcmcia); free_irq(pcmcia_schlvl, NULL); diff --git a/drivers/pcmcia/pcmcia_resource.c b/drivers/pcmcia/pcmcia_resource.c index 0ce39de..1d128fb 100644 --- a/drivers/pcmcia/pcmcia_resource.c +++ b/drivers/pcmcia/pcmcia_resource.c @@ -65,23 +65,23 @@ extern int ds_pc_debug; * Special stuff for managing IO windows, because they are scarce */ -static int alloc_io_space(struct pcmcia_socket *s, u_int attr, ioaddr_t *base, - ioaddr_t num, u_int lines) +static int alloc_io_space(struct pcmcia_socket *s, u_int attr, + unsigned int *base, unsigned int num, u_int lines) { int i; - kio_addr_t try, align; + unsigned int try, align; align = (*base) ? (lines ? 1<<lines : 0) : 1; if (align && (align < num)) { if (*base) { - ds_dbg(s, 0, "odd IO request: num %#x align %#lx\n", + ds_dbg(s, 0, "odd IO request: num %#x align %#x\n", num, align); align = 0; } else while (align && (align < num)) align <<= 1; } if (*base & ~(align-1)) { - ds_dbg(s, 0, "odd IO request: base %#x align %#lx\n", + ds_dbg(s, 0, "odd IO request: base %#x align %#x\n", *base, align); align = 0; } @@ -132,8 +132,8 @@ static int alloc_io_space(struct pcmcia_socket *s, u_int attr, ioaddr_t *base, } /* alloc_io_space */ -static void release_io_space(struct pcmcia_socket *s, ioaddr_t base, - ioaddr_t num) +static void release_io_space(struct pcmcia_socket *s, unsigned int base, + unsigned int num) { int i; diff --git a/drivers/pcmcia/rsrc_nonstatic.c b/drivers/pcmcia/rsrc_nonstatic.c index bfcaad6..a8d1007 100644 --- a/drivers/pcmcia/rsrc_nonstatic.c +++ b/drivers/pcmcia/rsrc_nonstatic.c @@ -186,15 +186,16 @@ static int sub_interval(struct resource_map *map, u_long base, u_long num) ======================================================================*/ #ifdef CONFIG_PCMCIA_PROBE -static void do_io_probe(struct pcmcia_socket *s, kio_addr_t base, kio_addr_t num) +static void do_io_probe(struct pcmcia_socket *s, unsigned int base, + unsigned int num) { struct resource *res; struct socket_data *s_data = s->resource_data; - kio_addr_t i, j, bad; + unsigned int i, j, bad; int any; u_char *b, hole, most; - printk(KERN_INFO "cs: IO port probe %#lx-%#lx:", + printk(KERN_INFO "cs: IO port probe %#x-%#x:", base, base+num-1); /* First, what does a floating port look like? */ @@ -233,7 +234,7 @@ static void do_io_probe(struct pcmcia_socket *s, kio_addr_t base, kio_addr_t num } else { if (bad) { sub_interval(&s_data->io_db, bad, i-bad); - printk(" %#lx-%#lx", bad, i-1); + printk(" %#x-%#x", bad, i-1); bad = 0; } } @@ -244,7 +245,7 @@ static void do_io_probe(struct pcmcia_socket *s, kio_addr_t base, kio_addr_t num return; } else { sub_interval(&s_data->io_db, bad, i-bad); - printk(" %#lx-%#lx", bad, i-1); + printk(" %#x-%#x", bad, i-1); } } diff --git a/drivers/pcmcia/tcic.c b/drivers/pcmcia/tcic.c index 749ac37..5792bd5 100644 --- a/drivers/pcmcia/tcic.c +++ b/drivers/pcmcia/tcic.c @@ -719,7 +719,7 @@ static int tcic_set_io_map(struct pcmcia_socket *sock, struct pccard_io_map *io) u_short base, len, ioctl; debug(1, "SetIOMap(%d, %d, %#2.2x, %d ns, " - "%#lx-%#lx)\n", psock, io->map, io->flags, + "%#x-%#x)\n", psock, io->map, io->flags, io->speed, io->start, io->stop); if ((io->map > 1) || (io->start > 0xffff) || (io->stop > 0xffff) || (io->stop < io->start)) return -EINVAL; diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 0e8267c..fb08861 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -449,9 +449,6 @@ static int aac_slave_configure(struct scsi_device *sdev) else if (depth < 2) depth = 2; scsi_adjust_queue_depth(sdev, MSG_ORDERED_TAG, depth); - if (!(((struct aac_dev *)host->hostdata)->adapter_info.options & - AAC_OPT_NEW_COMM)) - blk_queue_max_segment_size(sdev->request_queue, 65536); } else scsi_adjust_queue_depth(sdev, 0, 1); @@ -1133,6 +1130,12 @@ static int __devinit aac_probe_one(struct pci_dev *pdev, if (error < 0) goto out_deinit; + if (!(aac->adapter_info.options & AAC_OPT_NEW_COMM)) { + error = pci_set_dma_max_seg_size(pdev, 65536); + if (error) + goto out_deinit; + } + /* * Lets override negotiations and drop the maximum SG limit to 34 */ diff --git a/drivers/scsi/pcmcia/fdomain_stub.c b/drivers/scsi/pcmcia/fdomain_stub.c index 4b82b20..d8b9935 100644 --- a/drivers/scsi/pcmcia/fdomain_stub.c +++ b/drivers/scsi/pcmcia/fdomain_stub.c @@ -130,7 +130,7 @@ static int fdomain_config(struct pcmcia_device *link) cisparse_t parse; int i, last_ret, last_fn; u_char tuple_data[64]; - char str[16]; + char str[22]; struct Scsi_Host *host; DEBUG(0, "fdomain_config(0x%p)\n", link); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index b12fb31..f243fc30 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1569,6 +1569,7 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost, request_fn_proc *request_fn) { struct request_queue *q; + struct device *dev = shost->shost_gendev.parent; q = blk_init_queue(request_fn, NULL); if (!q) @@ -1583,6 +1584,9 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost, blk_queue_max_sectors(q, shost->max_sectors); blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost)); blk_queue_segment_boundary(q, shost->dma_boundary); + dma_set_seg_boundary(dev, shost->dma_boundary); + + blk_queue_max_segment_size(q, dma_get_max_seg_size(dev)); if (!shost->use_clustering) clear_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c index f94109c..b8a4bd9 100644 --- a/drivers/serial/8250.c +++ b/drivers/serial/8250.c @@ -2047,7 +2047,7 @@ serial8250_set_termios(struct uart_port *port, struct ktermios *termios, * Oxford Semi 952 rev B workaround */ if (up->bugs & UART_BUG_QUOT && (quot & 0xff) == 0) - quot ++; + quot++; if (up->capabilities & UART_CAP_FIFO && up->port.fifosize > 1) { if (baud < 2400) @@ -2662,16 +2662,17 @@ static int __devinit serial8250_probe(struct platform_device *dev) memset(&port, 0, sizeof(struct uart_port)); for (i = 0; p && p->flags != 0; p++, i++) { - port.iobase = p->iobase; - port.membase = p->membase; - port.irq = p->irq; - port.uartclk = p->uartclk; - port.regshift = p->regshift; - port.iotype = p->iotype; - port.flags = p->flags; - port.mapbase = p->mapbase; - port.hub6 = p->hub6; - port.dev = &dev->dev; + port.iobase = p->iobase; + port.membase = p->membase; + port.irq = p->irq; + port.uartclk = p->uartclk; + port.regshift = p->regshift; + port.iotype = p->iotype; + port.flags = p->flags; + port.mapbase = p->mapbase; + port.hub6 = p->hub6; + port.private_data = p->private_data; + port.dev = &dev->dev; if (share_irqs) port.flags |= UPF_SHARE_IRQ; ret = serial8250_register_port(&port); @@ -2812,15 +2813,16 @@ int serial8250_register_port(struct uart_port *port) if (uart) { uart_remove_one_port(&serial8250_reg, &uart->port); - uart->port.iobase = port->iobase; - uart->port.membase = port->membase; - uart->port.irq = port->irq; - uart->port.uartclk = port->uartclk; - uart->port.fifosize = port->fifosize; - uart->port.regshift = port->regshift; - uart->port.iotype = port->iotype; - uart->port.flags = port->flags | UPF_BOOT_AUTOCONF; - uart->port.mapbase = port->mapbase; + uart->port.iobase = port->iobase; + uart->port.membase = port->membase; + uart->port.irq = port->irq; + uart->port.uartclk = port->uartclk; + uart->port.fifosize = port->fifosize; + uart->port.regshift = port->regshift; + uart->port.iotype = port->iotype; + uart->port.flags = port->flags | UPF_BOOT_AUTOCONF; + uart->port.mapbase = port->mapbase; + uart->port.private_data = port->private_data; if (port->dev) uart->port.dev = port->dev; diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c index ceb03c9..0a4ac2b 100644 --- a/drivers/serial/8250_pci.c +++ b/drivers/serial/8250_pci.c @@ -106,6 +106,32 @@ setup_port(struct serial_private *priv, struct uart_port *port, } /* + * ADDI-DATA GmbH communication cards <info@addi-data.com> + */ +static int addidata_apci7800_setup(struct serial_private *priv, + struct pciserial_board *board, + struct uart_port *port, int idx) +{ + unsigned int bar = 0, offset = board->first_offset; + bar = FL_GET_BASE(board->flags); + + if (idx < 2) { + offset += idx * board->uart_offset; + } else if ((idx >= 2) && (idx < 4)) { + bar += 1; + offset += ((idx - 2) * board->uart_offset); + } else if ((idx >= 4) && (idx < 6)) { + bar += 2; + offset += ((idx - 4) * board->uart_offset); + } else if (idx >= 6) { + bar += 3; + offset += ((idx - 6) * board->uart_offset); + } + + return setup_port(priv, port, bar, offset, board->reg_shift); +} + +/* * AFAVLAB uses a different mixture of BARs and offsets * Not that ugly ;) -- HW */ @@ -752,6 +778,16 @@ pci_default_setup(struct serial_private *priv, struct pciserial_board *board, */ static struct pci_serial_quirk pci_serial_quirks[] = { /* + * ADDI-DATA GmbH communication cards <info@addi-data.com> + */ + { + .vendor = PCI_VENDOR_ID_ADDIDATA_OLD, + .device = PCI_DEVICE_ID_ADDIDATA_APCI7800, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .setup = addidata_apci7800_setup, + }, + /* * AFAVLAB cards - these may be called via parport_serial * It is not clear whether this applies to all products. */ @@ -1179,6 +1215,12 @@ static struct pciserial_board pci_boards[] __devinitdata = { .base_baud = 115200, .uart_offset = 8, }, + [pbn_b0_8_115200] = { + .flags = FL_BASE0, + .num_ports = 8, + .base_baud = 115200, + .uart_offset = 8, + }, [pbn_b0_1_921600] = { .flags = FL_BASE0, @@ -2697,6 +2739,97 @@ static struct pci_device_id serial_pci_tbl[] = { pbn_pasemi_1682M }, /* + * ADDI-DATA GmbH communication cards <info@addi-data.com> + */ + { PCI_VENDOR_ID_ADDIDATA, + PCI_DEVICE_ID_ADDIDATA_APCI7500, + PCI_ANY_ID, + PCI_ANY_ID, + 0, + 0, + pbn_b0_4_115200 }, + + { PCI_VENDOR_ID_ADDIDATA, + PCI_DEVICE_ID_ADDIDATA_APCI7420, + PCI_ANY_ID, + PCI_ANY_ID, + 0, + 0, + pbn_b0_2_115200 }, + + { PCI_VENDOR_ID_ADDIDATA, + PCI_DEVICE_ID_ADDIDATA_APCI7300, + PCI_ANY_ID, + PCI_ANY_ID, + 0, + 0, + pbn_b0_1_115200 }, + + { PCI_VENDOR_ID_ADDIDATA_OLD, + PCI_DEVICE_ID_ADDIDATA_APCI7800, + PCI_ANY_ID, + PCI_ANY_ID, + 0, + 0, + pbn_b1_8_115200 }, + + { PCI_VENDOR_ID_ADDIDATA, + PCI_DEVICE_ID_ADDIDATA_APCI7500_2, + PCI_ANY_ID, + PCI_ANY_ID, + 0, + 0, + pbn_b0_4_115200 }, + + { PCI_VENDOR_ID_ADDIDATA, + PCI_DEVICE_ID_ADDIDATA_APCI7420_2, + PCI_ANY_ID, + PCI_ANY_ID, + 0, + 0, + pbn_b0_2_115200 }, + + { PCI_VENDOR_ID_ADDIDATA, + PCI_DEVICE_ID_ADDIDATA_APCI7300_2, + PCI_ANY_ID, + PCI_ANY_ID, + 0, + 0, + pbn_b0_1_115200 }, + + { PCI_VENDOR_ID_ADDIDATA, + PCI_DEVICE_ID_ADDIDATA_APCI7500_3, + PCI_ANY_ID, + PCI_ANY_ID, + 0, + 0, + pbn_b0_4_115200 }, + + { PCI_VENDOR_ID_ADDIDATA, + PCI_DEVICE_ID_ADDIDATA_APCI7420_3, + PCI_ANY_ID, + PCI_ANY_ID, + 0, + 0, + pbn_b0_2_115200 }, + + { PCI_VENDOR_ID_ADDIDATA, + PCI_DEVICE_ID_ADDIDATA_APCI7300_3, + PCI_ANY_ID, + PCI_ANY_ID, + 0, + 0, + pbn_b0_1_115200 }, + + { PCI_VENDOR_ID_ADDIDATA, + PCI_DEVICE_ID_ADDIDATA_APCI7800_3, + PCI_ANY_ID, + PCI_ANY_ID, + 0, + 0, + pbn_b0_8_115200 }, + + /* * These entries match devices with class COMMUNICATION_SERIAL, * COMMUNICATION_MODEM or COMMUNICATION_MULTISERIAL */ diff --git a/drivers/serial/8250_pnp.c b/drivers/serial/8250_pnp.c index 1de098e..6f09cbd 100644 --- a/drivers/serial/8250_pnp.c +++ b/drivers/serial/8250_pnp.c @@ -414,8 +414,9 @@ static int __devinit check_resources(struct pnp_option *option) */ static int __devinit serial_pnp_guess_board(struct pnp_dev *dev, int *flags) { - if (!(check_name(pnp_dev_name(dev)) || (dev->card && check_name(dev->card->name)))) - return -ENODEV; + if (!(check_name(pnp_dev_name(dev)) || + (dev->card && check_name(dev->card->name)))) + return -ENODEV; if (check_resources(dev->independent)) return 0; @@ -452,8 +453,9 @@ serial_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id) return -ENODEV; #ifdef SERIAL_DEBUG_PNP - printk("Setup PNP port: port %x, mem 0x%lx, irq %d, type %d\n", - port.iobase, port.mapbase, port.irq, port.iotype); + printk(KERN_DEBUG + "Setup PNP port: port %x, mem 0x%lx, irq %d, type %d\n", + port.iobase, port.mapbase, port.irq, port.iotype); #endif port.flags |= UPF_SKIP_TEST | UPF_BOOT_AUTOCONF; diff --git a/drivers/serial/mcf.c b/drivers/serial/mcf.c index 051fcc2..e76fc72 100644 --- a/drivers/serial/mcf.c +++ b/drivers/serial/mcf.c @@ -434,7 +434,7 @@ static struct uart_ops mcf_uart_ops = { static struct mcf_uart mcf_ports[3]; -#define MCF_MAXPORTS (sizeof(mcf_ports) / sizeof(struct mcf_uart)) +#define MCF_MAXPORTS ARRAY_SIZE(mcf_ports) /****************************************************************************/ #if defined(CONFIG_SERIAL_MCF_CONSOLE) diff --git a/drivers/serial/mpsc.c b/drivers/serial/mpsc.c index 4d643c9..cb3a919 100644 --- a/drivers/serial/mpsc.c +++ b/drivers/serial/mpsc.c @@ -612,6 +612,7 @@ static void mpsc_hw_init(struct mpsc_port_info *pi) /* No preamble, 16x divider, low-latency, */ writel(0x04400400, pi->mpsc_base + MPSC_MMCRH); + mpsc_set_baudrate(pi, pi->default_baud); if (pi->mirror_regs) { pi->MPSC_CHR_1_m = 0; diff --git a/drivers/serial/s3c2410.c b/drivers/serial/s3c2410.c index e773c8e..45de193 100644 --- a/drivers/serial/s3c2410.c +++ b/drivers/serial/s3c2410.c @@ -1527,7 +1527,7 @@ static inline void s3c2440_serial_exit(void) #define s3c2440_uart_inf_at NULL #endif /* CONFIG_CPU_S3C2440 */ -#if defined(CONFIG_CPU_S3C2412) || defined(CONFIG_CPU_S3C2413) +#if defined(CONFIG_CPU_S3C2412) static int s3c2412_serial_setsource(struct uart_port *port, struct s3c24xx_uart_clksrc *clk) diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c index 3bb5d24..276da14 100644 --- a/drivers/serial/serial_core.c +++ b/drivers/serial/serial_core.c @@ -371,7 +371,8 @@ uart_get_baud_rate(struct uart_port *port, struct ktermios *termios, */ termios->c_cflag &= ~CBAUD; if (old) { - termios->c_cflag |= old->c_cflag & CBAUD; + baud = tty_termios_baud_rate(old); + tty_termios_encode_baud_rate(termios, baud, baud); old = NULL; continue; } @@ -380,7 +381,7 @@ uart_get_baud_rate(struct uart_port *port, struct ktermios *termios, * As a last resort, if the quotient is zero, * default to 9600 bps */ - termios->c_cflag |= B9600; + tty_termios_encode_baud_rate(termios, 9600, 9600); } return 0; @@ -1977,6 +1978,7 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *port) if (state->info && state->info->flags & UIF_INITIALIZED) { const struct uart_ops *ops = port->ops; + int tries; state->info->flags = (state->info->flags & ~UIF_INITIALIZED) | UIF_SUSPENDED; @@ -1990,9 +1992,14 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *port) /* * Wait for the transmitter to empty. */ - while (!ops->tx_empty(port)) { + for (tries = 3; !ops->tx_empty(port) && tries; tries--) { msleep(10); } + if (!tries) + printk(KERN_ERR "%s%s%s%d: Unable to drain transmitter\n", + port->dev ? port->dev->bus_id : "", + port->dev ? ": " : "", + drv->dev_name, port->line); ops->shutdown(port); } @@ -2029,8 +2036,6 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *port) } port->suspended = 0; - uart_change_pm(state, 0); - /* * Re-enable the console device after suspending. */ @@ -2049,6 +2054,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *port) if (state->info && state->info->tty && termios.c_cflag == 0) termios = *state->info->tty->termios; + uart_change_pm(state, 0); port->ops->set_termios(port, &termios, NULL); console_start(port->cons); } @@ -2057,6 +2063,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *port) const struct uart_ops *ops = port->ops; int ret; + uart_change_pm(state, 0); ops->set_mctrl(port, 0); ret = ops->startup(port); if (ret == 0) { @@ -2150,10 +2157,11 @@ uart_configure_port(struct uart_driver *drv, struct uart_state *state, /* * Ensure that the modem control lines are de-activated. + * keep the DTR setting that is set in uart_set_options() * We probably don't need a spinlock around this, but */ spin_lock_irqsave(&port->lock, flags); - port->ops->set_mctrl(port, 0); + port->ops->set_mctrl(port, port->mctrl & TIOCM_DTR); spin_unlock_irqrestore(&port->lock, flags); /* diff --git a/drivers/serial/serial_cs.c b/drivers/serial/serial_cs.c index d8b6600..164d2a4 100644 --- a/drivers/serial/serial_cs.c +++ b/drivers/serial/serial_cs.c @@ -389,7 +389,7 @@ static void serial_detach(struct pcmcia_device *link) /*====================================================================*/ static int setup_serial(struct pcmcia_device *handle, struct serial_info * info, - kio_addr_t iobase, int irq) + unsigned int iobase, int irq) { struct uart_port port; int line; @@ -456,7 +456,7 @@ next_tuple(struct pcmcia_device *handle, tuple_t * tuple, cisparse_t * parse) static int simple_config(struct pcmcia_device *link) { - static const kio_addr_t base[5] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8, 0x0 }; + static const unsigned int base[5] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8, 0x0 }; static const int size_table[2] = { 8, 16 }; struct serial_info *info = link->priv; struct serial_cfg_mem *cfg_mem; @@ -480,7 +480,7 @@ static int simple_config(struct pcmcia_device *link) /* If the card is already configured, look up the port and irq */ i = pcmcia_get_configuration_info(link, &config); if ((i == CS_SUCCESS) && (config.Attributes & CONF_VALID_CLIENT)) { - kio_addr_t port = 0; + unsigned int port = 0; if ((config.BasePort2 != 0) && (config.NumPorts2 == 8)) { port = config.BasePort2; info->slave = 1; diff --git a/fs/buffer.c b/fs/buffer.c index 456c9ab..826baf4 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1798,7 +1798,7 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) start = max(from, block_start); size = min(to, block_end) - start; - zero_user_page(page, start, size, KM_USER0); + zero_user(page, start, size); set_buffer_uptodate(bh); } @@ -1861,19 +1861,10 @@ static int __block_prepare_write(struct inode *inode, struct page *page, mark_buffer_dirty(bh); continue; } - if (block_end > to || block_start < from) { - void *kaddr; - - kaddr = kmap_atomic(page, KM_USER0); - if (block_end > to) - memset(kaddr+to, 0, - block_end-to); - if (block_start < from) - memset(kaddr+block_start, - 0, from-block_start); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - } + if (block_end > to || block_start < from) + zero_user_segments(page, + to, block_end, + block_start, from); continue; } } @@ -2104,8 +2095,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block) SetPageError(page); } if (!buffer_mapped(bh)) { - zero_user_page(page, i * blocksize, blocksize, - KM_USER0); + zero_user(page, i * blocksize, blocksize); if (!err) set_buffer_uptodate(bh); continue; @@ -2218,7 +2208,7 @@ int cont_expand_zero(struct file *file, struct address_space *mapping, &page, &fsdata); if (err) goto out; - zero_user_page(page, zerofrom, len, KM_USER0); + zero_user(page, zerofrom, len); err = pagecache_write_end(file, mapping, curpos, len, len, page, fsdata); if (err < 0) @@ -2245,7 +2235,7 @@ int cont_expand_zero(struct file *file, struct address_space *mapping, &page, &fsdata); if (err) goto out; - zero_user_page(page, zerofrom, len, KM_USER0); + zero_user(page, zerofrom, len); err = pagecache_write_end(file, mapping, curpos, len, len, page, fsdata); if (err < 0) @@ -2422,7 +2412,6 @@ int nobh_write_begin(struct file *file, struct address_space *mapping, unsigned block_in_page; unsigned block_start, block_end; sector_t block_in_file; - char *kaddr; int nr_reads = 0; int ret = 0; int is_mapped_to_disk = 1; @@ -2493,13 +2482,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping, continue; } if (buffer_new(bh) || !buffer_mapped(bh)) { - kaddr = kmap_atomic(page, KM_USER0); - if (block_start < from) - memset(kaddr+block_start, 0, from-block_start); - if (block_end > to) - memset(kaddr + to, 0, block_end - to); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); + zero_user_segments(page, block_start, from, + to, block_end); continue; } if (buffer_uptodate(bh)) @@ -2636,7 +2620,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block, * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0); + zero_user_segment(page, offset, PAGE_CACHE_SIZE); out: ret = mpage_writepage(page, get_block, wbc); if (ret == -EAGAIN) @@ -2709,7 +2693,7 @@ has_buffers: if (page_has_buffers(page)) goto has_buffers; } - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); set_page_dirty(page); err = 0; @@ -2785,7 +2769,7 @@ int block_truncate_page(struct address_space *mapping, goto unlock; } - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); mark_buffer_dirty(bh); err = 0; @@ -2831,7 +2815,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block, * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0); + zero_user_segment(page, offset, PAGE_CACHE_SIZE); return __block_write_full_page(inode, page, get_block, wbc); } @@ -3169,7 +3153,7 @@ static void recalc_bh_state(void) struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) { - struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, + struct buffer_head *ret = kmem_cache_alloc(bh_cachep, set_migrateflags(gfp_flags, __GFP_RECLAIMABLE)); if (ret) { INIT_LIST_HEAD(&ret->b_assoc_buffers); @@ -3257,12 +3241,24 @@ int bh_submit_read(struct buffer_head *bh) } EXPORT_SYMBOL(bh_submit_read); +static void +init_buffer_head(struct kmem_cache *cachep, void *data) +{ + struct buffer_head *bh = data; + + memset(bh, 0, sizeof(*bh)); + INIT_LIST_HEAD(&bh->b_assoc_buffers); +} + void __init buffer_init(void) { int nrpages; - bh_cachep = KMEM_CACHE(buffer_head, - SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); + bh_cachep = kmem_cache_create("buffer_head", + sizeof(struct buffer_head), 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| + SLAB_MEM_SPREAD), + init_buffer_head); /* * Limit the bh occupancy to 10% of ZONE_NORMAL diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index d9567ba..47f2621 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1386,7 +1386,7 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from) if (!page) return -ENOMEM; - zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0); + zero_user_segment(page, offset, PAGE_CACHE_SIZE); unlock_page(page); page_cache_release(page); return rc; diff --git a/fs/compat.c b/fs/compat.c index 5216c3f..69baca5 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -2206,19 +2206,41 @@ asmlinkage long compat_sys_signalfd(int ufd, #ifdef CONFIG_TIMERFD -asmlinkage long compat_sys_timerfd(int ufd, int clockid, int flags, - const struct compat_itimerspec __user *utmr) +asmlinkage long compat_sys_timerfd_settime(int ufd, int flags, + const struct compat_itimerspec __user *utmr, + struct compat_itimerspec __user *otmr) { + int error; struct itimerspec t; struct itimerspec __user *ut; if (get_compat_itimerspec(&t, utmr)) return -EFAULT; - ut = compat_alloc_user_space(sizeof(*ut)); - if (copy_to_user(ut, &t, sizeof(t))) + ut = compat_alloc_user_space(2 * sizeof(struct itimerspec)); + if (copy_to_user(&ut[0], &t, sizeof(t))) return -EFAULT; + error = sys_timerfd_settime(ufd, flags, &ut[0], &ut[1]); + if (!error && otmr) + error = (copy_from_user(&t, &ut[1], sizeof(struct itimerspec)) || + put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0; + + return error; +} + +asmlinkage long compat_sys_timerfd_gettime(int ufd, + struct compat_itimerspec __user *otmr) +{ + int error; + struct itimerspec t; + struct itimerspec __user *ut; - return sys_timerfd(ufd, clockid, flags, ut); + ut = compat_alloc_user_space(sizeof(struct itimerspec)); + error = sys_timerfd_gettime(ufd, ut); + if (!error) + error = (copy_from_user(&t, ut, sizeof(struct itimerspec)) || + put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0; + + return error; } #endif /* CONFIG_TIMERFD */ diff --git a/fs/direct-io.c b/fs/direct-io.c index acf0da1..9e81add 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -878,8 +878,8 @@ do_holes: page_cache_release(page); goto out; } - zero_user_page(page, block_in_page << blkbits, - 1 << blkbits, KM_USER0); + zero_user(page, block_in_page << blkbits, + 1 << blkbits); dio->block_in_file++; block_in_page++; goto next_block; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 32c5711..0535412 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -257,8 +257,7 @@ static int fill_zeros_to_end_of_page(struct page *page, unsigned int to) end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE; if (to > end_byte_in_page) end_byte_in_page = to; - zero_user_page(page, end_byte_in_page, - PAGE_CACHE_SIZE - end_byte_in_page, KM_USER0); + zero_user_segment(page, end_byte_in_page, PAGE_CACHE_SIZE); out: return 0; } @@ -307,7 +306,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, */ if ((i_size_read(page->mapping->host) == prev_page_end_size) && (from != 0)) { - zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); + zero_user(page, 0, PAGE_CACHE_SIZE); } out: return rc; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 81c04ab..a415f42 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -353,7 +353,7 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) spin_unlock_irqrestore(&psw->lock, flags); /* Do really wake up now */ - wake_up(wq); + wake_up_nested(wq, 1 + wake_nests); /* Remove the current task from the list */ spin_lock_irqsave(&psw->lock, flags); @@ -760,7 +760,7 @@ static int de_thread(struct task_struct *tsk) */ read_lock(&tasklist_lock); spin_lock_irq(lock); - if (sig->flags & SIGNAL_GROUP_EXIT) { + if (signal_group_exit(sig)) { /* * Another group action in progress, just * return so that the signal is processed. @@ -778,6 +778,7 @@ static int de_thread(struct task_struct *tsk) if (unlikely(tsk->group_leader == task_child_reaper(tsk))) task_active_pid_ns(tsk)->child_reaper = tsk; + sig->group_exit_task = tsk; zap_other_threads(tsk); read_unlock(&tasklist_lock); @@ -802,7 +803,6 @@ static int de_thread(struct task_struct *tsk) } sig->notify_count = count; - sig->group_exit_task = tsk; while (atomic_read(&sig->count) > count) { __set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock_irq(lock); @@ -871,15 +871,10 @@ static int de_thread(struct task_struct *tsk) leader->exit_state = EXIT_DEAD; write_unlock_irq(&tasklist_lock); - } + } sig->group_exit_task = NULL; sig->notify_count = 0; - /* - * There may be one thread left which is just exiting, - * but it's safe to stop telling the group to kill themselves. - */ - sig->flags = 0; no_thread_group: exit_itimers(sig); @@ -947,12 +942,13 @@ static void flush_old_files(struct files_struct * files) spin_unlock(&files->file_lock); } -void get_task_comm(char *buf, struct task_struct *tsk) +char *get_task_comm(char *buf, struct task_struct *tsk) { /* buf must be at least sizeof(tsk->comm) in size */ task_lock(tsk); strncpy(buf, tsk->comm, sizeof(tsk->comm)); task_unlock(tsk); + return buf; } void set_task_comm(struct task_struct *tsk, char *buf) @@ -1548,7 +1544,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, int err = -EAGAIN; spin_lock_irq(&tsk->sighand->siglock); - if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) { + if (!signal_group_exit(tsk->signal)) { tsk->signal->group_exit_code = exit_code; zap_process(tsk); err = 0; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 9b162cd..0775354 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1845,7 +1845,7 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, */ if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode) && PageUptodate(page)) { - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); set_page_dirty(page); goto unlock; } @@ -1898,7 +1898,7 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, goto unlock; } - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); BUFFER_TRACE(bh, "zeroed end of block"); err = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bb717cb..05c4145 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1840,7 +1840,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page, */ if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode) && PageUptodate(page)) { - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); set_page_dirty(page); goto unlock; } @@ -1893,7 +1893,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page, goto unlock; } - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); BUFFER_TRACE(bh, "zeroed end of block"); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 300324b..0b30640 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -284,7 +284,17 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc) * soon as the queue becomes uncongested. */ inode->i_state |= I_DIRTY_PAGES; - requeue_io(inode); + if (wbc->nr_to_write <= 0) { + /* + * slice used up: queue for next turn + */ + requeue_io(inode); + } else { + /* + * somehow blocked: retry later + */ + redirty_tail(inode); + } } else { /* * Otherwise fully redirty the inode so that @@ -334,9 +344,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) WARN_ON(inode->i_state & I_WILL_FREE); if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_SYNC)) { - struct address_space *mapping = inode->i_mapping; - int ret; - /* * We're skipping this inode because it's locked, and we're not * doing writeback-for-data-integrity. Move it to s_more_io so @@ -345,15 +352,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * completed a full scan of s_io. */ requeue_io(inode); - - /* - * Even if we don't actually write the inode itself here, - * we can at least start some of the data writeout.. - */ - spin_unlock(&inode_lock); - ret = do_writepages(mapping, wbc); - spin_lock(&inode_lock); - return ret; + return 0; } /* @@ -479,8 +478,12 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) iput(inode); cond_resched(); spin_lock(&inode_lock); - if (wbc->nr_to_write <= 0) + if (wbc->nr_to_write <= 0) { + wbc->more_io = 1; break; + } + if (!list_empty(&sb->s_more_io)) + wbc->more_io = 1; } return; /* Leave any unwritten inodes on s_io */ } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index e4effc4..e9456eb 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -932,7 +932,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping) if (!gfs2_is_writeback(ip)) gfs2_trans_add_bh(ip->i_gl, bh, 0); - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); unlock: unlock_page(page); diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index 38dbe99..ac772b6 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -446,7 +446,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) * so we need to supply one here. It doesn't happen often. */ if (unlikely(page->index)) { - zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); + zero_user(page, 0, PAGE_CACHE_SIZE); return 0; } diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 35c1a9f..53fd0a6 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -285,17 +285,17 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd) return err; times[0].tv_sec = atime_ts.tv_sec; - times[0].tv_usec = atime_ts.tv_nsec * 1000; + times[0].tv_usec = atime_ts.tv_nsec / 1000; times[1].tv_sec = mtime_ts.tv_sec; - times[1].tv_usec = mtime_ts.tv_nsec * 1000; + times[1].tv_usec = mtime_ts.tv_nsec / 1000; if (attrs->ia_valid & HOSTFS_ATTR_ATIME_SET) { times[0].tv_sec = attrs->ia_atime.tv_sec; - times[0].tv_usec = attrs->ia_atime.tv_nsec * 1000; + times[0].tv_usec = attrs->ia_atime.tv_nsec / 1000; } if (attrs->ia_valid & HOSTFS_ATTR_MTIME_SET) { times[1].tv_sec = attrs->ia_mtime.tv_sec; - times[1].tv_usec = attrs->ia_mtime.tv_nsec * 1000; + times[1].tv_usec = attrs->ia_mtime.tv_nsec / 1000; } if (fd >= 0) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 09ee07f..3b3cc28 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -768,7 +768,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) case Opt_mode: if (match_octal(&args[0], &option)) goto bad_val; - pconfig->mode = option & 0777U; + pconfig->mode = option & 01777U; break; case Opt_size: { @@ -341,13 +341,10 @@ int simple_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { if (!PageUptodate(page)) { - if (to - from != PAGE_CACHE_SIZE) { - void *kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr, 0, from); - memset(kaddr + to, 0, PAGE_CACHE_SIZE - to); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - } + if (to - from != PAGE_CACHE_SIZE) + zero_user_segments(page, + 0, from, + to, PAGE_CACHE_SIZE); } return 0; } @@ -276,9 +276,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, } if (first_hole != blocks_per_page) { - zero_user_page(page, first_hole << blkbits, - PAGE_CACHE_SIZE - (first_hole << blkbits), - KM_USER0); + zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE); if (first_hole == 0) { SetPageUptodate(page); unlock_page(page); @@ -571,8 +569,7 @@ page_is_mapped: if (page->index > end_index || !offset) goto confused; - zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, - KM_USER0); + zero_user_segment(page, offset, PAGE_CACHE_SIZE); } /* diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 8fd6dfb..3d7d963 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -79,7 +79,7 @@ void nfs_readdata_release(void *data) static int nfs_return_empty_page(struct page *page) { - zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); + zero_user(page, 0, PAGE_CACHE_SIZE); SetPageUptodate(page); unlock_page(page); return 0; @@ -103,10 +103,10 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) pglen = PAGE_CACHE_SIZE - base; for (;;) { if (remainder <= pglen) { - zero_user_page(*pages, base, remainder, KM_USER0); + zero_user(*pages, base, remainder); break; } - zero_user_page(*pages, base, pglen, KM_USER0); + zero_user(*pages, base, pglen); pages++; remainder -= pglen; pglen = PAGE_CACHE_SIZE; @@ -130,7 +130,7 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, return PTR_ERR(new); } if (len < PAGE_CACHE_SIZE) - zero_user_page(page, len, PAGE_CACHE_SIZE - len, KM_USER0); + zero_user_segment(page, len, PAGE_CACHE_SIZE); nfs_list_add_request(new, &one_request); if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) @@ -532,7 +532,7 @@ readpage_async_filler(void *data, struct page *page) goto out_error; if (len < PAGE_CACHE_SIZE) - zero_user_page(page, len, PAGE_CACHE_SIZE - len, KM_USER0); + zero_user_segment(page, len, PAGE_CACHE_SIZE); nfs_pageio_add_request(desc->pgio, new); return 0; out_error: diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 522efff..b144b19 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -665,9 +665,7 @@ zero_page: * then we need to zero any uninitalised data. */ if (req->wb_pgbase == 0 && req->wb_bytes != PAGE_CACHE_SIZE && !PageUptodate(req->wb_page)) - zero_user_page(req->wb_page, req->wb_bytes, - PAGE_CACHE_SIZE - req->wb_bytes, - KM_USER0); + zero_user_segment(req->wb_page, req->wb_bytes, PAGE_CACHE_SIZE); return req; } diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 2192805..d13403e 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -11,8 +11,6 @@ #include <linux/nfsd/nfsd.h> #include <linux/nfsd/export.h> -#define CAP_NFSD_MASK (CAP_FS_MASK|CAP_TO_MASK(CAP_SYS_RESOURCE)) - int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp) { struct exp_flavor_info *f; @@ -69,10 +67,12 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) ret = set_current_groups(cred.cr_group_info); put_group_info(cred.cr_group_info); if ((cred.cr_uid)) { - cap_t(current->cap_effective) &= ~CAP_NFSD_MASK; + current->cap_effective = + cap_drop_nfsd_set(current->cap_effective); } else { - cap_t(current->cap_effective) |= (CAP_NFSD_MASK & - current->cap_permitted); + current->cap_effective = + cap_raise_nfsd_set(current->cap_effective, + current->cap_permitted); } return ret; } diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index ad87cb0..00e9ccd 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -87,13 +87,17 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) /* Check for the current buffer head overflowing. */ if (unlikely(file_ofs + bh->b_size > init_size)) { int ofs; + void *kaddr; ofs = 0; if (file_ofs < init_size) ofs = init_size - file_ofs; local_irq_save(flags); - zero_user_page(page, bh_offset(bh) + ofs, - bh->b_size - ofs, KM_BIO_SRC_IRQ); + kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ); + memset(kaddr + bh_offset(bh) + ofs, 0, + bh->b_size - ofs); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_BIO_SRC_IRQ); local_irq_restore(flags); } } else { @@ -334,7 +338,7 @@ handle_hole: bh->b_blocknr = -1UL; clear_buffer_mapped(bh); handle_zblock: - zero_user_page(page, i * blocksize, blocksize, KM_USER0); + zero_user(page, i * blocksize, blocksize); if (likely(!err)) set_buffer_uptodate(bh); } while (i++, iblock++, (bh = bh->b_this_page) != head); @@ -410,7 +414,7 @@ retry_readpage: /* Is the page fully outside i_size? (truncate in progress) */ if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT)) { - zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); + zero_user(page, 0, PAGE_CACHE_SIZE); ntfs_debug("Read outside i_size - truncated?"); goto done; } @@ -459,7 +463,7 @@ retry_readpage: * ok to ignore the compressed flag here. */ if (unlikely(page->index > 0)) { - zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); + zero_user(page, 0, PAGE_CACHE_SIZE); goto done; } if (!NInoAttr(ni)) @@ -788,8 +792,7 @@ lock_retry_remap: if (err == -ENOENT || lcn == LCN_ENOENT) { bh->b_blocknr = -1; clear_buffer_dirty(bh); - zero_user_page(page, bh_offset(bh), blocksize, - KM_USER0); + zero_user(page, bh_offset(bh), blocksize); set_buffer_uptodate(bh); err = 0; continue; @@ -1414,8 +1417,7 @@ retry_writepage: if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) { /* The page straddles i_size. */ unsigned int ofs = i_size & ~PAGE_CACHE_MASK; - zero_user_page(page, ofs, PAGE_CACHE_SIZE - ofs, - KM_USER0); + zero_user_segment(page, ofs, PAGE_CACHE_SIZE); } /* Handle mst protected attributes. */ if (NInoMstProtected(ni)) diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index d1619d0..33ff314 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -565,7 +565,7 @@ int ntfs_read_compressed_block(struct page *page) if (xpage >= max_page) { kfree(bhs); kfree(pages); - zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); + zero_user(page, 0, PAGE_CACHE_SIZE); ntfs_debug("Compressed read outside i_size - truncated?"); SetPageUptodate(page); unlock_page(page); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 6cd08df..3c5550c 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -607,8 +607,8 @@ do_next_page: ntfs_submit_bh_for_read(bh); *wait_bh++ = bh; } else { - zero_user_page(page, bh_offset(bh), - blocksize, KM_USER0); + zero_user(page, bh_offset(bh), + blocksize); set_buffer_uptodate(bh); } } @@ -683,9 +683,8 @@ map_buffer_cached: ntfs_submit_bh_for_read(bh); *wait_bh++ = bh; } else { - zero_user_page(page, - bh_offset(bh), - blocksize, KM_USER0); + zero_user(page, bh_offset(bh), + blocksize); set_buffer_uptodate(bh); } } @@ -703,8 +702,8 @@ map_buffer_cached: */ if (bh_end <= pos || bh_pos >= end) { if (!buffer_uptodate(bh)) { - zero_user_page(page, bh_offset(bh), - blocksize, KM_USER0); + zero_user(page, bh_offset(bh), + blocksize); set_buffer_uptodate(bh); } mark_buffer_dirty(bh); @@ -743,8 +742,7 @@ map_buffer_cached: if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh)) { - zero_user_page(page, bh_offset(bh), blocksize, - KM_USER0); + zero_user(page, bh_offset(bh), blocksize); set_buffer_uptodate(bh); } continue; @@ -868,8 +866,8 @@ rl_not_mapped_enoent: if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh)) { - zero_user_page(page, bh_offset(bh), - blocksize, KM_USER0); + zero_user(page, bh_offset(bh), + blocksize); set_buffer_uptodate(bh); } continue; @@ -1128,8 +1126,8 @@ rl_not_mapped_enoent: if (likely(bh_pos < initialized_size)) ofs = initialized_size - bh_pos; - zero_user_page(page, bh_offset(bh) + ofs, - blocksize - ofs, KM_USER0); + zero_user_segment(page, bh_offset(bh) + ofs, + blocksize); } } else /* if (unlikely(!buffer_uptodate(bh))) */ err = -EIO; @@ -1269,8 +1267,8 @@ rl_not_mapped_enoent: if (PageUptodate(page)) set_buffer_uptodate(bh); else { - zero_user_page(page, bh_offset(bh), - blocksize, KM_USER0); + zero_user(page, bh_offset(bh), + blocksize); set_buffer_uptodate(bh); } } @@ -1330,7 +1328,7 @@ err_out: len = PAGE_CACHE_SIZE; if (len > bytes) len = bytes; - zero_user_page(*pages, 0, len, KM_USER0); + zero_user(*pages, 0, len); } goto out; } @@ -1451,7 +1449,7 @@ err_out: len = PAGE_CACHE_SIZE; if (len > bytes) len = bytes; - zero_user_page(*pages, 0, len, KM_USER0); + zero_user(*pages, 0, len); } goto out; } diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index e38e402..cd0be3f 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -85,8 +85,7 @@ static inline void *ntfs_malloc_nofs_nofail(unsigned long size) static inline void ntfs_free(void *addr) { - if (likely(((unsigned long)addr < VMALLOC_START) || - ((unsigned long)addr >= VMALLOC_END ))) { + if (!is_vmalloc_addr(addr)) { kfree(addr); /* free_page((unsigned long)addr); */ return; diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 64713e1..447206eb 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5670,7 +5670,7 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, mlog_errno(ret); if (zero) - zero_user_page(page, from, to - from, KM_USER0); + zero_user_segment(page, from, to); /* * Need to set the buffers we zero'd into uptodate diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index bc7b4cb..8224312 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -307,7 +307,7 @@ static int ocfs2_readpage(struct file *file, struct page *page) * XXX sys_readahead() seems to get that wrong? */ if (start >= i_size_read(inode)) { - zero_user_page(page, 0, PAGE_SIZE, KM_USER0); + zero_user(page, 0, PAGE_SIZE); SetPageUptodate(page); ret = 0; goto out_alloc; @@ -869,7 +869,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, if (block_start >= to) break; - zero_user_page(page, block_start, bh->b_size, KM_USER0); + zero_user(page, block_start, bh->b_size); set_buffer_uptodate(bh); mark_buffer_dirty(bh); @@ -1034,7 +1034,7 @@ static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to start = max(from, block_start); end = min(to, block_end); - zero_user_page(page, start, end - start, KM_USER0); + zero_user_segment(page, start, end); set_buffer_uptodate(bh); } diff --git a/fs/proc/array.c b/fs/proc/array.c index b380313..6ba2746 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -281,14 +281,23 @@ static inline char *task_sig(struct task_struct *p, char *buffer) return buffer; } +static char *render_cap_t(const char *header, kernel_cap_t *a, char *buffer) +{ + unsigned __capi; + + buffer += sprintf(buffer, "%s", header); + CAP_FOR_EACH_U32(__capi) { + buffer += sprintf(buffer, "%08x", + a->cap[(_LINUX_CAPABILITY_U32S-1) - __capi]); + } + return buffer + sprintf(buffer, "\n"); +} + static inline char *task_cap(struct task_struct *p, char *buffer) { - return buffer + sprintf(buffer, "CapInh:\t%016x\n" - "CapPrm:\t%016x\n" - "CapEff:\t%016x\n", - cap_t(p->cap_inheritable), - cap_t(p->cap_permitted), - cap_t(p->cap_effective)); + buffer = render_cap_t("CapInh:\t", &p->cap_inheritable, buffer); + buffer = render_cap_t("CapPrm:\t", &p->cap_permitted, buffer); + return render_cap_t("CapEff:\t", &p->cap_effective, buffer); } static inline char *task_context_switch_counts(struct task_struct *p, diff --git a/fs/proc/base.c b/fs/proc/base.c index 3353748..c59852b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -88,10 +88,6 @@ * in /proc for a task before it execs a suid executable. */ - -/* Worst case buffer size needed for holding an integer. */ -#define PROC_NUMBUF 13 - struct pid_entry { char *name; int len; @@ -787,7 +783,7 @@ out_no_task: } #endif -static loff_t mem_lseek(struct file * file, loff_t offset, int orig) +loff_t mem_lseek(struct file *file, loff_t offset, int orig) { switch (orig) { case 0: @@ -935,42 +931,6 @@ static const struct file_operations proc_oom_adjust_operations = { .write = oom_adjust_write, }; -#ifdef CONFIG_MMU -static ssize_t clear_refs_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct task_struct *task; - char buffer[PROC_NUMBUF], *end; - struct mm_struct *mm; - - memset(buffer, 0, sizeof(buffer)); - if (count > sizeof(buffer) - 1) - count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) - return -EFAULT; - if (!simple_strtol(buffer, &end, 0)) - return -EINVAL; - if (*end == '\n') - end++; - task = get_proc_task(file->f_path.dentry->d_inode); - if (!task) - return -ESRCH; - mm = get_task_mm(task); - if (mm) { - clear_refs_smap(mm); - mmput(mm); - } - put_task_struct(task); - if (end - buffer == 0) - return -EIO; - return end - buffer; -} - -static struct file_operations proc_clear_refs_operations = { - .write = clear_refs_write, -}; -#endif - #ifdef CONFIG_AUDITSYSCALL #define TMPBUFLEN 21 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, @@ -2289,9 +2249,10 @@ static const struct pid_entry tgid_base_stuff[] = { LNK("exe", exe), REG("mounts", S_IRUGO, mounts), REG("mountstats", S_IRUSR, mountstats), -#ifdef CONFIG_MMU +#ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, clear_refs), REG("smaps", S_IRUGO, smaps), + REG("pagemap", S_IRUSR, pagemap), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, attr_dir), @@ -2360,7 +2321,8 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) name.len = snprintf(buf, sizeof(buf), "%d", pid); dentry = d_hash_and_lookup(mnt->mnt_root, &name); if (dentry) { - shrink_dcache_parent(dentry); + if (!(current->flags & PF_EXITING)) + shrink_dcache_parent(dentry); d_drop(dentry); dput(dentry); } @@ -2617,9 +2579,10 @@ static const struct pid_entry tid_base_stuff[] = { LNK("root", root), LNK("exe", exe), REG("mounts", S_IRUGO, mounts), -#ifdef CONFIG_MMU +#ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, clear_refs), REG("smaps", S_IRUGO, smaps), + REG("pagemap", S_IRUSR, pagemap), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, attr_dir), diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 05b3e90..7d57e80 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -52,15 +52,13 @@ extern int proc_tid_stat(struct task_struct *, char *); extern int proc_tgid_stat(struct task_struct *, char *); extern int proc_pid_status(struct task_struct *, char *); extern int proc_pid_statm(struct task_struct *, char *); +extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); extern const struct file_operations proc_maps_operations; extern const struct file_operations proc_numa_maps_operations; extern const struct file_operations proc_smaps_operations; - -extern const struct file_operations proc_maps_operations; -extern const struct file_operations proc_numa_maps_operations; -extern const struct file_operations proc_smaps_operations; - +extern const struct file_operations proc_clear_refs_operations; +extern const struct file_operations proc_pagemap_operations; void free_proc_entry(struct proc_dir_entry *de); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 1be7308..7dd26e1 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -325,7 +325,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if (m == NULL) { if (clear_user(buffer, tsz)) return -EFAULT; - } else if ((start >= VMALLOC_START) && (start < VMALLOC_END)) { + } else if (is_vmalloc_addr((void *)start)) { char * elf_buf; struct vm_struct *m; unsigned long curstart = start; diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 3462bfd..51288db 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -46,6 +46,7 @@ #include <linux/vmalloc.h> #include <linux/crash_dump.h> #include <linux/pid_namespace.h> +#include <linux/bootmem.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/io.h> @@ -675,6 +676,137 @@ static const struct file_operations proc_sysrq_trigger_operations = { }; #endif +#ifdef CONFIG_PROC_PAGE_MONITOR +#define KPMSIZE sizeof(u64) +#define KPMMASK (KPMSIZE - 1) +/* /proc/kpagecount - an array exposing page counts + * + * Each entry is a u64 representing the corresponding + * physical page count. + */ +static ssize_t kpagecount_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + u64 pcount; + + pfn = src / KPMSIZE; + count = min_t(size_t, count, (max_pfn * KPMSIZE) - src); + if (src & KPMMASK || count & KPMMASK) + return -EIO; + + while (count > 0) { + ppage = NULL; + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + pfn++; + if (!ppage) + pcount = 0; + else + pcount = atomic_read(&ppage->_count); + + if (put_user(pcount, out++)) { + ret = -EFAULT; + break; + } + + count -= KPMSIZE; + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static struct file_operations proc_kpagecount_operations = { + .llseek = mem_lseek, + .read = kpagecount_read, +}; + +/* /proc/kpageflags - an array exposing page flags + * + * Each entry is a u64 representing the corresponding + * physical page flags. + */ + +/* These macros are used to decouple internal flags from exported ones */ + +#define KPF_LOCKED 0 +#define KPF_ERROR 1 +#define KPF_REFERENCED 2 +#define KPF_UPTODATE 3 +#define KPF_DIRTY 4 +#define KPF_LRU 5 +#define KPF_ACTIVE 6 +#define KPF_SLAB 7 +#define KPF_WRITEBACK 8 +#define KPF_RECLAIM 9 +#define KPF_BUDDY 10 + +#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos) + +static ssize_t kpageflags_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + u64 kflags, uflags; + + pfn = src / KPMSIZE; + count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); + if (src & KPMMASK || count & KPMMASK) + return -EIO; + + while (count > 0) { + ppage = NULL; + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + pfn++; + if (!ppage) + kflags = 0; + else + kflags = ppage->flags; + + uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) | + kpf_copy_bit(kflags, KPF_ERROR, PG_error) | + kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) | + kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) | + kpf_copy_bit(kflags, KPF_DIRTY, PG_dirty) | + kpf_copy_bit(kflags, KPF_LRU, PG_lru) | + kpf_copy_bit(kflags, KPF_ACTIVE, PG_active) | + kpf_copy_bit(kflags, KPF_SLAB, PG_slab) | + kpf_copy_bit(kflags, KPF_WRITEBACK, PG_writeback) | + kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) | + kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy); + + if (put_user(uflags, out++)) { + ret = -EFAULT; + break; + } + + count -= KPMSIZE; + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static struct file_operations proc_kpageflags_operations = { + .llseek = mem_lseek, + .read = kpageflags_read, +}; +#endif /* CONFIG_PROC_PAGE_MONITOR */ + struct proc_dir_entry *proc_root_kcore; void create_seq_entry(char *name, mode_t mode, const struct file_operations *f) @@ -755,6 +887,10 @@ void __init proc_misc_init(void) (size_t)high_memory - PAGE_OFFSET + PAGE_SIZE; } #endif +#ifdef CONFIG_PROC_PAGE_MONITOR + create_seq_entry("kpagecount", S_IRUSR, &proc_kpagecount_operations); + create_seq_entry("kpageflags", S_IRUSR, &proc_kpageflags_operations); +#endif #ifdef CONFIG_PROC_VMCORE proc_vmcore = create_proc_entry("vmcore", S_IRUSR, NULL); if (proc_vmcore) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8043a3e..38338ed 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -5,7 +5,10 @@ #include <linux/highmem.h> #include <linux/ptrace.h> #include <linux/pagemap.h> +#include <linux/ptrace.h> #include <linux/mempolicy.h> +#include <linux/swap.h> +#include <linux/swapops.h> #include <asm/elf.h> #include <asm/uaccess.h> @@ -114,24 +117,124 @@ static void pad_len_spaces(struct seq_file *m, int len) seq_printf(m, "%*c", len, ' '); } -struct mem_size_stats +static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) { - unsigned long resident; - unsigned long shared_clean; - unsigned long shared_dirty; - unsigned long private_clean; - unsigned long private_dirty; - unsigned long referenced; -}; + if (vma && vma != priv->tail_vma) { + struct mm_struct *mm = vma->vm_mm; + up_read(&mm->mmap_sem); + mmput(mm); + } +} -struct pmd_walker { - struct vm_area_struct *vma; - void *private; - void (*action)(struct vm_area_struct *, pmd_t *, unsigned long, - unsigned long, void *); -}; +static void *m_start(struct seq_file *m, loff_t *pos) +{ + struct proc_maps_private *priv = m->private; + unsigned long last_addr = m->version; + struct mm_struct *mm; + struct vm_area_struct *vma, *tail_vma = NULL; + loff_t l = *pos; + + /* Clear the per syscall fields in priv */ + priv->task = NULL; + priv->tail_vma = NULL; + + /* + * We remember last_addr rather than next_addr to hit with + * mmap_cache most of the time. We have zero last_addr at + * the beginning and also after lseek. We will have -1 last_addr + * after the end of the vmas. + */ + + if (last_addr == -1UL) + return NULL; + + priv->task = get_pid_task(priv->pid, PIDTYPE_PID); + if (!priv->task) + return NULL; + + mm = mm_for_maps(priv->task); + if (!mm) + return NULL; + + tail_vma = get_gate_vma(priv->task); + priv->tail_vma = tail_vma; + + /* Start with last addr hint */ + vma = find_vma(mm, last_addr); + if (last_addr && vma) { + vma = vma->vm_next; + goto out; + } + + /* + * Check the vma index is within the range and do + * sequential scan until m_index. + */ + vma = NULL; + if ((unsigned long)l < mm->map_count) { + vma = mm->mmap; + while (l-- && vma) + vma = vma->vm_next; + goto out; + } + + if (l != mm->map_count) + tail_vma = NULL; /* After gate vma */ + +out: + if (vma) + return vma; + + /* End of vmas has been reached */ + m->version = (tail_vma != NULL)? 0: -1UL; + up_read(&mm->mmap_sem); + mmput(mm); + return tail_vma; +} -static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss) +static void *m_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + struct vm_area_struct *tail_vma = priv->tail_vma; + + (*pos)++; + if (vma && (vma != tail_vma) && vma->vm_next) + return vma->vm_next; + vma_stop(priv, vma); + return (vma != tail_vma)? tail_vma: NULL; +} + +static void m_stop(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + + vma_stop(priv, vma); + if (priv->task) + put_task_struct(priv->task); +} + +static int do_maps_open(struct inode *inode, struct file *file, + struct seq_operations *ops) +{ + struct proc_maps_private *priv; + int ret = -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->pid = proc_pid(inode); + ret = seq_open(file, ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } + } + return ret; +} + +static int show_map(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct task_struct *task = priv->task; @@ -191,41 +294,71 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats } seq_putc(m, '\n'); - if (mss) - seq_printf(m, - "Size: %8lu kB\n" - "Rss: %8lu kB\n" - "Shared_Clean: %8lu kB\n" - "Shared_Dirty: %8lu kB\n" - "Private_Clean: %8lu kB\n" - "Private_Dirty: %8lu kB\n" - "Referenced: %8lu kB\n", - (vma->vm_end - vma->vm_start) >> 10, - mss->resident >> 10, - mss->shared_clean >> 10, - mss->shared_dirty >> 10, - mss->private_clean >> 10, - mss->private_dirty >> 10, - mss->referenced >> 10); - if (m->count < m->size) /* vma is copied successfully */ m->version = (vma != get_gate_vma(task))? vma->vm_start: 0; return 0; } -static int show_map(struct seq_file *m, void *v) +static struct seq_operations proc_pid_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_map +}; + +static int maps_open(struct inode *inode, struct file *file) { - return show_map_internal(m, v, NULL); + return do_maps_open(inode, file, &proc_pid_maps_op); } -static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - void *private) +const struct file_operations proc_maps_operations = { + .open = maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +/* + * Proportional Set Size(PSS): my share of RSS. + * + * PSS of a process is the count of pages it has in memory, where each + * page is divided by the number of processes sharing it. So if a + * process has 1000 pages all to itself, and 1000 shared with one other + * process, its PSS will be 1500. + * + * To keep (accumulated) division errors low, we adopt a 64bit + * fixed-point pss counter to minimize division errors. So (pss >> + * PSS_SHIFT) would be the real byte count. + * + * A shift of 12 before division means (assuming 4K page size): + * - 1M 3-user-pages add up to 8KB errors; + * - supports mapcount up to 2^24, or 16M; + * - supports PSS up to 2^52 bytes, or 4PB. + */ +#define PSS_SHIFT 12 + +#ifdef CONFIG_PROC_PAGE_MONITOR +struct mem_size_stats +{ + struct vm_area_struct *vma; + unsigned long resident; + unsigned long shared_clean; + unsigned long shared_dirty; + unsigned long private_clean; + unsigned long private_dirty; + unsigned long referenced; + u64 pss; +}; + +static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + void *private) { struct mem_size_stats *mss = private; + struct vm_area_struct *vma = mss->vma; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; + int mapcount; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { @@ -242,26 +375,88 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, /* Accumulate the size in pages that have been accessed. */ if (pte_young(ptent) || PageReferenced(page)) mss->referenced += PAGE_SIZE; - if (page_mapcount(page) >= 2) { + mapcount = page_mapcount(page); + if (mapcount >= 2) { if (pte_dirty(ptent)) mss->shared_dirty += PAGE_SIZE; else mss->shared_clean += PAGE_SIZE; + mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; } else { if (pte_dirty(ptent)) mss->private_dirty += PAGE_SIZE; else mss->private_clean += PAGE_SIZE; + mss->pss += (PAGE_SIZE << PSS_SHIFT); } } pte_unmap_unlock(pte - 1, ptl); cond_resched(); + return 0; } -static void clear_refs_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - void *private) +static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range }; + +static int show_smap(struct seq_file *m, void *v) { + struct vm_area_struct *vma = v; + struct mem_size_stats mss; + int ret; + + memset(&mss, 0, sizeof mss); + mss.vma = vma; + if (vma->vm_mm && !is_vm_hugetlb_page(vma)) + walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end, + &smaps_walk, &mss); + + ret = show_map(m, v); + if (ret) + return ret; + + seq_printf(m, + "Size: %8lu kB\n" + "Rss: %8lu kB\n" + "Pss: %8lu kB\n" + "Shared_Clean: %8lu kB\n" + "Shared_Dirty: %8lu kB\n" + "Private_Clean: %8lu kB\n" + "Private_Dirty: %8lu kB\n" + "Referenced: %8lu kB\n", + (vma->vm_end - vma->vm_start) >> 10, + mss.resident >> 10, + (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), + mss.shared_clean >> 10, + mss.shared_dirty >> 10, + mss.private_clean >> 10, + mss.private_dirty >> 10, + mss.referenced >> 10); + + return ret; +} + +static struct seq_operations proc_pid_smaps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_smap +}; + +static int smaps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_smaps_op); +} + +const struct file_operations proc_smaps_operations = { + .open = smaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, void *private) +{ + struct vm_area_struct *vma = private; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; @@ -282,235 +477,248 @@ static void clear_refs_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } pte_unmap_unlock(pte - 1, ptl); cond_resched(); + return 0; } -static inline void walk_pmd_range(struct pmd_walker *walker, pud_t *pud, - unsigned long addr, unsigned long end) +static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range }; + +static ssize_t clear_refs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) { - pmd_t *pmd; - unsigned long next; + struct task_struct *task; + char buffer[PROC_NUMBUF], *end; + struct mm_struct *mm; + struct vm_area_struct *vma; - for (pmd = pmd_offset(pud, addr); addr != end; - pmd++, addr = next) { - next = pmd_addr_end(addr, end); - if (pmd_none_or_clear_bad(pmd)) - continue; - walker->action(walker->vma, pmd, addr, next, walker->private); + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + if (!simple_strtol(buffer, &end, 0)) + return -EINVAL; + if (*end == '\n') + end++; + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) + return -ESRCH; + mm = get_task_mm(task); + if (mm) { + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) + if (!is_vm_hugetlb_page(vma)) + walk_page_range(mm, vma->vm_start, vma->vm_end, + &clear_refs_walk, vma); + flush_tlb_mm(mm); + up_read(&mm->mmap_sem); + mmput(mm); } + put_task_struct(task); + if (end - buffer == 0) + return -EIO; + return end - buffer; } -static inline void walk_pud_range(struct pmd_walker *walker, pgd_t *pgd, - unsigned long addr, unsigned long end) -{ - pud_t *pud; - unsigned long next; +const struct file_operations proc_clear_refs_operations = { + .write = clear_refs_write, +}; - for (pud = pud_offset(pgd, addr); addr != end; - pud++, addr = next) { - next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) - continue; - walk_pmd_range(walker, pud, addr, next); +struct pagemapread { + char __user *out, *end; +}; + +#define PM_ENTRY_BYTES sizeof(u64) +#define PM_RESERVED_BITS 3 +#define PM_RESERVED_OFFSET (64 - PM_RESERVED_BITS) +#define PM_RESERVED_MASK (((1LL<<PM_RESERVED_BITS)-1) << PM_RESERVED_OFFSET) +#define PM_SPECIAL(nr) (((nr) << PM_RESERVED_OFFSET) | PM_RESERVED_MASK) +#define PM_NOT_PRESENT PM_SPECIAL(1LL) +#define PM_SWAP PM_SPECIAL(2LL) +#define PM_END_OF_BUFFER 1 + +static int add_to_pagemap(unsigned long addr, u64 pfn, + struct pagemapread *pm) +{ + /* + * Make sure there's room in the buffer for an + * entire entry. Otherwise, only copy part of + * the pfn. + */ + if (pm->out + PM_ENTRY_BYTES >= pm->end) { + if (copy_to_user(pm->out, &pfn, pm->end - pm->out)) + return -EFAULT; + pm->out = pm->end; + return PM_END_OF_BUFFER; } + + if (put_user(pfn, pm->out)) + return -EFAULT; + pm->out += PM_ENTRY_BYTES; + return 0; } -/* - * walk_page_range - walk the page tables of a VMA with a callback - * @vma - VMA to walk - * @action - callback invoked for every bottom-level (PTE) page table - * @private - private data passed to the callback function - * - * Recursively walk the page table for the memory area in a VMA, calling - * a callback for every bottom-level (PTE) page table. - */ -static inline void walk_page_range(struct vm_area_struct *vma, - void (*action)(struct vm_area_struct *, - pmd_t *, unsigned long, - unsigned long, void *), - void *private) +static int pagemap_pte_hole(unsigned long start, unsigned long end, + void *private) { - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; - struct pmd_walker walker = { - .vma = vma, - .private = private, - .action = action, - }; - pgd_t *pgd; - unsigned long next; - - for (pgd = pgd_offset(vma->vm_mm, addr); addr != end; - pgd++, addr = next) { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; - walk_pud_range(&walker, pgd, addr, next); + struct pagemapread *pm = private; + unsigned long addr; + int err = 0; + for (addr = start; addr < end; addr += PAGE_SIZE) { + err = add_to_pagemap(addr, PM_NOT_PRESENT, pm); + if (err) + break; } + return err; } -static int show_smap(struct seq_file *m, void *v) +u64 swap_pte_to_pagemap_entry(pte_t pte) { - struct vm_area_struct *vma = v; - struct mem_size_stats mss; - - memset(&mss, 0, sizeof mss); - if (vma->vm_mm && !is_vm_hugetlb_page(vma)) - walk_page_range(vma, smaps_pte_range, &mss); - return show_map_internal(m, v, &mss); + swp_entry_t e = pte_to_swp_entry(pte); + return PM_SWAP | swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT); } -void clear_refs_smap(struct mm_struct *mm) +static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + void *private) { - struct vm_area_struct *vma; + struct pagemapread *pm = private; + pte_t *pte; + int err = 0; + + for (; addr != end; addr += PAGE_SIZE) { + u64 pfn = PM_NOT_PRESENT; + pte = pte_offset_map(pmd, addr); + if (is_swap_pte(*pte)) + pfn = swap_pte_to_pagemap_entry(*pte); + else if (pte_present(*pte)) + pfn = pte_pfn(*pte); + /* unmap so we're not in atomic when we copy to userspace */ + pte_unmap(pte); + err = add_to_pagemap(addr, pfn, pm); + if (err) + return err; + } - down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) - if (vma->vm_mm && !is_vm_hugetlb_page(vma)) - walk_page_range(vma, clear_refs_pte_range, NULL); - flush_tlb_mm(mm); - up_read(&mm->mmap_sem); + cond_resched(); + + return err; } -static void *m_start(struct seq_file *m, loff_t *pos) +static struct mm_walk pagemap_walk = { + .pmd_entry = pagemap_pte_range, + .pte_hole = pagemap_pte_hole +}; + +/* + * /proc/pid/pagemap - an array mapping virtual pages to pfns + * + * For each page in the address space, this file contains one 64-bit + * entry representing the corresponding physical page frame number + * (PFN) if the page is present. If there is a swap entry for the + * physical page, then an encoding of the swap file number and the + * page's offset into the swap file are returned. If no page is + * present at all, PM_NOT_PRESENT is returned. This allows determining + * precisely which pages are mapped (or in swap) and comparing mapped + * pages between processes. + * + * Efficient users of this interface will use /proc/pid/maps to + * determine which areas of memory are actually mapped and llseek to + * skip over unmapped regions. + */ +static ssize_t pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) { - struct proc_maps_private *priv = m->private; - unsigned long last_addr = m->version; + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + struct page **pages, *page; + unsigned long uaddr, uend; struct mm_struct *mm; - struct vm_area_struct *vma, *tail_vma = NULL; - loff_t l = *pos; - - /* Clear the per syscall fields in priv */ - priv->task = NULL; - priv->tail_vma = NULL; + struct pagemapread pm; + int pagecount; + int ret = -ESRCH; - /* - * We remember last_addr rather than next_addr to hit with - * mmap_cache most of the time. We have zero last_addr at - * the beginning and also after lseek. We will have -1 last_addr - * after the end of the vmas. - */ + if (!task) + goto out; - if (last_addr == -1UL) - return NULL; + ret = -EACCES; + if (!ptrace_may_attach(task)) + goto out; - priv->task = get_pid_task(priv->pid, PIDTYPE_PID); - if (!priv->task) - return NULL; + ret = -EINVAL; + /* file position must be aligned */ + if (*ppos % PM_ENTRY_BYTES) + goto out; - mm = mm_for_maps(priv->task); + ret = 0; + mm = get_task_mm(task); if (!mm) - return NULL; - - priv->tail_vma = tail_vma = get_gate_vma(priv->task); - - /* Start with last addr hint */ - if (last_addr && (vma = find_vma(mm, last_addr))) { - vma = vma->vm_next; goto out; - } - /* - * Check the vma index is within the range and do - * sequential scan until m_index. - */ - vma = NULL; - if ((unsigned long)l < mm->map_count) { - vma = mm->mmap; - while (l-- && vma) - vma = vma->vm_next; - goto out; - } + ret = -ENOMEM; + uaddr = (unsigned long)buf & PAGE_MASK; + uend = (unsigned long)(buf + count); + pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE; + pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL); + if (!pages) + goto out_task; - if (l != mm->map_count) - tail_vma = NULL; /* After gate vma */ + down_read(¤t->mm->mmap_sem); + ret = get_user_pages(current, current->mm, uaddr, pagecount, + 1, 0, pages, NULL); + up_read(¤t->mm->mmap_sem); -out: - if (vma) - return vma; + if (ret < 0) + goto out_free; - /* End of vmas has been reached */ - m->version = (tail_vma != NULL)? 0: -1UL; - up_read(&mm->mmap_sem); - mmput(mm); - return tail_vma; -} + pm.out = buf; + pm.end = buf + count; -static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) -{ - if (vma && vma != priv->tail_vma) { - struct mm_struct *mm = vma->vm_mm; - up_read(&mm->mmap_sem); - mmput(mm); + if (!ptrace_may_attach(task)) { + ret = -EIO; + } else { + unsigned long src = *ppos; + unsigned long svpfn = src / PM_ENTRY_BYTES; + unsigned long start_vaddr = svpfn << PAGE_SHIFT; + unsigned long end_vaddr = TASK_SIZE_OF(task); + + /* watch out for wraparound */ + if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) + start_vaddr = end_vaddr; + + /* + * The odds are that this will stop walking way + * before end_vaddr, because the length of the + * user buffer is tracked in "pm", and the walk + * will stop when we hit the end of the buffer. + */ + ret = walk_page_range(mm, start_vaddr, end_vaddr, + &pagemap_walk, &pm); + if (ret == PM_END_OF_BUFFER) + ret = 0; + /* don't need mmap_sem for these, but this looks cleaner */ + *ppos += pm.out - buf; + if (!ret) + ret = pm.out - buf; } -} - -static void *m_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct proc_maps_private *priv = m->private; - struct vm_area_struct *vma = v; - struct vm_area_struct *tail_vma = priv->tail_vma; - - (*pos)++; - if (vma && (vma != tail_vma) && vma->vm_next) - return vma->vm_next; - vma_stop(priv, vma); - return (vma != tail_vma)? tail_vma: NULL; -} - -static void m_stop(struct seq_file *m, void *v) -{ - struct proc_maps_private *priv = m->private; - struct vm_area_struct *vma = v; - vma_stop(priv, vma); - if (priv->task) - put_task_struct(priv->task); -} - -static struct seq_operations proc_pid_maps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_map -}; - -static struct seq_operations proc_pid_smaps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_smap -}; - -static int do_maps_open(struct inode *inode, struct file *file, - struct seq_operations *ops) -{ - struct proc_maps_private *priv; - int ret = -ENOMEM; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (priv) { - priv->pid = proc_pid(inode); - ret = seq_open(file, ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = priv; - } else { - kfree(priv); - } + for (; pagecount; pagecount--) { + page = pages[pagecount-1]; + if (!PageReserved(page)) + SetPageDirty(page); + page_cache_release(page); } + mmput(mm); +out_free: + kfree(pages); +out_task: + put_task_struct(task); +out: return ret; } -static int maps_open(struct inode *inode, struct file *file) -{ - return do_maps_open(inode, file, &proc_pid_maps_op); -} - -const struct file_operations proc_maps_operations = { - .open = maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, +const struct file_operations proc_pagemap_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = pagemap_read, }; +#endif /* CONFIG_PROC_PAGE_MONITOR */ #ifdef CONFIG_NUMA extern int show_numa_map(struct seq_file *m, void *v); @@ -545,15 +753,3 @@ const struct file_operations proc_numa_maps_operations = { .release = seq_release_private, }; #endif - -static int smaps_open(struct inode *inode, struct file *file) -{ - return do_maps_open(inode, file, &proc_pid_smaps_op); -} - -const struct file_operations proc_smaps_operations = { - .open = smaps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 231fd5c..1953098 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2143,7 +2143,7 @@ int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) /* if we are not on a block boundary */ if (length) { length = blocksize - length; - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); if (buffer_mapped(bh) && bh->b_blocknr != 0) { mark_buffer_dirty(bh); } @@ -2367,7 +2367,7 @@ static int reiserfs_write_full_page(struct page *page, unlock_page(page); return 0; } - zero_user_page(page, last_offset, PAGE_CACHE_SIZE - last_offset, KM_USER0); + zero_user_segment(page, last_offset, PAGE_CACHE_SIZE); } bh = head; block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits); diff --git a/fs/timerfd.c b/fs/timerfd.c index 61983f3..10c80b5 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -25,13 +25,15 @@ struct timerfd_ctx { struct hrtimer tmr; ktime_t tintv; wait_queue_head_t wqh; + u64 ticks; int expired; + int clockid; }; /* * This gets called when the timer event triggers. We set the "expired" * flag, but we do not re-arm the timer (in case it's necessary, - * tintv.tv64 != 0) until the timer is read. + * tintv.tv64 != 0) until the timer is accessed. */ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) { @@ -40,13 +42,24 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) spin_lock_irqsave(&ctx->wqh.lock, flags); ctx->expired = 1; + ctx->ticks++; wake_up_locked(&ctx->wqh); spin_unlock_irqrestore(&ctx->wqh.lock, flags); return HRTIMER_NORESTART; } -static void timerfd_setup(struct timerfd_ctx *ctx, int clockid, int flags, +static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) +{ + ktime_t now, remaining; + + now = ctx->tmr.base->get_time(); + remaining = ktime_sub(ctx->tmr.expires, now); + + return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; +} + +static void timerfd_setup(struct timerfd_ctx *ctx, int flags, const struct itimerspec *ktmr) { enum hrtimer_mode htmode; @@ -57,8 +70,9 @@ static void timerfd_setup(struct timerfd_ctx *ctx, int clockid, int flags, texp = timespec_to_ktime(ktmr->it_value); ctx->expired = 0; + ctx->ticks = 0; ctx->tintv = timespec_to_ktime(ktmr->it_interval); - hrtimer_init(&ctx->tmr, clockid, htmode); + hrtimer_init(&ctx->tmr, ctx->clockid, htmode); ctx->tmr.expires = texp; ctx->tmr.function = timerfd_tmrproc; if (texp.tv64 != 0) @@ -83,7 +97,7 @@ static unsigned int timerfd_poll(struct file *file, poll_table *wait) poll_wait(file, &ctx->wqh, wait); spin_lock_irqsave(&ctx->wqh.lock, flags); - if (ctx->expired) + if (ctx->ticks) events |= POLLIN; spin_unlock_irqrestore(&ctx->wqh.lock, flags); @@ -102,11 +116,11 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, return -EINVAL; spin_lock_irq(&ctx->wqh.lock); res = -EAGAIN; - if (!ctx->expired && !(file->f_flags & O_NONBLOCK)) { + if (!ctx->ticks && !(file->f_flags & O_NONBLOCK)) { __add_wait_queue(&ctx->wqh, &wait); for (res = 0;;) { set_current_state(TASK_INTERRUPTIBLE); - if (ctx->expired) { + if (ctx->ticks) { res = 0; break; } @@ -121,22 +135,21 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } - if (ctx->expired) { - ctx->expired = 0; - if (ctx->tintv.tv64 != 0) { + if (ctx->ticks) { + ticks = ctx->ticks; + if (ctx->expired && ctx->tintv.tv64) { /* * If tintv.tv64 != 0, this is a periodic timer that * needs to be re-armed. We avoid doing it in the timer * callback to avoid DoS attacks specifying a very * short timer period. */ - ticks = (u64) - hrtimer_forward(&ctx->tmr, - hrtimer_cb_get_time(&ctx->tmr), - ctx->tintv); + ticks += hrtimer_forward_now(&ctx->tmr, + ctx->tintv) - 1; hrtimer_restart(&ctx->tmr); - } else - ticks = 1; + } + ctx->expired = 0; + ctx->ticks = 0; } spin_unlock_irq(&ctx->wqh.lock); if (ticks) @@ -150,76 +163,132 @@ static const struct file_operations timerfd_fops = { .read = timerfd_read, }; -asmlinkage long sys_timerfd(int ufd, int clockid, int flags, - const struct itimerspec __user *utmr) +static struct file *timerfd_fget(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + if (file->f_op != &timerfd_fops) { + fput(file); + return ERR_PTR(-EINVAL); + } + + return file; +} + +asmlinkage long sys_timerfd_create(int clockid, int flags) { - int error; + int error, ufd; struct timerfd_ctx *ctx; struct file *file; struct inode *inode; - struct itimerspec ktmr; - - if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) - return -EFAULT; + if (flags) + return -EINVAL; if (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME) return -EINVAL; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + init_waitqueue_head(&ctx->wqh); + ctx->clockid = clockid; + hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); + + error = anon_inode_getfd(&ufd, &inode, &file, "[timerfd]", + &timerfd_fops, ctx); + if (error) { + kfree(ctx); + return error; + } + + return ufd; +} + +asmlinkage long sys_timerfd_settime(int ufd, int flags, + const struct itimerspec __user *utmr, + struct itimerspec __user *otmr) +{ + struct file *file; + struct timerfd_ctx *ctx; + struct itimerspec ktmr, kotmr; + + if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) + return -EFAULT; + if (!timespec_valid(&ktmr.it_value) || !timespec_valid(&ktmr.it_interval)) return -EINVAL; - if (ufd == -1) { - ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - - init_waitqueue_head(&ctx->wqh); - - timerfd_setup(ctx, clockid, flags, &ktmr); - - /* - * When we call this, the initialization must be complete, since - * anon_inode_getfd() will install the fd. - */ - error = anon_inode_getfd(&ufd, &inode, &file, "[timerfd]", - &timerfd_fops, ctx); - if (error) - goto err_tmrcancel; - } else { - file = fget(ufd); - if (!file) - return -EBADF; - ctx = file->private_data; - if (file->f_op != &timerfd_fops) { - fput(file); - return -EINVAL; - } - /* - * We need to stop the existing timer before reprogramming - * it to the new values. - */ - for (;;) { - spin_lock_irq(&ctx->wqh.lock); - if (hrtimer_try_to_cancel(&ctx->tmr) >= 0) - break; - spin_unlock_irq(&ctx->wqh.lock); - cpu_relax(); - } - /* - * Re-program the timer to the new value ... - */ - timerfd_setup(ctx, clockid, flags, &ktmr); + file = timerfd_fget(ufd); + if (IS_ERR(file)) + return PTR_ERR(file); + ctx = file->private_data; + /* + * We need to stop the existing timer before reprogramming + * it to the new values. + */ + for (;;) { + spin_lock_irq(&ctx->wqh.lock); + if (hrtimer_try_to_cancel(&ctx->tmr) >= 0) + break; spin_unlock_irq(&ctx->wqh.lock); - fput(file); + cpu_relax(); } - return ufd; + /* + * If the timer is expired and it's periodic, we need to advance it + * because the caller may want to know the previous expiration time. + * We do not update "ticks" and "expired" since the timer will be + * re-programmed again in the following timerfd_setup() call. + */ + if (ctx->expired && ctx->tintv.tv64) + hrtimer_forward_now(&ctx->tmr, ctx->tintv); -err_tmrcancel: - hrtimer_cancel(&ctx->tmr); - kfree(ctx); - return error; + kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); + kotmr.it_interval = ktime_to_timespec(ctx->tintv); + + /* + * Re-program the timer to the new value ... + */ + timerfd_setup(ctx, flags, &ktmr); + + spin_unlock_irq(&ctx->wqh.lock); + fput(file); + if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr))) + return -EFAULT; + + return 0; +} + +asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr) +{ + struct file *file; + struct timerfd_ctx *ctx; + struct itimerspec kotmr; + + file = timerfd_fget(ufd); + if (IS_ERR(file)) + return PTR_ERR(file); + ctx = file->private_data; + + spin_lock_irq(&ctx->wqh.lock); + if (ctx->expired && ctx->tintv.tv64) { + ctx->expired = 0; + ctx->ticks += + hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1; + hrtimer_restart(&ctx->tmr); + } + kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); + kotmr.it_interval = ktime_to_timespec(ctx->tintv); + spin_unlock_irq(&ctx->wqh.lock); + fput(file); + + return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0; } @@ -105,6 +105,33 @@ out: EXPORT_SYMBOL_GPL(vfs_setxattr); ssize_t +xattr_getsecurity(struct inode *inode, const char *name, void *value, + size_t size) +{ + void *buffer = NULL; + ssize_t len; + + if (!value || !size) { + len = security_inode_getsecurity(inode, name, &buffer, false); + goto out_noalloc; + } + + len = security_inode_getsecurity(inode, name, &buffer, true); + if (len < 0) + return len; + if (size < len) { + len = -ERANGE; + goto out; + } + memcpy(value, buffer, len); +out: + security_release_secctx(buffer, len); +out_noalloc: + return len; +} +EXPORT_SYMBOL_GPL(xattr_getsecurity); + +ssize_t vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size) { struct inode *inode = dentry->d_inode; @@ -118,23 +145,23 @@ vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size) if (error) return error; - if (inode->i_op->getxattr) - error = inode->i_op->getxattr(dentry, name, value, size); - else - error = -EOPNOTSUPP; - if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - int ret = security_inode_getsecurity(inode, suffix, value, - size, error); + int ret = xattr_getsecurity(inode, suffix, value, size); /* * Only overwrite the return value if a security module * is actually active. */ - if (ret != -EOPNOTSUPP) - error = ret; + if (ret == -EOPNOTSUPP) + goto nolsm; + return ret; } +nolsm: + if (inode->i_op->getxattr) + error = inode->i_op->getxattr(dentry, name, value, size); + else + error = -EOPNOTSUPP; return error; } diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c index ed2b16d..e040f1c 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/linux-2.6/kmem.c @@ -92,8 +92,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize, void kmem_free(void *ptr, size_t size) { - if (((unsigned long)ptr < VMALLOC_START) || - ((unsigned long)ptr >= VMALLOC_END)) { + if (!is_vmalloc_addr(ptr)) { kfree(ptr); } else { vfree(ptr); diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index a49dd8d..0382c19 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -709,8 +709,7 @@ static inline struct page * mem_to_page( void *addr) { - if (((unsigned long)addr < VMALLOC_START) || - ((unsigned long)addr >= VMALLOC_END)) { + if ((!is_vmalloc_addr(addr))) { return virt_to_page(addr); } else { return vmalloc_to_page(addr); diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index d6a8ddd..6f614f3 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -155,7 +155,7 @@ xfs_iozero( if (status) break; - zero_user_page(page, offset, bytes, KM_USER0); + zero_user(page, offset, bytes); status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, page, fsdata); diff --git a/include/asm-alpha/atomic.h b/include/asm-alpha/atomic.h index f5cb7b8..ca88e54 100644 --- a/include/asm-alpha/atomic.h +++ b/include/asm-alpha/atomic.h @@ -100,7 +100,7 @@ static __inline__ void atomic64_sub(long i, atomic64_t * v) /* * Same as above, but return the result value */ -static __inline__ long atomic_add_return(int i, atomic_t * v) +static inline int atomic_add_return(int i, atomic_t *v) { long temp, result; smp_mb(); diff --git a/include/asm-alpha/pci.h b/include/asm-alpha/pci.h index 30ee766..d5b10ef 100644 --- a/include/asm-alpha/pci.h +++ b/include/asm-alpha/pci.h @@ -4,6 +4,7 @@ #ifdef __KERNEL__ #include <linux/spinlock.h> +#include <linux/dma-mapping.h> #include <asm/scatterlist.h> #include <asm/machvec.h> diff --git a/include/asm-alpha/pgalloc.h b/include/asm-alpha/pgalloc.h index 471864e..fdbedac 100644 --- a/include/asm-alpha/pgalloc.h +++ b/include/asm-alpha/pgalloc.h @@ -31,7 +31,7 @@ pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd) extern pgd_t *pgd_alloc(struct mm_struct *mm); static inline void -pgd_free(pgd_t *pgd) +pgd_free(struct mm_struct *mm, pgd_t *pgd) { free_page((unsigned long)pgd); } @@ -44,7 +44,7 @@ pmd_alloc_one(struct mm_struct *mm, unsigned long address) } static inline void -pmd_free(pmd_t *pmd) +pmd_free(struct mm_struct *mm, pmd_t *pmd) { free_page((unsigned long)pmd); } @@ -52,7 +52,7 @@ pmd_free(pmd_t *pmd) extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); static inline void -pte_free_kernel(pte_t *pte) +pte_free_kernel(struct mm_struct *mm, pte_t *pte) { free_page((unsigned long)pte); } @@ -67,7 +67,7 @@ pte_alloc_one(struct mm_struct *mm, unsigned long addr) } static inline void -pte_free(struct page *page) +pte_free(struct mm_struct *mm, struct page *page) { __free_page(page); } diff --git a/include/asm-alpha/tlb.h b/include/asm-alpha/tlb.h index aa91335..c136365 100644 --- a/include/asm-alpha/tlb.h +++ b/include/asm-alpha/tlb.h @@ -9,7 +9,7 @@ #include <asm-generic/tlb.h> -#define __pte_free_tlb(tlb,pte) pte_free(pte) -#define __pmd_free_tlb(tlb,pmd) pmd_free(pmd) +#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, pte) +#define __pmd_free_tlb(tlb, pmd) pmd_free((tlb)->mm, pmd) #endif diff --git a/include/asm-alpha/tlbflush.h b/include/asm-alpha/tlbflush.h index b9e9147..9d87aaa 100644 --- a/include/asm-alpha/tlbflush.h +++ b/include/asm-alpha/tlbflush.h @@ -142,6 +142,10 @@ extern void flush_tlb_range(struct vm_area_struct *, unsigned long, #endif /* CONFIG_SMP */ -#define flush_tlb_kernel_range(start, end) flush_tlb_all() +static inline void flush_tlb_kernel_range(unsigned long start, + unsigned long end) +{ + flush_tlb_all(); +} #endif /* _ALPHA_TLBFLUSH_H */ diff --git a/include/asm-alpha/unistd.h b/include/asm-alpha/unistd.h index 29bf2fd..5b5c174 100644 --- a/include/asm-alpha/unistd.h +++ b/include/asm-alpha/unistd.h @@ -442,7 +442,6 @@ #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_GETHOSTNAME -#define __ARCH_WANT_SYS_SOCKETCALL #define __ARCH_WANT_SYS_FADVISE64 #define __ARCH_WANT_SYS_GETPGRP #define __ARCH_WANT_SYS_OLD_GETRLIMIT diff --git a/include/asm-arm/arch-pxa/gpio.h b/include/asm-arm/arch-pxa/gpio.h index 9dbc2dc..bdbf5f9 100644 --- a/include/asm-arm/arch-pxa/gpio.h +++ b/include/asm-arm/arch-pxa/gpio.h @@ -28,43 +28,35 @@ #include <asm/irq.h> #include <asm/hardware.h> -static inline int gpio_request(unsigned gpio, const char *label) -{ - return 0; -} +#include <asm-generic/gpio.h> -static inline void gpio_free(unsigned gpio) -{ - return; -} -extern int gpio_direction_input(unsigned gpio); -extern int gpio_direction_output(unsigned gpio, int value); +/* NOTE: some PXAs have fewer on-chip GPIOs (like PXA255, with 85). + * Those cases currently cause holes in the GPIO number space. + */ +#define NR_BUILTIN_GPIO 128 -static inline int __gpio_get_value(unsigned gpio) +static inline int gpio_get_value(unsigned gpio) { - return GPLR(gpio) & GPIO_bit(gpio); + if (__builtin_constant_p(gpio) && (gpio < NR_BUILTIN_GPIO)) + return GPLR(gpio) & GPIO_bit(gpio); + else + return __gpio_get_value(gpio); } -#define gpio_get_value(gpio) \ - (__builtin_constant_p(gpio) ? \ - __gpio_get_value(gpio) : \ - pxa_gpio_get_value(gpio)) - -static inline void __gpio_set_value(unsigned gpio, int value) +static inline void gpio_set_value(unsigned gpio, int value) { - if (value) - GPSR(gpio) = GPIO_bit(gpio); - else - GPCR(gpio) = GPIO_bit(gpio); + if (__builtin_constant_p(gpio) && (gpio < NR_BUILTIN_GPIO)) { + if (value) + GPSR(gpio) = GPIO_bit(gpio); + else + GPCR(gpio) = GPIO_bit(gpio); + } else { + __gpio_set_value(gpio, value); + } } -#define gpio_set_value(gpio,value) \ - (__builtin_constant_p(gpio) ? \ - __gpio_set_value(gpio, value) : \ - pxa_gpio_set_value(gpio, value)) - -#include <asm-generic/gpio.h> /* cansleep wrappers */ +#define gpio_cansleep __gpio_cansleep #define gpio_to_irq(gpio) IRQ_GPIO(gpio) #define irq_to_gpio(irq) IRQ_TO_GPIO(irq) diff --git a/include/asm-arm/arch-pxa/pxa-regs.h b/include/asm-arm/arch-pxa/pxa-regs.h index 16ed24d..ac175b4 100644 --- a/include/asm-arm/arch-pxa/pxa-regs.h +++ b/include/asm-arm/arch-pxa/pxa-regs.h @@ -1131,6 +1131,19 @@ * General Purpose I/O */ +#define GPIO0_BASE ((void __iomem *)io_p2v(0x40E00000)) +#define GPIO1_BASE ((void __iomem *)io_p2v(0x40E00004)) +#define GPIO2_BASE ((void __iomem *)io_p2v(0x40E00008)) +#define GPIO3_BASE ((void __iomem *)io_p2v(0x40E00100)) + +#define GPLR_OFFSET 0x00 +#define GPDR_OFFSET 0x0C +#define GPSR_OFFSET 0x18 +#define GPCR_OFFSET 0x24 +#define GRER_OFFSET 0x30 +#define GFER_OFFSET 0x3C +#define GEDR_OFFSET 0x48 + #define GPLR0 __REG(0x40E00000) /* GPIO Pin-Level Register GPIO<31:0> */ #define GPLR1 __REG(0x40E00004) /* GPIO Pin-Level Register GPIO<63:32> */ #define GPLR2 __REG(0x40E00008) /* GPIO Pin-Level Register GPIO<80:64> */ diff --git a/include/asm-arm/pgalloc.h b/include/asm-arm/pgalloc.h index 4d43945..fb6c6e3 100644 --- a/include/asm-arm/pgalloc.h +++ b/include/asm-arm/pgalloc.h @@ -27,14 +27,14 @@ * Since we have only two-level page tables, these are trivial */ #define pmd_alloc_one(mm,addr) ({ BUG(); ((pmd_t *)2); }) -#define pmd_free(pmd) do { } while (0) +#define pmd_free(mm, pmd) do { } while (0) #define pgd_populate(mm,pmd,pte) BUG() extern pgd_t *get_pgd_slow(struct mm_struct *mm); -extern void free_pgd_slow(pgd_t *pgd); +extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd); #define pgd_alloc(mm) get_pgd_slow(mm) -#define pgd_free(pgd) free_pgd_slow(pgd) +#define pgd_free(mm, pgd) free_pgd_slow(mm, pgd) /* * Allocate one PTE table. @@ -83,7 +83,7 @@ pte_alloc_one(struct mm_struct *mm, unsigned long addr) /* * Free one PTE table. */ -static inline void pte_free_kernel(pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { if (pte) { pte -= PTRS_PER_PTE; @@ -91,7 +91,7 @@ static inline void pte_free_kernel(pte_t *pte) } } -static inline void pte_free(struct page *pte) +static inline void pte_free(struct mm_struct *mm, struct page *pte) { __free_page(pte); } diff --git a/include/asm-arm/tlb.h b/include/asm-arm/tlb.h index cb74002..36bd402 100644 --- a/include/asm-arm/tlb.h +++ b/include/asm-arm/tlb.h @@ -85,8 +85,8 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) } #define tlb_remove_page(tlb,page) free_page_and_swap_cache(page) -#define pte_free_tlb(tlb,ptep) pte_free(ptep) -#define pmd_free_tlb(tlb,pmdp) pmd_free(pmdp) +#define pte_free_tlb(tlb, ptep) pte_free((tlb)->mm, ptep) +#define pmd_free_tlb(tlb, pmdp) pmd_free((tlb)->mm, pmdp) #define tlb_migrate_finish(mm) do { } while (0) diff --git a/include/asm-avr32/arch-at32ap/at32ap700x.h b/include/asm-avr32/arch-at32ap/at32ap700x.h index 99684d6..31e48b0 100644 --- a/include/asm-avr32/arch-at32ap/at32ap700x.h +++ b/include/asm-avr32/arch-at32ap/at32ap700x.h @@ -13,8 +13,6 @@ #define GPIO_PERIPH_A 0 #define GPIO_PERIPH_B 1 -#define NR_GPIO_CONTROLLERS 4 - /* * Pin numbers identifying specific GPIO pins on the chip. They can * also be converted to IRQ numbers by passing them through diff --git a/include/asm-avr32/arch-at32ap/gpio.h b/include/asm-avr32/arch-at32ap/gpio.h index af7f953..0180f58 100644 --- a/include/asm-avr32/arch-at32ap/gpio.h +++ b/include/asm-avr32/arch-at32ap/gpio.h @@ -5,20 +5,36 @@ #include <asm/irq.h> -/* Arch-neutral GPIO API */ -int __must_check gpio_request(unsigned int gpio, const char *label); -void gpio_free(unsigned int gpio); +/* Some GPIO chips can manage IRQs; some can't. The exact numbers can + * be changed if needed, but for the moment they're not configurable. + */ +#define ARCH_NR_GPIOS (NR_GPIO_IRQS + 2 * 32) -int gpio_direction_input(unsigned int gpio); -int gpio_direction_output(unsigned int gpio, int value); -int gpio_get_value(unsigned int gpio); -void gpio_set_value(unsigned int gpio, int value); -#include <asm-generic/gpio.h> /* cansleep wrappers */ +/* Arch-neutral GPIO API, supporting both "native" and external GPIOs. */ +#include <asm-generic/gpio.h> + +static inline int gpio_get_value(unsigned int gpio) +{ + return __gpio_get_value(gpio); +} + +static inline void gpio_set_value(unsigned int gpio, int value) +{ + __gpio_set_value(gpio, value); +} + +static inline int gpio_cansleep(unsigned int gpio) +{ + return __gpio_cansleep(gpio); +} + static inline int gpio_to_irq(unsigned int gpio) { - return gpio + GPIO_IRQ_BASE; + if (gpio < NR_GPIO_IRQS) + return gpio + GPIO_IRQ_BASE; + return -EINVAL; } static inline int irq_to_gpio(unsigned int irq) diff --git a/include/asm-avr32/arch-at32ap/irq.h b/include/asm-avr32/arch-at32ap/irq.h index 5adffab..608e350 100644 --- a/include/asm-avr32/arch-at32ap/irq.h +++ b/include/asm-avr32/arch-at32ap/irq.h @@ -3,11 +3,11 @@ #define EIM_IRQ_BASE NR_INTERNAL_IRQS #define NR_EIM_IRQS 32 - #define AT32_EXTINT(n) (EIM_IRQ_BASE + (n)) #define GPIO_IRQ_BASE (EIM_IRQ_BASE + NR_EIM_IRQS) -#define NR_GPIO_IRQS (5 * 32) +#define NR_GPIO_CTLR (5 /*internal*/ + 1 /*external*/) +#define NR_GPIO_IRQS (NR_GPIO_CTLR * 32) #define NR_IRQS (GPIO_IRQ_BASE + NR_GPIO_IRQS) diff --git a/include/asm-avr32/pgalloc.h b/include/asm-avr32/pgalloc.h index 0e680f4..b77e364 100644 --- a/include/asm-avr32/pgalloc.h +++ b/include/asm-avr32/pgalloc.h @@ -30,7 +30,7 @@ static __inline__ pgd_t *pgd_alloc(struct mm_struct *mm) return kcalloc(USER_PTRS_PER_PGD, sizeof(pgd_t), GFP_KERNEL); } -static inline void pgd_free(pgd_t *pgd) +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { kfree(pgd); } @@ -55,12 +55,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, return pte; } -static inline void pte_free_kernel(pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { free_page((unsigned long)pte); } -static inline void pte_free(struct page *pte) +static inline void pte_free(struct mm_struct *mm, struct page *pte) { __free_page(pte); } diff --git a/include/asm-cris/bitops.h b/include/asm-cris/bitops.h index e2f49c2..75ea6e0 100644 --- a/include/asm-cris/bitops.h +++ b/include/asm-cris/bitops.h @@ -24,13 +24,6 @@ #include <linux/compiler.h> /* - * Some hacks to defeat gcc over-optimizations.. - */ -struct __dummy { unsigned long a[100]; }; -#define ADDR (*(struct __dummy *) addr) -#define CONST_ADDR (*(const struct __dummy *) addr) - -/* * set_bit - Atomically set a bit in memory * @nr: the bit to set * @addr: the address to start counting from diff --git a/include/asm-cris/pgalloc.h b/include/asm-cris/pgalloc.h index deaddfe..8ddd66f 100644 --- a/include/asm-cris/pgalloc.h +++ b/include/asm-cris/pgalloc.h @@ -16,7 +16,7 @@ static inline pgd_t *pgd_alloc (struct mm_struct *mm) return (pgd_t *)get_zeroed_page(GFP_KERNEL); } -static inline void pgd_free (pgd_t *pgd) +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { free_page((unsigned long)pgd); } @@ -34,12 +34,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long add return pte; } -static inline void pte_free_kernel(pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { free_page((unsigned long)pte); } -static inline void pte_free(struct page *pte) +static inline void pte_free(struct mm_struct *mm, struct page *pte) { __free_page(pte); } diff --git a/include/asm-frv/dma-mapping.h b/include/asm-frv/dma-mapping.h index bcb2df6..2e8966c 100644 --- a/include/asm-frv/dma-mapping.h +++ b/include/asm-frv/dma-mapping.h @@ -17,16 +17,6 @@ void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); /* - * These macros should be used after a pci_map_sg call has been done - * to get bus addresses of each of the SG entries and their lengths. - * You should only work with the number of sg entries pci_map_sg - * returns, or alternatively stop on the first sg_dma_len(sg) which - * is 0. - */ -#define sg_dma_address(sg) ((sg)->dma_address) -#define sg_dma_len(sg) ((sg)->length) - -/* * Map a single buffer of the indicated size for DMA in streaming mode. * The 32-bit bus address to use is returned. * diff --git a/include/asm-frv/page.h b/include/asm-frv/page.h index 213d92f..bd9bd2d 100644 --- a/include/asm-frv/page.h +++ b/include/asm-frv/page.h @@ -76,10 +76,6 @@ extern unsigned long max_pfn; #endif /* __ASSEMBLY__ */ -#ifdef CONFIG_CONTIGUOUS_PAGE_ALLOC -#define WANT_PAGE_VIRTUAL 1 -#endif - #include <asm-generic/memory_model.h> #include <asm-generic/page.h> diff --git a/include/asm-frv/pgalloc.h b/include/asm-frv/pgalloc.h index ce982a6..e89620e 100644 --- a/include/asm-frv/pgalloc.h +++ b/include/asm-frv/pgalloc.h @@ -31,18 +31,18 @@ do { \ */ extern pgd_t *pgd_alloc(struct mm_struct *); -extern void pgd_free(pgd_t *); +extern void pgd_free(struct mm_struct *mm, pgd_t *); extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); extern struct page *pte_alloc_one(struct mm_struct *, unsigned long); -static inline void pte_free_kernel(pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { free_page((unsigned long)pte); } -static inline void pte_free(struct page *pte) +static inline void pte_free(struct mm_struct *mm, struct page *pte) { __free_page(pte); } @@ -55,7 +55,7 @@ static inline void pte_free(struct page *pte) * (In the PAE case we free the pmds as part of the pgd.) */ #define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *) 2); }) -#define pmd_free(x) do { } while (0) +#define pmd_free(mm, x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #endif /* CONFIG_MMU */ diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h index 3c402af..6c0682e 100644 --- a/include/asm-frv/pgtable.h +++ b/include/asm-frv/pgtable.h @@ -226,7 +226,7 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) * inside the pgd, so has no extra memory associated with it. */ #define pud_alloc_one(mm, address) NULL -#define pud_free(x) do { } while (0) +#define pud_free(mm, x) do { } while (0) #define __pud_free_tlb(tlb, x) do { } while (0) /* diff --git a/include/asm-frv/scatterlist.h b/include/asm-frv/scatterlist.h index 2e7143b..4bca8a2 100644 --- a/include/asm-frv/scatterlist.h +++ b/include/asm-frv/scatterlist.h @@ -31,6 +31,16 @@ struct scatterlist { unsigned int length; }; +/* + * These macros should be used after a pci_map_sg call has been done + * to get bus addresses of each of the SG entries and their lengths. + * You should only work with the number of sg entries pci_map_sg + * returns, or alternatively stop on the first sg_dma_len(sg) which + * is 0. + */ +#define sg_dma_address(sg) ((sg)->dma_address) +#define sg_dma_len(sg) ((sg)->length) + #define ISA_DMA_THRESHOLD (0xffffffffUL) #endif /* !_ASM_SCATTERLIST_H */ diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h index 7b88d39..9d40e87 100644 --- a/include/asm-generic/4level-fixup.h +++ b/include/asm-generic/4level-fixup.h @@ -28,7 +28,7 @@ #undef pud_free_tlb #define pud_free_tlb(tlb, x) do { } while (0) -#define pud_free(x) do { } while (0) +#define pud_free(mm, x) do { } while (0) #define __pud_free_tlb(tlb, x) do { } while (0) #undef pud_addr_end diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h index 2d0aab1..f29a502 100644 --- a/include/asm-generic/gpio.h +++ b/include/asm-generic/gpio.h @@ -1,6 +1,102 @@ #ifndef _ASM_GENERIC_GPIO_H #define _ASM_GENERIC_GPIO_H +#ifdef CONFIG_HAVE_GPIO_LIB + +/* Platforms may implement their GPIO interface with library code, + * at a small performance cost for non-inlined operations and some + * extra memory (for code and for per-GPIO table entries). + * + * While the GPIO programming interface defines valid GPIO numbers + * to be in the range 0..MAX_INT, this library restricts them to the + * smaller range 0..ARCH_NR_GPIOS. + */ + +#ifndef ARCH_NR_GPIOS +#define ARCH_NR_GPIOS 256 +#endif + +struct seq_file; + +/** + * struct gpio_chip - abstract a GPIO controller + * @label: for diagnostics + * @direction_input: configures signal "offset" as input, or returns error + * @get: returns value for signal "offset"; for output signals this + * returns either the value actually sensed, or zero + * @direction_output: configures signal "offset" as output, or returns error + * @set: assigns output value for signal "offset" + * @dbg_show: optional routine to show contents in debugfs; default code + * will be used when this is omitted, but custom code can show extra + * state (such as pullup/pulldown configuration). + * @base: identifies the first GPIO number handled by this chip; or, if + * negative during registration, requests dynamic ID allocation. + * @ngpio: the number of GPIOs handled by this controller; the last GPIO + * handled is (base + ngpio - 1). + * @can_sleep: flag must be set iff get()/set() methods sleep, as they + * must while accessing GPIO expander chips over I2C or SPI + * + * A gpio_chip can help platforms abstract various sources of GPIOs so + * they can all be accessed through a common programing interface. + * Example sources would be SOC controllers, FPGAs, multifunction + * chips, dedicated GPIO expanders, and so on. + * + * Each chip controls a number of signals, identified in method calls + * by "offset" values in the range 0..(@ngpio - 1). When those signals + * are referenced through calls like gpio_get_value(gpio), the offset + * is calculated by subtracting @base from the gpio number. + */ +struct gpio_chip { + char *label; + + int (*direction_input)(struct gpio_chip *chip, + unsigned offset); + int (*get)(struct gpio_chip *chip, + unsigned offset); + int (*direction_output)(struct gpio_chip *chip, + unsigned offset, int value); + void (*set)(struct gpio_chip *chip, + unsigned offset, int value); + void (*dbg_show)(struct seq_file *s, + struct gpio_chip *chip); + int base; + u16 ngpio; + unsigned can_sleep:1; +}; + +extern const char *gpiochip_is_requested(struct gpio_chip *chip, + unsigned offset); + +/* add/remove chips */ +extern int gpiochip_add(struct gpio_chip *chip); +extern int __must_check gpiochip_remove(struct gpio_chip *chip); + + +/* Always use the library code for GPIO management calls, + * or when sleeping may be involved. + */ +extern int gpio_request(unsigned gpio, const char *label); +extern void gpio_free(unsigned gpio); + +extern int gpio_direction_input(unsigned gpio); +extern int gpio_direction_output(unsigned gpio, int value); + +extern int gpio_get_value_cansleep(unsigned gpio); +extern void gpio_set_value_cansleep(unsigned gpio, int value); + + +/* A platform's <asm/gpio.h> code may want to inline the I/O calls when + * the GPIO is constant and refers to some always-present controller, + * giving direct access to chip registers and tight bitbanging loops. + */ +extern int __gpio_get_value(unsigned gpio); +extern void __gpio_set_value(unsigned gpio, int value); + +extern int __gpio_cansleep(unsigned gpio); + + +#else + /* platforms that don't directly support access to GPIOs through I2C, SPI, * or other blocking infrastructure can use these wrappers. */ @@ -22,4 +118,6 @@ static inline void gpio_set_value_cansleep(unsigned gpio, int value) gpio_set_value(gpio, value); } +#endif + #endif /* _ASM_GENERIC_GPIO_H */ diff --git a/include/asm-generic/pgtable-nopmd.h b/include/asm-generic/pgtable-nopmd.h index 29ff5d8..087325e 100644 --- a/include/asm-generic/pgtable-nopmd.h +++ b/include/asm-generic/pgtable-nopmd.h @@ -54,7 +54,7 @@ static inline pmd_t * pmd_offset(pud_t * pud, unsigned long address) * inside the pud, so has no extra memory associated with it. */ #define pmd_alloc_one(mm, address) NULL -#define pmd_free(x) do { } while (0) +#define pmd_free(mm, x) do { } while (0) #define __pmd_free_tlb(tlb, x) do { } while (0) #undef pmd_addr_end diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h index 5664645..87cf449 100644 --- a/include/asm-generic/pgtable-nopud.h +++ b/include/asm-generic/pgtable-nopud.h @@ -51,7 +51,7 @@ static inline pud_t * pud_offset(pgd_t * pgd, unsigned long address) * inside the pgd, so has no extra memory associated with it. */ #define pud_alloc_one(mm, address) NULL -#define pud_free(x) do { } while (0) +#define pud_free(mm, x) do { } while (0) #define __pud_free_tlb(tlb, x) do { } while (0) #undef pud_addr_end diff --git a/include/asm-ia64/pgalloc.h b/include/asm-ia64/pgalloc.h index 67552ca..556d988 100644 --- a/include/asm-ia64/pgalloc.h +++ b/include/asm-ia64/pgalloc.h @@ -27,7 +27,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) return quicklist_alloc(0, GFP_KERNEL, NULL); } -static inline void pgd_free(pgd_t * pgd) +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { quicklist_free(0, NULL, pgd); } @@ -44,11 +44,11 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) return quicklist_alloc(0, GFP_KERNEL, NULL); } -static inline void pud_free(pud_t * pud) +static inline void pud_free(struct mm_struct *mm, pud_t *pud) { quicklist_free(0, NULL, pud); } -#define __pud_free_tlb(tlb, pud) pud_free(pud) +#define __pud_free_tlb(tlb, pud) pud_free((tlb)->mm, pud) #endif /* CONFIG_PGTABLE_4 */ static inline void @@ -62,12 +62,12 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) return quicklist_alloc(0, GFP_KERNEL, NULL); } -static inline void pmd_free(pmd_t * pmd) +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { quicklist_free(0, NULL, pmd); } -#define __pmd_free_tlb(tlb, pmd) pmd_free(pmd) +#define __pmd_free_tlb(tlb, pmd) pmd_free((tlb)->mm, pmd) static inline void pmd_populate(struct mm_struct *mm, pmd_t * pmd_entry, struct page *pte) @@ -94,12 +94,12 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, return quicklist_alloc(0, GFP_KERNEL, NULL); } -static inline void pte_free(struct page *pte) +static inline void pte_free(struct mm_struct *mm, struct page *pte) { quicklist_free_page(0, NULL, pte); } -static inline void pte_free_kernel(pte_t * pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { quicklist_free(0, NULL, pte); } @@ -109,6 +109,6 @@ static inline void check_pgt_cache(void) quicklist_trim(0, NULL, 25, 16); } -#define __pte_free_tlb(tlb, pte) pte_free(pte) +#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, pte) #endif /* _ASM_IA64_PGALLOC_H */ diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h index be3b0ae..666385b 100644 --- a/include/asm-ia64/processor.h +++ b/include/asm-ia64/processor.h @@ -31,7 +31,8 @@ * each (assuming 8KB page size), for a total of 8TB of user virtual * address space. */ -#define TASK_SIZE (current->thread.task_size) +#define TASK_SIZE_OF(tsk) ((tsk)->thread.task_size) +#define TASK_SIZE TASK_SIZE_OF(current) /* * This decides where the kernel will search for a free chunk of vm diff --git a/include/asm-m32r/irq.h b/include/asm-m32r/irq.h index 2f93f47..242028b 100644 --- a/include/asm-m32r/irq.h +++ b/include/asm-m32r/irq.h @@ -3,7 +3,7 @@ #define _ASM_M32R_IRQ_H -#if defined(CONFIG_PLAT_M32700UT_Alpha) || defined(CONFIG_PLAT_USRV) +#if defined(CONFIG_PLAT_USRV) /* * IRQ definitions for M32700UT * M32700 Chip: 64 interrupts diff --git a/include/asm-m32r/m32700ut/m32700ut_pld.h b/include/asm-m32r/m32700ut/m32700ut_pld.h index d391212..57623be 100644 --- a/include/asm-m32r/m32700ut/m32700ut_pld.h +++ b/include/asm-m32r/m32700ut/m32700ut_pld.h @@ -13,9 +13,7 @@ * this archive for more details. */ -#if defined(CONFIG_PLAT_M32700UT_Alpha) -#define PLD_PLAT_BASE 0x08c00000 -#elif defined(CONFIG_PLAT_M32700UT) || defined(CONFIG_PLAT_USRV) +#if defined(CONFIG_PLAT_M32700UT) || defined(CONFIG_PLAT_USRV) #define PLD_PLAT_BASE 0x04c00000 #else #error "no platform configuration" diff --git a/include/asm-m32r/pgalloc.h b/include/asm-m32r/pgalloc.h index 943ba63..e5921ad 100644 --- a/include/asm-m32r/pgalloc.h +++ b/include/asm-m32r/pgalloc.h @@ -24,7 +24,7 @@ static __inline__ pgd_t *pgd_alloc(struct mm_struct *mm) return pgd; } -static __inline__ void pgd_free(pgd_t *pgd) +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { free_page((unsigned long)pgd); } @@ -46,17 +46,17 @@ static __inline__ struct page *pte_alloc_one(struct mm_struct *mm, return pte; } -static __inline__ void pte_free_kernel(pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { free_page((unsigned long)pte); } -static __inline__ void pte_free(struct page *pte) +static inline void pte_free(struct mm_struct *mm, struct page *pte) { __free_page(pte); } -#define __pte_free_tlb(tlb, pte) pte_free((pte)) +#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, (pte)) /* * allocating and freeing a pmd is trivial: the 1-entry pmd is @@ -65,7 +65,7 @@ static __inline__ void pte_free(struct page *pte) */ #define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) -#define pmd_free(x) do { } while (0) +#define pmd_free(mm, x) do { } while (0) #define __pmd_free_tlb(tlb, x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() diff --git a/include/asm-m68k/macintosh.h b/include/asm-m68k/macintosh.h index 27d11da..28b0f49 100644 --- a/include/asm-m68k/macintosh.h +++ b/include/asm-m68k/macintosh.h @@ -14,8 +14,6 @@ extern void mac_init_IRQ(void); extern int mac_irq_pending(unsigned int); extern void mac_identify(void); extern void mac_report_hardware(void); -extern void mac_debugging_penguin(int); -extern void mac_boom(int); /* * Floppy driver magic hook - probably shouldnt be here diff --git a/include/asm-m68k/motorola_pgalloc.h b/include/asm-m68k/motorola_pgalloc.h index 5158412..500ec9b 100644 --- a/include/asm-m68k/motorola_pgalloc.h +++ b/include/asm-m68k/motorola_pgalloc.h @@ -22,7 +22,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long ad return pte; } -static inline void pte_free_kernel(pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { cache_page(pte); free_page((unsigned long) pte); @@ -47,7 +47,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long add return page; } -static inline void pte_free(struct page *page) +static inline void pte_free(struct mm_struct *mm, struct page *page) { cache_page(kmap(page)); kunmap(page); @@ -67,7 +67,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) return get_pointer_table(); } -static inline int pmd_free(pmd_t *pmd) +static inline int pmd_free(struct mm_struct *mm, pmd_t *pmd) { return free_pointer_table(pmd); } @@ -78,9 +78,9 @@ static inline int __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) } -static inline void pgd_free(pgd_t *pgd) +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - pmd_free((pmd_t *)pgd); + pmd_free(mm, (pmd_t *)pgd); } static inline pgd_t *pgd_alloc(struct mm_struct *mm) diff --git a/include/asm-m68k/sun3_pgalloc.h b/include/asm-m68k/sun3_pgalloc.h index fd82411..a5a91e7 100644 --- a/include/asm-m68k/sun3_pgalloc.h +++ b/include/asm-m68k/sun3_pgalloc.h @@ -21,12 +21,12 @@ extern const char bad_pmd_string[]; #define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) -static inline void pte_free_kernel(pte_t * pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { free_page((unsigned long) pte); } -static inline void pte_free(struct page *page) +static inline void pte_free(struct mm_struct *mm, struct page *page) { __free_page(page); } @@ -72,10 +72,10 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *p * allocating and freeing a pmd is trivial: the 1-entry pmd is * inside the pgd, so has no extra memory associated with it. */ -#define pmd_free(x) do { } while (0) +#define pmd_free(mm, x) do { } while (0) #define __pmd_free_tlb(tlb, x) do { } while (0) -static inline void pgd_free(pgd_t * pgd) +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { free_page((unsigned long) pgd); } diff --git a/include/asm-m68knommu/mcfne.h b/include/asm-m68knommu/mcfne.h index c920ccd..431f63a 100644 --- a/include/asm-m68knommu/mcfne.h +++ b/include/asm-m68knommu/mcfne.h @@ -60,17 +60,6 @@ #define NE2000_BYTE volatile unsigned char #endif -#if defined(CONFIG_CFV240) -#define NE2000_ADDR 0x40010000 -#define NE2000_ADDR1 0x40010001 -#define NE2000_ODDOFFSET 0x00000000 -#define NE2000_IRQ 1 -#define NE2000_IRQ_VECTOR 0x19 -#define NE2000_IRQ_PRIORITY 2 -#define NE2000_IRQ_LEVEL 1 -#define NE2000_BYTE volatile unsigned char -#endif - #if defined(CONFIG_M5307C3) #define NE2000_ADDR 0x40000300 #define NE2000_ODDOFFSET 0x00010000 @@ -173,13 +162,8 @@ void ne2000_outsw(unsigned int addr, void *vbuf, unsigned long len); * On most NE2000 implementations on ColdFire boards the chip is * mapped in kinda funny, due to its ISA heritage. */ -#ifdef CONFIG_CFV240 -#define NE2000_PTR(addr) (NE2000_ADDR + ((addr & 0x3f) << 1) + 1) -#define NE2000_DATA_PTR(addr) (NE2000_ADDR + ((addr & 0x3f) << 1)) -#else #define NE2000_PTR(addr) ((addr&0x1)?(NE2000_ODDOFFSET+addr-1):(addr)) #define NE2000_DATA_PTR(addr) (addr) -#endif void ne2000_outb(unsigned int val, unsigned int addr) @@ -285,17 +269,6 @@ void ne2000_irqsetup(int irq) } #endif -#if defined(CONFIG_CFV240) -void ne2000_irqsetup(int irq) -{ - volatile unsigned char *icrp; - - icrp = (volatile unsigned char *) (MCF_MBAR + MCFSIM_ICR1); - *icrp = MCFSIM_ICR_LEVEL1 | MCFSIM_ICR_PRI2 | MCFSIM_ICR_AUTOVEC; - mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_EINT1); -} -#endif - #if defined(CONFIG_M5206e) && defined(CONFIG_NETtel) void ne2000_irqsetup(int irq) { diff --git a/include/asm-m68knommu/mcfsim.h b/include/asm-m68knommu/mcfsim.h index 1074ae7..da3f2ce 100644 --- a/include/asm-m68knommu/mcfsim.h +++ b/include/asm-m68knommu/mcfsim.h @@ -17,9 +17,7 @@ * Include 5204, 5206/e, 5235, 5249, 5270/5271, 5272, 5280/5282, * 5307 or 5407 specific addresses. */ -#if defined(CONFIG_M5204) -#include <asm/m5204sim.h> -#elif defined(CONFIG_M5206) || defined(CONFIG_M5206e) +#if defined(CONFIG_M5206) || defined(CONFIG_M5206e) #include <asm/m5206sim.h> #elif defined(CONFIG_M520x) #include <asm/m520xsim.h> diff --git a/include/asm-m68knommu/mcftimer.h b/include/asm-m68knommu/mcftimer.h index 6f4d796..0f90f6d 100644 --- a/include/asm-m68knommu/mcftimer.h +++ b/include/asm-m68knommu/mcftimer.h @@ -16,7 +16,7 @@ /* * Get address specific defines for this ColdFire member. */ -#if defined(CONFIG_M5204) || defined(CONFIG_M5206) || defined(CONFIG_M5206e) +#if defined(CONFIG_M5206) || defined(CONFIG_M5206e) #define MCFTIMER_BASE1 0x100 /* Base address of TIMER1 */ #define MCFTIMER_BASE2 0x120 /* Base address of TIMER2 */ #elif defined(CONFIG_M5272) diff --git a/include/asm-m68knommu/mcfuart.h b/include/asm-m68knommu/mcfuart.h index 8a7a677..ef22938 100644 --- a/include/asm-m68knommu/mcfuart.h +++ b/include/asm-m68knommu/mcfuart.h @@ -19,7 +19,7 @@ #if defined(CONFIG_M5272) #define MCFUART_BASE1 0x100 /* Base address of UART1 */ #define MCFUART_BASE2 0x140 /* Base address of UART2 */ -#elif defined(CONFIG_M5204) || defined(CONFIG_M5206) || defined(CONFIG_M5206e) +#elif defined(CONFIG_M5206) || defined(CONFIG_M5206e) #if defined(CONFIG_NETtel) #define MCFUART_BASE1 0x180 /* Base address of UART1 */ #define MCFUART_BASE2 0x140 /* Base address of UART2 */ diff --git a/include/asm-m68knommu/system.h b/include/asm-m68knommu/system.h index 15b4c7d..ee2dc07 100644 --- a/include/asm-m68knommu/system.h +++ b/include/asm-m68knommu/system.h @@ -207,23 +207,6 @@ cmpxchg(volatile int *p, int old, int new) } -#ifdef CONFIG_M68332 -#define HARD_RESET_NOW() ({ \ - local_irq_disable(); \ - asm(" \ - movew #0x0000, 0xfffa6a; \ - reset; \ - /*movew #0x1557, 0xfffa44;*/ \ - /*movew #0x0155, 0xfffa46;*/ \ - moveal #0, %a0; \ - movec %a0, %vbr; \ - moveal 0, %sp; \ - moveal 4, %a0; \ - jmp (%a0); \ - "); \ -}) -#endif - #if defined( CONFIG_M68328 ) || defined( CONFIG_M68EZ328 ) || \ defined (CONFIG_M68360) || defined( CONFIG_M68VZ328 ) #define HARD_RESET_NOW() ({ \ diff --git a/include/asm-mips/pgalloc.h b/include/asm-mips/pgalloc.h index 81b7212..c4efece 100644 --- a/include/asm-mips/pgalloc.h +++ b/include/asm-mips/pgalloc.h @@ -58,7 +58,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) return ret; } -static inline void pgd_free(pgd_t *pgd) +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { free_pages((unsigned long)pgd, PGD_ORDER); } @@ -85,12 +85,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, return pte; } -static inline void pte_free_kernel(pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { free_pages((unsigned long)pte, PTE_ORDER); } -static inline void pte_free(struct page *pte) +static inline void pte_free(struct mm_struct *mm, struct page *pte) { __free_pages(pte, PTE_ORDER); } @@ -103,7 +103,7 @@ static inline void pte_free(struct page *pte) * allocating and freeing a pmd is trivial: the 1-entry pmd is * inside the pgd, so has no extra memory associated with it. */ -#define pmd_free(x) do { } while (0) +#define pmd_free(mm, x) do { } while (0) #define __pmd_free_tlb(tlb, x) do { } while (0) #endif @@ -120,12 +120,12 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) return pmd; } -static inline void pmd_free(pmd_t *pmd) +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { free_pages((unsigned long)pmd, PMD_ORDER); } -#define __pmd_free_tlb(tlb, x) pmd_free(x) +#define __pmd_free_tlb(tlb, x) pmd_free((tlb)->mm, x) #endif diff --git a/include/asm-mips/processor.h b/include/asm-mips/processor.h index 83bc945..36f42de 100644 --- a/include/asm-mips/processor.h +++ b/include/asm-mips/processor.h @@ -65,6 +65,8 @@ extern unsigned int vced_count, vcei_count; #define TASK_UNMAPPED_BASE \ (test_thread_flag(TIF_32BIT_ADDR) ? \ PAGE_ALIGN(TASK_SIZE32 / 3) : PAGE_ALIGN(TASK_SIZE / 3)) +#define TASK_SIZE_OF(tsk) \ + (test_tsk_thread_flag(tsk, TIF_32BIT_ADDR) ? TASK_SIZE32 : TASK_SIZE) #endif #define NUM_FPU_REGS 32 diff --git a/include/asm-parisc/pgalloc.h b/include/asm-parisc/pgalloc.h index 1af1a41..aab66f1 100644 --- a/include/asm-parisc/pgalloc.h +++ b/include/asm-parisc/pgalloc.h @@ -43,7 +43,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) return actual_pgd; } -static inline void pgd_free(pgd_t *pgd) +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { #ifdef CONFIG_64BIT pgd -= PTRS_PER_PGD; @@ -70,7 +70,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) return pmd; } -static inline void pmd_free(pmd_t *pmd) +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { #ifdef CONFIG_64BIT if(pmd_flag(*pmd) & PxD_FLAG_ATTACHED) @@ -91,7 +91,7 @@ static inline void pmd_free(pmd_t *pmd) */ #define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) -#define pmd_free(x) do { } while (0) +#define pmd_free(mm, x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() #endif @@ -130,12 +130,12 @@ pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) return pte; } -static inline void pte_free_kernel(pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { free_page((unsigned long)pte); } -#define pte_free(page) pte_free_kernel(page_address(page)) +#define pte_free(mm, page) pte_free_kernel(page_address(page)) #define check_pgt_cache() do { } while (0) diff --git a/include/asm-parisc/processor.h b/include/asm-parisc/processor.h index 6b294fb..3bb06e8 100644 --- a/include/asm-parisc/processor.h +++ b/include/asm-parisc/processor.h @@ -32,7 +32,8 @@ #endif #define current_text_addr() ({ void *pc; current_ia(pc); pc; }) -#define TASK_SIZE (current->thread.task_size) +#define TASK_SIZE_OF(tsk) ((tsk)->thread.task_size) +#define TASK_SIZE TASK_SIZE_OF(current) #define TASK_UNMAPPED_BASE (current->thread.map_base) #define DEFAULT_TASK_SIZE32 (0xFFF00000UL) diff --git a/include/asm-parisc/tlb.h b/include/asm-parisc/tlb.h index 33107a2..383b1db 100644 --- a/include/asm-parisc/tlb.h +++ b/include/asm-parisc/tlb.h @@ -21,7 +21,7 @@ do { if (!(tlb)->fullmm) \ #include <asm-generic/tlb.h> -#define __pmd_free_tlb(tlb, pmd) pmd_free(pmd) -#define __pte_free_tlb(tlb, pte) pte_free(pte) +#define __pmd_free_tlb(tlb, pmd) pmd_free((tlb)->mm, pmd) +#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, pte) #endif diff --git a/include/asm-powerpc/iommu.h b/include/asm-powerpc/iommu.h index 7a3cef7..852e15f 100644 --- a/include/asm-powerpc/iommu.h +++ b/include/asm-powerpc/iommu.h @@ -79,19 +79,19 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); -extern int iommu_map_sg(struct iommu_table *tbl, struct scatterlist *sglist, +extern int iommu_map_sg(struct device *dev, struct scatterlist *sglist, int nelems, unsigned long mask, enum dma_data_direction direction); extern void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, int nelems, enum dma_data_direction direction); -extern void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size, - dma_addr_t *dma_handle, unsigned long mask, - gfp_t flag, int node); +extern void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, + size_t size, dma_addr_t *dma_handle, + unsigned long mask, gfp_t flag, int node); extern void iommu_free_coherent(struct iommu_table *tbl, size_t size, void *vaddr, dma_addr_t dma_handle); -extern dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr, - size_t size, unsigned long mask, +extern dma_addr_t iommu_map_single(struct device *dev, struct iommu_table *tbl, + void *vaddr, size_t size, unsigned long mask, enum dma_data_direction direction); extern void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction); diff --git a/include/asm-powerpc/nvram.h b/include/asm-powerpc/nvram.h index 4e7059c..efde5ac 100644 --- a/include/asm-powerpc/nvram.h +++ b/include/asm-powerpc/nvram.h @@ -58,6 +58,9 @@ struct nvram_header { }; #ifdef __KERNEL__ + +#include <linux/list.h> + struct nvram_partition { struct list_head partition; struct nvram_header header; diff --git a/include/asm-powerpc/pgalloc-32.h b/include/asm-powerpc/pgalloc-32.h index e130743..c162a4c 100644 --- a/include/asm-powerpc/pgalloc-32.h +++ b/include/asm-powerpc/pgalloc-32.h @@ -6,14 +6,14 @@ extern void __bad_pte(pmd_t *pmd); extern pgd_t *pgd_alloc(struct mm_struct *mm); -extern void pgd_free(pgd_t *pgd); +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); /* * We don't have any real pmd's, and this code never triggers because * the pgd will always be present.. */ /* #define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) */ -#define pmd_free(x) do { } while (0) +#define pmd_free(mm, x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) /* #define pgd_populate(mm, pmd, pte) BUG() */ @@ -31,10 +31,10 @@ extern void pgd_free(pgd_t *pgd); extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr); -extern void pte_free_kernel(pte_t *pte); -extern void pte_free(struct page *pte); +extern void pte_free_kernel(struct mm_struct *mm, pte_t *pte); +extern void pte_free(struct mm_struct *mm, struct page *pte); -#define __pte_free_tlb(tlb, pte) pte_free((pte)) +#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, (pte)) #define check_pgt_cache() do { } while (0) diff --git a/include/asm-powerpc/pgalloc-64.h b/include/asm-powerpc/pgalloc-64.h index 43214c8..5afae85 100644 --- a/include/asm-powerpc/pgalloc-64.h +++ b/include/asm-powerpc/pgalloc-64.h @@ -29,7 +29,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL); } -static inline void pgd_free(pgd_t *pgd) +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { subpage_prot_free(pgd); kmem_cache_free(pgtable_cache[PGD_CACHE_NUM], pgd); @@ -45,7 +45,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) GFP_KERNEL|__GFP_REPEAT); } -static inline void pud_free(pud_t *pud) +static inline void pud_free(struct mm_struct *mm, pud_t *pud) { kmem_cache_free(pgtable_cache[PUD_CACHE_NUM], pud); } @@ -81,7 +81,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) GFP_KERNEL|__GFP_REPEAT); } -static inline void pmd_free(pmd_t *pmd) +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd); } @@ -99,12 +99,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, return pte ? virt_to_page(pte) : NULL; } -static inline void pte_free_kernel(pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { free_page((unsigned long)pte); } -static inline void pte_free(struct page *ptepage) +static inline void pte_free(struct mm_struct *mm, struct page *ptepage) { __free_page(ptepage); } diff --git a/include/asm-powerpc/processor.h b/include/asm-powerpc/processor.h index dba7c94..1f4765d 100644 --- a/include/asm-powerpc/processor.h +++ b/include/asm-powerpc/processor.h @@ -99,8 +99,9 @@ extern struct task_struct *last_task_used_spe; */ #define TASK_SIZE_USER32 (0x0000000100000000UL - (1*PAGE_SIZE)) -#define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \ +#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \ TASK_SIZE_USER32 : TASK_SIZE_USER64) +#define TASK_SIZE TASK_SIZE_OF(current) /* This decides where the kernel will search for a free chunk of vm * space during mmap's. diff --git a/include/asm-powerpc/systbl.h b/include/asm-powerpc/systbl.h index 0c8b0d6..e996521 100644 --- a/include/asm-powerpc/systbl.h +++ b/include/asm-powerpc/systbl.h @@ -309,7 +309,7 @@ SYSCALL_SPU(getcpu) COMPAT_SYS(epoll_pwait) COMPAT_SYS_SPU(utimensat) COMPAT_SYS_SPU(signalfd) -COMPAT_SYS_SPU(timerfd) +SYSCALL(ni_syscall) SYSCALL_SPU(eventfd) COMPAT_SYS_SPU(sync_file_range2) COMPAT_SYS(fallocate) diff --git a/include/asm-ppc/pgalloc.h b/include/asm-ppc/pgalloc.h index 44d88a9..7c39a95 100644 --- a/include/asm-ppc/pgalloc.h +++ b/include/asm-ppc/pgalloc.h @@ -7,14 +7,14 @@ extern void __bad_pte(pmd_t *pmd); extern pgd_t *pgd_alloc(struct mm_struct *mm); -extern void pgd_free(pgd_t *pgd); +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); /* * We don't have any real pmd's, and this code never triggers because * the pgd will always be present.. */ #define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) -#define pmd_free(x) do { } while (0) +#define pmd_free(mm, x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() @@ -32,10 +32,10 @@ extern void pgd_free(pgd_t *pgd); extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr); -extern void pte_free_kernel(pte_t *pte); -extern void pte_free(struct page *pte); +extern void pte_free_kernel(struct mm_struct *mm, pte_t *pte); +extern void pte_free(struct mm_struct *mm, struct page *pte); -#define __pte_free_tlb(tlb, pte) pte_free((pte)) +#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, (pte)) #define check_pgt_cache() do { } while (0) diff --git a/include/asm-s390/pgalloc.h b/include/asm-s390/pgalloc.h index 709dd17..6f6619b 100644 --- a/include/asm-s390/pgalloc.h +++ b/include/asm-s390/pgalloc.h @@ -57,10 +57,10 @@ static inline unsigned long pgd_entry_type(struct mm_struct *mm) } #define pud_alloc_one(mm,address) ({ BUG(); ((pud_t *)2); }) -#define pud_free(x) do { } while (0) +#define pud_free(mm, x) do { } while (0) #define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) -#define pmd_free(x) do { } while (0) +#define pmd_free(mm, x) do { } while (0) #define pgd_populate(mm, pgd, pud) BUG() #define pgd_populate_kernel(mm, pgd, pud) BUG() @@ -76,7 +76,7 @@ static inline unsigned long pgd_entry_type(struct mm_struct *mm) } #define pud_alloc_one(mm,address) ({ BUG(); ((pud_t *)2); }) -#define pud_free(x) do { } while (0) +#define pud_free(mm, x) do { } while (0) static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) { @@ -85,7 +85,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) crst_table_init(crst, _SEGMENT_ENTRY_EMPTY); return (pmd_t *) crst; } -#define pmd_free(pmd) crst_table_free((unsigned long *) pmd) +#define pmd_free(mm, pmd) crst_table_free((unsigned long *)pmd) #define pgd_populate(mm, pgd, pud) BUG() #define pgd_populate_kernel(mm, pgd, pud) BUG() @@ -115,7 +115,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) crst_table_init(crst, pgd_entry_type(mm)); return (pgd_t *) crst; } -#define pgd_free(pgd) crst_table_free((unsigned long *) pgd) +#define pgd_free(mm, pgd) crst_table_free((unsigned long *) pgd) static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) @@ -151,9 +151,9 @@ pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *page) #define pte_alloc_one(mm, vmaddr) \ virt_to_page(page_table_alloc(s390_noexec)) -#define pte_free_kernel(pte) \ +#define pte_free_kernel(mm, pte) \ page_table_free((unsigned long *) pte) -#define pte_free(pte) \ +#define pte_free(mm, pte) \ page_table_free((unsigned long *) page_to_phys((struct page *) pte)) #endif /* _S390_PGALLOC_H */ diff --git a/include/asm-s390/processor.h b/include/asm-s390/processor.h index c86b982..4f74460 100644 --- a/include/asm-s390/processor.h +++ b/include/asm-s390/processor.h @@ -70,8 +70,9 @@ extern int get_cpu_capability(unsigned int *); #else /* __s390x__ */ -# define TASK_SIZE (test_thread_flag(TIF_31BIT) ? \ +# define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_31BIT) ? \ (0x80000000UL) : (0x40000000000UL)) +# define TASK_SIZE TASK_SIZE_OF(current) # define TASK_UNMAPPED_BASE (TASK_SIZE / 2) # define DEFAULT_TASK_SIZE (0x40000000000UL) diff --git a/include/asm-s390/tlb.h b/include/asm-s390/tlb.h index 618693cf..985de2b 100644 --- a/include/asm-s390/tlb.h +++ b/include/asm-s390/tlb.h @@ -65,9 +65,9 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb, if (!tlb->fullmm && (tlb->nr_ptes > 0 || tlb->nr_pmds < TLB_NR_PTRS)) __tlb_flush_mm(tlb->mm); while (tlb->nr_ptes > 0) - pte_free(tlb->array[--tlb->nr_ptes]); + pte_free(tlb->mm, tlb->array[--tlb->nr_ptes]); while (tlb->nr_pmds < TLB_NR_PTRS) - pmd_free((pmd_t *) tlb->array[tlb->nr_pmds++]); + pmd_free(tlb->mm, (pmd_t *) tlb->array[tlb->nr_pmds++]); } static inline void tlb_finish_mmu(struct mmu_gather *tlb, @@ -102,7 +102,7 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, struct page *page) if (tlb->nr_ptes >= tlb->nr_pmds) tlb_flush_mmu(tlb, 0, 0); } else - pte_free(page); + pte_free(tlb->mm, page); } /* @@ -117,7 +117,7 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) if (tlb->nr_ptes >= tlb->nr_pmds) tlb_flush_mmu(tlb, 0, 0); } else - pmd_free(pmd); + pmd_free(tlb->mm, pmd); #endif } diff --git a/include/asm-sh/pgalloc.h b/include/asm-sh/pgalloc.h index 18b613c..59ca16d 100644 --- a/include/asm-sh/pgalloc.h +++ b/include/asm-sh/pgalloc.h @@ -36,7 +36,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) return quicklist_alloc(QUICK_PGD, GFP_KERNEL | __GFP_REPEAT, pgd_ctor); } -static inline void pgd_free(pgd_t *pgd) +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { quicklist_free(QUICK_PGD, NULL, pgd); } @@ -54,12 +54,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, return pg ? virt_to_page(pg) : NULL; } -static inline void pte_free_kernel(pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { quicklist_free(QUICK_PT, NULL, pte); } -static inline void pte_free(struct page *pte) +static inline void pte_free(struct mm_struct *mm, struct page *pte) { quicklist_free_page(QUICK_PT, NULL, pte); } @@ -71,7 +71,7 @@ static inline void pte_free(struct page *pte) * inside the pgd, so has no extra memory associated with it. */ -#define pmd_free(x) do { } while (0) +#define pmd_free(mm, x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) static inline void check_pgt_cache(void) diff --git a/include/asm-sparc/pgalloc.h b/include/asm-sparc/pgalloc.h index a449cd4..b5fbdd3 100644 --- a/include/asm-sparc/pgalloc.h +++ b/include/asm-sparc/pgalloc.h @@ -32,7 +32,7 @@ BTFIXUPDEF_CALL(pgd_t *, get_pgd_fast, void) BTFIXUPDEF_CALL(void, free_pgd_fast, pgd_t *) #define free_pgd_fast(pgd) BTFIXUP_CALL(free_pgd_fast)(pgd) -#define pgd_free(pgd) free_pgd_fast(pgd) +#define pgd_free(mm, pgd) free_pgd_fast(pgd) #define pgd_alloc(mm) get_pgd_fast() BTFIXUPDEF_CALL(void, pgd_set, pgd_t *, pmd_t *) @@ -45,8 +45,8 @@ BTFIXUPDEF_CALL(pmd_t *, pmd_alloc_one, struct mm_struct *, unsigned long) BTFIXUPDEF_CALL(void, free_pmd_fast, pmd_t *) #define free_pmd_fast(pmd) BTFIXUP_CALL(free_pmd_fast)(pmd) -#define pmd_free(pmd) free_pmd_fast(pmd) -#define __pmd_free_tlb(tlb, pmd) pmd_free(pmd) +#define pmd_free(mm, pmd) free_pmd_fast(pmd) +#define __pmd_free_tlb(tlb, pmd) pmd_free((tlb)->mm, pmd) BTFIXUPDEF_CALL(void, pmd_populate, pmd_t *, struct page *) #define pmd_populate(MM, PMD, PTE) BTFIXUP_CALL(pmd_populate)(PMD, PTE) @@ -59,10 +59,10 @@ BTFIXUPDEF_CALL(pte_t *, pte_alloc_one_kernel, struct mm_struct *, unsigned long #define pte_alloc_one_kernel(mm, addr) BTFIXUP_CALL(pte_alloc_one_kernel)(mm, addr) BTFIXUPDEF_CALL(void, free_pte_fast, pte_t *) -#define pte_free_kernel(pte) BTFIXUP_CALL(free_pte_fast)(pte) +#define pte_free_kernel(mm, pte) BTFIXUP_CALL(free_pte_fast)(pte) BTFIXUPDEF_CALL(void, pte_free, struct page *) -#define pte_free(pte) BTFIXUP_CALL(pte_free)(pte) -#define __pte_free_tlb(tlb, pte) pte_free(pte) +#define pte_free(mm, pte) BTFIXUP_CALL(pte_free)(pte) +#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, pte) #endif /* _SPARC_PGALLOC_H */ diff --git a/include/asm-sparc64/pgalloc.h b/include/asm-sparc64/pgalloc.h index 5d66b85..b48f73c 100644 --- a/include/asm-sparc64/pgalloc.h +++ b/include/asm-sparc64/pgalloc.h @@ -20,7 +20,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) return quicklist_alloc(0, GFP_KERNEL, NULL); } -static inline void pgd_free(pgd_t *pgd) +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { quicklist_free(0, NULL, pgd); } @@ -32,7 +32,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) return quicklist_alloc(0, GFP_KERNEL, NULL); } -static inline void pmd_free(pmd_t *pmd) +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { quicklist_free(0, NULL, pmd); } @@ -50,12 +50,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, return pg ? virt_to_page(pg) : NULL; } -static inline void pte_free_kernel(pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { quicklist_free(0, NULL, pte); } -static inline void pte_free(struct page *ptepage) +static inline void pte_free(struct mm_struct *mm, struct page *ptepage) { quicklist_free_page(0, NULL, ptepage); } diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h index 349d1d3..ec81cde 100644 --- a/include/asm-sparc64/tlb.h +++ b/include/asm-sparc64/tlb.h @@ -100,8 +100,8 @@ static inline void tlb_remove_page(struct mmu_gather *mp, struct page *page) } #define tlb_remove_tlb_entry(mp,ptep,addr) do { } while (0) -#define pte_free_tlb(mp,ptepage) pte_free(ptepage) -#define pmd_free_tlb(mp,pmdp) pmd_free(pmdp) +#define pte_free_tlb(mp, ptepage) pte_free((mp)->mm, ptepage) +#define pmd_free_tlb(mp, pmdp) pmd_free((mp)->mm, pmdp) #define pud_free_tlb(tlb,pudp) __pud_free_tlb(tlb,pudp) #define tlb_migrate_finish(mm) do { } while (0) diff --git a/include/asm-um/a.out.h b/include/asm-um/a.out.h index 9281dd8..f42ff145 100644 --- a/include/asm-um/a.out.h +++ b/include/asm-um/a.out.h @@ -13,11 +13,9 @@ extern unsigned long stacksizelim; -extern unsigned long host_task_size; - #define STACK_ROOM (stacksizelim) -#define STACK_TOP task_size +#define STACK_TOP (TASK_SIZE - 2 * PAGE_SIZE) #define STACK_TOP_MAX STACK_TOP diff --git a/include/asm-um/current.h b/include/asm-um/current.h index 8fd72f6..c2191d9 100644 --- a/include/asm-um/current.h +++ b/include/asm-um/current.h @@ -1,32 +1,13 @@ -/* - * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ #ifndef __UM_CURRENT_H #define __UM_CURRENT_H -#ifndef __ASSEMBLY__ - -#include "asm/page.h" #include "linux/thread_info.h" #define current (current_thread_info()->task) -/*Backward compatibility - it's used inside arch/um.*/ -#define current_thread current_thread_info() - -#endif /* __ASSEMBLY__ */ - #endif - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/include/asm-um/elf-i386.h b/include/asm-um/elf-i386.h index ca94a13..23d6893 100644 --- a/include/asm-um/elf-i386.h +++ b/include/asm-um/elf-i386.h @@ -1,11 +1,11 @@ /* - * Copyright (C) 2000 - 2003 Jeff Dike (jdike@addtoit.com) + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ #ifndef __UM_ELF_I386_H #define __UM_ELF_I386_H -#include <linux/sched.h> +#include <asm/user.h> #include "skas.h" #define R_386_NONE 0 @@ -46,7 +46,7 @@ typedef struct user_i387_struct elf_fpregset_t; PT_REGS_EDI(regs) = 0; \ PT_REGS_EBP(regs) = 0; \ PT_REGS_EAX(regs) = 0; \ -} while(0) +} while (0) #define USE_ELF_CORE_DUMP #define ELF_EXEC_PAGESIZE 4096 @@ -74,14 +74,9 @@ typedef struct user_i387_struct elf_fpregset_t; pr_reg[14] = PT_REGS_EFLAGS(regs); \ pr_reg[15] = PT_REGS_SP(regs); \ pr_reg[16] = PT_REGS_SS(regs); \ -} while(0); +} while (0); -static inline int elf_core_copy_fpregs(struct task_struct *t, - elf_fpregset_t *fpu) -{ - int cpu = ((struct thread_info *) t->stack)->cpu; - return save_fp_registers(userspace_pid[cpu], (unsigned long *) fpu); -} +extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu); #define ELF_CORE_COPY_FPREGS(t, fpu) elf_core_copy_fpregs(t, fpu) @@ -91,7 +86,7 @@ extern long elf_aux_hwcap; extern char * elf_aux_platform; #define ELF_PLATFORM (elf_aux_platform) -#define SET_PERSONALITY(ex, ibcs2) do ; while(0) +#define SET_PERSONALITY(ex, ibcs2) do { } while (0) extern unsigned long vsyscall_ehdr; extern unsigned long vsyscall_end; @@ -166,14 +161,3 @@ if ( vsyscall_ehdr ) { \ } #endif - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/include/asm-um/elf-x86_64.h b/include/asm-um/elf-x86_64.h index 3c9d543..3b2d522 100644 --- a/include/asm-um/elf-x86_64.h +++ b/include/asm-um/elf-x86_64.h @@ -7,7 +7,6 @@ #ifndef __UM_ELF_X86_64_H #define __UM_ELF_X86_64_H -#include <linux/sched.h> #include <asm/user.h> #include "skas.h" @@ -96,12 +95,7 @@ typedef struct user_i387_struct elf_fpregset_t; (pr_reg)[25] = 0; \ (pr_reg)[26] = 0; -static inline int elf_core_copy_fpregs(struct task_struct *t, - elf_fpregset_t *fpu) -{ - int cpu = current_thread->cpu; - return save_fp_registers(userspace_pid[cpu], (unsigned long *) fpu); -} +extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu); #define ELF_CORE_COPY_FPREGS(t, fpu) elf_core_copy_fpregs(t, fpu) diff --git a/include/asm-um/fixmap.h b/include/asm-um/fixmap.h index d352a35..89a87c1 100644 --- a/include/asm-um/fixmap.h +++ b/include/asm-um/fixmap.h @@ -1,9 +1,10 @@ #ifndef __UM_FIXMAP_H #define __UM_FIXMAP_H +#include <asm/system.h> #include <asm/kmap_types.h> #include <asm/archparam.h> -#include <asm/elf.h> +#include <asm/page.h> /* * Here we define all the compile-time 'special' virtual @@ -55,9 +56,8 @@ extern void __set_fixmap (enum fixed_addresses idx, * the start of the fixmap, and leave one page empty * at the top of mem.. */ -extern unsigned long get_kmem_end(void); -#define FIXADDR_TOP (get_kmem_end() - 0x2000) +#define FIXADDR_TOP (CONFIG_TOP_ADDR - 2 * PAGE_SIZE) #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) diff --git a/include/asm-um/ldt.h b/include/asm-um/ldt.h index b2553f3..52af512 100644 --- a/include/asm-um/ldt.h +++ b/include/asm-um/ldt.h @@ -8,7 +8,7 @@ #ifndef __ASM_LDT_H #define __ASM_LDT_H -#include "asm/semaphore.h" +#include <linux/mutex.h> #include "asm/host_ldt.h" extern void ldt_host_info(void); @@ -27,7 +27,7 @@ struct ldt_entry { typedef struct uml_ldt { int entry_count; - struct semaphore semaphore; + struct mutex lock; union { struct ldt_entry * pages[LDT_PAGES_MAX]; struct ldt_entry entries[LDT_DIRECT_ENTRIES]; diff --git a/include/asm-um/linkage.h b/include/asm-um/linkage.h index cdb3024..7dfce37 100644 --- a/include/asm-um/linkage.h +++ b/include/asm-um/linkage.h @@ -3,10 +3,4 @@ #include "asm/arch/linkage.h" - -/* <linux/linkage.h> will pick sane defaults */ -#ifdef CONFIG_GPROF -#undef fastcall -#endif - #endif diff --git a/include/asm-um/mmu_context.h b/include/asm-um/mmu_context.h index 5f3b863..6686fc5 100644 --- a/include/asm-um/mmu_context.h +++ b/include/asm-um/mmu_context.h @@ -6,11 +6,12 @@ #ifndef __UM_MMU_CONTEXT_H #define __UM_MMU_CONTEXT_H -#include <asm-generic/mm_hooks.h> - #include "linux/sched.h" #include "um_mmu.h" +extern void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); +extern void arch_exit_mmap(struct mm_struct *mm); + #define get_mmu_context(task) do ; while(0) #define activate_context(tsk) do ; while(0) @@ -30,6 +31,8 @@ static inline void activate_mm(struct mm_struct *old, struct mm_struct *new) */ if (old != new && (current->flags & PF_BORROWED_MM)) __switch_mm(&new->context.id); + + arch_dup_mmap(old, new); } static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, diff --git a/include/asm-um/page.h b/include/asm-um/page.h index 4b424c7..fe2374d 100644 --- a/include/asm-um/page.h +++ b/include/asm-um/page.h @@ -30,7 +30,7 @@ struct page; #if defined(CONFIG_3_LEVEL_PGTABLES) && !defined(CONFIG_64BIT) typedef struct { unsigned long pte_low, pte_high; } pte_t; -typedef struct { unsigned long long pmd; } pmd_t; +typedef struct { unsigned long pmd; } pmd_t; typedef struct { unsigned long pgd; } pgd_t; #define pte_val(x) ((x).pte_low | ((unsigned long long) (x).pte_high << 32)) @@ -106,8 +106,8 @@ extern unsigned long uml_physmem; #define __pa(virt) to_phys((void *) (unsigned long) (virt)) #define __va(phys) to_virt((unsigned long) (phys)) -#define phys_to_pfn(p) ((p) >> PAGE_SHIFT) -#define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT) +#define phys_to_pfn(p) ((pfn_t) ((p) >> PAGE_SHIFT)) +#define pfn_to_phys(pfn) ((phys_t) ((pfn) << PAGE_SHIFT)) #define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(v) pfn_valid(phys_to_pfn(__pa(v))) diff --git a/include/asm-um/param.h b/include/asm-um/param.h index f914e7d..4cd4a22 100644 --- a/include/asm-um/param.h +++ b/include/asm-um/param.h @@ -10,7 +10,7 @@ #define MAXHOSTNAMELEN 64 /* max length of hostname */ #ifdef __KERNEL__ -#define HZ 100 +#define HZ CONFIG_HZ #define USER_HZ 100 /* .. some user interfaces are in "ticks" */ #define CLOCKS_PER_SEC (USER_HZ) /* frequency at which times() counts */ #endif diff --git a/include/asm-um/pgalloc.h b/include/asm-um/pgalloc.h index 1490487..4f3e62b 100644 --- a/include/asm-um/pgalloc.h +++ b/include/asm-um/pgalloc.h @@ -23,17 +23,17 @@ * Allocate and free page tables. */ extern pgd_t *pgd_alloc(struct mm_struct *); -extern void pgd_free(pgd_t *pgd); +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); extern struct page *pte_alloc_one(struct mm_struct *, unsigned long); -static inline void pte_free_kernel(pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { free_page((unsigned long) pte); } -static inline void pte_free(struct page *pte) +static inline void pte_free(struct mm_struct *mm, struct page *pte) { __free_page(pte); } @@ -42,7 +42,7 @@ static inline void pte_free(struct page *pte) #ifdef CONFIG_3_LEVEL_PGTABLES -static inline void pmd_free(pmd_t *pmd) +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { free_page((unsigned long)pmd); } diff --git a/include/asm-um/pgtable-2level.h b/include/asm-um/pgtable-2level.h index 172a75f..f534b73 100644 --- a/include/asm-um/pgtable-2level.h +++ b/include/asm-um/pgtable-2level.h @@ -41,9 +41,6 @@ static inline void pgd_mkuptodate(pgd_t pgd) { } #define pfn_pte(pfn, prot) __pte(pfn_to_phys(pfn) | pgprot_val(prot)) #define pfn_pmd(pfn, prot) __pmd(pfn_to_phys(pfn) | pgprot_val(prot)) -#define pmd_page_vaddr(pmd) \ - ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) - /* * Bits 0 through 4 are taken */ diff --git a/include/asm-um/pgtable-3level.h b/include/asm-um/pgtable-3level.h index 3ebafba..0446f45 100644 --- a/include/asm-um/pgtable-3level.h +++ b/include/asm-um/pgtable-3level.h @@ -11,7 +11,11 @@ /* PGDIR_SHIFT determines what a third-level page table entry can map */ +#ifdef CONFIG_64BIT #define PGDIR_SHIFT 30 +#else +#define PGDIR_SHIFT 31 +#endif #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) @@ -28,9 +32,15 @@ */ #define PTRS_PER_PTE 512 +#ifdef CONFIG_64BIT #define PTRS_PER_PMD 512 -#define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) #define PTRS_PER_PGD 512 +#else +#define PTRS_PER_PMD 1024 +#define PTRS_PER_PGD 1024 +#endif + +#define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) #define FIRST_USER_ADDRESS 0 #define pte_ERROR(e) \ @@ -49,7 +59,12 @@ #define pud_populate(mm, pud, pmd) \ set_pud(pud, __pud(_PAGE_TABLE + __pa(pmd))) +#ifdef CONFIG_64BIT #define set_pud(pudptr, pudval) set_64bit((phys_t *) (pudptr), pud_val(pudval)) +#else +#define set_pud(pudptr, pudval) (*(pudptr) = (pudval)) +#endif + static inline int pgd_newpage(pgd_t pgd) { return(pgd_val(pgd) & _PAGE_NEWPAGE); @@ -57,17 +72,14 @@ static inline int pgd_newpage(pgd_t pgd) static inline void pgd_mkuptodate(pgd_t pgd) { pgd_val(pgd) &= ~_PAGE_NEWPAGE; } +#ifdef CONFIG_64BIT #define set_pmd(pmdptr, pmdval) set_64bit((phys_t *) (pmdptr), pmd_val(pmdval)) +#else +#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) +#endif -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) -{ - pmd_t *pmd = (pmd_t *) __get_free_page(GFP_KERNEL); - - if(pmd) - memset(pmd, 0, PAGE_SIZE); - - return pmd; -} +struct mm_struct; +extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address); static inline void pud_clear (pud_t *pud) { @@ -75,8 +87,7 @@ static inline void pud_clear (pud_t *pud) } #define pud_page(pud) phys_to_page(pud_val(pud) & PAGE_MASK) -#define pud_page_vaddr(pud) \ - ((struct page *) __va(pud_val(pud) & PAGE_MASK)) +#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PAGE_MASK)) /* Find an entry in the second-level page table.. */ #define pmd_offset(pud, address) ((pmd_t *) pud_page_vaddr(*(pud)) + \ diff --git a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h index 830fc6e..4102b44 100644 --- a/include/asm-um/pgtable.h +++ b/include/asm-um/pgtable.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Copyright 2003 PathScale, Inc. * Derived from include/asm-i386/pgtable.h * Licensed under the GPL @@ -8,11 +8,7 @@ #ifndef __UM_PGTABLE_H #define __UM_PGTABLE_H -#include "linux/sched.h" -#include "linux/linkage.h" -#include "asm/processor.h" -#include "asm/page.h" -#include "asm/fixmap.h" +#include <asm/fixmap.h> #define _PAGE_PRESENT 0x001 #define _PAGE_NEWPAGE 0x002 @@ -34,22 +30,11 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; -extern void *um_virt_to_phys(struct task_struct *task, unsigned long virt, - pte_t *pte_out); - /* zero page used for uninitialized stuff */ extern unsigned long *empty_zero_page; #define pgtable_cache_init() do ; while (0) -/* - * pgd entries used up by user/kernel: - */ - -#define USER_PGD_PTRS (TASK_SIZE >> PGDIR_SHIFT) -#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) - -#ifndef __ASSEMBLY__ /* Just any arbitrary offset to the start of the vmalloc VM area: the * current 8MB value just means that there will be a 8MB "hole" after the * physical memory until the kernel virtual memory starts. That means that @@ -62,16 +47,12 @@ extern unsigned long end_iomem; #define VMALLOC_OFFSET (__va_space) #define VMALLOC_START ((end_iomem + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)) - #ifdef CONFIG_HIGHMEM # define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) #else # define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) #endif -#define REGION_SHIFT (sizeof(pte_t) * 8 - 4) -#define REGION_MASK (((unsigned long) 0xf) << REGION_SHIFT) - #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) @@ -81,11 +62,12 @@ extern unsigned long end_iomem; #define PAGE_COPY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) #define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) #define PAGE_KERNEL __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) -#define PAGE_KERNEL_RO __pgprot(_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED) /* - * The i386 can't do page protection for execute, and considers that the same are read. - * Also, write permissions imply read permissions. This is the closest we can get.. + * The i386 can't do page protection for execute, and considers that the same + * are read. + * Also, write permissions imply read permissions. This is the closest we can + * get.. */ #define __P000 PAGE_NONE #define __P001 PAGE_READONLY @@ -106,40 +88,16 @@ extern unsigned long end_iomem; #define __S111 PAGE_SHARED /* - * Define this if things work differently on an i386 and an i486: - * it will (on an i486) warn about kernel memory accesses that are - * done without a 'access_ok(VERIFY_WRITE,..)' - */ -#undef TEST_VERIFY_AREA - -/* page table for 0-4MB for everybody */ -extern unsigned long pg0[1024]; - -/* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ - #define ZERO_PAGE(vaddr) virt_to_page(empty_zero_page) -/* number of bits that fit into a memory pointer */ -#define BITS_PER_PTR (8*sizeof(unsigned long)) - -/* to align the pointer to a pointer address */ -#define PTR_MASK (~(sizeof(void*)-1)) - -/* sizeof(void*)==1<<SIZEOF_PTR_LOG2 */ -/* 64-bit machines, beware! SRB. */ -#define SIZEOF_PTR_LOG2 3 - -/* to find an entry in a page-table */ -#define PAGE_PTR(address) \ -((unsigned long)(address)>>(PAGE_SHIFT-SIZEOF_PTR_LOG2)&PTR_MASK&~PAGE_MASK) - #define pte_clear(mm,addr,xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEWPAGE)) #define pmd_none(x) (!((unsigned long)pmd_val(x) & ~_PAGE_NEWPAGE)) #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) + #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0) @@ -149,14 +107,9 @@ extern unsigned long pg0[1024]; #define pud_newpage(x) (pud_val(x) & _PAGE_NEWPAGE) #define pud_mkuptodate(x) (pud_val(x) &= ~_PAGE_NEWPAGE) -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) - #define pmd_page(pmd) phys_to_page(pmd_val(pmd) & PAGE_MASK) #define pte_page(x) pfn_to_page(pte_pfn(x)) -#define pte_address(x) (__va(pte_val(x) & PAGE_MASK)) -#define mk_phys(a, r) ((a) + (((unsigned long) r) << REGION_SHIFT)) -#define phys_addr(p) ((p) & ~REGION_MASK) #define pte_present(x) pte_get_bits(x, (_PAGE_PRESENT | _PAGE_PROTNONE)) @@ -309,7 +262,8 @@ static inline void set_pte(pte_t *pteptr, pte_t pteval) #define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys)) #define __virt_to_page(virt) phys_to_page(__pa(virt)) -#define page_to_phys(page) pfn_to_phys(page_to_pfn(page)) +#define page_to_phys(page) pfn_to_phys((pfn_t) page_to_pfn(page)) +#define virt_to_page(addr) __virt_to_page((const unsigned long) addr) #define mk_pte(page, pgprot) \ ({ pte_t pte; \ @@ -325,8 +279,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) return pte; } -#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) - /* * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] * @@ -335,8 +287,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) */ #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) -#define pgd_index_k(addr) pgd_index(addr) - /* * pgd_offset() returns a (pgd_t *) * pgd_index() is used get the offset into the pgd page's array of pgd_t's; @@ -355,8 +305,12 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) * this macro returns the index of the entry in the pmd page which would * control the given virtual address */ +#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) +#define pmd_page_vaddr(pmd) \ + ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) + /* * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] * @@ -372,6 +326,9 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) +struct mm_struct; +extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr); + #define update_mmu_cache(vma,address,pte) do ; while (0) /* Encode and de-code a swap entry */ @@ -388,29 +345,4 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #include <asm-generic/pgtable.h> -#include <asm-generic/pgtable-nopud.h> - -#ifdef CONFIG_HIGHMEM -/* Clear a kernel PTE and flush it from the TLB */ -#define kpte_clear_flush(ptep, vaddr) \ -do { \ - pte_clear(&init_mm, vaddr, ptep); \ - __flush_tlb_one(vaddr); \ -} while (0) #endif - -#endif -#endif - -#define virt_to_page(addr) __virt_to_page((const unsigned long) addr) - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/include/asm-um/processor-generic.h b/include/asm-um/processor-generic.h index 78c0599..b7d9a16 100644 --- a/include/asm-um/processor-generic.h +++ b/include/asm-um/processor-generic.h @@ -11,6 +11,7 @@ struct pt_regs; struct task_struct; #include "asm/ptrace.h" +#include "asm/pgtable.h" #include "registers.h" #include "sysdep/archsetjmp.h" @@ -26,7 +27,6 @@ struct thread_struct { * as of 2.6.11). */ int forking; - int nsyscalls; struct pt_regs regs; int singlestep_syscall; void *fault_addr; @@ -58,7 +58,6 @@ struct thread_struct { #define INIT_THREAD \ { \ .forking = 0, \ - .nsyscalls = 0, \ .regs = EMPTY_REGS, \ .fault_addr = NULL, \ .prev_sched = NULL, \ @@ -68,10 +67,6 @@ struct thread_struct { .request = { 0 } \ } -typedef struct { - unsigned long seg; -} mm_segment_t; - extern struct task_struct *alloc_task_struct(void); static inline void release_thread(struct task_struct *task) @@ -97,9 +92,7 @@ static inline void mm_copy_segments(struct mm_struct *from_mm, /* * User space process size: 3GB (default). */ -extern unsigned long task_size; - -#define TASK_SIZE (task_size) +#define TASK_SIZE (CONFIG_TOP_ADDR & PGDIR_MASK) /* This decides where the kernel will search for a free chunk of vm * space during mmap's. @@ -128,6 +121,6 @@ extern struct cpuinfo_um cpu_data[]; #define KSTK_REG(tsk, reg) get_thread_reg(reg, &tsk->thread.switch_buf) -#define get_wchan(p) (0) +extern unsigned long get_wchan(struct task_struct *p); #endif diff --git a/include/asm-um/processor-i386.h b/include/asm-um/processor-i386.h index 595f1c3..a2b7fe1 100644 --- a/include/asm-um/processor-i386.h +++ b/include/asm-um/processor-i386.h @@ -10,7 +10,6 @@ #include "asm/host_ldt.h" #include "asm/segment.h" -extern int host_has_xmm; extern int host_has_cmov; /* include faultinfo structure */ diff --git a/include/asm-um/thread_info.h b/include/asm-um/thread_info.h index 6e5fd5c..356b83e 100644 --- a/include/asm-um/thread_info.h +++ b/include/asm-um/thread_info.h @@ -1,5 +1,5 @@ -/* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) +/* + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ @@ -8,8 +8,9 @@ #ifndef __ASSEMBLY__ -#include <asm/processor.h> #include <asm/types.h> +#include <asm/page.h> +#include <asm/uaccess.h> struct thread_info { struct task_struct *task; /* main task structure */ @@ -75,8 +76,8 @@ static inline struct thread_info *current_thread_info(void) #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ -#define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling - * TIF_NEED_RESCHED +#define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling + * TIF_NEED_RESCHED */ #define TIF_RESTART_BLOCK 4 #define TIF_MEMDIE 5 diff --git a/include/asm-um/tlb.h b/include/asm-um/tlb.h index c640033..39fc475 100644 --- a/include/asm-um/tlb.h +++ b/include/asm-um/tlb.h @@ -1,6 +1,126 @@ #ifndef __UM_TLB_H #define __UM_TLB_H -#include <asm/arch/tlb.h> +#include <linux/swap.h> +#include <asm/percpu.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> + +#define tlb_start_vma(tlb, vma) do { } while (0) +#define tlb_end_vma(tlb, vma) do { } while (0) +#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) + +/* struct mmu_gather is an opaque type used by the mm code for passing around + * any data needed by arch specific code for tlb_remove_page. + */ +struct mmu_gather { + struct mm_struct *mm; + unsigned int need_flush; /* Really unmapped some ptes? */ + unsigned long start; + unsigned long end; + unsigned int fullmm; /* non-zero means full mm flush */ +}; + +/* Users of the generic TLB shootdown code must declare this storage space. */ +DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); + +static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, + unsigned long address) +{ + if (tlb->start > address) + tlb->start = address; + if (tlb->end < address + PAGE_SIZE) + tlb->end = address + PAGE_SIZE; +} + +static inline void init_tlb_gather(struct mmu_gather *tlb) +{ + tlb->need_flush = 0; + + tlb->start = TASK_SIZE; + tlb->end = 0; + + if (tlb->fullmm) { + tlb->start = 0; + tlb->end = TASK_SIZE; + } +} + +/* tlb_gather_mmu + * Return a pointer to an initialized struct mmu_gather. + */ +static inline struct mmu_gather * +tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) +{ + struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); + + tlb->mm = mm; + tlb->fullmm = full_mm_flush; + + init_tlb_gather(tlb); + + return tlb; +} + +extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + unsigned long end); + +static inline void +tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +{ + if (!tlb->need_flush) + return; + + flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end); + init_tlb_gather(tlb); +} + +/* tlb_finish_mmu + * Called at the end of the shootdown operation to free up any resources + * that were required. + */ +static inline void +tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +{ + tlb_flush_mmu(tlb, start, end); + + /* keep the page table cache within bounds */ + check_pgt_cache(); + + put_cpu_var(mmu_gathers); +} + +/* tlb_remove_page + * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), + * while handling the additional races in SMP caused by other CPUs + * caching valid mappings in their TLBs. + */ +static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) +{ + tlb->need_flush = 1; + free_page_and_swap_cache(page); + return; +} + +/** + * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation. + * + * Record the fact that pte's were really umapped in ->need_flush, so we can + * later optimise away the tlb invalidate. This helps when userspace is + * unmapping already-unmapped pages, which happens quite a lot. + */ +#define tlb_remove_tlb_entry(tlb, ptep, address) \ + do { \ + tlb->need_flush = 1; \ + __tlb_remove_tlb_entry(tlb, ptep, address); \ + } while (0) + +#define pte_free_tlb(tlb, ptep) __pte_free_tlb(tlb, ptep) + +#define pud_free_tlb(tlb, pudp) __pud_free_tlb(tlb, pudp) + +#define pmd_free_tlb(tlb, pmdp) __pmd_free_tlb(tlb, pmdp) + +#define tlb_migrate_finish(mm) do {} while (0) #endif diff --git a/include/asm-um/uaccess.h b/include/asm-um/uaccess.h index 077032d..b9a895d 100644 --- a/include/asm-um/uaccess.h +++ b/include/asm-um/uaccess.h @@ -6,7 +6,15 @@ #ifndef __UM_UACCESS_H #define __UM_UACCESS_H -#include "linux/sched.h" +#include <asm/errno.h> +#include <asm/processor.h> + +/* thread_info has a mm_segment_t in it, so put the definition up here */ +typedef struct { + unsigned long seg; +} mm_segment_t; + +#include "linux/thread_info.h" #define VERIFY_READ 0 #define VERIFY_WRITE 1 diff --git a/include/asm-x86/bitops_64.h b/include/asm-x86/bitops_64.h index 48adbf5..aaf1519 100644 --- a/include/asm-x86/bitops_64.h +++ b/include/asm-x86/bitops_64.h @@ -37,12 +37,6 @@ static inline long __scanbit(unsigned long val, unsigned long max) ((off)+(__scanbit(~(((*(unsigned long *)addr)) >> (off)),(size)-(off)))) : \ find_next_zero_bit(addr,size,off))) -/* - * Find string of zero bits in a bitmap. -1 when not found. - */ -extern unsigned long -find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len); - static inline void set_bit_string(unsigned long *bitmap, unsigned long i, int len) { @@ -53,16 +47,6 @@ static inline void set_bit_string(unsigned long *bitmap, unsigned long i, } } -static inline void __clear_bit_string(unsigned long *bitmap, unsigned long i, - int len) -{ - unsigned long end = i + len; - while (i < end) { - __clear_bit(i, bitmap); - i++; - } -} - /** * ffz - find first zero in word. * @word: The word to search diff --git a/include/asm-x86/highmem.h b/include/asm-x86/highmem.h index c25cfca..479767c 100644 --- a/include/asm-x86/highmem.h +++ b/include/asm-x86/highmem.h @@ -38,11 +38,6 @@ extern pte_t *pkmap_page_table; * easily, subsequent pte tables have to be allocated in one physical * chunk of RAM. */ -#ifdef CONFIG_X86_PAE -#define LAST_PKMAP 512 -#else -#define LAST_PKMAP 1024 -#endif /* * Ordering is: * @@ -58,7 +53,6 @@ extern pte_t *pkmap_page_table; * VMALLOC_START * high_memory */ -#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK ) #define LAST_PKMAP_MASK (LAST_PKMAP-1) #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h index 6c21ef9..bab1271 100644 --- a/include/asm-x86/pgalloc_32.h +++ b/include/asm-x86/pgalloc_32.h @@ -36,17 +36,17 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *p * Allocate and free page tables. */ extern pgd_t *pgd_alloc(struct mm_struct *); -extern void pgd_free(pgd_t *pgd); +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); extern struct page *pte_alloc_one(struct mm_struct *, unsigned long); -static inline void pte_free_kernel(pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { free_page((unsigned long)pte); } -static inline void pte_free(struct page *pte) +static inline void pte_free(struct mm_struct *mm, struct page *pte) { __free_page(pte); } @@ -63,7 +63,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); } -static inline void pmd_free(pmd_t *pmd) +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); free_page((unsigned long)pmd); diff --git a/include/asm-x86/pgalloc_64.h b/include/asm-x86/pgalloc_64.h index 8bb5646..315314c 100644 --- a/include/asm-x86/pgalloc_64.h +++ b/include/asm-x86/pgalloc_64.h @@ -17,7 +17,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *p set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); } -static inline void pmd_free(pmd_t *pmd) +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); free_page((unsigned long)pmd); @@ -33,7 +33,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); } -static inline void pud_free (pud_t *pud) +static inline void pud_free(struct mm_struct *mm, pud_t *pud) { BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); free_page((unsigned long)pud); @@ -77,7 +77,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) return pgd; } -static inline void pgd_free(pgd_t *pgd) +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); pgd_list_del(pgd); @@ -100,13 +100,13 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long add /* Should really implement gc for free page table pages. This could be done with a reference count in struct page. */ -static inline void pte_free_kernel(pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { BUG_ON((unsigned long)pte & (PAGE_SIZE-1)); free_page((unsigned long)pte); } -static inline void pte_free(struct page *pte) +static inline void pte_free(struct mm_struct *mm, struct page *pte) { __free_page(pte); } diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h index 935630d..80dd438 100644 --- a/include/asm-x86/pgtable_32.h +++ b/include/asm-x86/pgtable_32.h @@ -66,6 +66,14 @@ void paging_init(void); #define VMALLOC_OFFSET (8*1024*1024) #define VMALLOC_START (((unsigned long) high_memory + \ 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1)) +#ifdef CONFIG_X86_PAE +#define LAST_PKMAP 512 +#else +#define LAST_PKMAP 1024 +#endif + +#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK) + #ifdef CONFIG_HIGHMEM # define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) #else diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h index 8d8f9b5..984123a 100644 --- a/include/asm-x86/unistd_32.h +++ b/include/asm-x86/unistd_32.h @@ -327,9 +327,11 @@ #define __NR_epoll_pwait 319 #define __NR_utimensat 320 #define __NR_signalfd 321 -#define __NR_timerfd 322 +#define __NR_timerfd_create 322 #define __NR_eventfd 323 #define __NR_fallocate 324 +#define __NR_timerfd_settime 325 +#define __NR_timerfd_gettime 326 #ifdef __KERNEL__ diff --git a/include/asm-x86/unistd_64.h b/include/asm-x86/unistd_64.h index 5ff4d3e..3883ceb 100644 --- a/include/asm-x86/unistd_64.h +++ b/include/asm-x86/unistd_64.h @@ -629,12 +629,17 @@ __SYSCALL(__NR_utimensat, sys_utimensat) __SYSCALL(__NR_epoll_pwait, sys_epoll_pwait) #define __NR_signalfd 282 __SYSCALL(__NR_signalfd, sys_signalfd) -#define __NR_timerfd 283 -__SYSCALL(__NR_timerfd, sys_timerfd) +#define __NR_timerfd_create 283 +__SYSCALL(__NR_timerfd_create, sys_timerfd_create) #define __NR_eventfd 284 __SYSCALL(__NR_eventfd, sys_eventfd) #define __NR_fallocate 285 __SYSCALL(__NR_fallocate, sys_fallocate) +#define __NR_timerfd_settime 286 +__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime) +#define __NR_timerfd_gettime 287 +__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime) + #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/include/asm-xtensa/pgalloc.h b/include/asm-xtensa/pgalloc.h index 3e5b565..1d51ba5 100644 --- a/include/asm-xtensa/pgalloc.h +++ b/include/asm-xtensa/pgalloc.h @@ -31,7 +31,7 @@ pgd_alloc(struct mm_struct *mm) return (pgd_t*) __get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER); } -static inline void pgd_free(pgd_t *pgd) +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { free_page((unsigned long)pgd); } @@ -52,12 +52,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, return virt_to_page(pte_alloc_one_kernel(mm, addr)); } -static inline void pte_free_kernel(pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { kmem_cache_free(pgtable_cache, pte); } -static inline void pte_free(struct page *page) +static inline void pte_free(struct mm_struct *mm, struct page *page) { kmem_cache_free(pgtable_cache, page_address(page)); } diff --git a/include/asm-xtensa/tlb.h b/include/asm-xtensa/tlb.h index 4830232..31c220f 100644 --- a/include/asm-xtensa/tlb.h +++ b/include/asm-xtensa/tlb.h @@ -42,6 +42,6 @@ #include <asm-generic/tlb.h> -#define __pte_free_tlb(tlb,pte) pte_free(pte) +#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, pte) #endif /* _XTENSA_TLB_H */ diff --git a/include/linux/capability.h b/include/linux/capability.h index bb017ed..7d50ff6 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -14,7 +14,6 @@ #define _LINUX_CAPABILITY_H #include <linux/types.h> -#include <linux/compiler.h> struct task_struct; @@ -23,13 +22,20 @@ struct task_struct; kernel might be somewhat backwards compatible, but don't bet on it. */ -/* XXX - Note, cap_t, is defined by POSIX to be an "opaque" pointer to +/* Note, cap_t, is defined by POSIX (draft) to be an "opaque" pointer to a set of three capability sets. The transposition of 3*the following structure to such a composite is better handled in a user library since the draft standard requires the use of malloc/free etc.. */ -#define _LINUX_CAPABILITY_VERSION 0x19980330 +#define _LINUX_CAPABILITY_VERSION_1 0x19980330 +#define _LINUX_CAPABILITY_U32S_1 1 + +#define _LINUX_CAPABILITY_VERSION_2 0x20071026 +#define _LINUX_CAPABILITY_U32S_2 2 + +#define _LINUX_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_2 +#define _LINUX_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_2 typedef struct __user_cap_header_struct { __u32 version; @@ -42,41 +48,42 @@ typedef struct __user_cap_data_struct { __u32 inheritable; } __user *cap_user_data_t; + #define XATTR_CAPS_SUFFIX "capability" #define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX -#define XATTR_CAPS_SZ (3*sizeof(__le32)) #define VFS_CAP_REVISION_MASK 0xFF000000 +#define VFS_CAP_FLAGS_MASK ~VFS_CAP_REVISION_MASK +#define VFS_CAP_FLAGS_EFFECTIVE 0x000001 + #define VFS_CAP_REVISION_1 0x01000000 +#define VFS_CAP_U32_1 1 +#define XATTR_CAPS_SZ_1 (sizeof(__le32)*(1 + 2*VFS_CAP_U32_1)) -#define VFS_CAP_REVISION VFS_CAP_REVISION_1 +#define VFS_CAP_REVISION_2 0x02000000 +#define VFS_CAP_U32_2 2 +#define XATTR_CAPS_SZ_2 (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2)) + +#define XATTR_CAPS_SZ XATTR_CAPS_SZ_2 +#define VFS_CAP_U32 VFS_CAP_U32_2 +#define VFS_CAP_REVISION VFS_CAP_REVISION_2 -#define VFS_CAP_FLAGS_MASK ~VFS_CAP_REVISION_MASK -#define VFS_CAP_FLAGS_EFFECTIVE 0x000001 struct vfs_cap_data { - __u32 magic_etc; /* Little endian */ - __u32 permitted; /* Little endian */ - __u32 inheritable; /* Little endian */ + __le32 magic_etc; /* Little endian */ + struct { + __le32 permitted; /* Little endian */ + __le32 inheritable; /* Little endian */ + } data[VFS_CAP_U32]; }; #ifdef __KERNEL__ -/* #define STRICT_CAP_T_TYPECHECKS */ - -#ifdef STRICT_CAP_T_TYPECHECKS - typedef struct kernel_cap_struct { - __u32 cap; + __u32 cap[_LINUX_CAPABILITY_U32S]; } kernel_cap_t; -#else - -typedef __u32 kernel_cap_t; - -#endif - -#define _USER_CAP_HEADER_SIZE (2*sizeof(__u32)) +#define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct)) #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t)) #endif @@ -119,10 +126,6 @@ typedef __u32 kernel_cap_t; #define CAP_FSETID 4 -/* Used to decide between falling back on the old suser() or fsuser(). */ - -#define CAP_FS_MASK 0x1f - /* Overrides the restriction that the real or effective user ID of a process sending a signal must match the real or effective user ID of the process receiving the signal. */ @@ -145,8 +148,14 @@ typedef __u32 kernel_cap_t; ** Linux-specific capabilities **/ -/* Transfer any capability in your permitted set to any pid, - remove any capability in your permitted set from any pid */ +/* Without VFS support for capabilities: + * Transfer any capability in your permitted set to any pid, + * remove any capability in your permitted set from any pid + * With VFS support for capabilities (neither of above, but) + * Add any capability from current's capability bounding set + * to the current process' inheritable set + * Allow taking bits out of capability bounding set + */ #define CAP_SETPCAP 8 @@ -195,7 +204,6 @@ typedef __u32 kernel_cap_t; #define CAP_IPC_OWNER 15 /* Insert and remove kernel modules - modify kernel without limit */ -/* Modify cap_bset */ #define CAP_SYS_MODULE 16 /* Allow ioperm/iopl access */ @@ -307,74 +315,183 @@ typedef __u32 kernel_cap_t; #define CAP_SETFCAP 31 +/* Override MAC access. + The base kernel enforces no MAC policy. + An LSM may enforce a MAC policy, and if it does and it chooses + to implement capability based overrides of that policy, this is + the capability it should use to do so. */ + +#define CAP_MAC_OVERRIDE 32 + +/* Allow MAC configuration or state changes. + The base kernel requires no MAC configuration. + An LSM may enforce a MAC policy, and if it does and it chooses + to implement capability based checks on modifications to that + policy or the data required to maintain it, this is the + capability it should use to do so. */ + +#define CAP_MAC_ADMIN 33 + +#define CAP_LAST_CAP CAP_MAC_ADMIN + +#define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) + +/* + * Bit location of each capability (used by user-space library and kernel) + */ + +#define CAP_TO_INDEX(x) ((x) >> 5) /* 1 << 5 == bits in __u32 */ +#define CAP_TO_MASK(x) (1 << ((x) & 31)) /* mask for indexed __u32 */ + #ifdef __KERNEL__ /* * Internal kernel functions only */ -#ifdef STRICT_CAP_T_TYPECHECKS +#define CAP_FOR_EACH_U32(__capi) \ + for (__capi = 0; __capi < _LINUX_CAPABILITY_U32S; ++__capi) + +# define CAP_FS_MASK_B0 (CAP_TO_MASK(CAP_CHOWN) \ + | CAP_TO_MASK(CAP_DAC_OVERRIDE) \ + | CAP_TO_MASK(CAP_DAC_READ_SEARCH) \ + | CAP_TO_MASK(CAP_FOWNER) \ + | CAP_TO_MASK(CAP_FSETID)) + +# define CAP_FS_MASK_B1 (CAP_TO_MASK(CAP_MAC_OVERRIDE)) + +#if _LINUX_CAPABILITY_U32S != 2 +# error Fix up hand-coded capability macro initializers +#else /* HAND-CODED capability initializers */ + +# define CAP_EMPTY_SET {{ 0, 0 }} +# define CAP_FULL_SET {{ ~0, ~0 }} +# define CAP_INIT_EFF_SET {{ ~CAP_TO_MASK(CAP_SETPCAP), ~0 }} +# define CAP_FS_SET {{ CAP_FS_MASK_B0, CAP_FS_MASK_B1 } } +# define CAP_NFSD_SET {{ CAP_FS_MASK_B0|CAP_TO_MASK(CAP_SYS_RESOURCE), \ + CAP_FS_MASK_B1 } } + +#endif /* _LINUX_CAPABILITY_U32S != 2 */ + +#define CAP_INIT_INH_SET CAP_EMPTY_SET + +# define cap_clear(c) do { (c) = __cap_empty_set; } while (0) +# define cap_set_full(c) do { (c) = __cap_full_set; } while (0) +# define cap_set_init_eff(c) do { (c) = __cap_init_eff_set; } while (0) + +#define cap_raise(c, flag) ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag)) +#define cap_lower(c, flag) ((c).cap[CAP_TO_INDEX(flag)] &= ~CAP_TO_MASK(flag)) +#define cap_raised(c, flag) ((c).cap[CAP_TO_INDEX(flag)] & CAP_TO_MASK(flag)) + +#define CAP_BOP_ALL(c, a, b, OP) \ +do { \ + unsigned __capi; \ + CAP_FOR_EACH_U32(__capi) { \ + c.cap[__capi] = a.cap[__capi] OP b.cap[__capi]; \ + } \ +} while (0) + +#define CAP_UOP_ALL(c, a, OP) \ +do { \ + unsigned __capi; \ + CAP_FOR_EACH_U32(__capi) { \ + c.cap[__capi] = OP a.cap[__capi]; \ + } \ +} while (0) + +static inline kernel_cap_t cap_combine(const kernel_cap_t a, + const kernel_cap_t b) +{ + kernel_cap_t dest; + CAP_BOP_ALL(dest, a, b, |); + return dest; +} -#define to_cap_t(x) { x } -#define cap_t(x) (x).cap +static inline kernel_cap_t cap_intersect(const kernel_cap_t a, + const kernel_cap_t b) +{ + kernel_cap_t dest; + CAP_BOP_ALL(dest, a, b, &); + return dest; +} -#else +static inline kernel_cap_t cap_drop(const kernel_cap_t a, + const kernel_cap_t drop) +{ + kernel_cap_t dest; + CAP_BOP_ALL(dest, a, drop, &~); + return dest; +} -#define to_cap_t(x) (x) -#define cap_t(x) (x) +static inline kernel_cap_t cap_invert(const kernel_cap_t c) +{ + kernel_cap_t dest; + CAP_UOP_ALL(dest, c, ~); + return dest; +} -#endif +static inline int cap_isclear(const kernel_cap_t a) +{ + unsigned __capi; + CAP_FOR_EACH_U32(__capi) { + if (a.cap[__capi] != 0) + return 0; + } + return 1; +} -#define CAP_EMPTY_SET to_cap_t(0) -#define CAP_FULL_SET to_cap_t(~0) -#define CAP_INIT_EFF_SET to_cap_t(~0 & ~CAP_TO_MASK(CAP_SETPCAP)) -#define CAP_INIT_INH_SET to_cap_t(0) +static inline int cap_issubset(const kernel_cap_t a, const kernel_cap_t set) +{ + kernel_cap_t dest; + dest = cap_drop(a, set); + return cap_isclear(dest); +} -#define CAP_TO_MASK(x) (1 << (x)) -#define cap_raise(c, flag) (cap_t(c) |= CAP_TO_MASK(flag)) -#define cap_lower(c, flag) (cap_t(c) &= ~CAP_TO_MASK(flag)) -#define cap_raised(c, flag) (cap_t(c) & CAP_TO_MASK(flag)) +/* Used to decide between falling back on the old suser() or fsuser(). */ -static inline kernel_cap_t cap_combine(kernel_cap_t a, kernel_cap_t b) +static inline int cap_is_fs_cap(int cap) { - kernel_cap_t dest; - cap_t(dest) = cap_t(a) | cap_t(b); - return dest; + const kernel_cap_t __cap_fs_set = CAP_FS_SET; + return !!(CAP_TO_MASK(cap) & __cap_fs_set.cap[CAP_TO_INDEX(cap)]); } -static inline kernel_cap_t cap_intersect(kernel_cap_t a, kernel_cap_t b) +static inline kernel_cap_t cap_drop_fs_set(const kernel_cap_t a) { - kernel_cap_t dest; - cap_t(dest) = cap_t(a) & cap_t(b); - return dest; + const kernel_cap_t __cap_fs_set = CAP_FS_SET; + return cap_drop(a, __cap_fs_set); } -static inline kernel_cap_t cap_drop(kernel_cap_t a, kernel_cap_t drop) +static inline kernel_cap_t cap_raise_fs_set(const kernel_cap_t a, + const kernel_cap_t permitted) { - kernel_cap_t dest; - cap_t(dest) = cap_t(a) & ~cap_t(drop); - return dest; + const kernel_cap_t __cap_fs_set = CAP_FS_SET; + return cap_combine(a, + cap_intersect(permitted, __cap_fs_set)); } -static inline kernel_cap_t cap_invert(kernel_cap_t c) +static inline kernel_cap_t cap_drop_nfsd_set(const kernel_cap_t a) { - kernel_cap_t dest; - cap_t(dest) = ~cap_t(c); - return dest; + const kernel_cap_t __cap_fs_set = CAP_NFSD_SET; + return cap_drop(a, __cap_fs_set); } -#define cap_isclear(c) (!cap_t(c)) -#define cap_issubset(a,set) (!(cap_t(a) & ~cap_t(set))) - -#define cap_clear(c) do { cap_t(c) = 0; } while(0) -#define cap_set_full(c) do { cap_t(c) = ~0; } while(0) -#define cap_mask(c,mask) do { cap_t(c) &= cap_t(mask); } while(0) +static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a, + const kernel_cap_t permitted) +{ + const kernel_cap_t __cap_nfsd_set = CAP_NFSD_SET; + return cap_combine(a, + cap_intersect(permitted, __cap_nfsd_set)); +} -#define cap_is_fs_cap(c) (CAP_TO_MASK(c) & CAP_FS_MASK) +extern const kernel_cap_t __cap_empty_set; +extern const kernel_cap_t __cap_full_set; +extern const kernel_cap_t __cap_init_eff_set; int capable(int cap); int __capable(struct task_struct *t, int cap); +extern long cap_prctl_drop(unsigned long cap); + #endif /* __KERNEL__ */ #endif /* !_LINUX_CAPABILITY_H */ diff --git a/include/linux/compat.h b/include/linux/compat.h index d38655f..ae0a483 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -279,8 +279,11 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, asmlinkage long compat_sys_signalfd(int ufd, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); -asmlinkage long compat_sys_timerfd(int ufd, int clockid, int flags, - const struct compat_itimerspec __user *utmr); +asmlinkage long compat_sys_timerfd_settime(int ufd, int flags, + const struct compat_itimerspec __user *utmr, + struct compat_itimerspec __user *otmr); +asmlinkage long compat_sys_timerfd_gettime(int ufd, + struct compat_itimerspec __user *otmr); #endif /* CONFIG_COMPAT */ #endif /* _LINUX_COMPAT_H */ diff --git a/include/linux/device.h b/include/linux/device.h index 479c0b3..2258d89 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -410,6 +410,15 @@ extern int devres_release_group(struct device *dev, void *id); extern void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp); extern void devm_kfree(struct device *dev, void *p); +struct device_dma_parameters { + /* + * a low level driver may set these to teach IOMMU code about + * sg limitations. + */ + unsigned int max_segment_size; + unsigned long segment_boundary_mask; +}; + struct device { struct klist klist_children; struct klist_node knode_parent; /* node in sibling list */ @@ -445,6 +454,8 @@ struct device { 64 bit addresses for consistent allocations such descriptors. */ + struct device_dma_parameters *dma_parms; + struct list_head dma_pools; /* dma pools (if dma'ble) */ struct dma_coherent_mem *dma_mem; /* internal for coherent mem diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 4470950..3320307 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -60,6 +60,36 @@ static inline int is_device_dma_capable(struct device *dev) extern u64 dma_get_required_mask(struct device *dev); +static inline unsigned int dma_get_max_seg_size(struct device *dev) +{ + return dev->dma_parms ? dev->dma_parms->max_segment_size : 65536; +} + +static inline unsigned int dma_set_max_seg_size(struct device *dev, + unsigned int size) +{ + if (dev->dma_parms) { + dev->dma_parms->max_segment_size = size; + return 0; + } else + return -EIO; +} + +static inline unsigned long dma_get_seg_boundary(struct device *dev) +{ + return dev->dma_parms ? + dev->dma_parms->segment_boundary_mask : 0xffffffff; +} + +static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask) +{ + if (dev->dma_parms) { + dev->dma_parms->segment_boundary_mask = mask; + return 0; + } else + return -EIO; +} + /* flags for the coherent memory api */ #define DMA_MEMORY_MAP 0x01 #define DMA_MEMORY_IO 0x02 diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7e93a9a..0c6ce51 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -228,5 +228,7 @@ extern void FASTCALL(free_cold_page(struct page *page)); void page_alloc_init(void); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); +void drain_all_pages(void); +void drain_local_pages(void *dummy); #endif /* __LINUX_GFP_H */ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 1fcb003..7dcbc82 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -68,8 +68,6 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) void *addr = kmap_atomic(page, KM_USER0); clear_user_page(addr, vaddr, page); kunmap_atomic(addr, KM_USER0); - /* Make sure this page is cleared on other CPU's too before using it */ - smp_wmb(); } #ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE @@ -124,28 +122,40 @@ static inline void clear_highpage(struct page *page) kunmap_atomic(kaddr, KM_USER0); } -/* - * Same but also flushes aliased cache contents to RAM. - * - * This must be a macro because KM_USER0 and friends aren't defined if - * !CONFIG_HIGHMEM - */ -#define zero_user_page(page, offset, size, km_type) \ - do { \ - void *kaddr; \ - \ - BUG_ON((offset) + (size) > PAGE_SIZE); \ - \ - kaddr = kmap_atomic(page, km_type); \ - memset((char *)kaddr + (offset), 0, (size)); \ - flush_dcache_page(page); \ - kunmap_atomic(kaddr, (km_type)); \ - } while (0) +static inline void zero_user_segments(struct page *page, + unsigned start1, unsigned end1, + unsigned start2, unsigned end2) +{ + void *kaddr = kmap_atomic(page, KM_USER0); + + BUG_ON(end1 > PAGE_SIZE || end2 > PAGE_SIZE); + + if (end1 > start1) + memset(kaddr + start1, 0, end1 - start1); + + if (end2 > start2) + memset(kaddr + start2, 0, end2 - start2); + + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(page); +} + +static inline void zero_user_segment(struct page *page, + unsigned start, unsigned end) +{ + zero_user_segments(page, start, end, 0, 0); +} + +static inline void zero_user(struct page *page, + unsigned start, unsigned size) +{ + zero_user_segments(page, start, start + size, 0, 0); +} static inline void __deprecated memclear_highpage_flush(struct page *page, unsigned int offset, unsigned int size) { - zero_user_page(page, offset, size, KM_USER0); + zero_user(page, offset, size); } #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE @@ -160,8 +170,6 @@ static inline void copy_user_highpage(struct page *to, struct page *from, copy_user_page(vto, vfrom, vaddr, to); kunmap_atomic(vfrom, KM_USER0); kunmap_atomic(vto, KM_USER1); - /* Make sure this page is cleared on other CPU's too before using it */ - smp_wmb(); } #endif diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index f79dcba..8371b66 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -301,9 +301,16 @@ static inline int hrtimer_is_queued(struct hrtimer *timer) } /* Forward a hrtimer so it expires after now: */ -extern unsigned long +extern u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); +/* Forward a hrtimer so it expires after the hrtimer's current now */ +static inline u64 hrtimer_forward_now(struct hrtimer *timer, + ktime_t interval) +{ + return hrtimer_forward(timer, timer->base->get_time(), interval); +} + /* Precise sleep: */ extern long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp, @@ -322,9 +329,9 @@ extern void hrtimer_run_pending(void); extern void __init hrtimers_init(void); #if BITS_PER_LONG < 64 -extern unsigned long ktime_divns(const ktime_t kt, s64 div); +extern u64 ktime_divns(const ktime_t kt, s64 div); #else /* BITS_PER_LONG < 64 */ -# define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div)) +# define ktime_divns(kt, div) (u64)((kt).tv64 / (div)) #endif /* Show pending timers: */ diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index 85d1191..4213182 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -44,7 +44,15 @@ struct hwrng { /** Register a new Hardware Random Number Generator driver. */ extern int hwrng_register(struct hwrng *rng); /** Unregister a Hardware Random Number Generator driver. */ -extern void hwrng_unregister(struct hwrng *rng); +extern void __hwrng_unregister(struct hwrng *rng, bool suspended); +static inline void hwrng_unregister(struct hwrng *rng) +{ + __hwrng_unregister(rng, false); +} +static inline void hwrng_unregister_suspended(struct hwrng *rng) +{ + __hwrng_unregister(rng, true); +} #endif /* __KERNEL__ */ #endif /* LINUX_HWRANDOM_H_ */ diff --git a/include/linux/i2c/pca9539.h b/include/linux/i2c/pca9539.h new file mode 100644 index 0000000..611d84a --- /dev/null +++ b/include/linux/i2c/pca9539.h @@ -0,0 +1,18 @@ +/* platform data for the PCA9539 16-bit I/O expander driver */ + +struct pca9539_platform_data { + /* number of the first GPIO */ + unsigned gpio_base; + + /* initial polarity inversion setting */ + uint16_t invert; + + void *context; /* param to setup/teardown */ + + int (*setup)(struct i2c_client *client, + unsigned gpio, unsigned ngpio, + void *context); + int (*teardown)(struct i2c_client *client, + unsigned gpio, unsigned ngpio, + void *context); +}; diff --git a/include/linux/i2c/pcf857x.h b/include/linux/i2c/pcf857x.h new file mode 100644 index 0000000..ba8ea6e --- /dev/null +++ b/include/linux/i2c/pcf857x.h @@ -0,0 +1,45 @@ +#ifndef __LINUX_PCF857X_H +#define __LINUX_PCF857X_H + +/** + * struct pcf857x_platform_data - data to set up pcf857x driver + * @gpio_base: number of the chip's first GPIO + * @n_latch: optional bit-inverse of initial register value; if + * you leave this initialized to zero the driver will act + * like the chip was just reset + * @setup: optional callback issued once the GPIOs are valid + * @teardown: optional callback issued before the GPIOs are invalidated + * @context: optional parameter passed to setup() and teardown() + * + * In addition to the I2C_BOARD_INFO() state appropriate to each chip, + * the i2c_board_info used with the pcf875x driver must provide the + * chip "type" ("pcf8574", "pcf8574a", "pcf8575", "pcf8575c") and its + * platform_data (pointer to one of these structures) with at least + * the gpio_base value initialized. + * + * The @setup callback may be used with the kind of board-specific glue + * which hands the (now-valid) GPIOs to other drivers, or which puts + * devices in their initial states using these GPIOs. + * + * These GPIO chips are only "quasi-bidirectional"; read the chip specs + * to understand the behavior. They don't have separate registers to + * record which pins are used for input or output, record which output + * values are driven, or provide access to input values. That must be + * inferred by reading the chip's value and knowing the last value written + * to it. If you leave n_latch initialized to zero, that last written + * value is presumed to be all ones (as if the chip were just reset). + */ +struct pcf857x_platform_data { + unsigned gpio_base; + unsigned n_latch; + + int (*setup)(struct i2c_client *client, + int gpio, unsigned ngpio, + void *context); + int (*teardown)(struct i2c_client *client, + int gpio, unsigned ngpio, + void *context); + void *context; +}; + +#endif /* __LINUX_PCF857X_H */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index f42663e..1f74e1d 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -121,6 +121,18 @@ extern struct group_info init_groups; #else #define INIT_IDS #endif + +#ifdef CONFIG_SECURITY_FILE_CAPABILITIES +/* + * Because of the reduced scope of CAP_SETPCAP when filesystem + * capabilities are in effect, it is safe to allow CAP_SETPCAP to + * be available in the default configuration. + */ +# define CAP_INIT_BSET CAP_FULL_SET +#else +# define CAP_INIT_BSET CAP_INIT_EFF_SET +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -156,6 +168,7 @@ extern struct group_info init_groups; .cap_effective = CAP_INIT_EFF_SET, \ .cap_inheritable = CAP_INIT_INH_SET, \ .cap_permitted = CAP_FULL_SET, \ + .cap_bset = CAP_INIT_BSET, \ .keep_capabilities = 0, \ .user = INIT_USER, \ .comm = "swapper", \ diff --git a/include/linux/iommu-helper.h b/include/linux/iommu-helper.h new file mode 100644 index 0000000..4dd4c04 --- /dev/null +++ b/include/linux/iommu-helper.h @@ -0,0 +1,7 @@ +extern unsigned long iommu_area_alloc(unsigned long *map, unsigned long size, + unsigned long start, unsigned int nr, + unsigned long shift, + unsigned long boundary_size, + unsigned long align_mask); +extern void iommu_area_free(unsigned long *map, unsigned long start, + unsigned int nr); diff --git a/include/linux/latency.h b/include/linux/latency.h deleted file mode 100644 index c08b52b..0000000 --- a/include/linux/latency.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * latency.h: Explicit system-wide latency-expectation infrastructure - * - * (C) Copyright 2006 Intel Corporation - * Author: Arjan van de Ven <arjan@linux.intel.com> - * - */ - -#ifndef _INCLUDE_GUARD_LATENCY_H_ -#define _INCLUDE_GUARD_LATENCY_H_ - -#include <linux/notifier.h> - -void set_acceptable_latency(char *identifier, int usecs); -void modify_acceptable_latency(char *identifier, int usecs); -void remove_acceptable_latency(char *identifier); -void synchronize_acceptable_latency(void); -int system_latency_constraint(void); - -int register_latency_notifier(struct notifier_block * nb); -int unregister_latency_notifier(struct notifier_block * nb); - -#define INFINITE_LATENCY 1000000 - -#endif diff --git a/include/linux/leds.h b/include/linux/leds.h index b4130ff..00f89fd 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -54,7 +54,15 @@ struct led_classdev { extern int led_classdev_register(struct device *parent, struct led_classdev *led_cdev); -extern void led_classdev_unregister(struct led_classdev *led_cdev); +extern void __led_classdev_unregister(struct led_classdev *led_cdev, bool sus); +static inline void led_classdev_unregister(struct led_classdev *lcd) +{ + __led_classdev_unregister(lcd, false); +} +static inline void led_classdev_unregister_suspended(struct led_classdev *lcd) +{ + __led_classdev_unregister(lcd, true); +} extern void led_classdev_suspend(struct led_classdev *led_cdev); extern void led_classdev_resume(struct led_classdev *led_cdev); diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index dff9ea3..24b30b9 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -43,7 +43,15 @@ struct miscdevice { }; extern int misc_register(struct miscdevice * misc); -extern int misc_deregister(struct miscdevice * misc); +extern int __misc_deregister(struct miscdevice *misc, bool suspended); +static inline int misc_deregister(struct miscdevice *misc) +{ + return __misc_deregister(misc, false); +} +static inline int misc_deregister_suspended(struct miscdevice *misc) +{ + return __misc_deregister(misc, true); +} #define MODULE_ALIAS_MISCDEV(minor) \ MODULE_ALIAS("char-major-" __stringify(MISC_MAJOR) \ diff --git a/include/linux/mm.h b/include/linux/mm.h index 1bba678..89d7c69 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -227,10 +227,22 @@ static inline int put_page_testzero(struct page *page) */ static inline int get_page_unless_zero(struct page *page) { - VM_BUG_ON(PageCompound(page)); + VM_BUG_ON(PageTail(page)); return atomic_inc_not_zero(&page->_count); } +/* Support for virtually mapped pages */ +struct page *vmalloc_to_page(const void *addr); +unsigned long vmalloc_to_pfn(const void *addr); + +/* Determine if an address is within the vmalloc range */ +static inline int is_vmalloc_addr(const void *x) +{ + unsigned long addr = (unsigned long)x; + + return addr >= VMALLOC_START && addr < VMALLOC_END; +} + static inline struct page *compound_head(struct page *page) { if (unlikely(PageTail(page))) @@ -706,6 +718,28 @@ unsigned long unmap_vmas(struct mmu_gather **tlb, struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *); + +/** + * mm_walk - callbacks for walk_page_range + * @pgd_entry: if set, called for each non-empty PGD (top-level) entry + * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry + * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry + * @pte_entry: if set, called for each non-empty PTE (4th-level) entry + * @pte_hole: if set, called for each hole at all levels + * + * (see walk_page_range for more details) + */ +struct mm_walk { + int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, void *); + int (*pud_entry)(pud_t *, unsigned long, unsigned long, void *); + int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, void *); + int (*pte_entry)(pte_t *, unsigned long, unsigned long, void *); + int (*pte_hole)(unsigned long, unsigned long, void *); +}; + +int walk_page_range(const struct mm_struct *, unsigned long addr, + unsigned long end, const struct mm_walk *walk, + void *private); void free_pgd_range(struct mmu_gather **tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma, @@ -1089,8 +1123,6 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) pgprot_t vm_get_page_prot(unsigned long vm_flags); struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); -struct page *vmalloc_to_page(void *addr); -unsigned long vmalloc_to_pfn(void *addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4c4522a..8d8d197 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -113,7 +113,7 @@ struct per_cpu_pages { }; struct per_cpu_pageset { - struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ + struct per_cpu_pages pcp; #ifdef CONFIG_NUMA s8 expire; #endif diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index e9fddb4..139d49d 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -343,7 +343,8 @@ struct sdio_device_id { __u8 class; /* Standard interface or SDIO_ANY_ID */ __u16 vendor; /* Vendor or SDIO_ANY_ID */ __u16 device; /* Device ID or SDIO_ANY_ID */ - kernel_ulong_t driver_data; /* Data private to the driver */ + kernel_ulong_t driver_data /* Data private to the driver */ + __attribute__((aligned(sizeof(kernel_ulong_t)))); }; /* SSB core, see drivers/ssb/ */ diff --git a/include/linux/nubus.h b/include/linux/nubus.h index cdb3e9b..c435507 100644 --- a/include/linux/nubus.h +++ b/include/linux/nubus.h @@ -132,10 +132,12 @@ enum nubus_drhw { NUBUS_DRHW_RDIUS_DCGX = 0x027C, /* Radius DirectColor/GX */ NUBUS_DRHW_RDIUS_PC8 = 0x0291, /* Radius PrecisionColor 8 */ NUBUS_DRHW_LAPIS_PCS8 = 0x0292, /* Lapis ProColorServer 8 */ - NUBUS_DRHW_RASTER_24LXI = 0x02A0, /* RasterOps 8/24 XLi */ + NUBUS_DRHW_RASTER_24XLI = 0x02A0, /* RasterOps 8/24 XLi */ NUBUS_DRHW_RASTER_PBPGT = 0x02A5, /* RasterOps PaintBoard Prism GT */ NUBUS_DRHW_EMACH_FSX = 0x02AE, /* E-Machines Futura SX */ + NUBUS_DRHW_RASTER_24XLTV = 0x02B7, /* RasterOps 24XLTV */ NUBUS_DRHW_SMAC_THUND24 = 0x02CB, /* SuperMac Thunder/24 */ + NUBUS_DRHW_SMAC_THUNDLGHT = 0x03D9, /* SuperMac ThunderLight */ NUBUS_DRHW_RDIUS_PC24XP = 0x0406, /* Radius PrecisionColor 24Xp */ NUBUS_DRHW_RDIUS_PC24X = 0x040A, /* Radius PrecisionColor 24X */ NUBUS_DRHW_RDIUS_PC8XJ = 0x040B, /* Radius PrecisionColor 8XJ */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 209d3a4..bbad43f 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -131,16 +131,52 @@ #define ClearPageReferenced(page) clear_bit(PG_referenced, &(page)->flags) #define TestClearPageReferenced(page) test_and_clear_bit(PG_referenced, &(page)->flags) -#define PageUptodate(page) test_bit(PG_uptodate, &(page)->flags) +static inline int PageUptodate(struct page *page) +{ + int ret = test_bit(PG_uptodate, &(page)->flags); + + /* + * Must ensure that the data we read out of the page is loaded + * _after_ we've loaded page->flags to check for PageUptodate. + * We can skip the barrier if the page is not uptodate, because + * we wouldn't be reading anything from it. + * + * See SetPageUptodate() for the other side of the story. + */ + if (ret) + smp_rmb(); + + return ret; +} + +static inline void __SetPageUptodate(struct page *page) +{ + smp_wmb(); + __set_bit(PG_uptodate, &(page)->flags); #ifdef CONFIG_S390 + page_clear_dirty(page); +#endif +} + static inline void SetPageUptodate(struct page *page) { +#ifdef CONFIG_S390 if (!test_and_set_bit(PG_uptodate, &page->flags)) page_clear_dirty(page); -} #else -#define SetPageUptodate(page) set_bit(PG_uptodate, &(page)->flags) + /* + * Memory barrier must be issued before setting the PG_uptodate bit, + * so that all previous stores issued in order to bring the page + * uptodate are actually visible before PageUptodate becomes true. + * + * s390 doesn't need an explicit smp_wmb here because the test and + * set bit already provides full barriers. + */ + smp_wmb(); + set_bit(PG_uptodate, &(page)->flags); #endif +} + #define ClearPageUptodate(page) clear_bit(PG_uptodate, &(page)->flags) #define PageDirty(page) test_bit(PG_dirty, &(page)->flags) diff --git a/include/linux/pci.h b/include/linux/pci.h index cee75c0..7215d3b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -159,6 +159,8 @@ struct pci_dev { this if your device has broken DMA or supports 64-bit transfers. */ + struct device_dma_parameters dma_parms; + pci_power_t current_state; /* Current operating state. In ACPI-speak, this is D0-D3, D0 being fully functional, and D3 being off. */ @@ -580,6 +582,8 @@ void pci_intx(struct pci_dev *dev, int enable); void pci_msi_off(struct pci_dev *dev); int pci_set_dma_mask(struct pci_dev *dev, u64 mask); int pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask); +int pci_set_dma_max_seg_size(struct pci_dev *dev, unsigned int size); +int pci_set_dma_seg_boundary(struct pci_dev *dev, unsigned long mask); int pcix_get_max_mmrbc(struct pci_dev *dev); int pcix_get_mmrbc(struct pci_dev *dev); int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc); @@ -822,6 +826,18 @@ static inline int pci_set_dma_mask(struct pci_dev *dev, u64 mask) return -EIO; } +static inline int pci_set_dma_max_seg_size(struct pci_dev *dev, + unsigned int size) +{ + return -EIO; +} + +static inline int pci_set_dma_seg_boundary(struct pci_dev *dev, + unsigned long mask) +{ + return -EIO; +} + static inline int pci_assign_resource(struct pci_dev *dev, int i) { return -EBUSY; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 41f6f28..39d3283 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2043,6 +2043,23 @@ #define PCI_VENDOR_ID_QUICKNET 0x15e2 #define PCI_DEVICE_ID_QUICKNET_XJ 0x0500 +/* + * ADDI-DATA GmbH communication cards <info@addi-data.com> + */ +#define PCI_VENDOR_ID_ADDIDATA_OLD 0x10E8 +#define PCI_VENDOR_ID_ADDIDATA 0x15B8 +#define PCI_DEVICE_ID_ADDIDATA_APCI7500 0x7000 +#define PCI_DEVICE_ID_ADDIDATA_APCI7420 0x7001 +#define PCI_DEVICE_ID_ADDIDATA_APCI7300 0x7002 +#define PCI_DEVICE_ID_ADDIDATA_APCI7800 0x818E +#define PCI_DEVICE_ID_ADDIDATA_APCI7500_2 0x7009 +#define PCI_DEVICE_ID_ADDIDATA_APCI7420_2 0x700A +#define PCI_DEVICE_ID_ADDIDATA_APCI7300_2 0x700B +#define PCI_DEVICE_ID_ADDIDATA_APCI7500_3 0x700C +#define PCI_DEVICE_ID_ADDIDATA_APCI7420_3 0x700D +#define PCI_DEVICE_ID_ADDIDATA_APCI7300_3 0x700E +#define PCI_DEVICE_ID_ADDIDATA_APCI7800_3 0x700F + #define PCI_VENDOR_ID_PDC 0x15e9 #define PCI_VENDOR_ID_FARSITE 0x1619 diff --git a/include/linux/pm_qos_params.h b/include/linux/pm_qos_params.h new file mode 100644 index 0000000..2e4e97b --- /dev/null +++ b/include/linux/pm_qos_params.h @@ -0,0 +1,25 @@ +/* interface for the pm_qos_power infrastructure of the linux kernel. + * + * Mark Gross + */ +#include <linux/list.h> +#include <linux/notifier.h> +#include <linux/miscdevice.h> + +#define PM_QOS_RESERVED 0 +#define PM_QOS_CPU_DMA_LATENCY 1 +#define PM_QOS_NETWORK_LATENCY 2 +#define PM_QOS_NETWORK_THROUGHPUT 3 + +#define PM_QOS_NUM_CLASSES 4 +#define PM_QOS_DEFAULT_VALUE -1 + +int pm_qos_add_requirement(int qos, char *name, s32 value); +int pm_qos_update_requirement(int qos, char *name, s32 new_value); +void pm_qos_remove_requirement(int qos, char *name); + +int pm_qos_requirement(int qos); + +int pm_qos_add_notifier(int qos, struct notifier_block *notifier); +int pm_qos_remove_notifier(int qos, struct notifier_block *notifier); + diff --git a/include/linux/prctl.h b/include/linux/prctl.h index e2eff90..3800639 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -63,4 +63,8 @@ #define PR_GET_SECCOMP 21 #define PR_SET_SECCOMP 22 +/* Get/set the capability bounding set */ +#define PR_CAPBSET_READ 23 +#define PR_CAPBSET_DROP 24 + #endif /* _LINUX_PRCTL_H */ diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 8f92546..e435515 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -19,6 +19,8 @@ struct completion; */ #define FIRST_PROCESS_ENTRY 256 +/* Worst case buffer size needed for holding an integer. */ +#define PROC_NUMBUF 13 /* * We always define these enumerators @@ -117,7 +119,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); unsigned long task_vsize(struct mm_struct *); int task_statm(struct mm_struct *, int *, int *, int *, int *); char *task_mem(struct mm_struct *, char *); -void clear_refs_smap(struct mm_struct *mm); struct proc_dir_entry *de_get(struct proc_dir_entry *de); void de_put(struct proc_dir_entry *de); diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 85ea63f..b93b541 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -59,8 +59,6 @@ extern void machine_crash_shutdown(struct pt_regs *); * Architecture independent implemenations of sys_reboot commands. */ -extern void kernel_shutdown_prepare(enum system_states state); - extern void kernel_restart(char *cmd); extern void kernel_halt(void); extern void kernel_power_off(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index af6947e..9c13be3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -555,6 +555,13 @@ struct signal_struct { #define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ +/* If true, all threads except ->group_exit_task have pending SIGKILL */ +static inline int signal_group_exit(const struct signal_struct *sig) +{ + return (sig->flags & SIGNAL_GROUP_EXIT) || + (sig->group_exit_task != NULL); +} + /* * Some day this will be a full-fledged user tracking system.. */ @@ -1091,7 +1098,7 @@ struct task_struct { uid_t uid,euid,suid,fsuid; gid_t gid,egid,sgid,fsgid; struct group_info *group_info; - kernel_cap_t cap_effective, cap_inheritable, cap_permitted; + kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset; unsigned keep_capabilities:1; struct user_struct *user; #ifdef CONFIG_KEYS @@ -1770,7 +1777,7 @@ extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned lon struct task_struct *fork_idle(int); extern void set_task_comm(struct task_struct *tsk, char *from); -extern void get_task_comm(char *to, struct task_struct *tsk); +extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP extern void wait_task_inactive(struct task_struct * p); @@ -2080,6 +2087,10 @@ static inline void migration_init(void) } #endif +#ifndef TASK_SIZE_OF +#define TASK_SIZE_OF(tsk) TASK_SIZE +#endif + #endif /* __KERNEL__ */ #endif diff --git a/include/linux/security.h b/include/linux/security.h index d249742..fe52cde 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -40,11 +40,6 @@ #define ROOTCONTEXT_MNT 0x04 #define DEFCONTEXT_MNT 0x08 -/* - * Bounding set - */ -extern kernel_cap_t cap_bset; - extern unsigned securebits; struct ctl_table; @@ -423,15 +418,12 @@ struct request_sock; * identified by @name for @dentry. * Return 0 if permission is granted. * @inode_getsecurity: - * Copy the extended attribute representation of the security label - * associated with @name for @inode into @buffer. @buffer may be - * NULL to request the size of the buffer required. @size indicates - * the size of @buffer in bytes. Note that @name is the remainder - * of the attribute name after the security. prefix has been removed. - * @err is the return value from the preceding fs getxattr call, - * and can be used by the security module to determine whether it - * should try and canonicalize the attribute value. - * Return number of bytes used/required on success. + * Retrieve a copy of the extended attribute representation of the + * security label associated with @name for @inode via @buffer. Note that + * @name is the remainder of the attribute name after the security prefix + * has been removed. @alloc is used to specify of the call should return a + * value via the buffer or just the value length Return size of buffer on + * success. * @inode_setsecurity: * Set the security label associated with @name for @inode from the * extended attribute value @value. @size indicates the size of the @@ -1304,7 +1296,7 @@ struct security_operations { int (*inode_removexattr) (struct dentry *dentry, char *name); int (*inode_need_killpriv) (struct dentry *dentry); int (*inode_killpriv) (struct dentry *dentry); - int (*inode_getsecurity)(const struct inode *inode, const char *name, void *buffer, size_t size, int err); + int (*inode_getsecurity)(const struct inode *inode, const char *name, void **buffer, bool alloc); int (*inode_setsecurity)(struct inode *inode, const char *name, const void *value, size_t size, int flags); int (*inode_listsecurity)(struct inode *inode, char *buffer, size_t buffer_size); @@ -1565,7 +1557,7 @@ int security_inode_listxattr(struct dentry *dentry); int security_inode_removexattr(struct dentry *dentry, char *name); int security_inode_need_killpriv(struct dentry *dentry); int security_inode_killpriv(struct dentry *dentry); -int security_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err); +int security_inode_getsecurity(const struct inode *inode, const char *name, void **buffer, bool alloc); int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags); int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size); int security_file_permission(struct file *file, int mask); @@ -1967,7 +1959,7 @@ static inline int security_inode_killpriv(struct dentry *dentry) return cap_inode_killpriv(dentry); } -static inline int security_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err) +static inline int security_inode_getsecurity(const struct inode *inode, const char *name, void **buffer, bool alloc) { return -EOPNOTSUPP; } diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index afe0f6d..00b65c0 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -23,6 +23,7 @@ struct plat_serial8250_port { resource_size_t mapbase; /* resource base */ unsigned int irq; /* interrupt number */ unsigned int uartclk; /* UART clock rate */ + void *private_data; unsigned char regshift; /* register shift */ unsigned char iotype; /* UPIO_* */ unsigned char hub6; diff --git a/include/linux/spi/mcp23s08.h b/include/linux/spi/mcp23s08.h new file mode 100644 index 0000000..835ddf4 --- /dev/null +++ b/include/linux/spi/mcp23s08.h @@ -0,0 +1,24 @@ + +/* FIXME driver should be able to handle all four slaves that + * can be hooked up to each chipselect, as well as IRQs... + */ + +struct mcp23s08_platform_data { + /* four slaves can share one SPI chipselect */ + u8 slave; + + /* number assigned to the first GPIO */ + unsigned base; + + /* pins with pullups */ + u8 pullups; + + void *context; /* param to setup/teardown */ + + int (*setup)(struct spi_device *spi, + int gpio, unsigned ngpio, + void *context); + int (*teardown)(struct spi_device *spi, + int gpio, unsigned ngpio, + void *context); +}; diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 646ce2d..1d7d4c5 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -130,7 +130,6 @@ struct pbe { }; /* mm/page_alloc.c */ -extern void drain_local_pages(void); extern void mark_free_pages(struct zone *zone); /** diff --git a/include/linux/swap.h b/include/linux/swap.h index 4f3838a..353153e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -158,9 +158,6 @@ struct swap_list_t { /* Swap 50% full? Release swapcache more aggressively.. */ #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) -/* linux/mm/memory.c */ -extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *); - /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; @@ -223,16 +220,17 @@ extern struct address_space swapper_space; #define total_swapcache_pages swapper_space.nrpages extern void show_swap_cache_info(void); extern int add_to_swap(struct page *, gfp_t); +extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); extern void __delete_from_swap_cache(struct page *); extern void delete_from_swap_cache(struct page *); -extern int move_to_swap_cache(struct page *, swp_entry_t); -extern int move_from_swap_cache(struct page *, unsigned long, - struct address_space *); extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); -extern struct page * lookup_swap_cache(swp_entry_t); -extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, - unsigned long addr); +extern struct page *lookup_swap_cache(swp_entry_t); +extern struct page *read_swap_cache_async(swp_entry_t, gfp_t, + struct vm_area_struct *vma, unsigned long addr); +extern struct page *swapin_readahead(swp_entry_t, gfp_t, + struct vm_area_struct *vma, unsigned long addr); + /* linux/mm/swapfile.c */ extern long total_swap_pages; extern unsigned int nr_swapfiles; @@ -306,7 +304,7 @@ static inline void swap_free(swp_entry_t swp) { } -static inline struct page *read_swap_cache_async(swp_entry_t swp, +static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { return NULL; @@ -317,22 +315,12 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp) return NULL; } -static inline int valid_swaphandles(swp_entry_t entry, unsigned long *offset) -{ - return 0; -} - #define can_share_swap_page(p) (page_mapcount(p) == 1) -static inline int move_to_swap_cache(struct page *page, swp_entry_t entry) -{ - return 1; -} - -static inline int move_from_swap_cache(struct page *page, unsigned long index, - struct address_space *mapping) +static inline int add_to_swap_cache(struct page *page, swp_entry_t entry, + gfp_t gfp_mask) { - return 1; + return -1; } static inline void __delete_from_swap_cache(struct page *page) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index ceb6cc5..7bf2d14 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -42,6 +42,12 @@ static inline pgoff_t swp_offset(swp_entry_t entry) return entry.val & SWP_OFFSET_MASK(entry); } +/* check whether a pte points to a swap entry */ +static inline int is_swap_pte(pte_t pte) +{ + return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); +} + /* * Convert the arch-dependent pte representation of a swp_entry_t into an * arch-independent swp_entry_t. diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 61def7c8..4c2577b 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -607,8 +607,11 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len); asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache); asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask); -asmlinkage long sys_timerfd(int ufd, int clockid, int flags, - const struct itimerspec __user *utmr); +asmlinkage long sys_timerfd_create(int clockid, int flags); +asmlinkage long sys_timerfd_settime(int ufd, int flags, + const struct itimerspec __user *utmr, + struct itimerspec __user *otmr); +asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr); asmlinkage long sys_eventfd(unsigned int count); asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index bf4ae4e..571f01d 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -102,7 +102,6 @@ enum KERN_NODENAME=7, KERN_DOMAINNAME=8, - KERN_CAP_BSET=14, /* int: capability bounding set */ KERN_PANIC=15, /* int: panic timeout */ KERN_REALROOTDEV=16, /* real root device to mount after initrd */ @@ -965,8 +964,6 @@ extern int proc_dostring(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); extern int proc_dointvec(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -extern int proc_dointvec_bset(struct ctl_table *, int, struct file *, - void __user *, size_t *, loff_t *); extern int proc_dointvec_minmax(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); extern int proc_dointvec_jiffies(struct ctl_table *, int, struct file *, diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 89338b4..ce8e7da 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -45,11 +45,11 @@ extern void *vmalloc_32_user(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot); -extern void vfree(void *addr); +extern void vfree(const void *addr); extern void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot); -extern void vunmap(void *addr); +extern void vunmap(const void *addr); extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff); @@ -71,7 +71,7 @@ extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, extern struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node, gfp_t gfp_mask); -extern struct vm_struct *remove_vm_area(void *addr); +extern struct vm_struct *remove_vm_area(const void *addr); extern int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages); diff --git a/include/linux/wait.h b/include/linux/wait.h index 1f4fb0a..33a2aa9 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -162,6 +162,22 @@ wait_queue_head_t *FASTCALL(bit_waitqueue(void *, int)); #define wake_up_interruptible_all(x) __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL) #define wake_up_interruptible_sync(x) __wake_up_sync((x), TASK_INTERRUPTIBLE, 1) +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/* + * macro to avoid include hell + */ +#define wake_up_nested(x, s) \ +do { \ + unsigned long flags; \ + \ + spin_lock_irqsave_nested(&(x)->lock, flags, (s)); \ + wake_up_locked(x); \ + spin_unlock_irqrestore(&(x)->lock, flags); \ +} while (0) +#else +#define wake_up_nested(x, s) wake_up(x) +#endif + #define __wait_event(wq, condition) \ do { \ DEFINE_WAIT(__wait); \ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index c6148bb..b7b3362 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -62,6 +62,7 @@ struct writeback_control { unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned for_writepages:1; /* This is a writepages() call */ unsigned range_cyclic:1; /* range_start is cyclic */ + unsigned more_io:1; /* more io to be dispatched */ }; /* @@ -100,6 +101,7 @@ extern int dirty_background_ratio; extern int vm_dirty_ratio; extern int dirty_writeback_interval; extern int dirty_expire_interval; +extern int vm_highmem_is_dirtyable; extern int block_dump; extern int laptop_mode; diff --git a/include/linux/xattr.h b/include/linux/xattr.h index def131a..df6b95d 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -46,6 +46,7 @@ struct xattr_handler { size_t size, int flags); }; +ssize_t xattr_getsecurity(struct inode *, const char *, void *, size_t); ssize_t vfs_getxattr(struct dentry *, char *, void *, size_t); ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size); int vfs_setxattr(struct dentry *, char *, void *, size_t, int); diff --git a/include/net/netlabel.h b/include/net/netlabel.h index b3213c7..0ca67d7 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -36,6 +36,8 @@ #include <net/netlink.h> #include <asm/atomic.h> +struct cipso_v4_doi; + /* * NetLabel - A management interface for maintaining network packet label * mapping tables for explicit packet labling protocols. @@ -103,12 +105,6 @@ struct netlbl_audit { uid_t loginuid; }; -/* Domain mapping definition struct */ -struct netlbl_dom_map; - -/* Domain mapping operations */ -int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info); - /* * LSM security attributes */ @@ -344,6 +340,19 @@ static inline void netlbl_secattr_free(struct netlbl_lsm_secattr *secattr) #ifdef CONFIG_NETLABEL /* + * LSM configuration operations + */ +int netlbl_cfg_map_del(const char *domain, struct netlbl_audit *audit_info); +int netlbl_cfg_unlbl_add_map(const char *domain, + struct netlbl_audit *audit_info); +int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def, + struct netlbl_audit *audit_info); +int netlbl_cfg_cipsov4_add_map(struct cipso_v4_doi *doi_def, + const char *domain, + struct netlbl_audit *audit_info); +int netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info); + +/* * LSM security attribute operations */ int netlbl_secattr_catmap_walk(struct netlbl_lsm_secattr_catmap *catmap, @@ -378,6 +387,32 @@ void netlbl_cache_invalidate(void); int netlbl_cache_add(const struct sk_buff *skb, const struct netlbl_lsm_secattr *secattr); #else +static inline int netlbl_cfg_map_del(const char *domain, + struct netlbl_audit *audit_info) +{ + return -ENOSYS; +} +static inline int netlbl_cfg_unlbl_add_map(const char *domain, + struct netlbl_audit *audit_info) +{ + return -ENOSYS; +} +static inline int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def, + struct netlbl_audit *audit_info) +{ + return -ENOSYS; +} +static inline int netlbl_cfg_cipsov4_add_map(struct cipso_v4_doi *doi_def, + const char *domain, + struct netlbl_audit *audit_info) +{ + return -ENOSYS; +} +static inline int netlbl_cfg_cipsov4_del(u32 doi, + struct netlbl_audit *audit_info) +{ + return -ENOSYS; +} static inline int netlbl_secattr_catmap_walk( struct netlbl_lsm_secattr_catmap *catmap, u32 offset) diff --git a/include/pcmcia/cs.h b/include/pcmcia/cs.h index d5838c3..87a260e 100644 --- a/include/pcmcia/cs.h +++ b/include/pcmcia/cs.h @@ -147,11 +147,11 @@ typedef struct config_req_t { /* For RequestIO and ReleaseIO */ typedef struct io_req_t { - ioaddr_t BasePort1; - ioaddr_t NumPorts1; + u_int BasePort1; + u_int NumPorts1; u_int Attributes1; - ioaddr_t BasePort2; - ioaddr_t NumPorts2; + u_int BasePort2; + u_int NumPorts2; u_int Attributes2; u_int IOAddrLines; } io_req_t; diff --git a/include/pcmcia/cs_types.h b/include/pcmcia/cs_types.h index 5f38803..9a6bcc4 100644 --- a/include/pcmcia/cs_types.h +++ b/include/pcmcia/cs_types.h @@ -27,7 +27,6 @@ typedef u_int ioaddr_t; #else typedef u_short ioaddr_t; #endif -typedef unsigned long kio_addr_t; typedef u_short socket_t; typedef u_int event_t; diff --git a/include/pcmcia/ss.h b/include/pcmcia/ss.h index 6e84258..f95dca0 100644 --- a/include/pcmcia/ss.h +++ b/include/pcmcia/ss.h @@ -92,7 +92,7 @@ typedef struct pccard_io_map { u_char map; u_char flags; u_short speed; - kio_addr_t start, stop; + u_int start, stop; } pccard_io_map; typedef struct pccard_mem_map { @@ -155,7 +155,7 @@ extern struct pccard_resource_ops pccard_iodyn_ops; struct pcmcia_socket; typedef struct io_window_t { - kio_addr_t InUse, Config; + u_int InUse, Config; struct resource *res; } io_window_t; @@ -208,7 +208,7 @@ struct pcmcia_socket { u_int features; u_int irq_mask; u_int map_size; - kio_addr_t io_offset; + u_int io_offset; u_char pci_irq; struct pci_dev * cb_dev; diff --git a/init/Kconfig b/init/Kconfig index b2acdeb..87f50df 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -582,7 +582,6 @@ config SIGNALFD config TIMERFD bool "Enable timerfd() system call" if EMBEDDED select ANON_INODES - depends on BROKEN default y help Enable the timerfd() system call that allows to receive timer @@ -657,11 +656,9 @@ config SLOB depends on EMBEDDED bool "SLOB (Simple Allocator)" help - SLOB replaces the SLAB allocator with a drastically simpler - allocator. SLOB is more space efficient than SLAB but does not - scale well (single lock for all operations) and is also highly - susceptible to fragmentation. SLUB can accomplish a higher object - density. It is usually better to use SLUB instead of SLOB. + SLOB replaces the stock allocator with a drastically simpler + allocator. SLOB is generally more space efficient but + does not perform as well on large systems. endchoice @@ -679,6 +676,16 @@ config MARKERS source "arch/Kconfig" +config PROC_PAGE_MONITOR + default y + depends on PROC_FS && MMU + bool "Enable /proc page monitoring" if EMBEDDED + help + Various /proc files exist to monitor process memory utilization: + /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, + /proc/kpagecount, and /proc/kpageflags. Disabling these + interfaces will reduce the size of the kernel by approximately 4kb. + endmenu # General setup config SLABINFO diff --git a/kernel/Makefile b/kernel/Makefile index db9af70..135a1b9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,8 +8,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \ - utsname.o notifier.o ksysfs.o + hrtimer.o rwsem.o nsproxy.o srcu.o \ + utsname.o notifier.o ksysfs.o pm_qos_params.o obj-$(CONFIG_SYSCTL) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/kernel/capability.c b/kernel/capability.c index efbd9cd..39e8193 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -22,6 +22,37 @@ static DEFINE_SPINLOCK(task_capability_lock); /* + * Leveraged for setting/resetting capabilities + */ + +const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; +const kernel_cap_t __cap_full_set = CAP_FULL_SET; +const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET; + +EXPORT_SYMBOL(__cap_empty_set); +EXPORT_SYMBOL(__cap_full_set); +EXPORT_SYMBOL(__cap_init_eff_set); + +/* + * More recent versions of libcap are available from: + * + * http://www.kernel.org/pub/linux/libs/security/linux-privs/ + */ + +static void warn_legacy_capability_use(void) +{ + static int warned; + if (!warned) { + char name[sizeof(current->comm)]; + + printk(KERN_INFO "warning: `%s' uses 32-bit capabilities" + " (legacy support in use)\n", + get_task_comm(name, current)); + warned = 1; + } +} + +/* * For sys_getproccap() and sys_setproccap(), any of the three * capability set pointers may be NULL -- indicating that that set is * uninteresting and/or not to be changed. @@ -42,12 +73,21 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) pid_t pid; __u32 version; struct task_struct *target; - struct __user_cap_data_struct data; + unsigned tocopy; + kernel_cap_t pE, pI, pP; if (get_user(version, &header->version)) return -EFAULT; - if (version != _LINUX_CAPABILITY_VERSION) { + switch (version) { + case _LINUX_CAPABILITY_VERSION_1: + warn_legacy_capability_use(); + tocopy = _LINUX_CAPABILITY_U32S_1; + break; + case _LINUX_CAPABILITY_VERSION_2: + tocopy = _LINUX_CAPABILITY_U32S_2; + break; + default: if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) return -EFAULT; return -EINVAL; @@ -71,14 +111,47 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) } else target = current; - ret = security_capget(target, &data.effective, &data.inheritable, &data.permitted); + ret = security_capget(target, &pE, &pI, &pP); out: read_unlock(&tasklist_lock); spin_unlock(&task_capability_lock); - if (!ret && copy_to_user(dataptr, &data, sizeof data)) - return -EFAULT; + if (!ret) { + struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S]; + unsigned i; + + for (i = 0; i < tocopy; i++) { + kdata[i].effective = pE.cap[i]; + kdata[i].permitted = pP.cap[i]; + kdata[i].inheritable = pI.cap[i]; + } + + /* + * Note, in the case, tocopy < _LINUX_CAPABILITY_U32S, + * we silently drop the upper capabilities here. This + * has the effect of making older libcap + * implementations implicitly drop upper capability + * bits when they perform a: capget/modify/capset + * sequence. + * + * This behavior is considered fail-safe + * behavior. Upgrading the application to a newer + * version of libcap will enable access to the newer + * capabilities. + * + * An alternative would be to return an error here + * (-ERANGE), but that causes legacy applications to + * unexpectidly fail; the capget/modify/capset aborts + * before modification is attempted and the application + * fails. + */ + + if (copy_to_user(dataptr, kdata, tocopy + * sizeof(struct __user_cap_data_struct))) { + return -EFAULT; + } + } return ret; } @@ -167,6 +240,8 @@ static inline int cap_set_all(kernel_cap_t *effective, */ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) { + struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S]; + unsigned i, tocopy; kernel_cap_t inheritable, permitted, effective; __u32 version; struct task_struct *target; @@ -176,7 +251,15 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) if (get_user(version, &header->version)) return -EFAULT; - if (version != _LINUX_CAPABILITY_VERSION) { + switch (version) { + case _LINUX_CAPABILITY_VERSION_1: + warn_legacy_capability_use(); + tocopy = _LINUX_CAPABILITY_U32S_1; + break; + case _LINUX_CAPABILITY_VERSION_2: + tocopy = _LINUX_CAPABILITY_U32S_2; + break; + default: if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) return -EFAULT; return -EINVAL; @@ -188,10 +271,22 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP)) return -EPERM; - if (copy_from_user(&effective, &data->effective, sizeof(effective)) || - copy_from_user(&inheritable, &data->inheritable, sizeof(inheritable)) || - copy_from_user(&permitted, &data->permitted, sizeof(permitted))) + if (copy_from_user(&kdata, data, tocopy + * sizeof(struct __user_cap_data_struct))) { return -EFAULT; + } + + for (i = 0; i < tocopy; i++) { + effective.cap[i] = kdata[i].effective; + permitted.cap[i] = kdata[i].permitted; + inheritable.cap[i] = kdata[i].inheritable; + } + while (i < _LINUX_CAPABILITY_U32S) { + effective.cap[i] = 0; + permitted.cap[i] = 0; + inheritable.cap[i] = 0; + i++; + } spin_lock(&task_capability_lock); read_lock(&tasklist_lock); diff --git a/kernel/exit.c b/kernel/exit.c index 9e459fe..9d3d0f0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1083,11 +1083,12 @@ do_group_exit(int exit_code) struct signal_struct *const sig = current->signal; struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); - if (sig->flags & SIGNAL_GROUP_EXIT) + if (signal_group_exit(sig)) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; else { sig->group_exit_code = exit_code; + sig->flags = SIGNAL_GROUP_EXIT; zap_other_threads(current); } spin_unlock_irq(&sighand->siglock); diff --git a/kernel/fork.c b/kernel/fork.c index 05e0b6f..2b55b74 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -325,7 +325,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm) static inline void mm_free_pgd(struct mm_struct * mm) { - pgd_free(mm->pgd); + pgd_free(mm, mm->pgd); } #else #define dup_mmap(mm, oldmm) (0) @@ -1118,6 +1118,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_SECURITY p->security = NULL; #endif + p->cap_bset = current->cap_bset; p->io_context = NULL; p->audit_context = NULL; cgroup_fork(p); @@ -1450,6 +1451,23 @@ long do_fork(unsigned long clone_flags, int trace = 0; long nr; + /* + * We hope to recycle these flags after 2.6.26 + */ + if (unlikely(clone_flags & CLONE_STOPPED)) { + static int __read_mostly count = 100; + + if (count > 0 && printk_ratelimit()) { + char comm[TASK_COMM_LEN]; + + count--; + printk(KERN_INFO "fork(): process `%s' used deprecated " + "clone flags 0x%lx\n", + get_task_comm(comm, current), + clone_flags & CLONE_STOPPED); + } + } + if (unlikely(current->ptrace)) { trace = fork_traceflag (clone_flags); if (trace) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 1069998..668f396 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -306,7 +306,7 @@ EXPORT_SYMBOL_GPL(ktime_sub_ns); /* * Divide a ktime value by a nanosecond value */ -unsigned long ktime_divns(const ktime_t kt, s64 div) +u64 ktime_divns(const ktime_t kt, s64 div) { u64 dclc, inc, dns; int sft = 0; @@ -321,7 +321,7 @@ unsigned long ktime_divns(const ktime_t kt, s64 div) dclc >>= sft; do_div(dclc, (unsigned long) div); - return (unsigned long) dclc; + return dclc; } #endif /* BITS_PER_LONG >= 64 */ @@ -656,10 +656,9 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) * Forward the timer expiry so it will expire in the future. * Returns the number of overruns. */ -unsigned long -hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) +u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { - unsigned long orun = 1; + u64 orun = 1; ktime_t delta; delta = ktime_sub(now, timer->expires); diff --git a/kernel/latency.c b/kernel/latency.c deleted file mode 100644 index e63fcac..0000000 --- a/kernel/latency.c +++ /dev/null @@ -1,280 +0,0 @@ -/* - * latency.c: Explicit system-wide latency-expectation infrastructure - * - * The purpose of this infrastructure is to allow device drivers to set - * latency constraint they have and to collect and summarize these - * expectations globally. The cummulated result can then be used by - * power management and similar users to make decisions that have - * tradoffs with a latency component. - * - * An example user of this are the x86 C-states; each higher C state saves - * more power, but has a higher exit latency. For the idle loop power - * code to make a good decision which C-state to use, information about - * acceptable latencies is required. - * - * An example announcer of latency is an audio driver that knowns it - * will get an interrupt when the hardware has 200 usec of samples - * left in the DMA buffer; in that case the driver can set a latency - * constraint of, say, 150 usec. - * - * Multiple drivers can each announce their maximum accepted latency, - * to keep these appart, a string based identifier is used. - * - * - * (C) Copyright 2006 Intel Corporation - * Author: Arjan van de Ven <arjan@linux.intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include <linux/latency.h> -#include <linux/list.h> -#include <linux/spinlock.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/notifier.h> -#include <linux/jiffies.h> -#include <asm/atomic.h> - -struct latency_info { - struct list_head list; - int usecs; - char *identifier; -}; - -/* - * locking rule: all modifications to current_max_latency and - * latency_list need to be done while holding the latency_lock. - * latency_lock needs to be taken _irqsave. - */ -static atomic_t current_max_latency; -static DEFINE_SPINLOCK(latency_lock); - -static LIST_HEAD(latency_list); -static BLOCKING_NOTIFIER_HEAD(latency_notifier); - -/* - * This function returns the maximum latency allowed, which - * happens to be the minimum of all maximum latencies on the - * list. - */ -static int __find_max_latency(void) -{ - int min = INFINITE_LATENCY; - struct latency_info *info; - - list_for_each_entry(info, &latency_list, list) { - if (info->usecs < min) - min = info->usecs; - } - return min; -} - -/** - * set_acceptable_latency - sets the maximum latency acceptable - * @identifier: string that identifies this driver - * @usecs: maximum acceptable latency for this driver - * - * This function informs the kernel that this device(driver) - * can accept at most usecs latency. This setting is used for - * power management and similar tradeoffs. - * - * This function sleeps and can only be called from process - * context. - * Calling this function with an existing identifier is valid - * and will cause the existing latency setting to be changed. - */ -void set_acceptable_latency(char *identifier, int usecs) -{ - struct latency_info *info, *iter; - unsigned long flags; - int found_old = 0; - - info = kzalloc(sizeof(struct latency_info), GFP_KERNEL); - if (!info) - return; - info->usecs = usecs; - info->identifier = kstrdup(identifier, GFP_KERNEL); - if (!info->identifier) - goto free_info; - - spin_lock_irqsave(&latency_lock, flags); - list_for_each_entry(iter, &latency_list, list) { - if (strcmp(iter->identifier, identifier)==0) { - found_old = 1; - iter->usecs = usecs; - break; - } - } - if (!found_old) - list_add(&info->list, &latency_list); - - if (usecs < atomic_read(¤t_max_latency)) - atomic_set(¤t_max_latency, usecs); - - spin_unlock_irqrestore(&latency_lock, flags); - - blocking_notifier_call_chain(&latency_notifier, - atomic_read(¤t_max_latency), NULL); - - /* - * if we inserted the new one, we're done; otherwise there was - * an existing one so we need to free the redundant data - */ - if (!found_old) - return; - - kfree(info->identifier); -free_info: - kfree(info); -} -EXPORT_SYMBOL_GPL(set_acceptable_latency); - -/** - * modify_acceptable_latency - changes the maximum latency acceptable - * @identifier: string that identifies this driver - * @usecs: maximum acceptable latency for this driver - * - * This function informs the kernel that this device(driver) - * can accept at most usecs latency. This setting is used for - * power management and similar tradeoffs. - * - * This function does not sleep and can be called in any context. - * Trying to use a non-existing identifier silently gets ignored. - * - * Due to the atomic nature of this function, the modified latency - * value will only be used for future decisions; past decisions - * can still lead to longer latencies in the near future. - */ -void modify_acceptable_latency(char *identifier, int usecs) -{ - struct latency_info *iter; - unsigned long flags; - - spin_lock_irqsave(&latency_lock, flags); - list_for_each_entry(iter, &latency_list, list) { - if (strcmp(iter->identifier, identifier) == 0) { - iter->usecs = usecs; - break; - } - } - if (usecs < atomic_read(¤t_max_latency)) - atomic_set(¤t_max_latency, usecs); - spin_unlock_irqrestore(&latency_lock, flags); -} -EXPORT_SYMBOL_GPL(modify_acceptable_latency); - -/** - * remove_acceptable_latency - removes the maximum latency acceptable - * @identifier: string that identifies this driver - * - * This function removes a previously set maximum latency setting - * for the driver and frees up any resources associated with the - * bookkeeping needed for this. - * - * This function does not sleep and can be called in any context. - * Trying to use a non-existing identifier silently gets ignored. - */ -void remove_acceptable_latency(char *identifier) -{ - unsigned long flags; - int newmax = 0; - struct latency_info *iter, *temp; - - spin_lock_irqsave(&latency_lock, flags); - - list_for_each_entry_safe(iter, temp, &latency_list, list) { - if (strcmp(iter->identifier, identifier) == 0) { - list_del(&iter->list); - newmax = iter->usecs; - kfree(iter->identifier); - kfree(iter); - break; - } - } - - /* If we just deleted the system wide value, we need to - * recalculate with a full search - */ - if (newmax == atomic_read(¤t_max_latency)) { - newmax = __find_max_latency(); - atomic_set(¤t_max_latency, newmax); - } - spin_unlock_irqrestore(&latency_lock, flags); -} -EXPORT_SYMBOL_GPL(remove_acceptable_latency); - -/** - * system_latency_constraint - queries the system wide latency maximum - * - * This function returns the system wide maximum latency in - * microseconds. - * - * This function does not sleep and can be called in any context. - */ -int system_latency_constraint(void) -{ - return atomic_read(¤t_max_latency); -} -EXPORT_SYMBOL_GPL(system_latency_constraint); - -/** - * synchronize_acceptable_latency - recalculates all latency decisions - * - * This function will cause a callback to various kernel pieces that - * will make those pieces rethink their latency decisions. This implies - * that if there are overlong latencies in hardware state already, those - * latencies get taken right now. When this call completes no overlong - * latency decisions should be active anymore. - * - * Typical usecase of this is after a modify_acceptable_latency() call, - * which in itself is non-blocking and non-synchronizing. - * - * This function blocks and should not be called with locks held. - */ - -void synchronize_acceptable_latency(void) -{ - blocking_notifier_call_chain(&latency_notifier, - atomic_read(¤t_max_latency), NULL); -} -EXPORT_SYMBOL_GPL(synchronize_acceptable_latency); - -/* - * Latency notifier: this notifier gets called when a non-atomic new - * latency value gets set. The expectation nof the caller of the - * non-atomic set is that when the call returns, future latencies - * are within bounds, so the functions on the notifier list are - * expected to take the overlong latencies immediately, inside the - * callback, and not make a overlong latency decision anymore. - * - * The callback gets called when the new latency value is made - * active so system_latency_constraint() returns the new latency. - */ -int register_latency_notifier(struct notifier_block * nb) -{ - return blocking_notifier_chain_register(&latency_notifier, nb); -} -EXPORT_SYMBOL_GPL(register_latency_notifier); - -int unregister_latency_notifier(struct notifier_block * nb) -{ - return blocking_notifier_chain_unregister(&latency_notifier, nb); -} -EXPORT_SYMBOL_GPL(unregister_latency_notifier); - -static __init int latency_init(void) -{ - atomic_set(¤t_max_latency, INFINITE_LATENCY); - /* - * we don't want by default to have longer latencies than 2 ticks, - * since that would cause lost ticks - */ - set_acceptable_latency("kernel", 2*1000000/HZ); - return 0; -} - -module_init(latency_init); diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c new file mode 100644 index 0000000..0afe32b --- /dev/null +++ b/kernel/pm_qos_params.c @@ -0,0 +1,425 @@ +/* + * This module exposes the interface to kernel space for specifying + * QoS dependencies. It provides infrastructure for registration of: + * + * Dependents on a QoS value : register requirements + * Watchers of QoS value : get notified when target QoS value changes + * + * This QoS design is best effort based. Dependents register their QoS needs. + * Watchers register to keep track of the current QoS needs of the system. + * + * There are 3 basic classes of QoS parameter: latency, timeout, throughput + * each have defined units: + * latency: usec + * timeout: usec <-- currently not used. + * throughput: kbs (kilo byte / sec) + * + * There are lists of pm_qos_objects each one wrapping requirements, notifiers + * + * User mode requirements on a QOS parameter register themselves to the + * subsystem by opening the device node /dev/... and writing there request to + * the node. As long as the process holds a file handle open to the node the + * client continues to be accounted for. Upon file release the usermode + * requirement is removed and a new qos target is computed. This way when the + * requirement that the application has is cleaned up when closes the file + * pointer or exits the pm_qos_object will get an opportunity to clean up. + * + * mark gross mgross@linux.intel.com + */ + +#include <linux/pm_qos_params.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/miscdevice.h> +#include <linux/string.h> +#include <linux/platform_device.h> +#include <linux/init.h> + +#include <linux/uaccess.h> + +/* + * locking rule: all changes to target_value or requirements or notifiers lists + * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock + * held, taken with _irqsave. One lock to rule them all + */ +struct requirement_list { + struct list_head list; + union { + s32 value; + s32 usec; + s32 kbps; + }; + char *name; +}; + +static s32 max_compare(s32 v1, s32 v2); +static s32 min_compare(s32 v1, s32 v2); + +struct pm_qos_object { + struct requirement_list requirements; + struct blocking_notifier_head *notifiers; + struct miscdevice pm_qos_power_miscdev; + char *name; + s32 default_value; + s32 target_value; + s32 (*comparitor)(s32, s32); +}; + +static struct pm_qos_object null_pm_qos; +static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); +static struct pm_qos_object cpu_dma_pm_qos = { + .requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)}, + .notifiers = &cpu_dma_lat_notifier, + .name = "cpu_dma_latency", + .default_value = 2000 * USEC_PER_SEC, + .target_value = 2000 * USEC_PER_SEC, + .comparitor = min_compare +}; + +static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); +static struct pm_qos_object network_lat_pm_qos = { + .requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)}, + .notifiers = &network_lat_notifier, + .name = "network_latency", + .default_value = 2000 * USEC_PER_SEC, + .target_value = 2000 * USEC_PER_SEC, + .comparitor = min_compare +}; + + +static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); +static struct pm_qos_object network_throughput_pm_qos = { + .requirements = + {LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)}, + .notifiers = &network_throughput_notifier, + .name = "network_throughput", + .default_value = 0, + .target_value = 0, + .comparitor = max_compare +}; + + +static struct pm_qos_object *pm_qos_array[] = { + &null_pm_qos, + &cpu_dma_pm_qos, + &network_lat_pm_qos, + &network_throughput_pm_qos +}; + +static DEFINE_SPINLOCK(pm_qos_lock); + +static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, + size_t count, loff_t *f_pos); +static int pm_qos_power_open(struct inode *inode, struct file *filp); +static int pm_qos_power_release(struct inode *inode, struct file *filp); + +static const struct file_operations pm_qos_power_fops = { + .write = pm_qos_power_write, + .open = pm_qos_power_open, + .release = pm_qos_power_release, +}; + +/* static helper functions */ +static s32 max_compare(s32 v1, s32 v2) +{ + return max(v1, v2); +} + +static s32 min_compare(s32 v1, s32 v2) +{ + return min(v1, v2); +} + + +static void update_target(int target) +{ + s32 extreme_value; + struct requirement_list *node; + unsigned long flags; + int call_notifier = 0; + + spin_lock_irqsave(&pm_qos_lock, flags); + extreme_value = pm_qos_array[target]->default_value; + list_for_each_entry(node, + &pm_qos_array[target]->requirements.list, list) { + extreme_value = pm_qos_array[target]->comparitor( + extreme_value, node->value); + } + if (pm_qos_array[target]->target_value != extreme_value) { + call_notifier = 1; + pm_qos_array[target]->target_value = extreme_value; + pr_debug(KERN_ERR "new target for qos %d is %d\n", target, + pm_qos_array[target]->target_value); + } + spin_unlock_irqrestore(&pm_qos_lock, flags); + + if (call_notifier) + blocking_notifier_call_chain(pm_qos_array[target]->notifiers, + (unsigned long) extreme_value, NULL); +} + +static int register_pm_qos_misc(struct pm_qos_object *qos) +{ + qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; + qos->pm_qos_power_miscdev.name = qos->name; + qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; + + return misc_register(&qos->pm_qos_power_miscdev); +} + +static int find_pm_qos_object_by_minor(int minor) +{ + int pm_qos_class; + + for (pm_qos_class = 0; + pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { + if (minor == + pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) + return pm_qos_class; + } + return -1; +} + +/** + * pm_qos_requirement - returns current system wide qos expectation + * @pm_qos_class: identification of which qos value is requested + * + * This function returns the current target value in an atomic manner. + */ +int pm_qos_requirement(int pm_qos_class) +{ + int ret_val; + unsigned long flags; + + spin_lock_irqsave(&pm_qos_lock, flags); + ret_val = pm_qos_array[pm_qos_class]->target_value; + spin_unlock_irqrestore(&pm_qos_lock, flags); + + return ret_val; +} +EXPORT_SYMBOL_GPL(pm_qos_requirement); + +/** + * pm_qos_add_requirement - inserts new qos request into the list + * @pm_qos_class: identifies which list of qos request to us + * @name: identifies the request + * @value: defines the qos request + * + * This function inserts a new entry in the pm_qos_class list of requested qos + * performance charactoistics. It recomputes the agregate QoS expectations for + * the pm_qos_class of parrameters. + */ +int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) +{ + struct requirement_list *dep; + unsigned long flags; + + dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL); + if (dep) { + if (value == PM_QOS_DEFAULT_VALUE) + dep->value = pm_qos_array[pm_qos_class]->default_value; + else + dep->value = value; + dep->name = kstrdup(name, GFP_KERNEL); + if (!dep->name) + goto cleanup; + + spin_lock_irqsave(&pm_qos_lock, flags); + list_add(&dep->list, + &pm_qos_array[pm_qos_class]->requirements.list); + spin_unlock_irqrestore(&pm_qos_lock, flags); + update_target(pm_qos_class); + + return 0; + } + +cleanup: + kfree(dep); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(pm_qos_add_requirement); + +/** + * pm_qos_update_requirement - modifies an existing qos request + * @pm_qos_class: identifies which list of qos request to us + * @name: identifies the request + * @value: defines the qos request + * + * Updates an existing qos requierement for the pm_qos_class of parameters along + * with updating the target pm_qos_class value. + * + * If the named request isn't in the lest then no change is made. + */ +int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) +{ + unsigned long flags; + struct requirement_list *node; + int pending_update = 0; + + spin_lock_irqsave(&pm_qos_lock, flags); + list_for_each_entry(node, + &pm_qos_array[pm_qos_class]->requirements.list, list) { + if (strcmp(node->name, name) == 0) { + if (new_value == PM_QOS_DEFAULT_VALUE) + node->value = + pm_qos_array[pm_qos_class]->default_value; + else + node->value = new_value; + pending_update = 1; + break; + } + } + spin_unlock_irqrestore(&pm_qos_lock, flags); + if (pending_update) + update_target(pm_qos_class); + + return 0; +} +EXPORT_SYMBOL_GPL(pm_qos_update_requirement); + +/** + * pm_qos_remove_requirement - modifies an existing qos request + * @pm_qos_class: identifies which list of qos request to us + * @name: identifies the request + * + * Will remove named qos request from pm_qos_class list of parrameters and + * recompute the current target value for the pm_qos_class. + */ +void pm_qos_remove_requirement(int pm_qos_class, char *name) +{ + unsigned long flags; + struct requirement_list *node; + int pending_update = 0; + + spin_lock_irqsave(&pm_qos_lock, flags); + list_for_each_entry(node, + &pm_qos_array[pm_qos_class]->requirements.list, list) { + if (strcmp(node->name, name) == 0) { + kfree(node->name); + list_del(&node->list); + kfree(node); + pending_update = 1; + break; + } + } + spin_unlock_irqrestore(&pm_qos_lock, flags); + if (pending_update) + update_target(pm_qos_class); +} +EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); + +/** + * pm_qos_add_notifier - sets notification entry for changes to target value + * @pm_qos_class: identifies which qos target changes should be notified. + * @notifier: notifier block managed by caller. + * + * will register the notifier into a notification chain that gets called + * uppon changes to the pm_qos_class target value. + */ + int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) +{ + int retval; + + retval = blocking_notifier_chain_register( + pm_qos_array[pm_qos_class]->notifiers, notifier); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_qos_add_notifier); + +/** + * pm_qos_remove_notifier - deletes notification entry from chain. + * @pm_qos_class: identifies which qos target changes are notified. + * @notifier: notifier block to be removed. + * + * will remove the notifier from the notification chain that gets called + * uppon changes to the pm_qos_class target value. + */ +int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) +{ + int retval; + + retval = blocking_notifier_chain_unregister( + pm_qos_array[pm_qos_class]->notifiers, notifier); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); + +#define PID_NAME_LEN sizeof("process_1234567890") +static char name[PID_NAME_LEN]; + +static int pm_qos_power_open(struct inode *inode, struct file *filp) +{ + int ret; + long pm_qos_class; + + pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); + if (pm_qos_class >= 0) { + filp->private_data = (void *)pm_qos_class; + sprintf(name, "process_%d", current->pid); + ret = pm_qos_add_requirement(pm_qos_class, name, + PM_QOS_DEFAULT_VALUE); + if (ret >= 0) + return 0; + } + + return -EPERM; +} + +static int pm_qos_power_release(struct inode *inode, struct file *filp) +{ + int pm_qos_class; + + pm_qos_class = (long)filp->private_data; + sprintf(name, "process_%d", current->pid); + pm_qos_remove_requirement(pm_qos_class, name); + + return 0; +} + +static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value; + int pm_qos_class; + + pm_qos_class = (long)filp->private_data; + if (count != sizeof(s32)) + return -EINVAL; + if (copy_from_user(&value, buf, sizeof(s32))) + return -EFAULT; + sprintf(name, "process_%d", current->pid); + pm_qos_update_requirement(pm_qos_class, name, value); + + return sizeof(s32); +} + + +static int __init pm_qos_power_init(void) +{ + int ret = 0; + + ret = register_pm_qos_misc(&cpu_dma_pm_qos); + if (ret < 0) { + printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n"); + return ret; + } + ret = register_pm_qos_misc(&network_lat_pm_qos); + if (ret < 0) { + printk(KERN_ERR "pm_qos_param: network_latency setup failed\n"); + return ret; + } + ret = register_pm_qos_misc(&network_throughput_pm_qos); + if (ret < 0) + printk(KERN_ERR + "pm_qos_param: network_throughput setup failed\n"); + + return ret; +} + +late_initcall(pm_qos_power_init); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 36d563f..122d5c7 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -256,8 +256,9 @@ static void schedule_next_timer(struct k_itimer *timr) if (timr->it.real.interval.tv64 == 0) return; - timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), - timr->it.real.interval); + timr->it_overrun += (unsigned int) hrtimer_forward(timer, + timer->base->get_time(), + timr->it.real.interval); timr->it_overrun_last = timr->it_overrun; timr->it_overrun = -1; @@ -386,7 +387,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) now = ktime_add(now, kj); } #endif - timr->it_overrun += + timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, timr->it.real.interval); ret = HRTIMER_RESTART; @@ -662,7 +663,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) */ if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING || (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) - timr->it_overrun += hrtimer_forward(timer, now, iv); + timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); remaining = ktime_sub(timer->expires, now); /* Return 0 only, when the timer is expired and not pending */ diff --git a/kernel/power/disk.c b/kernel/power/disk.c index d09da08..859a8e5 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -26,7 +26,7 @@ static int noresume = 0; -char resume_file[256] = CONFIG_PM_STD_PARTITION; +static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; @@ -185,7 +185,7 @@ static void platform_restore_cleanup(int platform_mode) * reappears in this routine after a restore. */ -int create_image(int platform_mode) +static int create_image(int platform_mode) { int error; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f6a5df9..95250d7 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1203,7 +1203,7 @@ asmlinkage int swsusp_save(void) printk(KERN_INFO "PM: Creating hibernation image: \n"); - drain_local_pages(); + drain_local_pages(NULL); nr_pages = count_data_pages(); nr_highmem = count_highmem_pages(); printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem); @@ -1221,7 +1221,7 @@ asmlinkage int swsusp_save(void) /* During allocating of suspend pagedir, new cold pages may appear. * Kill them. */ - drain_local_pages(); + drain_local_pages(NULL); copy_data_pages(©_bm, &orig_bm); /* diff --git a/kernel/signal.c b/kernel/signal.c index 4333b6d..6a5f97c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -911,27 +911,6 @@ __group_complete_signal(int sig, struct task_struct *p) } while_each_thread(p, t); return; } - - /* - * There will be a core dump. We make all threads other - * than the chosen one go into a group stop so that nothing - * happens until it gets scheduled, takes the signal off - * the shared queue, and does the core dump. This is a - * little more complicated than strictly necessary, but it - * keeps the signal state that winds up in the core dump - * unchanged from the death state, e.g. which thread had - * the core-dump signal unblocked. - */ - rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); - rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); - p->signal->group_stop_count = 0; - p->signal->group_exit_task = t; - p = t; - do { - p->signal->group_stop_count++; - signal_wake_up(t, t == p); - } while_each_thread(p, t); - return; } /* @@ -978,7 +957,6 @@ void zap_other_threads(struct task_struct *p) { struct task_struct *t; - p->signal->flags = SIGNAL_GROUP_EXIT; p->signal->group_stop_count = 0; for (t = next_thread(p); t != p; t = next_thread(t)) { @@ -1709,9 +1687,6 @@ static int do_signal_stop(int signr) struct signal_struct *sig = current->signal; int stop_count; - if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) - return 0; - if (sig->group_stop_count > 0) { /* * There is a group stop in progress. We don't need to @@ -1719,12 +1694,15 @@ static int do_signal_stop(int signr) */ stop_count = --sig->group_stop_count; } else { + struct task_struct *t; + + if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || + unlikely(sig->group_exit_task)) + return 0; /* * There is no group stop already in progress. * We must initiate one now. */ - struct task_struct *t; - sig->group_exit_code = signr; stop_count = 0; @@ -1752,47 +1730,6 @@ static int do_signal_stop(int signr) return 1; } -/* - * Do appropriate magic when group_stop_count > 0. - * We return nonzero if we stopped, after releasing the siglock. - * We return zero if we still hold the siglock and should look - * for another signal without checking group_stop_count again. - */ -static int handle_group_stop(void) -{ - int stop_count; - - if (current->signal->group_exit_task == current) { - /* - * Group stop is so we can do a core dump, - * We are the initiating thread, so get on with it. - */ - current->signal->group_exit_task = NULL; - return 0; - } - - if (current->signal->flags & SIGNAL_GROUP_EXIT) - /* - * Group stop is so another thread can do a core dump, - * or else we are racing against a death signal. - * Just punt the stop so we can get the next signal. - */ - return 0; - - /* - * There is a group stop in progress. We stop - * without any associated signal being in our queue. - */ - stop_count = --current->signal->group_stop_count; - if (stop_count == 0) - current->signal->flags = SIGNAL_STOP_STOPPED; - current->exit_code = current->signal->group_exit_code; - set_current_state(TASK_STOPPED); - spin_unlock_irq(¤t->sighand->siglock); - finish_stop(stop_count); - return 1; -} - int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie) { @@ -1807,7 +1744,7 @@ relock: struct k_sigaction *ka; if (unlikely(current->signal->group_stop_count > 0) && - handle_group_stop()) + do_signal_stop(0)) goto relock; signr = dequeue_signal(current, mask, info); diff --git a/kernel/sys.c b/kernel/sys.c index d1fe71e..53de35f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -315,7 +315,7 @@ static void kernel_kexec(void) #endif } -void kernel_shutdown_prepare(enum system_states state) +static void kernel_shutdown_prepare(enum system_states state) { blocking_notifier_call_chain(&reboot_notifier_list, (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); @@ -1637,7 +1637,7 @@ asmlinkage long sys_umask(int mask) mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); return mask; } - + asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { @@ -1742,6 +1742,17 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, error = prctl_set_seccomp(arg2); break; + case PR_CAPBSET_READ: + if (!cap_valid(arg2)) + return -EINVAL; + return !!cap_raised(current->cap_bset, arg2); + case PR_CAPBSET_DROP: +#ifdef CONFIG_SECURITY_FILE_CAPABILITIES + return cap_prctl_drop(arg2); +#else + return -EINVAL; +#endif + default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index beee5b3..5b9b467 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -154,7 +154,10 @@ cond_syscall(sys_ioprio_get); /* New file descriptors */ cond_syscall(sys_signalfd); -cond_syscall(sys_timerfd); cond_syscall(compat_sys_signalfd); -cond_syscall(compat_sys_timerfd); +cond_syscall(sys_timerfd_create); +cond_syscall(sys_timerfd_settime); +cond_syscall(sys_timerfd_gettime); +cond_syscall(compat_sys_timerfd_settime); +cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7cb1ac3..5e2ad5b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -84,8 +84,11 @@ extern int sysctl_stat_interval; extern int latencytop_enabled; /* Constants used for minimum and maximum */ -#ifdef CONFIG_DETECT_SOFTLOCKUP +#if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM) static int one = 1; +#endif + +#ifdef CONFIG_DETECT_SOFTLOCKUP static int sixty = 60; #endif @@ -416,15 +419,6 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif -#ifdef CONFIG_SECURITY_CAPABILITIES - { - .procname = "cap-bound", - .data = &cap_bset, - .maxlen = sizeof(kernel_cap_t), - .mode = 0600, - .proc_handler = &proc_dointvec_bset, - }, -#endif /* def CONFIG_SECURITY_CAPABILITIES */ #ifdef CONFIG_BLK_DEV_INITRD { .ctl_name = KERN_REALROOTDEV, @@ -1150,6 +1144,19 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, }, #endif +#ifdef CONFIG_HIGHMEM + { + .ctl_name = CTL_UNNUMBERED, + .procname = "highmem_is_dirtyable", + .data = &vm_highmem_is_dirtyable, + .maxlen = sizeof(vm_highmem_is_dirtyable), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -2080,26 +2087,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, return 0; } -#ifdef CONFIG_SECURITY_CAPABILITIES -/* - * init may raise the set. - */ - -int proc_dointvec_bset(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int op; - - if (write && !capable(CAP_SYS_MODULE)) { - return -EPERM; - } - - op = is_global_init(current) ? OP_SET : OP_AND; - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, - do_proc_dointvec_bset_conv,&op); -} -#endif /* def CONFIG_SECURITY_CAPABILITIES */ - /* * Taint values can only be increased */ @@ -2513,12 +2500,6 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp, return -ENOSYS; } -int proc_dointvec_bset(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index c3206fa..006365b 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -37,10 +37,6 @@ static struct trans_ctl_table trans_kern_table[] = { { KERN_NODENAME, "hostname" }, { KERN_DOMAINNAME, "domainname" }, -#ifdef CONFIG_SECURITY_CAPABILITIES - { KERN_CAP_BSET, "cap-bound" }, -#endif /* def CONFIG_SECURITY_CAPABILITIES */ - { KERN_PANIC, "panic" }, { KERN_REALROOTDEV, "real-root-dev" }, @@ -1498,9 +1494,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) (table->strategy == sysctl_ms_jiffies) || (table->proc_handler == proc_dostring) || (table->proc_handler == proc_dointvec) || -#ifdef CONFIG_SECURITY_CAPABILITIES - (table->proc_handler == proc_dointvec_bset) || -#endif /* def CONFIG_SECURITY_CAPABILITIES */ (table->proc_handler == proc_dointvec_minmax) || (table->proc_handler == proc_dointvec_jiffies) || (table->proc_handler == proc_dointvec_userhz_jiffies) || diff --git a/lib/Makefile b/lib/Makefile index 543f2502..a18062e4 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -65,6 +65,7 @@ obj-$(CONFIG_SMP) += pcounter.o obj-$(CONFIG_AUDIT_GENERIC) += audit.o obj-$(CONFIG_SWIOTLB) += swiotlb.o +obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o lib-$(CONFIG_GENERIC_BUG) += bug.o diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c new file mode 100644 index 0000000..495575a --- /dev/null +++ b/lib/iommu-helper.c @@ -0,0 +1,80 @@ +/* + * IOMMU helper functions for the free area management + */ + +#include <linux/module.h> +#include <linux/bitops.h> + +static unsigned long find_next_zero_area(unsigned long *map, + unsigned long size, + unsigned long start, + unsigned int nr, + unsigned long align_mask) +{ + unsigned long index, end, i; +again: + index = find_next_zero_bit(map, size, start); + + /* Align allocation */ + index = (index + align_mask) & ~align_mask; + + end = index + nr; + if (end >= size) + return -1; + for (i = index; i < end; i++) { + if (test_bit(i, map)) { + start = i+1; + goto again; + } + } + return index; +} + +static inline void set_bit_area(unsigned long *map, unsigned long i, + int len) +{ + unsigned long end = i + len; + while (i < end) { + __set_bit(i, map); + i++; + } +} + +static inline int is_span_boundary(unsigned int index, unsigned int nr, + unsigned long shift, + unsigned long boundary_size) +{ + shift = (shift + index) & (boundary_size - 1); + return shift + nr > boundary_size; +} + +unsigned long iommu_area_alloc(unsigned long *map, unsigned long size, + unsigned long start, unsigned int nr, + unsigned long shift, unsigned long boundary_size, + unsigned long align_mask) +{ + unsigned long index; +again: + index = find_next_zero_area(map, size, start, nr, align_mask); + if (index != -1) { + if (is_span_boundary(index, nr, shift, boundary_size)) { + /* we could do more effectively */ + start = index + 1; + goto again; + } + set_bit_area(map, index, nr); + } + return index; +} +EXPORT_SYMBOL(iommu_area_alloc); + +void iommu_area_free(unsigned long *map, unsigned long start, unsigned int nr) +{ + unsigned long end = start + nr; + + while (start < end) { + __clear_bit(start, map); + start++; + } +} +EXPORT_SYMBOL(iommu_area_free); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 48c250f..65f0e75 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -95,14 +95,17 @@ static inline gfp_t root_gfp_mask(struct radix_tree_root *root) static struct radix_tree_node * radix_tree_node_alloc(struct radix_tree_root *root) { - struct radix_tree_node *ret; + struct radix_tree_node *ret = NULL; gfp_t gfp_mask = root_gfp_mask(root); - ret = kmem_cache_alloc(radix_tree_node_cachep, - set_migrateflags(gfp_mask, __GFP_RECLAIMABLE)); - if (ret == NULL && !(gfp_mask & __GFP_WAIT)) { + if (!(gfp_mask & __GFP_WAIT)) { struct radix_tree_preload *rtp; + /* + * Provided the caller has preloaded here, we will always + * succeed in getting a node here (and never reach + * kmem_cache_alloc) + */ rtp = &__get_cpu_var(radix_tree_preloads); if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; @@ -110,6 +113,10 @@ radix_tree_node_alloc(struct radix_tree_root *root) rtp->nr--; } } + if (ret == NULL) + ret = kmem_cache_alloc(radix_tree_node_cachep, + set_migrateflags(gfp_mask, __GFP_RECLAIMABLE)); + BUG_ON(radix_tree_is_indirect_ptr(ret)); return ret; } diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 1a8050a..4bb5a11 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -282,6 +282,15 @@ address_needs_mapping(struct device *hwdev, dma_addr_t addr) return (addr & ~mask) != 0; } +static inline unsigned int is_span_boundary(unsigned int index, + unsigned int nslots, + unsigned long offset_slots, + unsigned long max_slots) +{ + unsigned long offset = (offset_slots + index) & (max_slots - 1); + return offset + nslots > max_slots; +} + /* * Allocates bounce buffer and returns its kernel virtual address. */ @@ -292,6 +301,16 @@ map_single(struct device *hwdev, char *buffer, size_t size, int dir) char *dma_addr; unsigned int nslots, stride, index, wrap; int i; + unsigned long start_dma_addr; + unsigned long mask; + unsigned long offset_slots; + unsigned long max_slots; + + mask = dma_get_seg_boundary(hwdev); + start_dma_addr = virt_to_bus(io_tlb_start) & mask; + + offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + max_slots = ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; /* * For mappings greater than a page, we limit the stride (and @@ -311,10 +330,17 @@ map_single(struct device *hwdev, char *buffer, size_t size, int dir) */ spin_lock_irqsave(&io_tlb_lock, flags); { - wrap = index = ALIGN(io_tlb_index, stride); - + index = ALIGN(io_tlb_index, stride); if (index >= io_tlb_nslabs) - wrap = index = 0; + index = 0; + + while (is_span_boundary(index, nslots, offset_slots, + max_slots)) { + index += stride; + if (index >= io_tlb_nslabs) + index = 0; + } + wrap = index; do { /* @@ -341,9 +367,12 @@ map_single(struct device *hwdev, char *buffer, size_t size, int dir) goto found; } - index += stride; - if (index >= io_tlb_nslabs) - index = 0; + do { + index += stride; + if (index >= io_tlb_nslabs) + index = 0; + } while (is_span_boundary(index, nslots, offset_slots, + max_slots)); } while (index != wrap); spin_unlock_irqrestore(&io_tlb_lock, flags); diff --git a/mm/Makefile b/mm/Makefile index 5c0b0ea..44e2528 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -13,6 +13,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o $(mmu-y) +obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o diff --git a/mm/fadvise.c b/mm/fadvise.c index 0df4c89..3c0f1e9 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -49,9 +49,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) goto out; } - if (mapping->a_ops->get_xip_page) - /* no bad return value, but ignore advice */ + if (mapping->a_ops->get_xip_page) { + switch (advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_WILLNEED: + case POSIX_FADV_NOREUSE: + case POSIX_FADV_DONTNEED: + /* no bad return value, but ignore advice */ + break; + default: + ret = -EINVAL; + } goto out; + } /* Careful about overflows. Len == 0 means "as much as possible" */ endbyte = offset + len; diff --git a/mm/filemap.c b/mm/filemap.c index 76bea88..81fb9bf 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -65,7 +65,6 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock - * ->zone.lock * * ->i_mutex * ->i_mmap_lock (truncate->unmap_mapping_range) @@ -528,7 +527,7 @@ static inline void wake_up_page(struct page *page, int bit) __wake_up_bit(page_waitqueue(page), &page->flags, bit); } -void fastcall wait_on_page_bit(struct page *page, int bit_nr) +void wait_on_page_bit(struct page *page, int bit_nr) { DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); @@ -552,7 +551,7 @@ EXPORT_SYMBOL(wait_on_page_bit); * the clear_bit and the read of the waitqueue (to avoid SMP races with a * parallel wait_on_page_locked()). */ -void fastcall unlock_page(struct page *page) +void unlock_page(struct page *page) { smp_mb__before_clear_bit(); if (!TestClearPageLocked(page)) @@ -586,7 +585,7 @@ EXPORT_SYMBOL(end_page_writeback); * chances are that on the second loop, the block layer's plug list is empty, * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. */ -void fastcall __lock_page(struct page *page) +void __lock_page(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); @@ -607,7 +606,7 @@ int fastcall __lock_page_killable(struct page *page) * Variant of lock_page that does not require the caller to hold a reference * on the page's mapping. */ -void fastcall __lock_page_nosync(struct page *page) +void __lock_page_nosync(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, @@ -1277,7 +1276,7 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. */ -static int fastcall page_cache_read(struct file * file, pgoff_t offset) +static int page_cache_read(struct file *file, pgoff_t offset) { struct address_space *mapping = file->f_mapping; struct page *page; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index f874ae8..0420a02 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -431,7 +431,7 @@ xip_truncate_page(struct address_space *mapping, loff_t from) else return PTR_ERR(page); } - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); return 0; } EXPORT_SYMBOL_GPL(xip_truncate_page); diff --git a/mm/fremap.c b/mm/fremap.c index 14bd3bf..69a37c2 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -190,10 +190,13 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, */ if (mapping_cap_account_dirty(mapping)) { unsigned long addr; + struct file *file = vma->vm_file; flags &= MAP_NONBLOCK; - addr = mmap_region(vma->vm_file, start, size, + get_file(file); + addr = mmap_region(file, start, size, flags, vma->vm_flags, pgoff, 1); + fput(file); if (IS_ERR_VALUE(addr)) { err = addr; } else { diff --git a/mm/highmem.c b/mm/highmem.c index 7a967bc..35d4773 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -163,7 +163,7 @@ start: return vaddr; } -void fastcall *kmap_high(struct page *page) +void *kmap_high(struct page *page) { unsigned long vaddr; @@ -185,7 +185,7 @@ void fastcall *kmap_high(struct page *page) EXPORT_SYMBOL(kmap_high); -void fastcall kunmap_high(struct page *page) +void kunmap_high(struct page *page) { unsigned long vaddr; unsigned long nr; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index db861d8..1a56420 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -813,6 +813,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); copy_huge_page(new_page, old_page, address, vma); + __SetPageUptodate(new_page); spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & HPAGE_MASK); @@ -858,6 +859,7 @@ retry: goto out; } clear_huge_page(page, address); + __SetPageUptodate(page); if (vma->vm_flags & VM_SHARED) { int err; diff --git a/mm/internal.h b/mm/internal.h index 953f941..5a9a620 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -24,7 +24,7 @@ static inline void set_page_count(struct page *page, int v) */ static inline void set_page_refcounted(struct page *page) { - VM_BUG_ON(PageCompound(page) && PageTail(page)); + VM_BUG_ON(PageTail(page)); VM_BUG_ON(atomic_read(&page->_count)); set_page_count(page, 1); } @@ -34,7 +34,7 @@ static inline void __put_page(struct page *page) atomic_dec(&page->_count); } -extern void fastcall __init __free_pages_bootmem(struct page *page, +extern void __init __free_pages_bootmem(struct page *page, unsigned int order); /* diff --git a/mm/memory.c b/mm/memory.c index d902d0e..7bb7072 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -305,7 +305,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) spin_lock(&mm->page_table_lock); if (pmd_present(*pmd)) { /* Another has populated it */ pte_lock_deinit(new); - pte_free(new); + pte_free(mm, new); } else { mm->nr_ptes++; inc_zone_page_state(new, NR_PAGETABLE); @@ -323,7 +323,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) spin_lock(&init_mm.page_table_lock); if (pmd_present(*pmd)) /* Another has populated it */ - pte_free_kernel(new); + pte_free_kernel(&init_mm, new); else pmd_populate_kernel(&init_mm, pmd, new); spin_unlock(&init_mm.page_table_lock); @@ -1109,7 +1109,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } EXPORT_SYMBOL(get_user_pages); -pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) +pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, + spinlock_t **ptl) { pgd_t * pgd = pgd_offset(mm, addr); pud_t * pud = pud_alloc(mm, pgd, addr); @@ -1517,10 +1518,8 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo memset(kaddr, 0, PAGE_SIZE); kunmap_atomic(kaddr, KM_USER0); flush_dcache_page(dst); - return; - - } - copy_user_highpage(dst, src, va, vma); + } else + copy_user_highpage(dst, src, va, vma); } /* @@ -1629,6 +1628,7 @@ gotten: if (!new_page) goto oom; cow_user_page(new_page, old_page, address, vma); + __SetPageUptodate(new_page); /* * Re-check the pte - we dropped the lock @@ -1909,50 +1909,49 @@ EXPORT_SYMBOL(unmap_mapping_range); */ int vmtruncate(struct inode * inode, loff_t offset) { - struct address_space *mapping = inode->i_mapping; - unsigned long limit; + if (inode->i_size < offset) { + unsigned long limit; - if (inode->i_size < offset) - goto do_expand; - /* - * truncation of in-use swapfiles is disallowed - it would cause - * subsequent swapout to scribble on the now-freed blocks. - */ - if (IS_SWAPFILE(inode)) - goto out_busy; - i_size_write(inode, offset); + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out_big; + i_size_write(inode, offset); + } else { + struct address_space *mapping = inode->i_mapping; + + /* + * truncation of in-use swapfiles is disallowed - it would + * cause subsequent swapout to scribble on the now-freed + * blocks. + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + i_size_write(inode, offset); + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, offset); + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); + } - /* - * unmap_mapping_range is called twice, first simply for efficiency - * so that truncate_inode_pages does fewer single-page unmaps. However - * after this first call, and before truncate_inode_pages finishes, - * it is possible for private pages to be COWed, which remain after - * truncate_inode_pages finishes, hence the second unmap_mapping_range - * call must be made for correctness. - */ - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, offset); - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - goto out_truncate; - -do_expand: - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out_big; - i_size_write(inode, offset); - -out_truncate: if (inode->i_op && inode->i_op->truncate) inode->i_op->truncate(inode); return 0; + out_sig: send_sig(SIGXFSZ, current, 0); out_big: return -EFBIG; -out_busy: - return -ETXTBSY; } EXPORT_SYMBOL(vmtruncate); @@ -1980,67 +1979,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) return 0; } -/** - * swapin_readahead - swap in pages in hope we need them soon - * @entry: swap entry of this memory - * @addr: address to start - * @vma: user vma this addresses belong to - * - * Primitive swap readahead code. We simply read an aligned block of - * (1 << page_cluster) entries in the swap area. This method is chosen - * because it doesn't cost us any seek time. We also make sure to queue - * the 'original' request together with the readahead ones... - * - * This has been extended to use the NUMA policies from the mm triggering - * the readahead. - * - * Caller must hold down_read on the vma->vm_mm if vma is not NULL. - */ -void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma) -{ -#ifdef CONFIG_NUMA - struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL; -#endif - int i, num; - struct page *new_page; - unsigned long offset; - - /* - * Get the number of handles we should do readahead io to. - */ - num = valid_swaphandles(entry, &offset); - for (i = 0; i < num; offset++, i++) { - /* Ok, do the async read-ahead now */ - new_page = read_swap_cache_async(swp_entry(swp_type(entry), - offset), vma, addr); - if (!new_page) - break; - page_cache_release(new_page); -#ifdef CONFIG_NUMA - /* - * Find the next applicable VMA for the NUMA policy. - */ - addr += PAGE_SIZE; - if (addr == 0) - vma = NULL; - if (vma) { - if (addr >= vma->vm_end) { - vma = next_vma; - next_vma = vma ? vma->vm_next : NULL; - } - if (vma && addr < vma->vm_start) - vma = NULL; - } else { - if (next_vma && addr >= next_vma->vm_start) { - vma = next_vma; - next_vma = vma->vm_next; - } - } -#endif - } - lru_add_drain(); /* Push any new pages onto the LRU now */ -} - /* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -2068,8 +2006,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, page = lookup_swap_cache(entry); if (!page) { grab_swap_token(); /* Contend for token _before_ read-in */ - swapin_readahead(entry, address, vma); - page = read_swap_cache_async(entry, vma, address); + page = swapin_readahead(entry, + GFP_HIGHUSER_MOVABLE, vma, address); if (!page) { /* * Back out if somebody else faulted in this pte @@ -2163,6 +2101,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page = alloc_zeroed_user_highpage_movable(vma, address); if (!page) goto oom; + __SetPageUptodate(page); entry = mk_pte(page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2263,6 +2202,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out; } copy_user_highpage(page, vmf.page, address, vma); + __SetPageUptodate(page); } else { /* * If the page will be shareable, see if the backing @@ -2563,7 +2503,7 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) spin_lock(&mm->page_table_lock); if (pgd_present(*pgd)) /* Another has populated it */ - pud_free(new); + pud_free(mm, new); else pgd_populate(mm, pgd, new); spin_unlock(&mm->page_table_lock); @@ -2585,12 +2525,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_4LEVEL_HACK if (pud_present(*pud)) /* Another has populated it */ - pmd_free(new); + pmd_free(mm, new); else pud_populate(mm, pud, new); #else if (pgd_present(*pud)) /* Another has populated it */ - pmd_free(new); + pmd_free(mm, new); else pgd_populate(mm, pud, new); #endif /* __ARCH_HAS_4LEVEL_HACK */ @@ -2618,46 +2558,6 @@ int make_pages_present(unsigned long addr, unsigned long end) return ret == len ? 0 : -1; } -/* - * Map a vmalloc()-space virtual address to the physical page. - */ -struct page * vmalloc_to_page(void * vmalloc_addr) -{ - unsigned long addr = (unsigned long) vmalloc_addr; - struct page *page = NULL; - pgd_t *pgd = pgd_offset_k(addr); - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - - if (!pgd_none(*pgd)) { - pud = pud_offset(pgd, addr); - if (!pud_none(*pud)) { - pmd = pmd_offset(pud, addr); - if (!pmd_none(*pmd)) { - ptep = pte_offset_map(pmd, addr); - pte = *ptep; - if (pte_present(pte)) - page = pte_page(pte); - pte_unmap(ptep); - } - } - } - return page; -} - -EXPORT_SYMBOL(vmalloc_to_page); - -/* - * Map a vmalloc()-space virtual address to the physical page frame number. - */ -unsigned long vmalloc_to_pfn(void * vmalloc_addr) -{ - return page_to_pfn(vmalloc_to_page(vmalloc_addr)); -} - -EXPORT_SYMBOL(vmalloc_to_pfn); - #if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9512a54..7469c50 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -481,8 +481,6 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } -extern void drain_all_local_pages(void); - int offline_pages(unsigned long start_pfn, unsigned long end_pfn, unsigned long timeout) { @@ -540,7 +538,7 @@ repeat: lru_add_drain_all(); flush_scheduled_work(); cond_resched(); - drain_all_local_pages(); + drain_all_pages(); } pfn = scan_lru_pages(start_pfn, end_pfn); @@ -563,7 +561,7 @@ repeat: flush_scheduled_work(); yield(); /* drain pcp pages , this is synchrouns. */ - drain_all_local_pages(); + drain_all_pages(); /* check again */ offlined_pages = check_pages_isolated(start_pfn, end_pfn); if (offlined_pages < 0) { diff --git a/mm/migrate.c b/mm/migrate.c index 6a207e8..857a987 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -115,11 +115,6 @@ int putback_lru_pages(struct list_head *l) return count; } -static inline int is_swap_pte(pte_t pte) -{ - return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); -} - /* * Restore a potential migration pte to a working pte entry */ @@ -645,15 +640,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, rcu_read_lock(); rcu_locked = 1; } + /* - * This is a corner case handling. - * When a new swap-cache is read into, it is linked to LRU - * and treated as swapcache but has no rmap yet. - * Calling try_to_unmap() against a page->mapping==NULL page is - * BUG. So handle it here. + * Corner case handling: + * 1. When a new swap-cache page is read into, it is added to the LRU + * and treated as swapcache but it has no rmap yet. + * Calling try_to_unmap() against a page->mapping==NULL page will + * trigger a BUG. So handle it here. + * 2. An orphaned page (see truncate_complete_page) might have + * fs-private metadata. The page can be picked up due to memory + * offlining. Everywhere else except page reclaim, the page is + * invisible to the vm, so the page can not be migrated. So try to + * free the metadata, so the page can be freed. */ - if (!page->mapping) + if (!page->mapping) { + if (!PageAnon(page) && PagePrivate(page)) { + /* + * Go direct to try_to_free_buffers() here because + * a) that's what try_to_release_page() would do anyway + * b) we may be under rcu_read_lock() here, so we can't + * use GFP_KERNEL which is what try_to_release_page() + * needs to be effective. + */ + try_to_free_buffers(page); + } goto rcu_unlock; + } + /* Establish migration ptes or remove ptes */ try_to_unmap(page, 1); @@ -36,6 +36,10 @@ #define arch_mmap_check(addr, len, flags) (0) #endif +#ifndef arch_rebalance_pgtables +#define arch_rebalance_pgtables(addr, len) (addr) +#endif + static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); @@ -1424,7 +1428,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (addr & ~PAGE_MASK) return -EINVAL; - return addr; + return arch_rebalance_pgtables(addr, len); } EXPORT_SYMBOL(get_unmapped_area); @@ -10,6 +10,7 @@ * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> + * Copyright (c) 2007 Paul Mundt <lethal@linux-sh.org> */ #include <linux/module.h> @@ -167,7 +168,7 @@ EXPORT_SYMBOL(get_user_pages); DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; -void vfree(void *addr) +void vfree(const void *addr) { kfree(addr); } @@ -183,13 +184,33 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); -struct page * vmalloc_to_page(void *addr) +void *vmalloc_user(unsigned long size) +{ + void *ret; + + ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); + if (ret) { + struct vm_area_struct *vma; + + down_write(¤t->mm->mmap_sem); + vma = find_vma(current->mm, (unsigned long)ret); + if (vma) + vma->vm_flags |= VM_USERMAP; + up_write(¤t->mm->mmap_sem); + } + + return ret; +} +EXPORT_SYMBOL(vmalloc_user); + +struct page *vmalloc_to_page(const void *addr) { return virt_to_page(addr); } EXPORT_SYMBOL(vmalloc_to_page); -unsigned long vmalloc_to_pfn(void *addr) +unsigned long vmalloc_to_pfn(const void *addr) { return page_to_pfn(virt_to_page(addr)); } @@ -253,10 +274,17 @@ EXPORT_SYMBOL(vmalloc_32); * * The resulting memory area is 32bit addressable and zeroed so it can be * mapped to userspace without leaking data. + * + * VM_USERMAP is set on the corresponding VMA so that subsequent calls to + * remap_vmalloc_range() are permissible. */ void *vmalloc_32_user(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + /* + * We'll have to sort out the ZONE_DMA bits for 64-bit, + * but for now this can simply use vmalloc_user() directly. + */ + return vmalloc_user(size); } EXPORT_SYMBOL(vmalloc_32_user); @@ -267,7 +295,7 @@ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_ } EXPORT_SYMBOL(vmap); -void vunmap(void *addr) +void vunmap(const void *addr) { BUG(); } @@ -1216,6 +1244,21 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, } EXPORT_SYMBOL(remap_pfn_range); +int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + unsigned int size = vma->vm_end - vma->vm_start; + + if (!(vma->vm_flags & VM_USERMAP)) + return -EINVAL; + + vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT)); + vma->vm_end = vma->vm_start + size; + + return 0; +} +EXPORT_SYMBOL(remap_vmalloc_range); + void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 96473b4..c1850bf 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -125,8 +125,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * Superuser processes are usually more important, so we make it * less likely that we kill those. */ - if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) || - p->uid == 0 || p->euid == 0) + if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE)) points /= 4; /* @@ -135,7 +134,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * tend to only have this flag set on applications they think * of as important. */ - if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) + if (__capable(p, CAP_SYS_RAWIO)) points /= 4; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3d3848f..5e00f17 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void) int dirty_background_ratio = 5; /* + * free highmem will not be subtracted from the total free memory + * for calculating free ratios if vm_highmem_is_dirtyable is true + */ +int vm_highmem_is_dirtyable; + +/* * The generator of dirty data starts writeback at this percentage */ int vm_dirty_ratio = 10; @@ -219,7 +225,7 @@ static inline void task_dirties_fraction(struct task_struct *tsk, * * dirty -= (dirty/8) * p_{t} */ -void task_dirty_limit(struct task_struct *tsk, long *pdirty) +static void task_dirty_limit(struct task_struct *tsk, long *pdirty) { long numerator, denominator; long dirty = *pdirty; @@ -287,7 +293,10 @@ static unsigned long determine_dirtyable_memory(void) x = global_page_state(NR_FREE_PAGES) + global_page_state(NR_INACTIVE) + global_page_state(NR_ACTIVE); - x -= highmem_dirtyable_memory(x); + + if (!vm_highmem_is_dirtyable) + x -= highmem_dirtyable_memory(x); + return x + 1; /* Ensure that we never return 0 */ } @@ -558,6 +567,7 @@ static void background_writeout(unsigned long _min_pages) global_page_state(NR_UNSTABLE_NFS) < background_thresh && min_pages <= 0) break; + wbc.more_io = 0; wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; @@ -565,8 +575,9 @@ static void background_writeout(unsigned long _min_pages) min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { /* Wrote less than expected */ - congestion_wait(WRITE, HZ/10); - if (!wbc.encountered_congestion) + if (wbc.encountered_congestion || wbc.more_io) + congestion_wait(WRITE, HZ/10); + else break; } } @@ -631,11 +642,12 @@ static void wb_kupdate(unsigned long arg) global_page_state(NR_UNSTABLE_NFS) + (inodes_stat.nr_inodes - inodes_stat.nr_unused); while (nr_to_write > 0) { + wbc.more_io = 0; wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; writeback_inodes(&wbc); if (wbc.nr_to_write > 0) { - if (wbc.encountered_congestion) + if (wbc.encountered_congestion || wbc.more_io) congestion_wait(WRITE, HZ/10); else break; /* All the old data is written */ @@ -1064,7 +1076,7 @@ static int __set_page_dirty(struct page *page) return 0; } -int fastcall set_page_dirty(struct page *page) +int set_page_dirty(struct page *page) { int ret = __set_page_dirty(page); if (ret) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b2838c2..37576b8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -537,7 +537,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) /* * permit the bootmem allocator to evade page validation on high-order frees */ -void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) +void __init __free_pages_bootmem(struct page *page, unsigned int order) { if (order == 0) { __ClearPageReserved(page); @@ -890,31 +890,51 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) } #endif -static void __drain_pages(unsigned int cpu) +/* + * Drain pages of the indicated processor. + * + * The processor must either be the current processor and the + * thread pinned to the current processor or a processor that + * is not online. + */ +static void drain_pages(unsigned int cpu) { unsigned long flags; struct zone *zone; - int i; for_each_zone(zone) { struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; if (!populated_zone(zone)) continue; pset = zone_pcp(zone, cpu); - for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { - struct per_cpu_pages *pcp; - - pcp = &pset->pcp[i]; - local_irq_save(flags); - free_pages_bulk(zone, pcp->count, &pcp->list, 0); - pcp->count = 0; - local_irq_restore(flags); - } + + pcp = &pset->pcp; + local_irq_save(flags); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; + local_irq_restore(flags); } } +/* + * Spill all of this CPU's per-cpu pages back into the buddy allocator. + */ +void drain_local_pages(void *arg) +{ + drain_pages(smp_processor_id()); +} + +/* + * Spill all the per-cpu pages from all CPUs back into the buddy allocator + */ +void drain_all_pages(void) +{ + on_each_cpu(drain_local_pages, NULL, 0, 1); +} + #ifdef CONFIG_HIBERNATION void mark_free_pages(struct zone *zone) @@ -952,40 +972,9 @@ void mark_free_pages(struct zone *zone) #endif /* CONFIG_PM */ /* - * Spill all of this CPU's per-cpu pages back into the buddy allocator. - */ -void drain_local_pages(void) -{ - unsigned long flags; - - local_irq_save(flags); - __drain_pages(smp_processor_id()); - local_irq_restore(flags); -} - -void smp_drain_local_pages(void *arg) -{ - drain_local_pages(); -} - -/* - * Spill all the per-cpu pages from all CPUs back into the buddy allocator - */ -void drain_all_local_pages(void) -{ - unsigned long flags; - - local_irq_save(flags); - __drain_pages(smp_processor_id()); - local_irq_restore(flags); - - smp_call_function(smp_drain_local_pages, NULL, 0, 1); -} - -/* * Free a 0-order page */ -static void fastcall free_hot_cold_page(struct page *page, int cold) +static void free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; @@ -1001,10 +990,13 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) arch_free_page(page, 0); kernel_map_pages(page, 1, 0); - pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; + pcp = &zone_pcp(zone, get_cpu())->pcp; local_irq_save(flags); __count_vm_event(PGFREE); - list_add(&page->lru, &pcp->list); + if (cold) + list_add_tail(&page->lru, &pcp->list); + else + list_add(&page->lru, &pcp->list); set_page_private(page, get_pageblock_migratetype(page)); pcp->count++; if (pcp->count >= pcp->high) { @@ -1015,12 +1007,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) put_cpu(); } -void fastcall free_hot_page(struct page *page) +void free_hot_page(struct page *page) { free_hot_cold_page(page, 0); } -void fastcall free_cold_page(struct page *page) +void free_cold_page(struct page *page) { free_hot_cold_page(page, 1); } @@ -1062,7 +1054,7 @@ again: if (likely(order == 0)) { struct per_cpu_pages *pcp; - pcp = &zone_pcp(zone, cpu)->pcp[cold]; + pcp = &zone_pcp(zone, cpu)->pcp; local_irq_save(flags); if (!pcp->count) { pcp->count = rmqueue_bulk(zone, 0, @@ -1072,9 +1064,15 @@ again: } /* Find a page of the appropriate migrate type */ - list_for_each_entry(page, &pcp->list, lru) - if (page_private(page) == migratetype) - break; + if (cold) { + list_for_each_entry_reverse(page, &pcp->list, lru) + if (page_private(page) == migratetype) + break; + } else { + list_for_each_entry(page, &pcp->list, lru) + if (page_private(page) == migratetype) + break; + } /* Allocate more to the pcp list if necessary */ if (unlikely(&page->lru == &pcp->list)) { @@ -1569,7 +1567,7 @@ nofail_alloc: cond_resched(); if (order != 0) - drain_all_local_pages(); + drain_all_pages(); if (likely(did_some_progress)) { page = get_page_from_freelist(gfp_mask, order, @@ -1643,7 +1641,7 @@ EXPORT_SYMBOL(__alloc_pages); /* * Common helper functions. */ -fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) +unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) { struct page * page; page = alloc_pages(gfp_mask, order); @@ -1654,7 +1652,7 @@ fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) EXPORT_SYMBOL(__get_free_pages); -fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) +unsigned long get_zeroed_page(gfp_t gfp_mask) { struct page * page; @@ -1680,7 +1678,7 @@ void __pagevec_free(struct pagevec *pvec) free_hot_cold_page(pvec->pages[i], pvec->cold); } -fastcall void __free_pages(struct page *page, unsigned int order) +void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { if (order == 0) @@ -1692,7 +1690,7 @@ fastcall void __free_pages(struct page *page, unsigned int order) EXPORT_SYMBOL(__free_pages); -fastcall void free_pages(unsigned long addr, unsigned int order) +void free_pages(unsigned long addr, unsigned int order) { if (addr != 0) { VM_BUG_ON(!virt_addr_valid((void *)addr)); @@ -1801,12 +1799,9 @@ void show_free_areas(void) pageset = zone_pcp(zone, cpu); - printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " - "Cold: hi:%5d, btch:%4d usd:%4d\n", - cpu, pageset->pcp[0].high, - pageset->pcp[0].batch, pageset->pcp[0].count, - pageset->pcp[1].high, pageset->pcp[1].batch, - pageset->pcp[1].count); + printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", + cpu, pageset->pcp.high, + pageset->pcp.batch, pageset->pcp.count); } } @@ -1879,6 +1874,8 @@ void show_free_areas(void) printk("= %lukB\n", K(total)); } + printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); + show_swap_cache_info(); } @@ -2551,8 +2548,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, } } -static void __meminit zone_init_free_lists(struct pglist_data *pgdat, - struct zone *zone, unsigned long size) +static void __meminit zone_init_free_lists(struct zone *zone) { int order, t; for_each_migratetype_order(order, t) { @@ -2604,17 +2600,11 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) memset(p, 0, sizeof(*p)); - pcp = &p->pcp[0]; /* hot */ + pcp = &p->pcp; pcp->count = 0; pcp->high = 6 * batch; pcp->batch = max(1UL, 1 * batch); INIT_LIST_HEAD(&pcp->list); - - pcp = &p->pcp[1]; /* cold*/ - pcp->count = 0; - pcp->high = 2 * batch; - pcp->batch = max(1UL, batch/2); - INIT_LIST_HEAD(&pcp->list); } /* @@ -2627,7 +2617,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, { struct per_cpu_pages *pcp; - pcp = &p->pcp[0]; /* hot list */ + pcp = &p->pcp; pcp->high = high; pcp->batch = max(1UL, high/4); if ((high/4) > (PAGE_SHIFT * 8)) @@ -2831,7 +2821,7 @@ __meminit int init_currently_empty_zone(struct zone *zone, memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); - zone_init_free_lists(pgdat, zone, zone->spanned_pages); + zone_init_free_lists(zone); return 0; } @@ -3978,10 +3968,23 @@ static int page_alloc_cpu_notify(struct notifier_block *self, int cpu = (unsigned long)hcpu; if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - local_irq_disable(); - __drain_pages(cpu); + drain_pages(cpu); + + /* + * Spill the event counters of the dead processor + * into the current processors event counters. + * This artificially elevates the count of the current + * processor. + */ vm_events_fold_cpu(cpu); - local_irq_enable(); + + /* + * Zero the differential counters of the dead processor + * so that the vm statistics are consistent. + * + * This is only okay since the processor is dead and cannot + * race with what we are doing. + */ refresh_cpu_vm_stats(cpu); } return NOTIFY_OK; @@ -4480,7 +4483,7 @@ int set_migratetype_isolate(struct page *page) out: spin_unlock_irqrestore(&zone->lock, flags); if (!ret) - drain_all_local_pages(); + drain_all_pages(); return ret; } diff --git a/mm/page_io.c b/mm/page_io.c index 3b97f68..065c448 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -126,7 +126,7 @@ int swap_readpage(struct file *file, struct page *page) int ret = 0; BUG_ON(!PageLocked(page)); - ClearPageUptodate(page); + BUG_ON(PageUptodate(page)); bio = get_swap_bio(GFP_KERNEL, page_private(page), page, end_swap_bio_read); if (bio == NULL) { diff --git a/mm/pagewalk.c b/mm/pagewalk.c new file mode 100644 index 0000000..b4f27d2 --- /dev/null +++ b/mm/pagewalk.c @@ -0,0 +1,131 @@ +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/sched.h> + +static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + const struct mm_walk *walk, void *private) +{ + pte_t *pte; + int err = 0; + + pte = pte_offset_map(pmd, addr); + do { + err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + pte_unmap(pte); + return err; +} + +static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + const struct mm_walk *walk, void *private) +{ + pmd_t *pmd; + unsigned long next; + int err = 0; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, private); + if (err) + break; + continue; + } + if (walk->pmd_entry) + err = walk->pmd_entry(pmd, addr, next, private); + if (!err && walk->pte_entry) + err = walk_pte_range(pmd, addr, next, walk, private); + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + +static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, + const struct mm_walk *walk, void *private) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, private); + if (err) + break; + continue; + } + if (walk->pud_entry) + err = walk->pud_entry(pud, addr, next, private); + if (!err && (walk->pmd_entry || walk->pte_entry)) + err = walk_pmd_range(pud, addr, next, walk, private); + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +/** + * walk_page_range - walk a memory map's page tables with a callback + * @mm - memory map to walk + * @addr - starting address + * @end - ending address + * @walk - set of callbacks to invoke for each level of the tree + * @private - private data passed to the callback function + * + * Recursively walk the page table for the memory area in a VMA, + * calling supplied callbacks. Callbacks are called in-order (first + * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, + * etc.). If lower-level callbacks are omitted, walking depth is reduced. + * + * Each callback receives an entry pointer, the start and end of the + * associated range, and a caller-supplied private data pointer. + * + * No locks are taken, but the bottom level iterator will map PTE + * directories from highmem if necessary. + * + * If any callback returns a non-zero value, the walk is aborted and + * the return value is propagated back to the caller. Otherwise 0 is returned. + */ +int walk_page_range(const struct mm_struct *mm, + unsigned long addr, unsigned long end, + const struct mm_walk *walk, void *private) +{ + pgd_t *pgd; + unsigned long next; + int err = 0; + + if (addr >= end) + return err; + + pgd = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, private); + if (err) + break; + continue; + } + if (walk->pgd_entry) + err = walk->pgd_entry(pgd, addr, next, private); + if (!err && + (walk->pud_entry || walk->pmd_entry || walk->pte_entry)) + err = walk_pud_range(pgd, addr, next, walk, private); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + return err; +} @@ -36,7 +36,6 @@ * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within inode_lock in __sync_single_inode) - * zone->lock (within radix tree node alloc) */ #include <linux/mm.h> @@ -284,7 +283,10 @@ static int page_referenced_one(struct page *page, if (!pte) goto out; - if (ptep_clear_flush_young(vma, address, pte)) + if (vma->vm_flags & VM_LOCKED) { + referenced++; + *mapcount = 1; /* break early from loop */ + } else if (ptep_clear_flush_young(vma, address, pte)) referenced++; /* Pretend the page is referenced if the task has the @@ -78,11 +78,10 @@ /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ enum sgp_type { - SGP_QUICK, /* don't try more than file page cache lookup */ SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ + SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ SGP_WRITE, /* may exceed i_size, may allocate page */ - SGP_FAULT, /* same as SGP_CACHE, return with page locked */ }; static int shmem_getpage(struct inode *inode, unsigned long idx, @@ -194,7 +193,7 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = { }; static LIST_HEAD(shmem_swaplist); -static DEFINE_SPINLOCK(shmem_swaplist_lock); +static DEFINE_MUTEX(shmem_swaplist_mutex); static void shmem_free_blocks(struct inode *inode, long pages) { @@ -207,6 +206,31 @@ static void shmem_free_blocks(struct inode *inode, long pages) } } +static int shmem_reserve_inode(struct super_block *sb) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + if (sbinfo->max_inodes) { + spin_lock(&sbinfo->stat_lock); + if (!sbinfo->free_inodes) { + spin_unlock(&sbinfo->stat_lock); + return -ENOSPC; + } + sbinfo->free_inodes--; + spin_unlock(&sbinfo->stat_lock); + } + return 0; +} + +static void shmem_free_inode(struct super_block *sb) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + if (sbinfo->max_inodes) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; + spin_unlock(&sbinfo->stat_lock); + } +} + /* * shmem_recalc_inode - recalculate the size of an inode * @@ -731,6 +755,8 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) (void) shmem_getpage(inode, attr->ia_size>>PAGE_CACHE_SHIFT, &page, SGP_READ, NULL); + if (page) + unlock_page(page); } /* * Reset SHMEM_PAGEIN flag so that shmem_truncate can @@ -762,7 +788,6 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) static void shmem_delete_inode(struct inode *inode) { - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); if (inode->i_op->truncate == shmem_truncate) { @@ -771,17 +796,13 @@ static void shmem_delete_inode(struct inode *inode) inode->i_size = 0; shmem_truncate(inode); if (!list_empty(&info->swaplist)) { - spin_lock(&shmem_swaplist_lock); + mutex_lock(&shmem_swaplist_mutex); list_del_init(&info->swaplist); - spin_unlock(&shmem_swaplist_lock); + mutex_unlock(&shmem_swaplist_mutex); } } BUG_ON(inode->i_blocks); - if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); - sbinfo->free_inodes++; - spin_unlock(&sbinfo->stat_lock); - } + shmem_free_inode(inode->i_sb); clear_inode(inode); } @@ -807,19 +828,22 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s struct page *subdir; swp_entry_t *ptr; int offset; + int error; idx = 0; ptr = info->i_direct; spin_lock(&info->lock); + if (!info->swapped) { + list_del_init(&info->swaplist); + goto lost2; + } limit = info->next_index; size = limit; if (size > SHMEM_NR_DIRECT) size = SHMEM_NR_DIRECT; offset = shmem_find_swp(entry, ptr, ptr+size); - if (offset >= 0) { - shmem_swp_balance_unmap(); + if (offset >= 0) goto found; - } if (!info->i_indirect) goto lost2; @@ -829,6 +853,14 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { if (unlikely(idx == stage)) { shmem_dir_unmap(dir-1); + if (cond_resched_lock(&info->lock)) { + /* check it has not been truncated */ + if (limit > info->next_index) { + limit = info->next_index; + if (idx >= limit) + goto lost2; + } + } dir = shmem_dir_map(info->i_indirect) + ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; while (!*dir) { @@ -849,11 +881,11 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s if (size > ENTRIES_PER_PAGE) size = ENTRIES_PER_PAGE; offset = shmem_find_swp(entry, ptr, ptr+size); + shmem_swp_unmap(ptr); if (offset >= 0) { shmem_dir_unmap(dir); goto found; } - shmem_swp_unmap(ptr); } } lost1: @@ -863,19 +895,63 @@ lost2: return 0; found: idx += offset; - inode = &info->vfs_inode; - if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) { - info->flags |= SHMEM_PAGEIN; - shmem_swp_set(info, ptr + offset, 0); - } - shmem_swp_unmap(ptr); + inode = igrab(&info->vfs_inode); spin_unlock(&info->lock); + /* - * Decrement swap count even when the entry is left behind: - * try_to_unuse will skip over mms, then reincrement count. + * Move _head_ to start search for next from here. + * But be careful: shmem_delete_inode checks list_empty without taking + * mutex, and there's an instant in list_move_tail when info->swaplist + * would appear empty, if it were the only one on shmem_swaplist. We + * could avoid doing it if inode NULL; or use this minor optimization. */ - swap_free(entry); - return 1; + if (shmem_swaplist.next != &info->swaplist) + list_move_tail(&shmem_swaplist, &info->swaplist); + mutex_unlock(&shmem_swaplist_mutex); + + error = 1; + if (!inode) + goto out; + error = radix_tree_preload(GFP_KERNEL); + if (error) + goto out; + error = 1; + + spin_lock(&info->lock); + ptr = shmem_swp_entry(info, idx, NULL); + if (ptr && ptr->val == entry.val) + error = add_to_page_cache(page, inode->i_mapping, + idx, GFP_NOWAIT); + if (error == -EEXIST) { + struct page *filepage = find_get_page(inode->i_mapping, idx); + error = 1; + if (filepage) { + /* + * There might be a more uptodate page coming down + * from a stacked writepage: forget our swappage if so. + */ + if (PageUptodate(filepage)) + error = 0; + page_cache_release(filepage); + } + } + if (!error) { + delete_from_swap_cache(page); + set_page_dirty(page); + info->flags |= SHMEM_PAGEIN; + shmem_swp_set(info, ptr, 0); + swap_free(entry); + error = 1; /* not an error, but entry was found */ + } + if (ptr) + shmem_swp_unmap(ptr); + spin_unlock(&info->lock); + radix_tree_preload_end(); +out: + unlock_page(page); + page_cache_release(page); + iput(inode); /* allows for NULL */ + return error; } /* @@ -887,20 +963,16 @@ int shmem_unuse(swp_entry_t entry, struct page *page) struct shmem_inode_info *info; int found = 0; - spin_lock(&shmem_swaplist_lock); + mutex_lock(&shmem_swaplist_mutex); list_for_each_safe(p, next, &shmem_swaplist) { info = list_entry(p, struct shmem_inode_info, swaplist); - if (!info->swapped) - list_del_init(&info->swaplist); - else if (shmem_unuse_inode(info, entry, page)) { - /* move head to start search for next from here */ - list_move_tail(&shmem_swaplist, &info->swaplist); - found = 1; - break; - } + found = shmem_unuse_inode(info, entry, page); + cond_resched(); + if (found) + goto out; } - spin_unlock(&shmem_swaplist_lock); - return found; + mutex_unlock(&shmem_swaplist_mutex); +out: return found; /* 0 or 1 or -ENOMEM */ } /* @@ -915,54 +987,65 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) struct inode *inode; BUG_ON(!PageLocked(page)); - /* - * shmem_backing_dev_info's capabilities prevent regular writeback or - * sync from ever calling shmem_writepage; but a stacking filesystem - * may use the ->writepage of its underlying filesystem, in which case - * we want to do nothing when that underlying filesystem is tmpfs - * (writing out to swap is useful as a response to memory pressure, but - * of no use to stabilize the data) - just redirty the page, unlock it - * and claim success in this case. AOP_WRITEPAGE_ACTIVATE, and the - * page_mapped check below, must be avoided unless we're in reclaim. - */ - if (!wbc->for_reclaim) { - set_page_dirty(page); - unlock_page(page); - return 0; - } - BUG_ON(page_mapped(page)); - mapping = page->mapping; index = page->index; inode = mapping->host; info = SHMEM_I(inode); if (info->flags & VM_LOCKED) goto redirty; - swap = get_swap_page(); - if (!swap.val) + if (!total_swap_pages) goto redirty; + /* + * shmem_backing_dev_info's capabilities prevent regular writeback or + * sync from ever calling shmem_writepage; but a stacking filesystem + * may use the ->writepage of its underlying filesystem, in which case + * tmpfs should write out to swap only in response to memory pressure, + * and not for pdflush or sync. However, in those cases, we do still + * want to check if there's a redundant swappage to be discarded. + */ + if (wbc->for_reclaim) + swap = get_swap_page(); + else + swap.val = 0; + spin_lock(&info->lock); - shmem_recalc_inode(inode); if (index >= info->next_index) { BUG_ON(!(info->flags & SHMEM_TRUNCATE)); goto unlock; } entry = shmem_swp_entry(info, index, NULL); - BUG_ON(!entry); - BUG_ON(entry->val); + if (entry->val) { + /* + * The more uptodate page coming down from a stacked + * writepage should replace our old swappage. + */ + free_swap_and_cache(*entry); + shmem_swp_set(info, entry, 0); + } + shmem_recalc_inode(inode); - if (move_to_swap_cache(page, swap) == 0) { + if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { + remove_from_page_cache(page); shmem_swp_set(info, entry, swap.val); shmem_swp_unmap(entry); + if (list_empty(&info->swaplist)) + inode = igrab(inode); + else + inode = NULL; spin_unlock(&info->lock); - if (list_empty(&info->swaplist)) { - spin_lock(&shmem_swaplist_lock); + swap_duplicate(swap); + BUG_ON(page_mapped(page)); + page_cache_release(page); /* pagecache ref */ + set_page_dirty(page); + unlock_page(page); + if (inode) { + mutex_lock(&shmem_swaplist_mutex); /* move instead of add in case we're racing */ list_move_tail(&info->swaplist, &shmem_swaplist); - spin_unlock(&shmem_swaplist_lock); + mutex_unlock(&shmem_swaplist_mutex); + iput(inode); } - unlock_page(page); return 0; } @@ -972,7 +1055,10 @@ unlock: swap_free(swap); redirty: set_page_dirty(page); - return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */ + if (wbc->for_reclaim) + return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ + unlock_page(page); + return 0; } #ifdef CONFIG_NUMA @@ -1025,53 +1111,33 @@ out: return err; } -static struct page *shmem_swapin_async(struct shared_policy *p, - swp_entry_t entry, unsigned long idx) +static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, + struct shmem_inode_info *info, unsigned long idx) { - struct page *page; struct vm_area_struct pvma; + struct page *page; /* Create a pseudo vma that just contains the policy */ - memset(&pvma, 0, sizeof(struct vm_area_struct)); - pvma.vm_end = PAGE_SIZE; + pvma.vm_start = 0; pvma.vm_pgoff = idx; - pvma.vm_policy = mpol_shared_policy_lookup(p, idx); - page = read_swap_cache_async(entry, &pvma, 0); + pvma.vm_ops = NULL; + pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); + page = swapin_readahead(entry, gfp, &pvma, 0); mpol_free(pvma.vm_policy); return page; } -static struct page *shmem_swapin(struct shmem_inode_info *info, - swp_entry_t entry, unsigned long idx) -{ - struct shared_policy *p = &info->policy; - int i, num; - struct page *page; - unsigned long offset; - - num = valid_swaphandles(entry, &offset); - for (i = 0; i < num; offset++, i++) { - page = shmem_swapin_async(p, - swp_entry(swp_type(entry), offset), idx); - if (!page) - break; - page_cache_release(page); - } - lru_add_drain(); /* Push any new pages onto the LRU now */ - return shmem_swapin_async(p, entry, idx); -} - -static struct page * -shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info, - unsigned long idx) +static struct page *shmem_alloc_page(gfp_t gfp, + struct shmem_inode_info *info, unsigned long idx) { struct vm_area_struct pvma; struct page *page; - memset(&pvma, 0, sizeof(struct vm_area_struct)); - pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); + /* Create a pseudo vma that just contains the policy */ + pvma.vm_start = 0; pvma.vm_pgoff = idx; - pvma.vm_end = PAGE_SIZE; + pvma.vm_ops = NULL; + pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); page = alloc_page_vma(gfp, &pvma, 0); mpol_free(pvma.vm_policy); return page; @@ -1083,15 +1149,14 @@ static inline int shmem_parse_mpol(char *value, int *policy, return 1; } -static inline struct page * -shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx) +static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, + struct shmem_inode_info *info, unsigned long idx) { - swapin_readahead(entry, 0, NULL); - return read_swap_cache_async(entry, NULL, 0); + return swapin_readahead(entry, gfp, NULL, 0); } -static inline struct page * -shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx) +static inline struct page *shmem_alloc_page(gfp_t gfp, + struct shmem_inode_info *info, unsigned long idx) { return alloc_page(gfp); } @@ -1114,6 +1179,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, struct page *swappage; swp_entry_t *entry; swp_entry_t swap; + gfp_t gfp; int error; if (idx >= SHMEM_MAX_INDEX) @@ -1126,7 +1192,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, * Normally, filepage is NULL on entry, and either found * uptodate immediately, or allocated and zeroed, or read * in under swappage, which is then assigned to filepage. - * But shmem_readpage and shmem_write_begin pass in a locked + * But shmem_readpage (required for splice) passes in a locked * filepage, which may be found not uptodate by other callers * too, and may need to be copied from the swappage read in. */ @@ -1136,8 +1202,17 @@ repeat: if (filepage && PageUptodate(filepage)) goto done; error = 0; - if (sgp == SGP_QUICK) - goto failed; + gfp = mapping_gfp_mask(mapping); + if (!filepage) { + /* + * Try to preload while we can wait, to not make a habit of + * draining atomic reserves; but don't latch on to this cpu. + */ + error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); + if (error) + goto failed; + radix_tree_preload_end(); + } spin_lock(&info->lock); shmem_recalc_inode(inode); @@ -1160,7 +1235,7 @@ repeat: *type |= VM_FAULT_MAJOR; } spin_unlock(&info->lock); - swappage = shmem_swapin(info, swap, idx); + swappage = shmem_swapin(swap, gfp, info, idx); if (!swappage) { spin_lock(&info->lock); entry = shmem_swp_alloc(info, idx, sgp); @@ -1218,23 +1293,21 @@ repeat: SetPageUptodate(filepage); set_page_dirty(filepage); swap_free(swap); - } else if (!(error = move_from_swap_cache( - swappage, idx, mapping))) { + } else if (!(error = add_to_page_cache( + swappage, mapping, idx, GFP_NOWAIT))) { info->flags |= SHMEM_PAGEIN; shmem_swp_set(info, entry, 0); shmem_swp_unmap(entry); + delete_from_swap_cache(swappage); spin_unlock(&info->lock); filepage = swappage; + set_page_dirty(filepage); swap_free(swap); } else { shmem_swp_unmap(entry); spin_unlock(&info->lock); unlock_page(swappage); page_cache_release(swappage); - if (error == -ENOMEM) { - /* let kswapd refresh zone for GFP_ATOMICs */ - congestion_wait(WRITE, HZ/50); - } goto repeat; } } else if (sgp == SGP_READ && !filepage) { @@ -1272,9 +1345,7 @@ repeat: if (!filepage) { spin_unlock(&info->lock); - filepage = shmem_alloc_page(mapping_gfp_mask(mapping), - info, - idx); + filepage = shmem_alloc_page(gfp, info, idx); if (!filepage) { shmem_unacct_blocks(info->flags, 1); shmem_free_blocks(inode, 1); @@ -1291,7 +1362,7 @@ repeat: shmem_swp_unmap(entry); } if (error || swap.val || 0 != add_to_page_cache_lru( - filepage, mapping, idx, GFP_ATOMIC)) { + filepage, mapping, idx, GFP_NOWAIT)) { spin_unlock(&info->lock); page_cache_release(filepage); shmem_unacct_blocks(info->flags, 1); @@ -1309,14 +1380,11 @@ repeat: clear_highpage(filepage); flush_dcache_page(filepage); SetPageUptodate(filepage); + if (sgp == SGP_DIRTY) + set_page_dirty(filepage); } done: - if (*pagep != filepage) { - *pagep = filepage; - if (sgp != SGP_FAULT) - unlock_page(filepage); - - } + *pagep = filepage; return 0; failed: @@ -1336,7 +1404,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) return VM_FAULT_SIGBUS; - error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_FAULT, &ret); + error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); @@ -1399,15 +1467,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) struct shmem_inode_info *info; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); - if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); - if (!sbinfo->free_inodes) { - spin_unlock(&sbinfo->stat_lock); - return NULL; - } - sbinfo->free_inodes--; - spin_unlock(&sbinfo->stat_lock); - } + if (shmem_reserve_inode(sb)) + return NULL; inode = new_inode(sb); if (inode) { @@ -1451,11 +1512,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) NULL); break; } - } else if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); - sbinfo->free_inodes++; - spin_unlock(&sbinfo->stat_lock); - } + } else + shmem_free_inode(sb); return inode; } @@ -1494,123 +1552,30 @@ shmem_write_end(struct file *file, struct address_space *mapping, { struct inode *inode = mapping->host; + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + + unlock_page(page); set_page_dirty(page); page_cache_release(page); - if (pos+copied > inode->i_size) - i_size_write(inode, pos+copied); - return copied; } -static ssize_t -shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) -{ - struct inode *inode = file->f_path.dentry->d_inode; - loff_t pos; - unsigned long written; - ssize_t err; - - if ((ssize_t) count < 0) - return -EINVAL; - - if (!access_ok(VERIFY_READ, buf, count)) - return -EFAULT; - - mutex_lock(&inode->i_mutex); - - pos = *ppos; - written = 0; - - err = generic_write_checks(file, &pos, &count, 0); - if (err || !count) - goto out; - - err = remove_suid(file->f_path.dentry); - if (err) - goto out; - - inode->i_ctime = inode->i_mtime = CURRENT_TIME; - - do { - struct page *page = NULL; - unsigned long bytes, index, offset; - char *kaddr; - int left; - - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - index = pos >> PAGE_CACHE_SHIFT; - bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) - bytes = count; - - /* - * We don't hold page lock across copy from user - - * what would it guard against? - so no deadlock here. - * But it still may be a good idea to prefault below. - */ - - err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL); - if (err) - break; - - left = bytes; - if (PageHighMem(page)) { - volatile unsigned char dummy; - __get_user(dummy, buf); - __get_user(dummy, buf + bytes - 1); - - kaddr = kmap_atomic(page, KM_USER0); - left = __copy_from_user_inatomic(kaddr + offset, - buf, bytes); - kunmap_atomic(kaddr, KM_USER0); - } - if (left) { - kaddr = kmap(page); - left = __copy_from_user(kaddr + offset, buf, bytes); - kunmap(page); - } - - written += bytes; - count -= bytes; - pos += bytes; - buf += bytes; - if (pos > inode->i_size) - i_size_write(inode, pos); - - flush_dcache_page(page); - set_page_dirty(page); - mark_page_accessed(page); - page_cache_release(page); - - if (left) { - pos -= left; - written -= left; - err = -EFAULT; - break; - } - - /* - * Our dirty pages are not counted in nr_dirty, - * and we do not attempt to balance dirty pages. - */ - - cond_resched(); - } while (count); - - *ppos = pos; - if (written) - err = written; -out: - mutex_unlock(&inode->i_mutex); - return err; -} - static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) { struct inode *inode = filp->f_path.dentry->d_inode; struct address_space *mapping = inode->i_mapping; unsigned long index, offset; + enum sgp_type sgp = SGP_READ; + + /* + * Might this read be for a stacking filesystem? Then when reading + * holes of a sparse file, we actually need to allocate those pages, + * and even mark them dirty, so it cannot exceed the max_blocks limit. + */ + if (segment_eq(get_fs(), KERNEL_DS)) + sgp = SGP_DIRTY; index = *ppos >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; @@ -1629,12 +1594,14 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ break; } - desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL); + desc->error = shmem_getpage(inode, index, &page, sgp, NULL); if (desc->error) { if (desc->error == -EINVAL) desc->error = 0; break; } + if (page) + unlock_page(page); /* * We must evaluate after, since reads (unlike writes) @@ -1798,22 +1765,16 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, int mode, static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = old_dentry->d_inode; - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + int ret; /* * No ordinary (disk based) filesystem counts links as inodes; * but each new link needs a new dentry, pinning lowmem, and * tmpfs dentries cannot be pruned until they are unlinked. */ - if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); - if (!sbinfo->free_inodes) { - spin_unlock(&sbinfo->stat_lock); - return -ENOSPC; - } - sbinfo->free_inodes--; - spin_unlock(&sbinfo->stat_lock); - } + ret = shmem_reserve_inode(inode->i_sb); + if (ret) + goto out; dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; @@ -1821,21 +1782,16 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr atomic_inc(&inode->i_count); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ d_instantiate(dentry, inode); - return 0; +out: + return ret; } static int shmem_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; - if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) { - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); - sbinfo->free_inodes++; - spin_unlock(&sbinfo->stat_lock); - } - } + if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) + shmem_free_inode(inode->i_sb); dir->i_size -= BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; @@ -1924,6 +1880,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s iput(inode); return error; } + unlock_page(page); inode->i_op = &shmem_symlink_inode_operations; kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, symname, len); @@ -1951,6 +1908,8 @@ static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) struct page *page = NULL; int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); + if (page) + unlock_page(page); return page; } @@ -1996,8 +1955,7 @@ static int shmem_xattr_security_get(struct inode *inode, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return security_inode_getsecurity(inode, name, buffer, size, - -EOPNOTSUPP); + return xattr_getsecurity(inode, name, buffer, size); } static int shmem_xattr_security_set(struct inode *inode, const char *name, @@ -2138,7 +2096,7 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid, } if (*rest) goto bad_val; - *blocks = size >> PAGE_CACHE_SHIFT; + *blocks = DIV_ROUND_UP(size, PAGE_CACHE_SIZE); } else if (!strcmp(this_char,"nr_blocks")) { *blocks = memparse(value,&rest); if (*rest) @@ -2375,7 +2333,8 @@ static const struct file_operations shmem_file_operations = { #ifdef CONFIG_TMPFS .llseek = generic_file_llseek, .read = shmem_file_read, - .write = shmem_file_write, + .write = do_sync_write, + .aio_write = generic_file_aio_write, .fsync = simple_sync_file, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, @@ -12,10 +12,17 @@ * allocator is as little as 2 bytes, however typically most architectures * will require 4 bytes on 32-bit and 8 bytes on 64-bit. * - * The slob heap is a linked list of pages from alloc_pages(), and - * within each page, there is a singly-linked list of free blocks (slob_t). - * The heap is grown on demand and allocation from the heap is currently - * first-fit. + * The slob heap is a set of linked list of pages from alloc_pages(), + * and within each page, there is a singly-linked list of free blocks + * (slob_t). The heap is grown on demand. To reduce fragmentation, + * heap pages are segregated into three lists, with objects less than + * 256 bytes, objects less than 1024 bytes, and all other objects. + * + * Allocation from heap involves first searching for a page with + * sufficient free blocks (using a next-fit-like approach) followed by + * a first-fit scan of the page. Deallocation inserts objects back + * into the free list in address order, so this is effectively an + * address-ordered first fit. * * Above this is an implementation of kmalloc/kfree. Blocks returned * from kmalloc are prepended with a 4-byte header with the kmalloc size. @@ -110,9 +117,13 @@ static inline void free_slob_page(struct slob_page *sp) } /* - * All (partially) free slob pages go on this list. + * All partially free slob pages go on these lists. */ -static LIST_HEAD(free_slob_pages); +#define SLOB_BREAK1 256 +#define SLOB_BREAK2 1024 +static LIST_HEAD(free_slob_small); +static LIST_HEAD(free_slob_medium); +static LIST_HEAD(free_slob_large); /* * slob_page: True for all slob pages (false for bigblock pages) @@ -140,9 +151,9 @@ static inline int slob_page_free(struct slob_page *sp) return test_bit(PG_private, &sp->flags); } -static inline void set_slob_page_free(struct slob_page *sp) +static void set_slob_page_free(struct slob_page *sp, struct list_head *list) { - list_add(&sp->list, &free_slob_pages); + list_add(&sp->list, list); __set_bit(PG_private, &sp->flags); } @@ -294,12 +305,20 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) { struct slob_page *sp; struct list_head *prev; + struct list_head *slob_list; slob_t *b = NULL; unsigned long flags; + if (size < SLOB_BREAK1) + slob_list = &free_slob_small; + else if (size < SLOB_BREAK2) + slob_list = &free_slob_medium; + else + slob_list = &free_slob_large; + spin_lock_irqsave(&slob_lock, flags); /* Iterate through each partially free page, try to find room */ - list_for_each_entry(sp, &free_slob_pages, list) { + list_for_each_entry(sp, slob_list, list) { #ifdef CONFIG_NUMA /* * If there's a node specification, search for a partial @@ -321,9 +340,9 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) /* Improve fragment distribution and reduce our average * search time by starting our next search here. (see * Knuth vol 1, sec 2.5, pg 449) */ - if (prev != free_slob_pages.prev && - free_slob_pages.next != prev->next) - list_move_tail(&free_slob_pages, prev->next); + if (prev != slob_list->prev && + slob_list->next != prev->next) + list_move_tail(slob_list, prev->next); break; } spin_unlock_irqrestore(&slob_lock, flags); @@ -341,7 +360,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) sp->free = b; INIT_LIST_HEAD(&sp->list); set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); - set_slob_page_free(sp); + set_slob_page_free(sp, slob_list); b = slob_page_alloc(sp, size, align); BUG_ON(!b); spin_unlock_irqrestore(&slob_lock, flags); @@ -387,7 +406,7 @@ static void slob_free(void *block, int size) set_slob(b, units, (void *)((unsigned long)(b + SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); - set_slob_page_free(sp); + set_slob_page_free(sp, &free_slob_small); goto out; } @@ -398,6 +417,10 @@ static void slob_free(void *block, int size) sp->units += units; if (b < sp->free) { + if (b + units == sp->free) { + units += slob_units(sp->free); + sp->free = slob_next(sp->free); + } set_slob(b, units, sp->free); sp->free = b; } else { diff --git a/mm/sparse.c b/mm/sparse.c index a2183cb..f6a43c0 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -237,7 +237,7 @@ static unsigned long *__kmalloc_section_usemap(void) } #endif /* CONFIG_MEMORY_HOTPLUG */ -static unsigned long *sparse_early_usemap_alloc(unsigned long pnum) +static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) { unsigned long *usemap; struct mem_section *ms = __nr_to_section(pnum); @@ -353,17 +353,9 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, return __kmalloc_section_memmap(nr_pages); } -static int vaddr_in_vmalloc_area(void *addr) -{ - if (addr >= (void *)VMALLOC_START && - addr < (void *)VMALLOC_END) - return 1; - return 0; -} - static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) { - if (vaddr_in_vmalloc_area(memmap)) + if (is_vmalloc_addr(memmap)) vfree(memmap); else free_pages((unsigned long)memmap, @@ -41,7 +41,7 @@ static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; * This path almost never happens for VM activity - pages are normally * freed via pagevecs. But it gets used by networking. */ -static void fastcall __page_cache_release(struct page *page) +static void __page_cache_release(struct page *page) { if (PageLRU(page)) { unsigned long flags; @@ -165,7 +165,7 @@ int rotate_reclaimable_page(struct page *page) /* * FIXME: speed this up? */ -void fastcall activate_page(struct page *page) +void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -186,7 +186,7 @@ void fastcall activate_page(struct page *page) * inactive,referenced -> active,unreferenced * active,unreferenced -> active,referenced */ -void fastcall mark_page_accessed(struct page *page) +void mark_page_accessed(struct page *page) { if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { activate_page(page); @@ -202,7 +202,7 @@ EXPORT_SYMBOL(mark_page_accessed); * lru_cache_add: add a page to the page lists * @page: the page to add */ -void fastcall lru_cache_add(struct page *page) +void lru_cache_add(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); @@ -212,7 +212,7 @@ void fastcall lru_cache_add(struct page *page) put_cpu_var(lru_add_pvecs); } -void fastcall lru_cache_add_active(struct page *page) +void lru_cache_add_active(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); diff --git a/mm/swap_state.c b/mm/swap_state.c index b526356..ec42f01 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/kernel_stat.h> #include <linux/swap.h> +#include <linux/swapops.h> #include <linux/init.h> #include <linux/pagemap.h> #include <linux/buffer_head.h> @@ -51,26 +52,22 @@ static struct { unsigned long del_total; unsigned long find_success; unsigned long find_total; - unsigned long noent_race; - unsigned long exist_race; } swap_cache_info; void show_swap_cache_info(void) { - printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n", + printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, - swap_cache_info.find_success, swap_cache_info.find_total, - swap_cache_info.noent_race, swap_cache_info.exist_race); + swap_cache_info.find_success, swap_cache_info.find_total); printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); } /* - * __add_to_swap_cache resembles add_to_page_cache on swapper_space, + * add_to_swap_cache resembles add_to_page_cache on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -static int __add_to_swap_cache(struct page *page, swp_entry_t entry, - gfp_t gfp_mask) +int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { int error; @@ -88,6 +85,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, set_page_private(page, entry.val); total_swapcache_pages++; __inc_zone_page_state(page, NR_FILE_PAGES); + INC_CACHE_INFO(add_total); } write_unlock_irq(&swapper_space.tree_lock); radix_tree_preload_end(); @@ -95,31 +93,6 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, return error; } -static int add_to_swap_cache(struct page *page, swp_entry_t entry) -{ - int error; - - BUG_ON(PageLocked(page)); - if (!swap_duplicate(entry)) { - INC_CACHE_INFO(noent_race); - return -ENOENT; - } - SetPageLocked(page); - error = __add_to_swap_cache(page, entry, GFP_KERNEL); - /* - * Anon pages are already on the LRU, we don't run lru_cache_add here. - */ - if (error) { - ClearPageLocked(page); - swap_free(entry); - if (error == -EEXIST) - INC_CACHE_INFO(exist_race); - return error; - } - INC_CACHE_INFO(add_total); - return 0; -} - /* * This must be called only on pages that have * been verified to be in the swap cache. @@ -152,6 +125,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask) int err; BUG_ON(!PageLocked(page)); + BUG_ON(!PageUptodate(page)); for (;;) { entry = get_swap_page(); @@ -169,18 +143,15 @@ int add_to_swap(struct page * page, gfp_t gfp_mask) /* * Add it to the swap cache and mark it dirty */ - err = __add_to_swap_cache(page, entry, + err = add_to_swap_cache(page, entry, gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); switch (err) { case 0: /* Success */ - SetPageUptodate(page); SetPageDirty(page); - INC_CACHE_INFO(add_total); return 1; case -EEXIST: /* Raced with "speculative" read_swap_cache_async */ - INC_CACHE_INFO(exist_race); swap_free(entry); continue; default: @@ -211,40 +182,6 @@ void delete_from_swap_cache(struct page *page) page_cache_release(page); } -/* - * Strange swizzling function only for use by shmem_writepage - */ -int move_to_swap_cache(struct page *page, swp_entry_t entry) -{ - int err = __add_to_swap_cache(page, entry, GFP_ATOMIC); - if (!err) { - remove_from_page_cache(page); - page_cache_release(page); /* pagecache ref */ - if (!swap_duplicate(entry)) - BUG(); - SetPageDirty(page); - INC_CACHE_INFO(add_total); - } else if (err == -EEXIST) - INC_CACHE_INFO(exist_race); - return err; -} - -/* - * Strange swizzling function for shmem_getpage (and shmem_unuse) - */ -int move_from_swap_cache(struct page *page, unsigned long index, - struct address_space *mapping) -{ - int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC); - if (!err) { - delete_from_swap_cache(page); - /* shift page from clean_pages to dirty_pages list */ - ClearPageDirty(page); - set_page_dirty(page); - } - return err; -} - /* * If we are the only user, then try to free up the swap cache. * @@ -317,7 +254,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. */ -struct page *read_swap_cache_async(swp_entry_t entry, +struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { struct page *found_page, *new_page = NULL; @@ -337,23 +274,27 @@ struct page *read_swap_cache_async(swp_entry_t entry, * Get a new page to read into from swap. */ if (!new_page) { - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, - vma, addr); + new_page = alloc_page_vma(gfp_mask, vma, addr); if (!new_page) break; /* Out of memory */ } /* + * Swap entry may have been freed since our caller observed it. + */ + if (!swap_duplicate(entry)) + break; + + /* * Associate the page with swap entry in the swap cache. - * May fail (-ENOENT) if swap entry has been freed since - * our caller observed it. May fail (-EEXIST) if there - * is already a page associated with this entry in the - * swap cache: added by a racing read_swap_cache_async, - * or by try_to_swap_out (or shmem_writepage) re-using - * the just freed swap entry for an existing page. + * May fail (-EEXIST) if there is already a page associated + * with this entry in the swap cache: added by a racing + * read_swap_cache_async, or add_to_swap or shmem_writepage + * re-using the just freed swap entry for an existing page. * May fail (-ENOMEM) if radix-tree node allocation failed. */ - err = add_to_swap_cache(new_page, entry); + SetPageLocked(new_page); + err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); if (!err) { /* * Initiate read into locked page and return. @@ -362,9 +303,57 @@ struct page *read_swap_cache_async(swp_entry_t entry, swap_readpage(NULL, new_page); return new_page; } - } while (err != -ENOENT && err != -ENOMEM); + ClearPageLocked(new_page); + swap_free(entry); + } while (err != -ENOMEM); if (new_page) page_cache_release(new_page); return found_page; } + +/** + * swapin_readahead - swap in pages in hope we need them soon + * @entry: swap entry of this memory + * @vma: user vma this address belongs to + * @addr: target address for mempolicy + * + * Returns the struct page for entry and addr, after queueing swapin. + * + * Primitive swap readahead code. We simply read an aligned block of + * (1 << page_cluster) entries in the swap area. This method is chosen + * because it doesn't cost us any seek time. We also make sure to queue + * the 'original' request together with the readahead ones... + * + * This has been extended to use the NUMA policies from the mm triggering + * the readahead. + * + * Caller must hold down_read on the vma->vm_mm if vma is not NULL. + */ +struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, unsigned long addr) +{ + int nr_pages; + struct page *page; + unsigned long offset; + unsigned long end_offset; + + /* + * Get starting offset for readaround, and number of pages to read. + * Adjust starting address by readbehind (for NUMA interleave case)? + * No, it's very unlikely that swap layout would follow vma layout, + * more likely that neighbouring swap pages came from the same node: + * so use the same "addr" to choose the same node for each swap read. + */ + nr_pages = valid_swaphandles(entry, &offset); + for (end_offset = offset + nr_pages; offset < end_offset; offset++) { + /* Ok, do the async read-ahead now */ + page = read_swap_cache_async(swp_entry(swp_type(entry), offset), + gfp_mask, vma, addr); + if (!page) + break; + page_cache_release(page); + } + lru_add_drain(); /* Push any new pages onto the LRU now */ + return read_swap_cache_async(entry, gfp_mask, vma, addr); +} diff --git a/mm/swapfile.c b/mm/swapfile.c index f071648..eade24d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -506,9 +506,19 @@ unsigned int count_swap_pages(int type, int free) * just let do_wp_page work it out if a write is requested later - to * force COW, vm_page_prot omits write permission from any private vma. */ -static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, +static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { + spinlock_t *ptl; + pte_t *pte; + int found = 1; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { + found = 0; + goto out; + } + inc_mm_counter(vma->vm_mm, anon_rss); get_page(page); set_pte_at(vma->vm_mm, addr, pte, @@ -520,6 +530,9 @@ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, * immediately swapped out again after swapon. */ activate_page(page); +out: + pte_unmap_unlock(pte, ptl); + return found; } static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, @@ -528,22 +541,33 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, { pte_t swp_pte = swp_entry_to_pte(entry); pte_t *pte; - spinlock_t *ptl; int found = 0; - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + /* + * We don't actually need pte lock while scanning for swp_pte: since + * we hold page lock and mmap_sem, swp_pte cannot be inserted into the + * page table while we're scanning; though it could get zapped, and on + * some architectures (e.g. x86_32 with PAE) we might catch a glimpse + * of unmatched parts which look like swp_pte, so unuse_pte must + * recheck under pte lock. Scanning without pte lock lets it be + * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. + */ + pte = pte_offset_map(pmd, addr); do { /* * swapoff spends a _lot_ of time in this loop! * Test inline before going to call unuse_pte. */ if (unlikely(pte_same(*pte, swp_pte))) { - unuse_pte(vma, pte++, addr, entry, page); - found = 1; - break; + pte_unmap(pte); + found = unuse_pte(vma, pmd, addr, entry, page); + if (found) + goto out; + pte = pte_offset_map(pmd, addr); } } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(pte - 1, ptl); + pte_unmap(pte - 1); +out: return found; } @@ -730,7 +754,8 @@ static int try_to_unuse(unsigned int type) */ swap_map = &si->swap_map[i]; entry = swp_entry(type, i); - page = read_swap_cache_async(entry, NULL, 0); + page = read_swap_cache_async(entry, + GFP_HIGHUSER_MOVABLE, NULL, 0); if (!page) { /* * Either swap_duplicate() failed because entry @@ -789,7 +814,7 @@ static int try_to_unuse(unsigned int type) atomic_inc(&new_start_mm->mm_users); atomic_inc(&prev_mm->mm_users); spin_lock(&mmlist_lock); - while (*swap_map > 1 && !retval && + while (*swap_map > 1 && !retval && !shmem && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!atomic_inc_not_zero(&mm->mm_users)) @@ -821,6 +846,13 @@ static int try_to_unuse(unsigned int type) mmput(start_mm); start_mm = new_start_mm; } + if (shmem) { + /* page has already been unlocked and released */ + if (shmem > 0) + continue; + retval = shmem; + break; + } if (retval) { unlock_page(page); page_cache_release(page); @@ -859,12 +891,6 @@ static int try_to_unuse(unsigned int type) * read from disk into another page. Splitting into two * pages would be incorrect if swap supported "shared * private" pages, but they are handled by tmpfs files. - * - * Note shmem_unuse already deleted a swappage from - * the swap cache, unless the move to filepage failed: - * in which case it left swappage in cache, lowered its - * swap count to pass quickly through the loops above, - * and now we must reincrement count to try again later. */ if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { struct writeback_control wbc = { @@ -875,12 +901,8 @@ static int try_to_unuse(unsigned int type) lock_page(page); wait_on_page_writeback(page); } - if (PageSwapCache(page)) { - if (shmem) - swap_duplicate(entry); - else - delete_from_swap_cache(page); - } + if (PageSwapCache(page)) + delete_from_swap_cache(page); /* * So we could skip searching mms once swap count went @@ -1768,31 +1790,48 @@ get_swap_info_struct(unsigned type) */ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) { + struct swap_info_struct *si; int our_page_cluster = page_cluster; - int ret = 0, i = 1 << our_page_cluster; - unsigned long toff; - struct swap_info_struct *swapdev = swp_type(entry) + swap_info; + pgoff_t target, toff; + pgoff_t base, end; + int nr_pages = 0; if (!our_page_cluster) /* no readahead */ return 0; - toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster; - if (!toff) /* first page is swap header */ - toff++, i--; - *offset = toff; + + si = &swap_info[swp_type(entry)]; + target = swp_offset(entry); + base = (target >> our_page_cluster) << our_page_cluster; + end = base + (1 << our_page_cluster); + if (!base) /* first page is swap header */ + base++; spin_lock(&swap_lock); - do { - /* Don't read-ahead past the end of the swap area */ - if (toff >= swapdev->max) + if (end > si->max) /* don't go beyond end of map */ + end = si->max; + + /* Count contiguous allocated slots above our target */ + for (toff = target; ++toff < end; nr_pages++) { + /* Don't read in free or bad pages */ + if (!si->swap_map[toff]) + break; + if (si->swap_map[toff] == SWAP_MAP_BAD) break; + } + /* Count contiguous allocated slots below our target */ + for (toff = target; --toff >= base; nr_pages++) { /* Don't read in free or bad pages */ - if (!swapdev->swap_map[toff]) + if (!si->swap_map[toff]) break; - if (swapdev->swap_map[toff] == SWAP_MAP_BAD) + if (si->swap_map[toff] == SWAP_MAP_BAD) break; - toff++; - ret++; - } while (--i); + } spin_unlock(&swap_lock); - return ret; + + /* + * Indicate starting offset, and return number of pages to get: + * if only 1, say 0, since there's then no readahead to be done. + */ + *offset = ++toff; + return nr_pages? ++nr_pages: 0; } diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index d436a9c..7020836 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -121,18 +121,6 @@ int shmem_unuse(swp_entry_t entry, struct page *page) return 0; } -#if 0 -int shmem_mmap(struct file *file, struct vm_area_struct *vma) -{ - file_accessed(file); -#ifndef CONFIG_MMU - return ramfs_nommu_mmap(file, vma); -#else - return 0; -#endif -} -#endif /* 0 */ - #ifndef CONFIG_MMU unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, diff --git a/mm/truncate.c b/mm/truncate.c index c3123b0..c35c49e 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -48,7 +48,7 @@ void do_invalidatepage(struct page *page, unsigned long offset) static inline void truncate_partial_page(struct page *page, unsigned partial) { - zero_user_page(page, partial, PAGE_CACHE_SIZE - partial, KM_USER0); + zero_user_segment(page, partial, PAGE_CACHE_SIZE); if (PagePrivate(page)) do_invalidatepage(page, partial); } @@ -84,7 +84,7 @@ EXPORT_SYMBOL(cancel_dirty_page); /* * If truncate cannot remove the fs-private metadata from the page, the page - * becomes anonymous. It will be left on the LRU and may even be mapped into + * becomes orphaned. It will be left on the LRU and may even be mapped into * user pagetables if we're racing with filemap_fault(). * * We need to bale out if page->mapping is no longer equal to the original @@ -98,11 +98,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page) if (page->mapping != mapping) return; - cancel_dirty_page(page, PAGE_CACHE_SIZE); - if (PagePrivate(page)) do_invalidatepage(page, 0); + cancel_dirty_page(page, PAGE_CACHE_SIZE); + remove_from_page_cache(page); ClearPageUptodate(page); ClearPageMappedToDisk(page); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index af77e17..0536dde 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -166,6 +166,44 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) } EXPORT_SYMBOL_GPL(map_vm_area); +/* + * Map a vmalloc()-space virtual address to the physical page. + */ +struct page *vmalloc_to_page(const void *vmalloc_addr) +{ + unsigned long addr = (unsigned long) vmalloc_addr; + struct page *page = NULL; + pgd_t *pgd = pgd_offset_k(addr); + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + + if (!pgd_none(*pgd)) { + pud = pud_offset(pgd, addr); + if (!pud_none(*pud)) { + pmd = pmd_offset(pud, addr); + if (!pmd_none(*pmd)) { + ptep = pte_offset_map(pmd, addr); + pte = *ptep; + if (pte_present(pte)) + page = pte_page(pte); + pte_unmap(ptep); + } + } + } + return page; +} +EXPORT_SYMBOL(vmalloc_to_page); + +/* + * Map a vmalloc()-space virtual address to the physical page frame number. + */ +unsigned long vmalloc_to_pfn(const void *vmalloc_addr) +{ + return page_to_pfn(vmalloc_to_page(vmalloc_addr)); +} +EXPORT_SYMBOL(vmalloc_to_pfn); + static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask) @@ -216,6 +254,10 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl if (addr > end - size) goto out; } + if ((size + addr) < addr) + goto out; + if (addr > end - size) + goto out; found: area->next = *p; @@ -268,7 +310,7 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, } /* Caller must hold vmlist_lock */ -static struct vm_struct *__find_vm_area(void *addr) +static struct vm_struct *__find_vm_area(const void *addr) { struct vm_struct *tmp; @@ -281,7 +323,7 @@ static struct vm_struct *__find_vm_area(void *addr) } /* Caller must hold vmlist_lock */ -static struct vm_struct *__remove_vm_area(void *addr) +static struct vm_struct *__remove_vm_area(const void *addr) { struct vm_struct **p, *tmp; @@ -310,7 +352,7 @@ found: * This function returns the found VM area, but using it is NOT safe * on SMP machines, except for its size or flags. */ -struct vm_struct *remove_vm_area(void *addr) +struct vm_struct *remove_vm_area(const void *addr) { struct vm_struct *v; write_lock(&vmlist_lock); @@ -319,7 +361,7 @@ struct vm_struct *remove_vm_area(void *addr) return v; } -static void __vunmap(void *addr, int deallocate_pages) +static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; @@ -346,8 +388,10 @@ static void __vunmap(void *addr, int deallocate_pages) int i; for (i = 0; i < area->nr_pages; i++) { - BUG_ON(!area->pages[i]); - __free_page(area->pages[i]); + struct page *page = area->pages[i]; + + BUG_ON(!page); + __free_page(page); } if (area->flags & VM_VPAGES) @@ -370,7 +414,7 @@ static void __vunmap(void *addr, int deallocate_pages) * * Must not be called in interrupt context. */ -void vfree(void *addr) +void vfree(const void *addr) { BUG_ON(in_interrupt()); __vunmap(addr, 1); @@ -386,7 +430,7 @@ EXPORT_SYMBOL(vfree); * * Must not be called in interrupt context. */ -void vunmap(void *addr) +void vunmap(const void *addr) { BUG_ON(in_interrupt()); __vunmap(addr, 0); @@ -423,8 +467,8 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); -void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node) +static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, + pgprot_t prot, int node) { struct page **pages; unsigned int nr_pages, array_size, i; @@ -451,15 +495,19 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } for (i = 0; i < area->nr_pages; i++) { + struct page *page; + if (node < 0) - area->pages[i] = alloc_page(gfp_mask); + page = alloc_page(gfp_mask); else - area->pages[i] = alloc_pages_node(node, gfp_mask, 0); - if (unlikely(!area->pages[i])) { + page = alloc_pages_node(node, gfp_mask, 0); + + if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; goto fail; } + area->pages[i] = page; } if (map_vm_area(area, prot, &pages)) diff --git a/mm/vmstat.c b/mm/vmstat.c index e8d846f..422d960 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -21,21 +21,14 @@ EXPORT_PER_CPU_SYMBOL(vm_event_states); static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) { - int cpu = 0; + int cpu; int i; memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); - cpu = first_cpu(*cpumask); - while (cpu < NR_CPUS) { + for_each_cpu_mask(cpu, *cpumask) { struct vm_event_state *this = &per_cpu(vm_event_states, cpu); - cpu = next_cpu(cpu, *cpumask); - - if (cpu < NR_CPUS) - prefetch(&per_cpu(vm_event_states, cpu)); - - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) ret[i] += this->event[i]; } @@ -284,6 +277,10 @@ EXPORT_SYMBOL(dec_zone_page_state); /* * Update the zone counters for one cpu. * + * The cpu specified must be either the current cpu or a processor that + * is not online. If it is the current cpu then the execution thread must + * be pinned to the current cpu. + * * Note that refresh_cpu_vm_stats strives to only access * node local memory. The per cpu pagesets on remote zones are placed * in the memory local to the processor using that pageset. So the @@ -299,7 +296,7 @@ void refresh_cpu_vm_stats(int cpu) { struct zone *zone; int i; - unsigned long flags; + int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; for_each_zone(zone) { struct per_cpu_pageset *p; @@ -311,15 +308,19 @@ void refresh_cpu_vm_stats(int cpu) for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) if (p->vm_stat_diff[i]) { + unsigned long flags; + int v; + local_irq_save(flags); - zone_page_state_add(p->vm_stat_diff[i], - zone, i); + v = p->vm_stat_diff[i]; p->vm_stat_diff[i] = 0; + local_irq_restore(flags); + atomic_long_add(v, &zone->vm_stat[i]); + global_diff[i] += v; #ifdef CONFIG_NUMA /* 3 seconds idle till flush */ p->expire = 3; #endif - local_irq_restore(flags); } #ifdef CONFIG_NUMA /* @@ -329,7 +330,7 @@ void refresh_cpu_vm_stats(int cpu) * Check if there are pages remaining in this pageset * if not then there is nothing to expire. */ - if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count)) + if (!p->expire || !p->pcp.count) continue; /* @@ -344,13 +345,14 @@ void refresh_cpu_vm_stats(int cpu) if (p->expire) continue; - if (p->pcp[0].count) - drain_zone_pages(zone, p->pcp + 0); - - if (p->pcp[1].count) - drain_zone_pages(zone, p->pcp + 1); + if (p->pcp.count) + drain_zone_pages(zone, &p->pcp); #endif } + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (global_diff[i]) + atomic_long_add(global_diff[i], &vm_stat[i]); } #endif @@ -681,20 +683,17 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n pagesets"); for_each_online_cpu(i) { struct per_cpu_pageset *pageset; - int j; pageset = zone_pcp(zone, i); - for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { - seq_printf(m, - "\n cpu: %i pcp: %i" - "\n count: %i" - "\n high: %i" - "\n batch: %i", - i, j, - pageset->pcp[j].count, - pageset->pcp[j].high, - pageset->pcp[j].batch); - } + seq_printf(m, + "\n cpu: %i" + "\n count: %i" + "\n high: %i" + "\n batch: %i", + i, + pageset->pcp.count, + pageset->pcp.high, + pageset->pcp.batch); #ifdef CONFIG_SMP seq_printf(m, "\n vm stats threshold: %d", pageset->stat_threshold); diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index a224106..8cd357f 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -547,8 +547,8 @@ int cipso_v4_doi_remove(u32 doi, rcu_read_lock(); list_for_each_entry_rcu(dom_iter, &doi_def->dom_list, list) if (dom_iter->valid) - netlbl_domhsh_remove(dom_iter->domain, - audit_info); + netlbl_cfg_map_del(dom_iter->domain, + audit_info); rcu_read_unlock(); cipso_v4_cache_invalidate(); call_rcu(&doi_def->rcu, callback); diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index becf91a..c7ad64d 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -90,7 +90,7 @@ static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1 * safely. * */ -static void netlbl_cipsov4_doi_free(struct rcu_head *entry) +void netlbl_cipsov4_doi_free(struct rcu_head *entry) { struct cipso_v4_doi *ptr; diff --git a/net/netlabel/netlabel_cipso_v4.h b/net/netlabel/netlabel_cipso_v4.h index f03cf9b..220cb9d 100644 --- a/net/netlabel/netlabel_cipso_v4.h +++ b/net/netlabel/netlabel_cipso_v4.h @@ -163,4 +163,7 @@ enum { /* NetLabel protocol functions */ int netlbl_cipsov4_genl_init(void); +/* Free the memory associated with a CIPSOv4 DOI definition */ +void netlbl_cipsov4_doi_free(struct rcu_head *entry); + #endif diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h index 3689956..8220990 100644 --- a/net/netlabel/netlabel_domainhash.h +++ b/net/netlabel/netlabel_domainhash.h @@ -61,6 +61,7 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry, struct netlbl_audit *audit_info); int netlbl_domhsh_add_default(struct netlbl_dom_map *entry, struct netlbl_audit *audit_info); +int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info); int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info); struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain); int netlbl_domhsh_walk(u32 *skip_bkt, diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index c69e3e1..39793a1 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -30,6 +30,7 @@ #include <linux/init.h> #include <linux/types.h> +#include <linux/audit.h> #include <net/ip.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> @@ -38,10 +39,186 @@ #include "netlabel_domainhash.h" #include "netlabel_unlabeled.h" +#include "netlabel_cipso_v4.h" #include "netlabel_user.h" #include "netlabel_mgmt.h" /* + * Configuration Functions + */ + +/** + * netlbl_cfg_map_del - Remove a NetLabel/LSM domain mapping + * @domain: the domain mapping to remove + * @audit_info: NetLabel audit information + * + * Description: + * Removes a NetLabel/LSM domain mapping. A @domain value of NULL causes the + * default domain mapping to be removed. Returns zero on success, negative + * values on failure. + * + */ +int netlbl_cfg_map_del(const char *domain, struct netlbl_audit *audit_info) +{ + return netlbl_domhsh_remove(domain, audit_info); +} + +/** + * netlbl_cfg_unlbl_add_map - Add an unlabeled NetLabel/LSM domain mapping + * @domain: the domain mapping to add + * @audit_info: NetLabel audit information + * + * Description: + * Adds a new unlabeled NetLabel/LSM domain mapping. A @domain value of NULL + * causes a new default domain mapping to be added. Returns zero on success, + * negative values on failure. + * + */ +int netlbl_cfg_unlbl_add_map(const char *domain, + struct netlbl_audit *audit_info) +{ + int ret_val = -ENOMEM; + struct netlbl_dom_map *entry; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) + goto cfg_unlbl_add_map_failure; + if (domain != NULL) { + entry->domain = kstrdup(domain, GFP_ATOMIC); + if (entry->domain == NULL) + goto cfg_unlbl_add_map_failure; + } + entry->type = NETLBL_NLTYPE_UNLABELED; + + ret_val = netlbl_domhsh_add(entry, audit_info); + if (ret_val != 0) + goto cfg_unlbl_add_map_failure; + + return 0; + +cfg_unlbl_add_map_failure: + if (entry != NULL) + kfree(entry->domain); + kfree(entry); + return ret_val; +} + +/** + * netlbl_cfg_cipsov4_add - Add a new CIPSOv4 DOI definition + * @doi_def: the DOI definition + * @audit_info: NetLabel audit information + * + * Description: + * Add a new CIPSOv4 DOI definition to the NetLabel subsystem. Returns zero on + * success, negative values on failure. + * + */ +int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def, + struct netlbl_audit *audit_info) +{ + int ret_val; + const char *type_str; + struct audit_buffer *audit_buf; + + ret_val = cipso_v4_doi_add(doi_def); + + audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD, + audit_info); + if (audit_buf != NULL) { + switch (doi_def->type) { + case CIPSO_V4_MAP_STD: + type_str = "std"; + break; + case CIPSO_V4_MAP_PASS: + type_str = "pass"; + break; + default: + type_str = "(unknown)"; + } + audit_log_format(audit_buf, + " cipso_doi=%u cipso_type=%s res=%u", + doi_def->doi, + type_str, + ret_val == 0 ? 1 : 0); + audit_log_end(audit_buf); + } + + return ret_val; +} + +/** + * netlbl_cfg_cipsov4_add_map - Add a new CIPSOv4 DOI definition and mapping + * @doi_def: the DOI definition + * @domain: the domain mapping to add + * @audit_info: NetLabel audit information + * + * Description: + * Add a new CIPSOv4 DOI definition and NetLabel/LSM domain mapping for this + * new DOI definition to the NetLabel subsystem. A @domain value of NULL adds + * a new default domain mapping. Returns zero on success, negative values on + * failure. + * + */ +int netlbl_cfg_cipsov4_add_map(struct cipso_v4_doi *doi_def, + const char *domain, + struct netlbl_audit *audit_info) +{ + int ret_val = -ENOMEM; + struct netlbl_dom_map *entry; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) + goto cfg_cipsov4_add_map_failure; + if (domain != NULL) { + entry->domain = kstrdup(domain, GFP_ATOMIC); + if (entry->domain == NULL) + goto cfg_cipsov4_add_map_failure; + } + entry->type = NETLBL_NLTYPE_CIPSOV4; + entry->type_def.cipsov4 = doi_def; + + /* Grab a RCU read lock here so nothing happens to the doi_def variable + * between adding it to the CIPSOv4 protocol engine and adding a + * domain mapping for it. */ + + rcu_read_lock(); + ret_val = netlbl_cfg_cipsov4_add(doi_def, audit_info); + if (ret_val != 0) + goto cfg_cipsov4_add_map_failure_unlock; + ret_val = netlbl_domhsh_add(entry, audit_info); + if (ret_val != 0) + goto cfg_cipsov4_add_map_failure_remove_doi; + rcu_read_unlock(); + + return 0; + +cfg_cipsov4_add_map_failure_remove_doi: + cipso_v4_doi_remove(doi_def->doi, audit_info, netlbl_cipsov4_doi_free); +cfg_cipsov4_add_map_failure_unlock: + rcu_read_unlock(); +cfg_cipsov4_add_map_failure: + if (entry != NULL) + kfree(entry->domain); + kfree(entry); + return ret_val; +} + +/** + * netlbl_cfg_cipsov4_del - Removean existing CIPSOv4 DOI definition + * @doi: the CIPSO DOI value + * @audit_info: NetLabel audit information + * + * Description: + * Removes an existing CIPSOv4 DOI definition from the NetLabel subsystem. + * Returns zero on success, negative values on failure. + * + */ +int netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info) +{ + return cipso_v4_doi_remove(doi, audit_info, netlbl_cipsov4_doi_free); +} + +/* * Security Attribute Functions */ diff --git a/security/Kconfig b/security/Kconfig index 389e151..25ffe1b 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -105,6 +105,7 @@ config SECURITY_ROOTPLUG If you are unsure how to answer this question, answer N. source security/selinux/Kconfig +source security/smack/Kconfig endmenu diff --git a/security/Makefile b/security/Makefile index ef87df2..9e8b025 100644 --- a/security/Makefile +++ b/security/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_KEYS) += keys/ subdir-$(CONFIG_SECURITY_SELINUX) += selinux +subdir-$(CONFIG_SECURITY_SMACK) += smack # if we don't select a security model, use the default capabilities ifneq ($(CONFIG_SECURITY),y) @@ -14,5 +15,6 @@ endif obj-$(CONFIG_SECURITY) += security.o dummy.o inode.o # Must precede capability.o in order to stack properly. obj-$(CONFIG_SECURITY_SELINUX) += selinux/built-in.o +obj-$(CONFIG_SECURITY_SMACK) += commoncap.o smack/built-in.o obj-$(CONFIG_SECURITY_CAPABILITIES) += commoncap.o capability.o obj-$(CONFIG_SECURITY_ROOTPLUG) += commoncap.o root_plug.o diff --git a/security/commoncap.c b/security/commoncap.c index ea61bc7..5aba826 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -1,4 +1,4 @@ -/* Common capabilities, needed by capability.o and root_plug.o +/* Common capabilities, needed by capability.o and root_plug.o * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -25,20 +25,6 @@ #include <linux/mount.h> #include <linux/sched.h> -#ifdef CONFIG_SECURITY_FILE_CAPABILITIES -/* - * Because of the reduced scope of CAP_SETPCAP when filesystem - * capabilities are in effect, it is safe to allow this capability to - * be available in the default configuration. - */ -# define CAP_INIT_BSET CAP_FULL_SET -#else /* ie. ndef CONFIG_SECURITY_FILE_CAPABILITIES */ -# define CAP_INIT_BSET CAP_INIT_EFF_SET -#endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */ - -kernel_cap_t cap_bset = CAP_INIT_BSET; /* systemwide capability bound */ -EXPORT_SYMBOL(cap_bset); - /* Global security state */ unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ @@ -93,9 +79,9 @@ int cap_capget (struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { /* Derived from kernel/capability.c:sys_capget. */ - *effective = cap_t (target->cap_effective); - *inheritable = cap_t (target->cap_inheritable); - *permitted = cap_t (target->cap_permitted); + *effective = target->cap_effective; + *inheritable = target->cap_inheritable; + *permitted = target->cap_permitted; return 0; } @@ -140,6 +126,12 @@ int cap_capset_check (struct task_struct *target, kernel_cap_t *effective, /* incapable of using this inheritable set */ return -EPERM; } + if (!cap_issubset(*inheritable, + cap_combine(target->cap_inheritable, + current->cap_bset))) { + /* no new pI capabilities outside bounding set */ + return -EPERM; + } /* verify restrictions on target's new Permitted set */ if (!cap_issubset (*permitted, @@ -198,28 +190,50 @@ int cap_inode_killpriv(struct dentry *dentry) } static inline int cap_from_disk(struct vfs_cap_data *caps, - struct linux_binprm *bprm, - int size) + struct linux_binprm *bprm, unsigned size) { __u32 magic_etc; + unsigned tocopy, i; - if (size != XATTR_CAPS_SZ) + if (size < sizeof(magic_etc)) return -EINVAL; magic_etc = le32_to_cpu(caps->magic_etc); switch ((magic_etc & VFS_CAP_REVISION_MASK)) { - case VFS_CAP_REVISION: - if (magic_etc & VFS_CAP_FLAGS_EFFECTIVE) - bprm->cap_effective = true; - else - bprm->cap_effective = false; - bprm->cap_permitted = to_cap_t(le32_to_cpu(caps->permitted)); - bprm->cap_inheritable = to_cap_t(le32_to_cpu(caps->inheritable)); - return 0; + case VFS_CAP_REVISION_1: + if (size != XATTR_CAPS_SZ_1) + return -EINVAL; + tocopy = VFS_CAP_U32_1; + break; + case VFS_CAP_REVISION_2: + if (size != XATTR_CAPS_SZ_2) + return -EINVAL; + tocopy = VFS_CAP_U32_2; + break; default: return -EINVAL; } + + if (magic_etc & VFS_CAP_FLAGS_EFFECTIVE) { + bprm->cap_effective = true; + } else { + bprm->cap_effective = false; + } + + for (i = 0; i < tocopy; ++i) { + bprm->cap_permitted.cap[i] = + le32_to_cpu(caps->data[i].permitted); + bprm->cap_inheritable.cap[i] = + le32_to_cpu(caps->data[i].inheritable); + } + while (i < VFS_CAP_U32) { + bprm->cap_permitted.cap[i] = 0; + bprm->cap_inheritable.cap[i] = 0; + i++; + } + + return 0; } /* Locate any VFS capabilities: */ @@ -227,7 +241,7 @@ static int get_file_caps(struct linux_binprm *bprm) { struct dentry *dentry; int rc = 0; - struct vfs_cap_data incaps; + struct vfs_cap_data vcaps; struct inode *inode; if (bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID) { @@ -240,14 +254,8 @@ static int get_file_caps(struct linux_binprm *bprm) if (!inode->i_op || !inode->i_op->getxattr) goto out; - rc = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, NULL, 0); - if (rc > 0) { - if (rc == XATTR_CAPS_SZ) - rc = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, - &incaps, XATTR_CAPS_SZ); - else - rc = -EINVAL; - } + rc = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, &vcaps, + XATTR_CAPS_SZ); if (rc == -ENODATA || rc == -EOPNOTSUPP) { /* no data, that's ok */ rc = 0; @@ -256,7 +264,7 @@ static int get_file_caps(struct linux_binprm *bprm) if (rc < 0) goto out; - rc = cap_from_disk(&incaps, bprm, rc); + rc = cap_from_disk(&vcaps, bprm, rc); if (rc) printk(KERN_NOTICE "%s: cap_from_disk returned %d for %s\n", __FUNCTION__, rc, bprm->filename); @@ -321,10 +329,11 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) /* Derived from fs/exec.c:compute_creds. */ kernel_cap_t new_permitted, working; - new_permitted = cap_intersect (bprm->cap_permitted, cap_bset); - working = cap_intersect (bprm->cap_inheritable, + new_permitted = cap_intersect(bprm->cap_permitted, + current->cap_bset); + working = cap_intersect(bprm->cap_inheritable, current->cap_inheritable); - new_permitted = cap_combine (new_permitted, working); + new_permitted = cap_combine(new_permitted, working); if (bprm->e_uid != current->uid || bprm->e_gid != current->gid || !cap_issubset (new_permitted, current->cap_permitted)) { @@ -351,8 +360,10 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) * capability rules */ if (!is_global_init(current)) { current->cap_permitted = new_permitted; - current->cap_effective = bprm->cap_effective ? - new_permitted : 0; + if (bprm->cap_effective) + current->cap_effective = new_permitted; + else + cap_clear(current->cap_effective); } /* AUD: Audit candidate if current->cap_effective is set */ @@ -474,13 +485,15 @@ int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid, if (!issecure (SECURE_NO_SETUID_FIXUP)) { if (old_fsuid == 0 && current->fsuid != 0) { - cap_t (current->cap_effective) &= - ~CAP_FS_MASK; + current->cap_effective = + cap_drop_fs_set( + current->cap_effective); } if (old_fsuid != 0 && current->fsuid == 0) { - cap_t (current->cap_effective) |= - (cap_t (current->cap_permitted) & - CAP_FS_MASK); + current->cap_effective = + cap_raise_fs_set( + current->cap_effective, + current->cap_permitted); } } break; @@ -561,6 +574,23 @@ int cap_task_kill(struct task_struct *p, struct siginfo *info, return -EPERM; } + +/* + * called from kernel/sys.c for prctl(PR_CABSET_DROP) + * done without task_capability_lock() because it introduces + * no new races - i.e. only another task doing capget() on + * this task could get inconsistent info. There can be no + * racing writer bc a task can only change its own caps. + */ +long cap_prctl_drop(unsigned long cap) +{ + if (!capable(CAP_SETPCAP)) + return -EPERM; + if (!cap_valid(cap)) + return -EINVAL; + cap_lower(current->cap_bset, cap); + return 0; +} #else int cap_task_setscheduler (struct task_struct *p, int policy, struct sched_param *lp) @@ -584,9 +614,9 @@ int cap_task_kill(struct task_struct *p, struct siginfo *info, void cap_task_reparent_to_init (struct task_struct *p) { - p->cap_effective = CAP_INIT_EFF_SET; - p->cap_inheritable = CAP_INIT_INH_SET; - p->cap_permitted = CAP_FULL_SET; + cap_set_init_eff(p->cap_effective); + cap_clear(p->cap_inheritable); + cap_set_full(p->cap_permitted); p->keep_capabilities = 0; return; } diff --git a/security/dummy.c b/security/dummy.c index 48d4b0a..649326b 100644 --- a/security/dummy.c +++ b/security/dummy.c @@ -36,14 +36,19 @@ static int dummy_ptrace (struct task_struct *parent, struct task_struct *child) static int dummy_capget (struct task_struct *target, kernel_cap_t * effective, kernel_cap_t * inheritable, kernel_cap_t * permitted) { - *effective = *inheritable = *permitted = 0; if (target->euid == 0) { - *permitted |= (~0 & ~CAP_FS_MASK); - *effective |= (~0 & ~CAP_TO_MASK(CAP_SETPCAP) & ~CAP_FS_MASK); + cap_set_full(*permitted); + cap_set_init_eff(*effective); + } else { + cap_clear(*permitted); + cap_clear(*effective); } - if (target->fsuid == 0) { - *permitted |= CAP_FS_MASK; - *effective |= CAP_FS_MASK; + + cap_clear(*inheritable); + + if (target->fsuid != 0) { + *permitted = cap_drop_fs_set(*permitted); + *effective = cap_drop_fs_set(*effective); } return 0; } @@ -402,7 +407,7 @@ static int dummy_inode_killpriv(struct dentry *dentry) return 0; } -static int dummy_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err) +static int dummy_inode_getsecurity(const struct inode *inode, const char *name, void **buffer, bool alloc) { return -EOPNOTSUPP; } diff --git a/security/security.c b/security/security.c index ca475ca..b6c57a6 100644 --- a/security/security.c +++ b/security/security.c @@ -493,11 +493,11 @@ int security_inode_killpriv(struct dentry *dentry) return security_ops->inode_killpriv(dentry); } -int security_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err) +int security_inode_getsecurity(const struct inode *inode, const char *name, void **buffer, bool alloc) { if (unlikely(IS_PRIVATE(inode))) return 0; - return security_ops->inode_getsecurity(inode, name, buffer, size, err); + return security_ops->inode_getsecurity(inode, name, buffer, alloc); } int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index be6de0b..e5ed075 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -136,32 +136,6 @@ static DEFINE_SPINLOCK(sb_security_lock); static struct kmem_cache *sel_inode_cache; -/* Return security context for a given sid or just the context - length if the buffer is null or length is 0 */ -static int selinux_getsecurity(u32 sid, void *buffer, size_t size) -{ - char *context; - unsigned len; - int rc; - - rc = security_sid_to_context(sid, &context, &len); - if (rc) - return rc; - - if (!buffer || !size) - goto getsecurity_exit; - - if (size < len) { - len = -ERANGE; - goto getsecurity_exit; - } - memcpy(buffer, context, len); - -getsecurity_exit: - kfree(context); - return len; -} - /** * selinux_secmark_enabled - Check to see if SECMARK is currently enabled * @@ -2675,14 +2649,27 @@ static int selinux_inode_removexattr (struct dentry *dentry, char *name) * * Permission check is handled by selinux_inode_getxattr hook. */ -static int selinux_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err) +static int selinux_inode_getsecurity(const struct inode *inode, const char *name, void **buffer, bool alloc) { + u32 size; + int error; + char *context = NULL; struct inode_security_struct *isec = inode->i_security; if (strcmp(name, XATTR_SELINUX_SUFFIX)) return -EOPNOTSUPP; - return selinux_getsecurity(isec->sid, buffer, size); + error = security_sid_to_context(isec->sid, &context, &size); + if (error) + return error; + error = size; + if (alloc) { + *buffer = context; + goto out_nofree; + } + kfree(context); +out_nofree: + return error; } static int selinux_inode_setsecurity(struct inode *inode, const char *name, diff --git a/security/smack/Kconfig b/security/smack/Kconfig new file mode 100644 index 0000000..603b087 --- /dev/null +++ b/security/smack/Kconfig @@ -0,0 +1,10 @@ +config SECURITY_SMACK + bool "Simplified Mandatory Access Control Kernel Support" + depends on NETLABEL && SECURITY_NETWORK + default n + help + This selects the Simplified Mandatory Access Control Kernel. + Smack is useful for sensitivity, integrity, and a variety + of other mandatory security schemes. + If you are unsure how to answer this question, answer N. + diff --git a/security/smack/Makefile b/security/smack/Makefile new file mode 100644 index 0000000..67a63aa --- /dev/null +++ b/security/smack/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the SMACK LSM +# + +obj-$(CONFIG_SECURITY_SMACK) := smack.o + +smack-y := smack_lsm.o smack_access.o smackfs.o diff --git a/security/smack/smack.h b/security/smack/smack.h new file mode 100644 index 0000000..a21a0e9 --- /dev/null +++ b/security/smack/smack.h @@ -0,0 +1,220 @@ +/* + * Copyright (C) 2007 Casey Schaufler <casey@schaufler-ca.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 2. + * + * Author: + * Casey Schaufler <casey@schaufler-ca.com> + * + */ + +#ifndef _SECURITY_SMACK_H +#define _SECURITY_SMACK_H + +#include <linux/capability.h> +#include <linux/spinlock.h> +#include <net/netlabel.h> + +/* + * Why 23? CIPSO is constrained to 30, so a 32 byte buffer is + * bigger than can be used, and 24 is the next lower multiple + * of 8, and there are too many issues if there isn't space set + * aside for the terminating null byte. + */ +#define SMK_MAXLEN 23 +#define SMK_LABELLEN (SMK_MAXLEN+1) + +/* + * How many kinds of access are there? + * Here's your answer. + */ +#define SMK_ACCESSDASH '-' +#define SMK_ACCESSLOW "rwxa" +#define SMK_ACCESSKINDS (sizeof(SMK_ACCESSLOW) - 1) + +struct superblock_smack { + char *smk_root; + char *smk_floor; + char *smk_hat; + char *smk_default; + int smk_initialized; + spinlock_t smk_sblock; /* for initialization */ +}; + +struct socket_smack { + char *smk_out; /* outbound label */ + char *smk_in; /* inbound label */ + char smk_packet[SMK_LABELLEN]; /* TCP peer label */ +}; + +/* + * Inode smack data + */ +struct inode_smack { + char *smk_inode; /* label of the fso */ + struct mutex smk_lock; /* initialization lock */ + int smk_flags; /* smack inode flags */ +}; + +#define SMK_INODE_INSTANT 0x01 /* inode is instantiated */ + +/* + * A label access rule. + */ +struct smack_rule { + char *smk_subject; + char *smk_object; + int smk_access; +}; + +/* + * An entry in the table of permitted label accesses. + */ +struct smk_list_entry { + struct smk_list_entry *smk_next; + struct smack_rule smk_rule; +}; + +/* + * An entry in the table mapping smack values to + * CIPSO level/category-set values. + */ +struct smack_cipso { + int smk_level; + char smk_catset[SMK_LABELLEN]; +}; + +/* + * This is the repository for labels seen so that it is + * not necessary to keep allocating tiny chuncks of memory + * and so that they can be shared. + * + * Labels are never modified in place. Anytime a label + * is imported (e.g. xattrset on a file) the list is checked + * for it and it is added if it doesn't exist. The address + * is passed out in either case. Entries are added, but + * never deleted. + * + * Since labels are hanging around anyway it doesn't + * hurt to maintain a secid for those awkward situations + * where kernel components that ought to use LSM independent + * interfaces don't. The secid should go away when all of + * these components have been repaired. + * + * If there is a cipso value associated with the label it + * gets stored here, too. This will most likely be rare as + * the cipso direct mapping in used internally. + */ +struct smack_known { + struct smack_known *smk_next; + char smk_known[SMK_LABELLEN]; + u32 smk_secid; + struct smack_cipso *smk_cipso; + spinlock_t smk_cipsolock; /* for changing cipso map */ +}; + +/* + * Mount options + */ +#define SMK_FSDEFAULT "smackfsdef=" +#define SMK_FSFLOOR "smackfsfloor=" +#define SMK_FSHAT "smackfshat=" +#define SMK_FSROOT "smackfsroot=" + +/* + * xattr names + */ +#define XATTR_SMACK_SUFFIX "SMACK64" +#define XATTR_SMACK_IPIN "SMACK64IPIN" +#define XATTR_SMACK_IPOUT "SMACK64IPOUT" +#define XATTR_NAME_SMACK XATTR_SECURITY_PREFIX XATTR_SMACK_SUFFIX +#define XATTR_NAME_SMACKIPIN XATTR_SECURITY_PREFIX XATTR_SMACK_IPIN +#define XATTR_NAME_SMACKIPOUT XATTR_SECURITY_PREFIX XATTR_SMACK_IPOUT + +/* + * smackfs macic number + */ +#define SMACK_MAGIC 0x43415d53 /* "SMAC" */ + +/* + * A limit on the number of entries in the lists + * makes some of the list administration easier. + */ +#define SMACK_LIST_MAX 10000 + +/* + * CIPSO defaults. + */ +#define SMACK_CIPSO_DOI_DEFAULT 3 /* Historical */ +#define SMACK_CIPSO_DIRECT_DEFAULT 250 /* Arbitrary */ +#define SMACK_CIPSO_MAXCATVAL 63 /* Bigger gets harder */ +#define SMACK_CIPSO_MAXLEVEL 255 /* CIPSO 2.2 standard */ +#define SMACK_CIPSO_MAXCATNUM 239 /* CIPSO 2.2 standard */ + +/* + * Just to make the common cases easier to deal with + */ +#define MAY_ANY (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC) +#define MAY_ANYREAD (MAY_READ | MAY_EXEC) +#define MAY_ANYWRITE (MAY_WRITE | MAY_APPEND) +#define MAY_READWRITE (MAY_READ | MAY_WRITE) +#define MAY_NOT 0 + +/* + * These functions are in smack_lsm.c + */ +struct inode_smack *new_inode_smack(char *); + +/* + * These functions are in smack_access.c + */ +int smk_access(char *, char *, int); +int smk_curacc(char *, u32); +int smack_to_cipso(const char *, struct smack_cipso *); +void smack_from_cipso(u32, char *, char *); +char *smack_from_secid(const u32); +char *smk_import(const char *, int); +struct smack_known *smk_import_entry(const char *, int); +u32 smack_to_secid(const char *); + +/* + * Shared data. + */ +extern int smack_cipso_direct; +extern int smack_net_nltype; +extern char *smack_net_ambient; + +extern struct smack_known *smack_known; +extern struct smack_known smack_known_floor; +extern struct smack_known smack_known_hat; +extern struct smack_known smack_known_huh; +extern struct smack_known smack_known_invalid; +extern struct smack_known smack_known_star; +extern struct smack_known smack_known_unset; + +extern struct smk_list_entry *smack_list; + +/* + * Stricly for CIPSO level manipulation. + * Set the category bit number in a smack label sized buffer. + */ +static inline void smack_catset_bit(int cat, char *catsetp) +{ + if (cat > SMK_LABELLEN * 8) + return; + + catsetp[(cat - 1) / 8] |= 0x80 >> ((cat - 1) % 8); +} + +/* + * Present a pointer to the smack label in an inode blob. + */ +static inline char *smk_of_inode(const struct inode *isp) +{ + struct inode_smack *sip = isp->i_security; + return sip->smk_inode; +} + +#endif /* _SECURITY_SMACK_H */ diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c new file mode 100644 index 0000000..f6b5f6e --- /dev/null +++ b/security/smack/smack_access.c @@ -0,0 +1,356 @@ +/* + * Copyright (C) 2007 Casey Schaufler <casey@schaufler-ca.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 2. + * + * Author: + * Casey Schaufler <casey@schaufler-ca.com> + * + */ + +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/sched.h> +#include "smack.h" + +struct smack_known smack_known_unset = { + .smk_next = NULL, + .smk_known = "UNSET", + .smk_secid = 1, + .smk_cipso = NULL, +}; + +struct smack_known smack_known_huh = { + .smk_next = &smack_known_unset, + .smk_known = "?", + .smk_secid = 2, + .smk_cipso = NULL, +}; + +struct smack_known smack_known_hat = { + .smk_next = &smack_known_huh, + .smk_known = "^", + .smk_secid = 3, + .smk_cipso = NULL, +}; + +struct smack_known smack_known_star = { + .smk_next = &smack_known_hat, + .smk_known = "*", + .smk_secid = 4, + .smk_cipso = NULL, +}; + +struct smack_known smack_known_floor = { + .smk_next = &smack_known_star, + .smk_known = "_", + .smk_secid = 5, + .smk_cipso = NULL, +}; + +struct smack_known smack_known_invalid = { + .smk_next = &smack_known_floor, + .smk_known = "", + .smk_secid = 6, + .smk_cipso = NULL, +}; + +struct smack_known *smack_known = &smack_known_invalid; + +/* + * The initial value needs to be bigger than any of the + * known values above. + */ +static u32 smack_next_secid = 10; + +/** + * smk_access - determine if a subject has a specific access to an object + * @subject_label: a pointer to the subject's Smack label + * @object_label: a pointer to the object's Smack label + * @request: the access requested, in "MAY" format + * + * This function looks up the subject/object pair in the + * access rule list and returns 0 if the access is permitted, + * non zero otherwise. + * + * Even though Smack labels are usually shared on smack_list + * labels that come in off the network can't be imported + * and added to the list for locking reasons. + * + * Therefore, it is necessary to check the contents of the labels, + * not just the pointer values. Of course, in most cases the labels + * will be on the list, so checking the pointers may be a worthwhile + * optimization. + */ +int smk_access(char *subject_label, char *object_label, int request) +{ + u32 may = MAY_NOT; + struct smk_list_entry *sp; + struct smack_rule *srp; + + /* + * Hardcoded comparisons. + * + * A star subject can't access any object. + */ + if (subject_label == smack_known_star.smk_known || + strcmp(subject_label, smack_known_star.smk_known) == 0) + return -EACCES; + /* + * A star object can be accessed by any subject. + */ + if (object_label == smack_known_star.smk_known || + strcmp(object_label, smack_known_star.smk_known) == 0) + return 0; + /* + * An object can be accessed in any way by a subject + * with the same label. + */ + if (subject_label == object_label || + strcmp(subject_label, object_label) == 0) + return 0; + /* + * A hat subject can read any object. + * A floor object can be read by any subject. + */ + if ((request & MAY_ANYREAD) == request) { + if (object_label == smack_known_floor.smk_known || + strcmp(object_label, smack_known_floor.smk_known) == 0) + return 0; + if (subject_label == smack_known_hat.smk_known || + strcmp(subject_label, smack_known_hat.smk_known) == 0) + return 0; + } + /* + * Beyond here an explicit relationship is required. + * If the requested access is contained in the available + * access (e.g. read is included in readwrite) it's + * good. + */ + for (sp = smack_list; sp != NULL; sp = sp->smk_next) { + srp = &sp->smk_rule; + + if (srp->smk_subject == subject_label || + strcmp(srp->smk_subject, subject_label) == 0) { + if (srp->smk_object == object_label || + strcmp(srp->smk_object, object_label) == 0) { + may = srp->smk_access; + break; + } + } + } + /* + * This is a bit map operation. + */ + if ((request & may) == request) + return 0; + + return -EACCES; +} + +/** + * smk_curacc - determine if current has a specific access to an object + * @object_label: a pointer to the object's Smack label + * @request: the access requested, in "MAY" format + * + * This function checks the current subject label/object label pair + * in the access rule list and returns 0 if the access is permitted, + * non zero otherwise. It allows that current my have the capability + * to override the rules. + */ +int smk_curacc(char *obj_label, u32 mode) +{ + int rc; + + rc = smk_access(current->security, obj_label, mode); + if (rc == 0) + return 0; + + if (capable(CAP_MAC_OVERRIDE)) + return 0; + + return rc; +} + +static DEFINE_MUTEX(smack_known_lock); + +/** + * smk_import_entry - import a label, return the list entry + * @string: a text string that might be a Smack label + * @len: the maximum size, or zero if it is NULL terminated. + * + * Returns a pointer to the entry in the label list that + * matches the passed string, adding it if necessary. + */ +struct smack_known *smk_import_entry(const char *string, int len) +{ + struct smack_known *skp; + char smack[SMK_LABELLEN]; + int found; + int i; + + if (len <= 0 || len > SMK_MAXLEN) + len = SMK_MAXLEN; + + for (i = 0, found = 0; i < SMK_LABELLEN; i++) { + if (found) + smack[i] = '\0'; + else if (i >= len || string[i] > '~' || string[i] <= ' ' || + string[i] == '/') { + smack[i] = '\0'; + found = 1; + } else + smack[i] = string[i]; + } + + if (smack[0] == '\0') + return NULL; + + mutex_lock(&smack_known_lock); + + for (skp = smack_known; skp != NULL; skp = skp->smk_next) + if (strncmp(skp->smk_known, smack, SMK_MAXLEN) == 0) + break; + + if (skp == NULL) { + skp = kzalloc(sizeof(struct smack_known), GFP_KERNEL); + if (skp != NULL) { + skp->smk_next = smack_known; + strncpy(skp->smk_known, smack, SMK_MAXLEN); + skp->smk_secid = smack_next_secid++; + skp->smk_cipso = NULL; + spin_lock_init(&skp->smk_cipsolock); + /* + * Make sure that the entry is actually + * filled before putting it on the list. + */ + smp_mb(); + smack_known = skp; + } + } + + mutex_unlock(&smack_known_lock); + + return skp; +} + +/** + * smk_import - import a smack label + * @string: a text string that might be a Smack label + * @len: the maximum size, or zero if it is NULL terminated. + * + * Returns a pointer to the label in the label list that + * matches the passed string, adding it if necessary. + */ +char *smk_import(const char *string, int len) +{ + struct smack_known *skp; + + skp = smk_import_entry(string, len); + if (skp == NULL) + return NULL; + return skp->smk_known; +} + +/** + * smack_from_secid - find the Smack label associated with a secid + * @secid: an integer that might be associated with a Smack label + * + * Returns a pointer to the appropraite Smack label if there is one, + * otherwise a pointer to the invalid Smack label. + */ +char *smack_from_secid(const u32 secid) +{ + struct smack_known *skp; + + for (skp = smack_known; skp != NULL; skp = skp->smk_next) + if (skp->smk_secid == secid) + return skp->smk_known; + + /* + * If we got this far someone asked for the translation + * of a secid that is not on the list. + */ + return smack_known_invalid.smk_known; +} + +/** + * smack_to_secid - find the secid associated with a Smack label + * @smack: the Smack label + * + * Returns the appropriate secid if there is one, + * otherwise 0 + */ +u32 smack_to_secid(const char *smack) +{ + struct smack_known *skp; + + for (skp = smack_known; skp != NULL; skp = skp->smk_next) + if (strncmp(skp->smk_known, smack, SMK_MAXLEN) == 0) + return skp->smk_secid; + return 0; +} + +/** + * smack_from_cipso - find the Smack label associated with a CIPSO option + * @level: Bell & LaPadula level from the network + * @cp: Bell & LaPadula categories from the network + * @result: where to put the Smack value + * + * This is a simple lookup in the label table. + * + * This is an odd duck as far as smack handling goes in that + * it sends back a copy of the smack label rather than a pointer + * to the master list. This is done because it is possible for + * a foreign host to send a smack label that is new to this + * machine and hence not on the list. That would not be an + * issue except that adding an entry to the master list can't + * be done at that point. + */ +void smack_from_cipso(u32 level, char *cp, char *result) +{ + struct smack_known *kp; + char *final = NULL; + + for (kp = smack_known; final == NULL && kp != NULL; kp = kp->smk_next) { + if (kp->smk_cipso == NULL) + continue; + + spin_lock_bh(&kp->smk_cipsolock); + + if (kp->smk_cipso->smk_level == level && + memcmp(kp->smk_cipso->smk_catset, cp, SMK_LABELLEN) == 0) + final = kp->smk_known; + + spin_unlock_bh(&kp->smk_cipsolock); + } + if (final == NULL) + final = smack_known_huh.smk_known; + strncpy(result, final, SMK_MAXLEN); + return; +} + +/** + * smack_to_cipso - find the CIPSO option to go with a Smack label + * @smack: a pointer to the smack label in question + * @cp: where to put the result + * + * Returns zero if a value is available, non-zero otherwise. + */ +int smack_to_cipso(const char *smack, struct smack_cipso *cp) +{ + struct smack_known *kp; + + for (kp = smack_known; kp != NULL; kp = kp->smk_next) + if (kp->smk_known == smack || + strcmp(kp->smk_known, smack) == 0) + break; + + if (kp == NULL || kp->smk_cipso == NULL) + return -ENOENT; + + memcpy(cp, kp->smk_cipso, sizeof(struct smack_cipso)); + return 0; +} diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c new file mode 100644 index 0000000..1c11e42 --- /dev/null +++ b/security/smack/smack_lsm.c @@ -0,0 +1,2518 @@ +/* + * Simplified MAC Kernel (smack) security module + * + * This file contains the smack hook function implementations. + * + * Author: + * Casey Schaufler <casey@schaufler-ca.com> + * + * Copyright (C) 2007 Casey Schaufler <casey@schaufler-ca.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + */ + +#include <linux/xattr.h> +#include <linux/pagemap.h> +#include <linux/mount.h> +#include <linux/stat.h> +#include <linux/ext2_fs.h> +#include <linux/kd.h> +#include <asm/ioctls.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/mutex.h> +#include <linux/pipe_fs_i.h> +#include <net/netlabel.h> +#include <net/cipso_ipv4.h> + +#include "smack.h" + +/* + * I hope these are the hokeyist lines of code in the module. Casey. + */ +#define DEVPTS_SUPER_MAGIC 0x1cd1 +#define SOCKFS_MAGIC 0x534F434B +#define TMPFS_MAGIC 0x01021994 + +/** + * smk_fetch - Fetch the smack label from a file. + * @ip: a pointer to the inode + * @dp: a pointer to the dentry + * + * Returns a pointer to the master list entry for the Smack label + * or NULL if there was no label to fetch. + */ +static char *smk_fetch(struct inode *ip, struct dentry *dp) +{ + int rc; + char in[SMK_LABELLEN]; + + if (ip->i_op->getxattr == NULL) + return NULL; + + rc = ip->i_op->getxattr(dp, XATTR_NAME_SMACK, in, SMK_LABELLEN); + if (rc < 0) + return NULL; + + return smk_import(in, rc); +} + +/** + * new_inode_smack - allocate an inode security blob + * @smack: a pointer to the Smack label to use in the blob + * + * Returns the new blob or NULL if there's no memory available + */ +struct inode_smack *new_inode_smack(char *smack) +{ + struct inode_smack *isp; + + isp = kzalloc(sizeof(struct inode_smack), GFP_KERNEL); + if (isp == NULL) + return NULL; + + isp->smk_inode = smack; + isp->smk_flags = 0; + mutex_init(&isp->smk_lock); + + return isp; +} + +/* + * LSM hooks. + * We he, that is fun! + */ + +/** + * smack_ptrace - Smack approval on ptrace + * @ptp: parent task pointer + * @ctp: child task pointer + * + * Returns 0 if access is OK, an error code otherwise + * + * Do the capability checks, and require read and write. + */ +static int smack_ptrace(struct task_struct *ptp, struct task_struct *ctp) +{ + int rc; + + rc = cap_ptrace(ptp, ctp); + if (rc != 0) + return rc; + + rc = smk_access(ptp->security, ctp->security, MAY_READWRITE); + if (rc != 0 && __capable(ptp, CAP_MAC_OVERRIDE)) + return 0; + + return rc; +} + +/** + * smack_syslog - Smack approval on syslog + * @type: message type + * + * Require that the task has the floor label + * + * Returns 0 on success, error code otherwise. + */ +static int smack_syslog(int type) +{ + int rc; + char *sp = current->security; + + rc = cap_syslog(type); + if (rc != 0) + return rc; + + if (capable(CAP_MAC_OVERRIDE)) + return 0; + + if (sp != smack_known_floor.smk_known) + rc = -EACCES; + + return rc; +} + + +/* + * Superblock Hooks. + */ + +/** + * smack_sb_alloc_security - allocate a superblock blob + * @sb: the superblock getting the blob + * + * Returns 0 on success or -ENOMEM on error. + */ +static int smack_sb_alloc_security(struct super_block *sb) +{ + struct superblock_smack *sbsp; + + sbsp = kzalloc(sizeof(struct superblock_smack), GFP_KERNEL); + + if (sbsp == NULL) + return -ENOMEM; + + sbsp->smk_root = smack_known_floor.smk_known; + sbsp->smk_default = smack_known_floor.smk_known; + sbsp->smk_floor = smack_known_floor.smk_known; + sbsp->smk_hat = smack_known_hat.smk_known; + sbsp->smk_initialized = 0; + spin_lock_init(&sbsp->smk_sblock); + + sb->s_security = sbsp; + + return 0; +} + +/** + * smack_sb_free_security - free a superblock blob + * @sb: the superblock getting the blob + * + */ +static void smack_sb_free_security(struct super_block *sb) +{ + kfree(sb->s_security); + sb->s_security = NULL; +} + +/** + * smack_sb_copy_data - copy mount options data for processing + * @type: file system type + * @orig: where to start + * @smackopts + * + * Returns 0 on success or -ENOMEM on error. + * + * Copy the Smack specific mount options out of the mount + * options list. + */ +static int smack_sb_copy_data(struct file_system_type *type, void *orig, + void *smackopts) +{ + char *cp, *commap, *otheropts, *dp; + + /* Binary mount data: just copy */ + if (type->fs_flags & FS_BINARY_MOUNTDATA) { + copy_page(smackopts, orig); + return 0; + } + + otheropts = (char *)get_zeroed_page(GFP_KERNEL); + if (otheropts == NULL) + return -ENOMEM; + + for (cp = orig, commap = orig; commap != NULL; cp = commap + 1) { + if (strstr(cp, SMK_FSDEFAULT) == cp) + dp = smackopts; + else if (strstr(cp, SMK_FSFLOOR) == cp) + dp = smackopts; + else if (strstr(cp, SMK_FSHAT) == cp) + dp = smackopts; + else if (strstr(cp, SMK_FSROOT) == cp) + dp = smackopts; + else + dp = otheropts; + + commap = strchr(cp, ','); + if (commap != NULL) + *commap = '\0'; + + if (*dp != '\0') + strcat(dp, ","); + strcat(dp, cp); + } + + strcpy(orig, otheropts); + free_page((unsigned long)otheropts); + + return 0; +} + +/** + * smack_sb_kern_mount - Smack specific mount processing + * @sb: the file system superblock + * @data: the smack mount options + * + * Returns 0 on success, an error code on failure + */ +static int smack_sb_kern_mount(struct super_block *sb, void *data) +{ + struct dentry *root = sb->s_root; + struct inode *inode = root->d_inode; + struct superblock_smack *sp = sb->s_security; + struct inode_smack *isp; + char *op; + char *commap; + char *nsp; + + spin_lock(&sp->smk_sblock); + if (sp->smk_initialized != 0) { + spin_unlock(&sp->smk_sblock); + return 0; + } + sp->smk_initialized = 1; + spin_unlock(&sp->smk_sblock); + + for (op = data; op != NULL; op = commap) { + commap = strchr(op, ','); + if (commap != NULL) + *commap++ = '\0'; + + if (strncmp(op, SMK_FSHAT, strlen(SMK_FSHAT)) == 0) { + op += strlen(SMK_FSHAT); + nsp = smk_import(op, 0); + if (nsp != NULL) + sp->smk_hat = nsp; + } else if (strncmp(op, SMK_FSFLOOR, strlen(SMK_FSFLOOR)) == 0) { + op += strlen(SMK_FSFLOOR); + nsp = smk_import(op, 0); + if (nsp != NULL) + sp->smk_floor = nsp; + } else if (strncmp(op, SMK_FSDEFAULT, + strlen(SMK_FSDEFAULT)) == 0) { + op += strlen(SMK_FSDEFAULT); + nsp = smk_import(op, 0); + if (nsp != NULL) + sp->smk_default = nsp; + } else if (strncmp(op, SMK_FSROOT, strlen(SMK_FSROOT)) == 0) { + op += strlen(SMK_FSROOT); + nsp = smk_import(op, 0); + if (nsp != NULL) + sp->smk_root = nsp; + } + } + + /* + * Initialize the root inode. + */ + isp = inode->i_security; + if (isp == NULL) + inode->i_security = new_inode_smack(sp->smk_root); + else + isp->smk_inode = sp->smk_root; + + return 0; +} + +/** + * smack_sb_statfs - Smack check on statfs + * @dentry: identifies the file system in question + * + * Returns 0 if current can read the floor of the filesystem, + * and error code otherwise + */ +static int smack_sb_statfs(struct dentry *dentry) +{ + struct superblock_smack *sbp = dentry->d_sb->s_security; + + return smk_curacc(sbp->smk_floor, MAY_READ); +} + +/** + * smack_sb_mount - Smack check for mounting + * @dev_name: unused + * @nd: mount point + * @type: unused + * @flags: unused + * @data: unused + * + * Returns 0 if current can write the floor of the filesystem + * being mounted on, an error code otherwise. + */ +static int smack_sb_mount(char *dev_name, struct nameidata *nd, + char *type, unsigned long flags, void *data) +{ + struct superblock_smack *sbp = nd->mnt->mnt_sb->s_security; + + return smk_curacc(sbp->smk_floor, MAY_WRITE); +} + +/** + * smack_sb_umount - Smack check for unmounting + * @mnt: file system to unmount + * @flags: unused + * + * Returns 0 if current can write the floor of the filesystem + * being unmounted, an error code otherwise. + */ +static int smack_sb_umount(struct vfsmount *mnt, int flags) +{ + struct superblock_smack *sbp; + + sbp = mnt->mnt_sb->s_security; + + return smk_curacc(sbp->smk_floor, MAY_WRITE); +} + +/* + * Inode hooks + */ + +/** + * smack_inode_alloc_security - allocate an inode blob + * @inode - the inode in need of a blob + * + * Returns 0 if it gets a blob, -ENOMEM otherwise + */ +static int smack_inode_alloc_security(struct inode *inode) +{ + inode->i_security = new_inode_smack(current->security); + if (inode->i_security == NULL) + return -ENOMEM; + return 0; +} + +/** + * smack_inode_free_security - free an inode blob + * @inode - the inode with a blob + * + * Clears the blob pointer in inode + */ +static void smack_inode_free_security(struct inode *inode) +{ + kfree(inode->i_security); + inode->i_security = NULL; +} + +/** + * smack_inode_init_security - copy out the smack from an inode + * @inode: the inode + * @dir: unused + * @name: where to put the attribute name + * @value: where to put the attribute value + * @len: where to put the length of the attribute + * + * Returns 0 if it all works out, -ENOMEM if there's no memory + */ +static int smack_inode_init_security(struct inode *inode, struct inode *dir, + char **name, void **value, size_t *len) +{ + char *isp = smk_of_inode(inode); + + if (name) { + *name = kstrdup(XATTR_SMACK_SUFFIX, GFP_KERNEL); + if (*name == NULL) + return -ENOMEM; + } + + if (value) { + *value = kstrdup(isp, GFP_KERNEL); + if (*value == NULL) + return -ENOMEM; + } + + if (len) + *len = strlen(isp) + 1; + + return 0; +} + +/** + * smack_inode_link - Smack check on link + * @old_dentry: the existing object + * @dir: unused + * @new_dentry: the new object + * + * Returns 0 if access is permitted, an error code otherwise + */ +static int smack_inode_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + int rc; + char *isp; + + isp = smk_of_inode(old_dentry->d_inode); + rc = smk_curacc(isp, MAY_WRITE); + + if (rc == 0 && new_dentry->d_inode != NULL) { + isp = smk_of_inode(new_dentry->d_inode); + rc = smk_curacc(isp, MAY_WRITE); + } + + return rc; +} + +/** + * smack_inode_unlink - Smack check on inode deletion + * @dir: containing directory object + * @dentry: file to unlink + * + * Returns 0 if current can write the containing directory + * and the object, error code otherwise + */ +static int smack_inode_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *ip = dentry->d_inode; + int rc; + + /* + * You need write access to the thing you're unlinking + */ + rc = smk_curacc(smk_of_inode(ip), MAY_WRITE); + if (rc == 0) + /* + * You also need write access to the containing directory + */ + rc = smk_curacc(smk_of_inode(dir), MAY_WRITE); + + return rc; +} + +/** + * smack_inode_rmdir - Smack check on directory deletion + * @dir: containing directory object + * @dentry: directory to unlink + * + * Returns 0 if current can write the containing directory + * and the directory, error code otherwise + */ +static int smack_inode_rmdir(struct inode *dir, struct dentry *dentry) +{ + int rc; + + /* + * You need write access to the thing you're removing + */ + rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE); + if (rc == 0) + /* + * You also need write access to the containing directory + */ + rc = smk_curacc(smk_of_inode(dir), MAY_WRITE); + + return rc; +} + +/** + * smack_inode_rename - Smack check on rename + * @old_inode: the old directory + * @old_dentry: unused + * @new_inode: the new directory + * @new_dentry: unused + * + * Read and write access is required on both the old and + * new directories. + * + * Returns 0 if access is permitted, an error code otherwise + */ +static int smack_inode_rename(struct inode *old_inode, + struct dentry *old_dentry, + struct inode *new_inode, + struct dentry *new_dentry) +{ + int rc; + char *isp; + + isp = smk_of_inode(old_dentry->d_inode); + rc = smk_curacc(isp, MAY_READWRITE); + + if (rc == 0 && new_dentry->d_inode != NULL) { + isp = smk_of_inode(new_dentry->d_inode); + rc = smk_curacc(isp, MAY_READWRITE); + } + + return rc; +} + +/** + * smack_inode_permission - Smack version of permission() + * @inode: the inode in question + * @mask: the access requested + * @nd: unused + * + * This is the important Smack hook. + * + * Returns 0 if access is permitted, -EACCES otherwise + */ +static int smack_inode_permission(struct inode *inode, int mask, + struct nameidata *nd) +{ + /* + * No permission to check. Existence test. Yup, it's there. + */ + if (mask == 0) + return 0; + + return smk_curacc(smk_of_inode(inode), mask); +} + +/** + * smack_inode_setattr - Smack check for setting attributes + * @dentry: the object + * @iattr: for the force flag + * + * Returns 0 if access is permitted, an error code otherwise + */ +static int smack_inode_setattr(struct dentry *dentry, struct iattr *iattr) +{ + /* + * Need to allow for clearing the setuid bit. + */ + if (iattr->ia_valid & ATTR_FORCE) + return 0; + + return smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE); +} + +/** + * smack_inode_getattr - Smack check for getting attributes + * @mnt: unused + * @dentry: the object + * + * Returns 0 if access is permitted, an error code otherwise + */ +static int smack_inode_getattr(struct vfsmount *mnt, struct dentry *dentry) +{ + return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ); +} + +/** + * smack_inode_setxattr - Smack check for setting xattrs + * @dentry: the object + * @name: name of the attribute + * @value: unused + * @size: unused + * @flags: unused + * + * This protects the Smack attribute explicitly. + * + * Returns 0 if access is permitted, an error code otherwise + */ +static int smack_inode_setxattr(struct dentry *dentry, char *name, + void *value, size_t size, int flags) +{ + if (!capable(CAP_MAC_ADMIN)) { + if (strcmp(name, XATTR_NAME_SMACK) == 0 || + strcmp(name, XATTR_NAME_SMACKIPIN) == 0 || + strcmp(name, XATTR_NAME_SMACKIPOUT) == 0) + return -EPERM; + } + + return smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE); +} + +/** + * smack_inode_post_setxattr - Apply the Smack update approved above + * @dentry: object + * @name: attribute name + * @value: attribute value + * @size: attribute size + * @flags: unused + * + * Set the pointer in the inode blob to the entry found + * in the master label list. + */ +static void smack_inode_post_setxattr(struct dentry *dentry, char *name, + void *value, size_t size, int flags) +{ + struct inode_smack *isp; + char *nsp; + + /* + * Not SMACK + */ + if (strcmp(name, XATTR_NAME_SMACK)) + return; + + if (size >= SMK_LABELLEN) + return; + + isp = dentry->d_inode->i_security; + + /* + * No locking is done here. This is a pointer + * assignment. + */ + nsp = smk_import(value, size); + if (nsp != NULL) + isp->smk_inode = nsp; + else + isp->smk_inode = smack_known_invalid.smk_known; + + return; +} + +/* + * smack_inode_getxattr - Smack check on getxattr + * @dentry: the object + * @name: unused + * + * Returns 0 if access is permitted, an error code otherwise + */ +static int smack_inode_getxattr(struct dentry *dentry, char *name) +{ + return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ); +} + +/* + * smack_inode_removexattr - Smack check on removexattr + * @dentry: the object + * @name: name of the attribute + * + * Removing the Smack attribute requires CAP_MAC_ADMIN + * + * Returns 0 if access is permitted, an error code otherwise + */ +static int smack_inode_removexattr(struct dentry *dentry, char *name) +{ + if (strcmp(name, XATTR_NAME_SMACK) == 0 && !capable(CAP_MAC_ADMIN)) + return -EPERM; + + return smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE); +} + +/** + * smack_inode_getsecurity - get smack xattrs + * @inode: the object + * @name: attribute name + * @buffer: where to put the result + * @size: size of the buffer + * @err: unused + * + * Returns the size of the attribute or an error code + */ +static int smack_inode_getsecurity(const struct inode *inode, + const char *name, void **buffer, + bool alloc) +{ + struct socket_smack *ssp; + struct socket *sock; + struct super_block *sbp; + struct inode *ip = (struct inode *)inode; + char *isp; + int ilen; + int rc = 0; + + if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) { + isp = smk_of_inode(inode); + ilen = strlen(isp) + 1; + *buffer = isp; + return ilen; + } + + /* + * The rest of the Smack xattrs are only on sockets. + */ + sbp = ip->i_sb; + if (sbp->s_magic != SOCKFS_MAGIC) + return -EOPNOTSUPP; + + sock = SOCKET_I(ip); + if (sock == NULL) + return -EOPNOTSUPP; + + ssp = sock->sk->sk_security; + + if (strcmp(name, XATTR_SMACK_IPIN) == 0) + isp = ssp->smk_in; + else if (strcmp(name, XATTR_SMACK_IPOUT) == 0) + isp = ssp->smk_out; + else + return -EOPNOTSUPP; + + ilen = strlen(isp) + 1; + if (rc == 0) { + *buffer = isp; + rc = ilen; + } + + return rc; +} + + +/** + * smack_inode_listsecurity - list the Smack attributes + * @inode: the object + * @buffer: where they go + * @buffer_size: size of buffer + * + * Returns 0 on success, -EINVAL otherwise + */ +static int smack_inode_listsecurity(struct inode *inode, char *buffer, + size_t buffer_size) +{ + int len = strlen(XATTR_NAME_SMACK); + + if (buffer != NULL && len <= buffer_size) { + memcpy(buffer, XATTR_NAME_SMACK, len); + return len; + } + return -EINVAL; +} + +/* + * File Hooks + */ + +/** + * smack_file_permission - Smack check on file operations + * @file: unused + * @mask: unused + * + * Returns 0 + * + * Should access checks be done on each read or write? + * UNICOS and SELinux say yes. + * Trusted Solaris, Trusted Irix, and just about everyone else says no. + * + * I'll say no for now. Smack does not do the frequent + * label changing that SELinux does. + */ +static int smack_file_permission(struct file *file, int mask) +{ + return 0; +} + +/** + * smack_file_alloc_security - assign a file security blob + * @file: the object + * + * The security blob for a file is a pointer to the master + * label list, so no allocation is done. + * + * Returns 0 + */ +static int smack_file_alloc_security(struct file *file) +{ + file->f_security = current->security; + return 0; +} + +/** + * smack_file_free_security - clear a file security blob + * @file: the object + * + * The security blob for a file is a pointer to the master + * label list, so no memory is freed. + */ +static void smack_file_free_security(struct file *file) +{ + file->f_security = NULL; +} + +/** + * smack_file_ioctl - Smack check on ioctls + * @file: the object + * @cmd: what to do + * @arg: unused + * + * Relies heavily on the correct use of the ioctl command conventions. + * + * Returns 0 if allowed, error code otherwise + */ +static int smack_file_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int rc = 0; + + if (_IOC_DIR(cmd) & _IOC_WRITE) + rc = smk_curacc(file->f_security, MAY_WRITE); + + if (rc == 0 && (_IOC_DIR(cmd) & _IOC_READ)) + rc = smk_curacc(file->f_security, MAY_READ); + + return rc; +} + +/** + * smack_file_lock - Smack check on file locking + * @file: the object + * @cmd unused + * + * Returns 0 if current has write access, error code otherwise + */ +static int smack_file_lock(struct file *file, unsigned int cmd) +{ + return smk_curacc(file->f_security, MAY_WRITE); +} + +/** + * smack_file_fcntl - Smack check on fcntl + * @file: the object + * @cmd: what action to check + * @arg: unused + * + * Returns 0 if current has access, error code otherwise + */ +static int smack_file_fcntl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int rc; + + switch (cmd) { + case F_DUPFD: + case F_GETFD: + case F_GETFL: + case F_GETLK: + case F_GETOWN: + case F_GETSIG: + rc = smk_curacc(file->f_security, MAY_READ); + break; + case F_SETFD: + case F_SETFL: + case F_SETLK: + case F_SETLKW: + case F_SETOWN: + case F_SETSIG: + rc = smk_curacc(file->f_security, MAY_WRITE); + break; + default: + rc = smk_curacc(file->f_security, MAY_READWRITE); + } + + return rc; +} + +/** + * smack_file_set_fowner - set the file security blob value + * @file: object in question + * + * Returns 0 + * Further research may be required on this one. + */ +static int smack_file_set_fowner(struct file *file) +{ + file->f_security = current->security; + return 0; +} + +/** + * smack_file_send_sigiotask - Smack on sigio + * @tsk: The target task + * @fown: the object the signal come from + * @signum: unused + * + * Allow a privileged task to get signals even if it shouldn't + * + * Returns 0 if a subject with the object's smack could + * write to the task, an error code otherwise. + */ +static int smack_file_send_sigiotask(struct task_struct *tsk, + struct fown_struct *fown, int signum) +{ + struct file *file; + int rc; + + /* + * struct fown_struct is never outside the context of a struct file + */ + file = container_of(fown, struct file, f_owner); + rc = smk_access(file->f_security, tsk->security, MAY_WRITE); + if (rc != 0 && __capable(tsk, CAP_MAC_OVERRIDE)) + return 0; + return rc; +} + +/** + * smack_file_receive - Smack file receive check + * @file: the object + * + * Returns 0 if current has access, error code otherwise + */ +static int smack_file_receive(struct file *file) +{ + int may = 0; + + /* + * This code relies on bitmasks. + */ + if (file->f_mode & FMODE_READ) + may = MAY_READ; + if (file->f_mode & FMODE_WRITE) + may |= MAY_WRITE; + + return smk_curacc(file->f_security, may); +} + +/* + * Task hooks + */ + +/** + * smack_task_alloc_security - "allocate" a task blob + * @tsk: the task in need of a blob + * + * Smack isn't using copies of blobs. Everyone + * points to an immutable list. No alloc required. + * No data copy required. + * + * Always returns 0 + */ +static int smack_task_alloc_security(struct task_struct *tsk) +{ + tsk->security = current->security; + + return 0; +} + +/** + * smack_task_free_security - "free" a task blob + * @task: the task with the blob + * + * Smack isn't using copies of blobs. Everyone + * points to an immutable list. The blobs never go away. + * There is no leak here. + */ +static void smack_task_free_security(struct task_struct *task) +{ + task->security = NULL; +} + +/** + * smack_task_setpgid - Smack check on setting pgid + * @p: the task object + * @pgid: unused + * + * Return 0 if write access is permitted + */ +static int smack_task_setpgid(struct task_struct *p, pid_t pgid) +{ + return smk_curacc(p->security, MAY_WRITE); +} + +/** + * smack_task_getpgid - Smack access check for getpgid + * @p: the object task + * + * Returns 0 if current can read the object task, error code otherwise + */ +static int smack_task_getpgid(struct task_struct *p) +{ + return smk_curacc(p->security, MAY_READ); +} + +/** + * smack_task_getsid - Smack access check for getsid + * @p: the object task + * + * Returns 0 if current can read the object task, error code otherwise + */ +static int smack_task_getsid(struct task_struct *p) +{ + return smk_curacc(p->security, MAY_READ); +} + +/** + * smack_task_getsecid - get the secid of the task + * @p: the object task + * @secid: where to put the result + * + * Sets the secid to contain a u32 version of the smack label. + */ +static void smack_task_getsecid(struct task_struct *p, u32 *secid) +{ + *secid = smack_to_secid(p->security); +} + +/** + * smack_task_setnice - Smack check on setting nice + * @p: the task object + * @nice: unused + * + * Return 0 if write access is permitted + */ +static int smack_task_setnice(struct task_struct *p, int nice) +{ + return smk_curacc(p->security, MAY_WRITE); +} + +/** + * smack_task_setioprio - Smack check on setting ioprio + * @p: the task object + * @ioprio: unused + * + * Return 0 if write access is permitted + */ +static int smack_task_setioprio(struct task_struct *p, int ioprio) +{ + return smk_curacc(p->security, MAY_WRITE); +} + +/** + * smack_task_getioprio - Smack check on reading ioprio + * @p: the task object + * + * Return 0 if read access is permitted + */ +static int smack_task_getioprio(struct task_struct *p) +{ + return smk_curacc(p->security, MAY_READ); +} + +/** + * smack_task_setscheduler - Smack check on setting scheduler + * @p: the task object + * @policy: unused + * @lp: unused + * + * Return 0 if read access is permitted + */ +static int smack_task_setscheduler(struct task_struct *p, int policy, + struct sched_param *lp) +{ + return smk_curacc(p->security, MAY_WRITE); +} + +/** + * smack_task_getscheduler - Smack check on reading scheduler + * @p: the task object + * + * Return 0 if read access is permitted + */ +static int smack_task_getscheduler(struct task_struct *p) +{ + return smk_curacc(p->security, MAY_READ); +} + +/** + * smack_task_movememory - Smack check on moving memory + * @p: the task object + * + * Return 0 if write access is permitted + */ +static int smack_task_movememory(struct task_struct *p) +{ + return smk_curacc(p->security, MAY_WRITE); +} + +/** + * smack_task_kill - Smack check on signal delivery + * @p: the task object + * @info: unused + * @sig: unused + * @secid: identifies the smack to use in lieu of current's + * + * Return 0 if write access is permitted + * + * The secid behavior is an artifact of an SELinux hack + * in the USB code. Someday it may go away. + */ +static int smack_task_kill(struct task_struct *p, struct siginfo *info, + int sig, u32 secid) +{ + /* + * Special cases where signals really ought to go through + * in spite of policy. Stephen Smalley suggests it may + * make sense to change the caller so that it doesn't + * bother with the LSM hook in these cases. + */ + if (info != SEND_SIG_NOINFO && + (is_si_special(info) || SI_FROMKERNEL(info))) + return 0; + /* + * Sending a signal requires that the sender + * can write the receiver. + */ + if (secid == 0) + return smk_curacc(p->security, MAY_WRITE); + /* + * If the secid isn't 0 we're dealing with some USB IO + * specific behavior. This is not clean. For one thing + * we can't take privilege into account. + */ + return smk_access(smack_from_secid(secid), p->security, MAY_WRITE); +} + +/** + * smack_task_wait - Smack access check for waiting + * @p: task to wait for + * + * Returns 0 if current can wait for p, error code otherwise + */ +static int smack_task_wait(struct task_struct *p) +{ + int rc; + + rc = smk_access(current->security, p->security, MAY_WRITE); + if (rc == 0) + return 0; + + /* + * Allow the operation to succeed if either task + * has privilege to perform operations that might + * account for the smack labels having gotten to + * be different in the first place. + * + * This breaks the strict subjet/object access + * control ideal, taking the object's privilege + * state into account in the decision as well as + * the smack value. + */ + if (capable(CAP_MAC_OVERRIDE) || __capable(p, CAP_MAC_OVERRIDE)) + return 0; + + return rc; +} + +/** + * smack_task_to_inode - copy task smack into the inode blob + * @p: task to copy from + * inode: inode to copy to + * + * Sets the smack pointer in the inode security blob + */ +static void smack_task_to_inode(struct task_struct *p, struct inode *inode) +{ + struct inode_smack *isp = inode->i_security; + isp->smk_inode = p->security; +} + +/* + * Socket hooks. + */ + +/** + * smack_sk_alloc_security - Allocate a socket blob + * @sk: the socket + * @family: unused + * @priority: memory allocation priority + * + * Assign Smack pointers to current + * + * Returns 0 on success, -ENOMEM is there's no memory + */ +static int smack_sk_alloc_security(struct sock *sk, int family, gfp_t gfp_flags) +{ + char *csp = current->security; + struct socket_smack *ssp; + + ssp = kzalloc(sizeof(struct socket_smack), gfp_flags); + if (ssp == NULL) + return -ENOMEM; + + ssp->smk_in = csp; + ssp->smk_out = csp; + ssp->smk_packet[0] = '\0'; + + sk->sk_security = ssp; + + return 0; +} + +/** + * smack_sk_free_security - Free a socket blob + * @sk: the socket + * + * Clears the blob pointer + */ +static void smack_sk_free_security(struct sock *sk) +{ + kfree(sk->sk_security); +} + +/** + * smack_set_catset - convert a capset to netlabel mls categories + * @catset: the Smack categories + * @sap: where to put the netlabel categories + * + * Allocates and fills attr.mls.cat + */ +static void smack_set_catset(char *catset, struct netlbl_lsm_secattr *sap) +{ + unsigned char *cp; + unsigned char m; + int cat; + int rc; + int byte; + + if (catset == 0) + return; + + sap->flags |= NETLBL_SECATTR_MLS_CAT; + sap->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); + sap->attr.mls.cat->startbit = 0; + + for (cat = 1, cp = catset, byte = 0; byte < SMK_LABELLEN; cp++, byte++) + for (m = 0x80; m != 0; m >>= 1, cat++) { + if ((m & *cp) == 0) + continue; + rc = netlbl_secattr_catmap_setbit(sap->attr.mls.cat, + cat, GFP_ATOMIC); + } +} + +/** + * smack_to_secattr - fill a secattr from a smack value + * @smack: the smack value + * @nlsp: where the result goes + * + * Casey says that CIPSO is good enough for now. + * It can be used to effect. + * It can also be abused to effect when necessary. + * Appologies to the TSIG group in general and GW in particular. + */ +static void smack_to_secattr(char *smack, struct netlbl_lsm_secattr *nlsp) +{ + struct smack_cipso cipso; + int rc; + + switch (smack_net_nltype) { + case NETLBL_NLTYPE_CIPSOV4: + nlsp->domain = NULL; + nlsp->flags = NETLBL_SECATTR_DOMAIN; + nlsp->flags |= NETLBL_SECATTR_MLS_LVL; + + rc = smack_to_cipso(smack, &cipso); + if (rc == 0) { + nlsp->attr.mls.lvl = cipso.smk_level; + smack_set_catset(cipso.smk_catset, nlsp); + } else { + nlsp->attr.mls.lvl = smack_cipso_direct; + smack_set_catset(smack, nlsp); + } + break; + default: + break; + } +} + +/** + * smack_netlabel - Set the secattr on a socket + * @sk: the socket + * + * Convert the outbound smack value (smk_out) to a + * secattr and attach it to the socket. + * + * Returns 0 on success or an error code + */ +static int smack_netlabel(struct sock *sk) +{ + struct socket_smack *ssp = sk->sk_security; + struct netlbl_lsm_secattr secattr; + int rc = 0; + + netlbl_secattr_init(&secattr); + smack_to_secattr(ssp->smk_out, &secattr); + if (secattr.flags != NETLBL_SECATTR_NONE) + rc = netlbl_sock_setattr(sk, &secattr); + + netlbl_secattr_destroy(&secattr); + return rc; +} + +/** + * smack_inode_setsecurity - set smack xattrs + * @inode: the object + * @name: attribute name + * @value: attribute value + * @size: size of the attribute + * @flags: unused + * + * Sets the named attribute in the appropriate blob + * + * Returns 0 on success, or an error code + */ +static int smack_inode_setsecurity(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + char *sp; + struct inode_smack *nsp = inode->i_security; + struct socket_smack *ssp; + struct socket *sock; + + if (value == NULL || size > SMK_LABELLEN) + return -EACCES; + + sp = smk_import(value, size); + if (sp == NULL) + return -EINVAL; + + if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) { + nsp->smk_inode = sp; + return 0; + } + /* + * The rest of the Smack xattrs are only on sockets. + */ + if (inode->i_sb->s_magic != SOCKFS_MAGIC) + return -EOPNOTSUPP; + + sock = SOCKET_I(inode); + if (sock == NULL) + return -EOPNOTSUPP; + + ssp = sock->sk->sk_security; + + if (strcmp(name, XATTR_SMACK_IPIN) == 0) + ssp->smk_in = sp; + else if (strcmp(name, XATTR_SMACK_IPOUT) == 0) { + ssp->smk_out = sp; + return smack_netlabel(sock->sk); + } else + return -EOPNOTSUPP; + + return 0; +} + +/** + * smack_socket_post_create - finish socket setup + * @sock: the socket + * @family: protocol family + * @type: unused + * @protocol: unused + * @kern: unused + * + * Sets the netlabel information on the socket + * + * Returns 0 on success, and error code otherwise + */ +static int smack_socket_post_create(struct socket *sock, int family, + int type, int protocol, int kern) +{ + if (family != PF_INET) + return 0; + /* + * Set the outbound netlbl. + */ + return smack_netlabel(sock->sk); +} + +/** + * smack_flags_to_may - convert S_ to MAY_ values + * @flags: the S_ value + * + * Returns the equivalent MAY_ value + */ +static int smack_flags_to_may(int flags) +{ + int may = 0; + + if (flags & S_IRUGO) + may |= MAY_READ; + if (flags & S_IWUGO) + may |= MAY_WRITE; + if (flags & S_IXUGO) + may |= MAY_EXEC; + + return may; +} + +/** + * smack_msg_msg_alloc_security - Set the security blob for msg_msg + * @msg: the object + * + * Returns 0 + */ +static int smack_msg_msg_alloc_security(struct msg_msg *msg) +{ + msg->security = current->security; + return 0; +} + +/** + * smack_msg_msg_free_security - Clear the security blob for msg_msg + * @msg: the object + * + * Clears the blob pointer + */ +static void smack_msg_msg_free_security(struct msg_msg *msg) +{ + msg->security = NULL; +} + +/** + * smack_of_shm - the smack pointer for the shm + * @shp: the object + * + * Returns a pointer to the smack value + */ +static char *smack_of_shm(struct shmid_kernel *shp) +{ + return (char *)shp->shm_perm.security; +} + +/** + * smack_shm_alloc_security - Set the security blob for shm + * @shp: the object + * + * Returns 0 + */ +static int smack_shm_alloc_security(struct shmid_kernel *shp) +{ + struct kern_ipc_perm *isp = &shp->shm_perm; + + isp->security = current->security; + return 0; +} + +/** + * smack_shm_free_security - Clear the security blob for shm + * @shp: the object + * + * Clears the blob pointer + */ +static void smack_shm_free_security(struct shmid_kernel *shp) +{ + struct kern_ipc_perm *isp = &shp->shm_perm; + + isp->security = NULL; +} + +/** + * smack_shm_associate - Smack access check for shm + * @shp: the object + * @shmflg: access requested + * + * Returns 0 if current has the requested access, error code otherwise + */ +static int smack_shm_associate(struct shmid_kernel *shp, int shmflg) +{ + char *ssp = smack_of_shm(shp); + int may; + + may = smack_flags_to_may(shmflg); + return smk_curacc(ssp, may); +} + +/** + * smack_shm_shmctl - Smack access check for shm + * @shp: the object + * @cmd: what it wants to do + * + * Returns 0 if current has the requested access, error code otherwise + */ +static int smack_shm_shmctl(struct shmid_kernel *shp, int cmd) +{ + char *ssp = smack_of_shm(shp); + int may; + + switch (cmd) { + case IPC_STAT: + case SHM_STAT: + may = MAY_READ; + break; + case IPC_SET: + case SHM_LOCK: + case SHM_UNLOCK: + case IPC_RMID: + may = MAY_READWRITE; + break; + case IPC_INFO: + case SHM_INFO: + /* + * System level information. + */ + return 0; + default: + return -EINVAL; + } + + return smk_curacc(ssp, may); +} + +/** + * smack_shm_shmat - Smack access for shmat + * @shp: the object + * @shmaddr: unused + * @shmflg: access requested + * + * Returns 0 if current has the requested access, error code otherwise + */ +static int smack_shm_shmat(struct shmid_kernel *shp, char __user *shmaddr, + int shmflg) +{ + char *ssp = smack_of_shm(shp); + int may; + + may = smack_flags_to_may(shmflg); + return smk_curacc(ssp, may); +} + +/** + * smack_of_sem - the smack pointer for the sem + * @sma: the object + * + * Returns a pointer to the smack value + */ +static char *smack_of_sem(struct sem_array *sma) +{ + return (char *)sma->sem_perm.security; +} + +/** + * smack_sem_alloc_security - Set the security blob for sem + * @sma: the object + * + * Returns 0 + */ +static int smack_sem_alloc_security(struct sem_array *sma) +{ + struct kern_ipc_perm *isp = &sma->sem_perm; + + isp->security = current->security; + return 0; +} + +/** + * smack_sem_free_security - Clear the security blob for sem + * @sma: the object + * + * Clears the blob pointer + */ +static void smack_sem_free_security(struct sem_array *sma) +{ + struct kern_ipc_perm *isp = &sma->sem_perm; + + isp->security = NULL; +} + +/** + * smack_sem_associate - Smack access check for sem + * @sma: the object + * @semflg: access requested + * + * Returns 0 if current has the requested access, error code otherwise + */ +static int smack_sem_associate(struct sem_array *sma, int semflg) +{ + char *ssp = smack_of_sem(sma); + int may; + + may = smack_flags_to_may(semflg); + return smk_curacc(ssp, may); +} + +/** + * smack_sem_shmctl - Smack access check for sem + * @sma: the object + * @cmd: what it wants to do + * + * Returns 0 if current has the requested access, error code otherwise + */ +static int smack_sem_semctl(struct sem_array *sma, int cmd) +{ + char *ssp = smack_of_sem(sma); + int may; + + switch (cmd) { + case GETPID: + case GETNCNT: + case GETZCNT: + case GETVAL: + case GETALL: + case IPC_STAT: + case SEM_STAT: + may = MAY_READ; + break; + case SETVAL: + case SETALL: + case IPC_RMID: + case IPC_SET: + may = MAY_READWRITE; + break; + case IPC_INFO: + case SEM_INFO: + /* + * System level information + */ + return 0; + default: + return -EINVAL; + } + + return smk_curacc(ssp, may); +} + +/** + * smack_sem_semop - Smack checks of semaphore operations + * @sma: the object + * @sops: unused + * @nsops: unused + * @alter: unused + * + * Treated as read and write in all cases. + * + * Returns 0 if access is allowed, error code otherwise + */ +static int smack_sem_semop(struct sem_array *sma, struct sembuf *sops, + unsigned nsops, int alter) +{ + char *ssp = smack_of_sem(sma); + + return smk_curacc(ssp, MAY_READWRITE); +} + +/** + * smack_msg_alloc_security - Set the security blob for msg + * @msq: the object + * + * Returns 0 + */ +static int smack_msg_queue_alloc_security(struct msg_queue *msq) +{ + struct kern_ipc_perm *kisp = &msq->q_perm; + + kisp->security = current->security; + return 0; +} + +/** + * smack_msg_free_security - Clear the security blob for msg + * @msq: the object + * + * Clears the blob pointer + */ +static void smack_msg_queue_free_security(struct msg_queue *msq) +{ + struct kern_ipc_perm *kisp = &msq->q_perm; + + kisp->security = NULL; +} + +/** + * smack_of_msq - the smack pointer for the msq + * @msq: the object + * + * Returns a pointer to the smack value + */ +static char *smack_of_msq(struct msg_queue *msq) +{ + return (char *)msq->q_perm.security; +} + +/** + * smack_msg_queue_associate - Smack access check for msg_queue + * @msq: the object + * @msqflg: access requested + * + * Returns 0 if current has the requested access, error code otherwise + */ +static int smack_msg_queue_associate(struct msg_queue *msq, int msqflg) +{ + char *msp = smack_of_msq(msq); + int may; + + may = smack_flags_to_may(msqflg); + return smk_curacc(msp, may); +} + +/** + * smack_msg_queue_msgctl - Smack access check for msg_queue + * @msq: the object + * @cmd: what it wants to do + * + * Returns 0 if current has the requested access, error code otherwise + */ +static int smack_msg_queue_msgctl(struct msg_queue *msq, int cmd) +{ + char *msp = smack_of_msq(msq); + int may; + + switch (cmd) { + case IPC_STAT: + case MSG_STAT: + may = MAY_READ; + break; + case IPC_SET: + case IPC_RMID: + may = MAY_READWRITE; + break; + case IPC_INFO: + case MSG_INFO: + /* + * System level information + */ + return 0; + default: + return -EINVAL; + } + + return smk_curacc(msp, may); +} + +/** + * smack_msg_queue_msgsnd - Smack access check for msg_queue + * @msq: the object + * @msg: unused + * @msqflg: access requested + * + * Returns 0 if current has the requested access, error code otherwise + */ +static int smack_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg, + int msqflg) +{ + char *msp = smack_of_msq(msq); + int rc; + + rc = smack_flags_to_may(msqflg); + return smk_curacc(msp, rc); +} + +/** + * smack_msg_queue_msgsnd - Smack access check for msg_queue + * @msq: the object + * @msg: unused + * @target: unused + * @type: unused + * @mode: unused + * + * Returns 0 if current has read and write access, error code otherwise + */ +static int smack_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg, + struct task_struct *target, long type, int mode) +{ + char *msp = smack_of_msq(msq); + + return smk_curacc(msp, MAY_READWRITE); +} + +/** + * smack_ipc_permission - Smack access for ipc_permission() + * @ipp: the object permissions + * @flag: access requested + * + * Returns 0 if current has read and write access, error code otherwise + */ +static int smack_ipc_permission(struct kern_ipc_perm *ipp, short flag) +{ + char *isp = ipp->security; + int may; + + may = smack_flags_to_may(flag); + return smk_curacc(isp, may); +} + +/** + * smack_d_instantiate - Make sure the blob is correct on an inode + * @opt_dentry: unused + * @inode: the object + * + * Set the inode's security blob if it hasn't been done already. + */ +static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode) +{ + struct super_block *sbp; + struct superblock_smack *sbsp; + struct inode_smack *isp; + char *csp = current->security; + char *fetched; + char *final; + struct dentry *dp; + + if (inode == NULL) + return; + + isp = inode->i_security; + + mutex_lock(&isp->smk_lock); + /* + * If the inode is already instantiated + * take the quick way out + */ + if (isp->smk_flags & SMK_INODE_INSTANT) + goto unlockandout; + + sbp = inode->i_sb; + sbsp = sbp->s_security; + /* + * We're going to use the superblock default label + * if there's no label on the file. + */ + final = sbsp->smk_default; + + /* + * This is pretty hackish. + * Casey says that we shouldn't have to do + * file system specific code, but it does help + * with keeping it simple. + */ + switch (sbp->s_magic) { + case SMACK_MAGIC: + /* + * Casey says that it's a little embarassing + * that the smack file system doesn't do + * extended attributes. + */ + final = smack_known_star.smk_known; + break; + case PIPEFS_MAGIC: + /* + * Casey says pipes are easy (?) + */ + final = smack_known_star.smk_known; + break; + case DEVPTS_SUPER_MAGIC: + /* + * devpts seems content with the label of the task. + * Programs that change smack have to treat the + * pty with respect. + */ + final = csp; + break; + case SOCKFS_MAGIC: + /* + * Casey says sockets get the smack of the task. + */ + final = csp; + break; + case PROC_SUPER_MAGIC: + /* + * Casey says procfs appears not to care. + * The superblock default suffices. + */ + break; + case TMPFS_MAGIC: + /* + * Device labels should come from the filesystem, + * but watch out, because they're volitile, + * getting recreated on every reboot. + */ + final = smack_known_star.smk_known; + /* + * No break. + * + * If a smack value has been set we want to use it, + * but since tmpfs isn't giving us the opportunity + * to set mount options simulate setting the + * superblock default. + */ + default: + /* + * This isn't an understood special case. + * Get the value from the xattr. + * + * No xattr support means, alas, no SMACK label. + * Use the aforeapplied default. + * It would be curious if the label of the task + * does not match that assigned. + */ + if (inode->i_op->getxattr == NULL) + break; + /* + * Get the dentry for xattr. + */ + if (opt_dentry == NULL) { + dp = d_find_alias(inode); + if (dp == NULL) + break; + } else { + dp = dget(opt_dentry); + if (dp == NULL) + break; + } + + fetched = smk_fetch(inode, dp); + if (fetched != NULL) + final = fetched; + + dput(dp); + break; + } + + if (final == NULL) + isp->smk_inode = csp; + else + isp->smk_inode = final; + + isp->smk_flags |= SMK_INODE_INSTANT; + +unlockandout: + mutex_unlock(&isp->smk_lock); + return; +} + +/** + * smack_getprocattr - Smack process attribute access + * @p: the object task + * @name: the name of the attribute in /proc/.../attr + * @value: where to put the result + * + * Places a copy of the task Smack into value + * + * Returns the length of the smack label or an error code + */ +static int smack_getprocattr(struct task_struct *p, char *name, char **value) +{ + char *cp; + int slen; + + if (strcmp(name, "current") != 0) + return -EINVAL; + + cp = kstrdup(p->security, GFP_KERNEL); + if (cp == NULL) + return -ENOMEM; + + slen = strlen(cp); + *value = cp; + return slen; +} + +/** + * smack_setprocattr - Smack process attribute setting + * @p: the object task + * @name: the name of the attribute in /proc/.../attr + * @value: the value to set + * @size: the size of the value + * + * Sets the Smack value of the task. Only setting self + * is permitted and only with privilege + * + * Returns the length of the smack label or an error code + */ +static int smack_setprocattr(struct task_struct *p, char *name, + void *value, size_t size) +{ + char *newsmack; + + if (!__capable(p, CAP_MAC_ADMIN)) + return -EPERM; + + /* + * Changing another process' Smack value is too dangerous + * and supports no sane use case. + */ + if (p != current) + return -EPERM; + + if (value == NULL || size == 0 || size >= SMK_LABELLEN) + return -EINVAL; + + if (strcmp(name, "current") != 0) + return -EINVAL; + + newsmack = smk_import(value, size); + if (newsmack == NULL) + return -EINVAL; + + p->security = newsmack; + return size; +} + +/** + * smack_unix_stream_connect - Smack access on UDS + * @sock: one socket + * @other: the other socket + * @newsk: unused + * + * Return 0 if a subject with the smack of sock could access + * an object with the smack of other, otherwise an error code + */ +static int smack_unix_stream_connect(struct socket *sock, + struct socket *other, struct sock *newsk) +{ + struct inode *sp = SOCK_INODE(sock); + struct inode *op = SOCK_INODE(other); + + return smk_access(smk_of_inode(sp), smk_of_inode(op), MAY_READWRITE); +} + +/** + * smack_unix_may_send - Smack access on UDS + * @sock: one socket + * @other: the other socket + * + * Return 0 if a subject with the smack of sock could access + * an object with the smack of other, otherwise an error code + */ +static int smack_unix_may_send(struct socket *sock, struct socket *other) +{ + struct inode *sp = SOCK_INODE(sock); + struct inode *op = SOCK_INODE(other); + + return smk_access(smk_of_inode(sp), smk_of_inode(op), MAY_WRITE); +} + +/** + * smack_from_secattr - Convert a netlabel attr.mls.lvl/attr.mls.cat + * pair to smack + * @sap: netlabel secattr + * @sip: where to put the result + * + * Copies a smack label into sip + */ +static void smack_from_secattr(struct netlbl_lsm_secattr *sap, char *sip) +{ + char smack[SMK_LABELLEN]; + int pcat; + + if ((sap->flags & NETLBL_SECATTR_MLS_LVL) == 0) { + /* + * If there are flags but no level netlabel isn't + * behaving the way we expect it to. + * + * Without guidance regarding the smack value + * for the packet fall back on the network + * ambient value. + */ + strncpy(sip, smack_net_ambient, SMK_MAXLEN); + return; + } + /* + * Get the categories, if any + */ + memset(smack, '\0', SMK_LABELLEN); + if ((sap->flags & NETLBL_SECATTR_MLS_CAT) != 0) + for (pcat = -1;;) { + pcat = netlbl_secattr_catmap_walk(sap->attr.mls.cat, + pcat + 1); + if (pcat < 0) + break; + smack_catset_bit(pcat, smack); + } + /* + * If it is CIPSO using smack direct mapping + * we are already done. WeeHee. + */ + if (sap->attr.mls.lvl == smack_cipso_direct) { + memcpy(sip, smack, SMK_MAXLEN); + return; + } + /* + * Look it up in the supplied table if it is not a direct mapping. + */ + smack_from_cipso(sap->attr.mls.lvl, smack, sip); + return; +} + +/** + * smack_socket_sock_rcv_skb - Smack packet delivery access check + * @sk: socket + * @skb: packet + * + * Returns 0 if the packet should be delivered, an error code otherwise + */ +static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + struct netlbl_lsm_secattr secattr; + struct socket_smack *ssp = sk->sk_security; + char smack[SMK_LABELLEN]; + int rc; + + if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + return 0; + + /* + * Translate what netlabel gave us. + */ + memset(smack, '\0', SMK_LABELLEN); + netlbl_secattr_init(&secattr); + rc = netlbl_skbuff_getattr(skb, sk->sk_family, &secattr); + if (rc == 0) + smack_from_secattr(&secattr, smack); + else + strncpy(smack, smack_net_ambient, SMK_MAXLEN); + netlbl_secattr_destroy(&secattr); + /* + * Receiving a packet requires that the other end + * be able to write here. Read access is not required. + * This is the simplist possible security model + * for networking. + */ + return smk_access(smack, ssp->smk_in, MAY_WRITE); +} + +/** + * smack_socket_getpeersec_stream - pull in packet label + * @sock: the socket + * @optval: user's destination + * @optlen: size thereof + * @len: max thereoe + * + * returns zero on success, an error code otherwise + */ +static int smack_socket_getpeersec_stream(struct socket *sock, + char __user *optval, + int __user *optlen, unsigned len) +{ + struct socket_smack *ssp; + int slen; + int rc = 0; + + ssp = sock->sk->sk_security; + slen = strlen(ssp->smk_packet) + 1; + + if (slen > len) + rc = -ERANGE; + else if (copy_to_user(optval, ssp->smk_packet, slen) != 0) + rc = -EFAULT; + + if (put_user(slen, optlen) != 0) + rc = -EFAULT; + + return rc; +} + + +/** + * smack_socket_getpeersec_dgram - pull in packet label + * @sock: the socket + * @skb: packet data + * @secid: pointer to where to put the secid of the packet + * + * Sets the netlabel socket state on sk from parent + */ +static int smack_socket_getpeersec_dgram(struct socket *sock, + struct sk_buff *skb, u32 *secid) + +{ + struct netlbl_lsm_secattr secattr; + struct sock *sk; + char smack[SMK_LABELLEN]; + int family = PF_INET; + u32 s; + int rc; + + /* + * Only works for families with packets. + */ + if (sock != NULL) { + sk = sock->sk; + if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + return 0; + family = sk->sk_family; + } + /* + * Translate what netlabel gave us. + */ + memset(smack, '\0', SMK_LABELLEN); + netlbl_secattr_init(&secattr); + rc = netlbl_skbuff_getattr(skb, family, &secattr); + if (rc == 0) + smack_from_secattr(&secattr, smack); + netlbl_secattr_destroy(&secattr); + + /* + * Give up if we couldn't get anything + */ + if (rc != 0) + return rc; + + s = smack_to_secid(smack); + if (s == 0) + return -EINVAL; + + *secid = s; + return 0; +} + +/** + * smack_sock_graft - graft access state between two sockets + * @sk: fresh sock + * @parent: donor socket + * + * Sets the netlabel socket state on sk from parent + */ +static void smack_sock_graft(struct sock *sk, struct socket *parent) +{ + struct socket_smack *ssp; + int rc; + + if (sk == NULL) + return; + + if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + return; + + ssp = sk->sk_security; + ssp->smk_in = current->security; + ssp->smk_out = current->security; + ssp->smk_packet[0] = '\0'; + + rc = smack_netlabel(sk); +} + +/** + * smack_inet_conn_request - Smack access check on connect + * @sk: socket involved + * @skb: packet + * @req: unused + * + * Returns 0 if a task with the packet label could write to + * the socket, otherwise an error code + */ +static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb, + struct request_sock *req) +{ + struct netlbl_lsm_secattr skb_secattr; + struct socket_smack *ssp = sk->sk_security; + char smack[SMK_LABELLEN]; + int rc; + + if (skb == NULL) + return -EACCES; + + memset(smack, '\0', SMK_LABELLEN); + netlbl_secattr_init(&skb_secattr); + rc = netlbl_skbuff_getattr(skb, sk->sk_family, &skb_secattr); + if (rc == 0) + smack_from_secattr(&skb_secattr, smack); + else + strncpy(smack, smack_known_huh.smk_known, SMK_MAXLEN); + netlbl_secattr_destroy(&skb_secattr); + /* + * Receiving a packet requires that the other end + * be able to write here. Read access is not required. + * + * If the request is successful save the peer's label + * so that SO_PEERCRED can report it. + */ + rc = smk_access(smack, ssp->smk_in, MAY_WRITE); + if (rc == 0) + strncpy(ssp->smk_packet, smack, SMK_MAXLEN); + + return rc; +} + +/* + * Key management security hooks + * + * Casey has not tested key support very heavily. + * The permission check is most likely too restrictive. + * If you care about keys please have a look. + */ +#ifdef CONFIG_KEYS + +/** + * smack_key_alloc - Set the key security blob + * @key: object + * @tsk: the task associated with the key + * @flags: unused + * + * No allocation required + * + * Returns 0 + */ +static int smack_key_alloc(struct key *key, struct task_struct *tsk, + unsigned long flags) +{ + key->security = tsk->security; + return 0; +} + +/** + * smack_key_free - Clear the key security blob + * @key: the object + * + * Clear the blob pointer + */ +static void smack_key_free(struct key *key) +{ + key->security = NULL; +} + +/* + * smack_key_permission - Smack access on a key + * @key_ref: gets to the object + * @context: task involved + * @perm: unused + * + * Return 0 if the task has read and write to the object, + * an error code otherwise + */ +static int smack_key_permission(key_ref_t key_ref, + struct task_struct *context, key_perm_t perm) +{ + struct key *keyp; + + keyp = key_ref_to_ptr(key_ref); + if (keyp == NULL) + return -EINVAL; + /* + * If the key hasn't been initialized give it access so that + * it may do so. + */ + if (keyp->security == NULL) + return 0; + /* + * This should not occur + */ + if (context->security == NULL) + return -EACCES; + + return smk_access(context->security, keyp->security, MAY_READWRITE); +} +#endif /* CONFIG_KEYS */ + +/* + * smack_secid_to_secctx - return the smack label for a secid + * @secid: incoming integer + * @secdata: destination + * @seclen: how long it is + * + * Exists for networking code. + */ +static int smack_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) +{ + char *sp = smack_from_secid(secid); + + *secdata = sp; + *seclen = strlen(sp); + return 0; +} + +/* + * smack_release_secctx - don't do anything. + * @key_ref: unused + * @context: unused + * @perm: unused + * + * Exists to make sure nothing gets done, and properly + */ +static void smack_release_secctx(char *secdata, u32 seclen) +{ +} + +static struct security_operations smack_ops = { + .ptrace = smack_ptrace, + .capget = cap_capget, + .capset_check = cap_capset_check, + .capset_set = cap_capset_set, + .capable = cap_capable, + .syslog = smack_syslog, + .settime = cap_settime, + .vm_enough_memory = cap_vm_enough_memory, + + .bprm_apply_creds = cap_bprm_apply_creds, + .bprm_set_security = cap_bprm_set_security, + .bprm_secureexec = cap_bprm_secureexec, + + .sb_alloc_security = smack_sb_alloc_security, + .sb_free_security = smack_sb_free_security, + .sb_copy_data = smack_sb_copy_data, + .sb_kern_mount = smack_sb_kern_mount, + .sb_statfs = smack_sb_statfs, + .sb_mount = smack_sb_mount, + .sb_umount = smack_sb_umount, + + .inode_alloc_security = smack_inode_alloc_security, + .inode_free_security = smack_inode_free_security, + .inode_init_security = smack_inode_init_security, + .inode_link = smack_inode_link, + .inode_unlink = smack_inode_unlink, + .inode_rmdir = smack_inode_rmdir, + .inode_rename = smack_inode_rename, + .inode_permission = smack_inode_permission, + .inode_setattr = smack_inode_setattr, + .inode_getattr = smack_inode_getattr, + .inode_setxattr = smack_inode_setxattr, + .inode_post_setxattr = smack_inode_post_setxattr, + .inode_getxattr = smack_inode_getxattr, + .inode_removexattr = smack_inode_removexattr, + .inode_getsecurity = smack_inode_getsecurity, + .inode_setsecurity = smack_inode_setsecurity, + .inode_listsecurity = smack_inode_listsecurity, + + .file_permission = smack_file_permission, + .file_alloc_security = smack_file_alloc_security, + .file_free_security = smack_file_free_security, + .file_ioctl = smack_file_ioctl, + .file_lock = smack_file_lock, + .file_fcntl = smack_file_fcntl, + .file_set_fowner = smack_file_set_fowner, + .file_send_sigiotask = smack_file_send_sigiotask, + .file_receive = smack_file_receive, + + .task_alloc_security = smack_task_alloc_security, + .task_free_security = smack_task_free_security, + .task_post_setuid = cap_task_post_setuid, + .task_setpgid = smack_task_setpgid, + .task_getpgid = smack_task_getpgid, + .task_getsid = smack_task_getsid, + .task_getsecid = smack_task_getsecid, + .task_setnice = smack_task_setnice, + .task_setioprio = smack_task_setioprio, + .task_getioprio = smack_task_getioprio, + .task_setscheduler = smack_task_setscheduler, + .task_getscheduler = smack_task_getscheduler, + .task_movememory = smack_task_movememory, + .task_kill = smack_task_kill, + .task_wait = smack_task_wait, + .task_reparent_to_init = cap_task_reparent_to_init, + .task_to_inode = smack_task_to_inode, + + .ipc_permission = smack_ipc_permission, + + .msg_msg_alloc_security = smack_msg_msg_alloc_security, + .msg_msg_free_security = smack_msg_msg_free_security, + + .msg_queue_alloc_security = smack_msg_queue_alloc_security, + .msg_queue_free_security = smack_msg_queue_free_security, + .msg_queue_associate = smack_msg_queue_associate, + .msg_queue_msgctl = smack_msg_queue_msgctl, + .msg_queue_msgsnd = smack_msg_queue_msgsnd, + .msg_queue_msgrcv = smack_msg_queue_msgrcv, + + .shm_alloc_security = smack_shm_alloc_security, + .shm_free_security = smack_shm_free_security, + .shm_associate = smack_shm_associate, + .shm_shmctl = smack_shm_shmctl, + .shm_shmat = smack_shm_shmat, + + .sem_alloc_security = smack_sem_alloc_security, + .sem_free_security = smack_sem_free_security, + .sem_associate = smack_sem_associate, + .sem_semctl = smack_sem_semctl, + .sem_semop = smack_sem_semop, + + .netlink_send = cap_netlink_send, + .netlink_recv = cap_netlink_recv, + + .d_instantiate = smack_d_instantiate, + + .getprocattr = smack_getprocattr, + .setprocattr = smack_setprocattr, + + .unix_stream_connect = smack_unix_stream_connect, + .unix_may_send = smack_unix_may_send, + + .socket_post_create = smack_socket_post_create, + .socket_sock_rcv_skb = smack_socket_sock_rcv_skb, + .socket_getpeersec_stream = smack_socket_getpeersec_stream, + .socket_getpeersec_dgram = smack_socket_getpeersec_dgram, + .sk_alloc_security = smack_sk_alloc_security, + .sk_free_security = smack_sk_free_security, + .sock_graft = smack_sock_graft, + .inet_conn_request = smack_inet_conn_request, + /* key management security hooks */ +#ifdef CONFIG_KEYS + .key_alloc = smack_key_alloc, + .key_free = smack_key_free, + .key_permission = smack_key_permission, +#endif /* CONFIG_KEYS */ + .secid_to_secctx = smack_secid_to_secctx, + .release_secctx = smack_release_secctx, +}; + +/** + * smack_init - initialize the smack system + * + * Returns 0 + */ +static __init int smack_init(void) +{ + printk(KERN_INFO "Smack: Initializing.\n"); + + /* + * Set the security state for the initial task. + */ + current->security = &smack_known_floor.smk_known; + + /* + * Initialize locks + */ + spin_lock_init(&smack_known_unset.smk_cipsolock); + spin_lock_init(&smack_known_huh.smk_cipsolock); + spin_lock_init(&smack_known_hat.smk_cipsolock); + spin_lock_init(&smack_known_star.smk_cipsolock); + spin_lock_init(&smack_known_floor.smk_cipsolock); + spin_lock_init(&smack_known_invalid.smk_cipsolock); + + /* + * Register with LSM + */ + if (register_security(&smack_ops)) + panic("smack: Unable to register with kernel.\n"); + + return 0; +} + +/* + * Smack requires early initialization in order to label + * all processes and objects when they are created. + */ +security_initcall(smack_init); + diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c new file mode 100644 index 0000000..15aa37f --- /dev/null +++ b/security/smack/smackfs.c @@ -0,0 +1,981 @@ +/* + * Copyright (C) 2007 Casey Schaufler <casey@schaufler-ca.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 2. + * + * Authors: + * Casey Schaufler <casey@schaufler-ca.com> + * Ahmed S. Darwish <darwish.07@gmail.com> + * + * Special thanks to the authors of selinuxfs. + * + * Karl MacMillan <kmacmillan@tresys.com> + * James Morris <jmorris@redhat.com> + * + */ + +#include <linux/kernel.h> +#include <linux/vmalloc.h> +#include <linux/security.h> +#include <linux/mutex.h> +#include <net/netlabel.h> +#include <net/cipso_ipv4.h> +#include <linux/seq_file.h> +#include <linux/ctype.h> +#include "smack.h" + +/* + * smackfs pseudo filesystem. + */ + +enum smk_inos { + SMK_ROOT_INO = 2, + SMK_LOAD = 3, /* load policy */ + SMK_CIPSO = 4, /* load label -> CIPSO mapping */ + SMK_DOI = 5, /* CIPSO DOI */ + SMK_DIRECT = 6, /* CIPSO level indicating direct label */ + SMK_AMBIENT = 7, /* internet ambient label */ + SMK_NLTYPE = 8, /* label scheme to use by default */ +}; + +/* + * List locks + */ +static DEFINE_MUTEX(smack_list_lock); +static DEFINE_MUTEX(smack_cipso_lock); + +/* + * This is the "ambient" label for network traffic. + * If it isn't somehow marked, use this. + * It can be reset via smackfs/ambient + */ +char *smack_net_ambient = smack_known_floor.smk_known; + +/* + * This is the default packet marking scheme for network traffic. + * It can be reset via smackfs/nltype + */ +int smack_net_nltype = NETLBL_NLTYPE_CIPSOV4; + +/* + * This is the level in a CIPSO header that indicates a + * smack label is contained directly in the category set. + * It can be reset via smackfs/direct + */ +int smack_cipso_direct = SMACK_CIPSO_DIRECT_DEFAULT; + +static int smk_cipso_doi_value = SMACK_CIPSO_DOI_DEFAULT; +struct smk_list_entry *smack_list; + +#define SEQ_READ_FINISHED 1 + +/* + * Disable concurrent writing open() operations + */ +static struct semaphore smack_write_sem; + +/* + * Values for parsing cipso rules + * SMK_DIGITLEN: Length of a digit field in a rule. + * SMK_CIPSOMEN: Minimum possible cipso rule length. + */ +#define SMK_DIGITLEN 4 +#define SMK_CIPSOMIN (SMK_MAXLEN + 2 * SMK_DIGITLEN) + +/* + * Seq_file read operations for /smack/load + */ + +static void *load_seq_start(struct seq_file *s, loff_t *pos) +{ + if (*pos == SEQ_READ_FINISHED) + return NULL; + + return smack_list; +} + +static void *load_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct smk_list_entry *skp = ((struct smk_list_entry *) v)->smk_next; + + if (skp == NULL) + *pos = SEQ_READ_FINISHED; + + return skp; +} + +static int load_seq_show(struct seq_file *s, void *v) +{ + struct smk_list_entry *slp = (struct smk_list_entry *) v; + struct smack_rule *srp = &slp->smk_rule; + + seq_printf(s, "%s %s", (char *)srp->smk_subject, + (char *)srp->smk_object); + + seq_putc(s, ' '); + + if (srp->smk_access & MAY_READ) + seq_putc(s, 'r'); + if (srp->smk_access & MAY_WRITE) + seq_putc(s, 'w'); + if (srp->smk_access & MAY_EXEC) + seq_putc(s, 'x'); + if (srp->smk_access & MAY_APPEND) + seq_putc(s, 'a'); + if (srp->smk_access == 0) + seq_putc(s, '-'); + + seq_putc(s, '\n'); + + return 0; +} + +static void load_seq_stop(struct seq_file *s, void *v) +{ + /* No-op */ +} + +static struct seq_operations load_seq_ops = { + .start = load_seq_start, + .next = load_seq_next, + .show = load_seq_show, + .stop = load_seq_stop, +}; + +/** + * smk_open_load - open() for /smack/load + * @inode: inode structure representing file + * @file: "load" file pointer + * + * For reading, use load_seq_* seq_file reading operations. + */ +static int smk_open_load(struct inode *inode, struct file *file) +{ + if ((file->f_flags & O_ACCMODE) == O_RDONLY) + return seq_open(file, &load_seq_ops); + + if (down_interruptible(&smack_write_sem)) + return -ERESTARTSYS; + + return 0; +} + +/** + * smk_release_load - release() for /smack/load + * @inode: inode structure representing file + * @file: "load" file pointer + * + * For a reading session, use the seq_file release + * implementation. + * Otherwise, we are at the end of a writing session so + * clean everything up. + */ +static int smk_release_load(struct inode *inode, struct file *file) +{ + if ((file->f_flags & O_ACCMODE) == O_RDONLY) + return seq_release(inode, file); + + up(&smack_write_sem); + return 0; +} + +/** + * smk_set_access - add a rule to the rule list + * @srp: the new rule to add + * + * Looks through the current subject/object/access list for + * the subject/object pair and replaces the access that was + * there. If the pair isn't found add it with the specified + * access. + */ +static void smk_set_access(struct smack_rule *srp) +{ + struct smk_list_entry *sp; + struct smk_list_entry *newp; + + mutex_lock(&smack_list_lock); + + for (sp = smack_list; sp != NULL; sp = sp->smk_next) + if (sp->smk_rule.smk_subject == srp->smk_subject && + sp->smk_rule.smk_object == srp->smk_object) { + sp->smk_rule.smk_access = srp->smk_access; + break; + } + + if (sp == NULL) { + newp = kzalloc(sizeof(struct smk_list_entry), GFP_KERNEL); + newp->smk_rule = *srp; + newp->smk_next = smack_list; + smack_list = newp; + } + + mutex_unlock(&smack_list_lock); + + return; +} + +/** + * smk_write_load - write() for /smack/load + * @filp: file pointer, not actually used + * @buf: where to get the data from + * @count: bytes sent + * @ppos: where to start - must be 0 + * + * Get one smack access rule from above. + * The format is exactly: + * char subject[SMK_LABELLEN] + * char object[SMK_LABELLEN] + * char access[SMK_ACCESSKINDS] + * + * Anything following is commentary and ignored. + * + * writes must be SMK_LABELLEN+SMK_LABELLEN+4 bytes. + */ +#define MINIMUM_LOAD (SMK_LABELLEN + SMK_LABELLEN + SMK_ACCESSKINDS) + +static ssize_t smk_write_load(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct smack_rule rule; + char *data; + int rc = -EINVAL; + + /* + * Must have privilege. + * No partial writes. + * Enough data must be present. + */ + if (!capable(CAP_MAC_ADMIN)) + return -EPERM; + if (*ppos != 0) + return -EINVAL; + if (count < MINIMUM_LOAD) + return -EINVAL; + + data = kzalloc(count, GFP_KERNEL); + if (data == NULL) + return -ENOMEM; + + if (copy_from_user(data, buf, count) != 0) { + rc = -EFAULT; + goto out; + } + + rule.smk_subject = smk_import(data, 0); + if (rule.smk_subject == NULL) + goto out; + + rule.smk_object = smk_import(data + SMK_LABELLEN, 0); + if (rule.smk_object == NULL) + goto out; + + rule.smk_access = 0; + + switch (data[SMK_LABELLEN + SMK_LABELLEN]) { + case '-': + break; + case 'r': + case 'R': + rule.smk_access |= MAY_READ; + break; + default: + goto out; + } + + switch (data[SMK_LABELLEN + SMK_LABELLEN + 1]) { + case '-': + break; + case 'w': + case 'W': + rule.smk_access |= MAY_WRITE; + break; + default: + goto out; + } + + switch (data[SMK_LABELLEN + SMK_LABELLEN + 2]) { + case '-': + break; + case 'x': + case 'X': + rule.smk_access |= MAY_EXEC; + break; + default: + goto out; + } + + switch (data[SMK_LABELLEN + SMK_LABELLEN + 3]) { + case '-': + break; + case 'a': + case 'A': + rule.smk_access |= MAY_READ; + break; + default: + goto out; + } + + smk_set_access(&rule); + rc = count; + +out: + kfree(data); + return rc; +} + +static const struct file_operations smk_load_ops = { + .open = smk_open_load, + .read = seq_read, + .llseek = seq_lseek, + .write = smk_write_load, + .release = smk_release_load, +}; + +/** + * smk_cipso_doi - initialize the CIPSO domain + */ +void smk_cipso_doi(void) +{ + int rc; + struct cipso_v4_doi *doip; + struct netlbl_audit audit_info; + + rc = netlbl_cfg_map_del(NULL, &audit_info); + if (rc != 0) + printk(KERN_WARNING "%s:%d remove rc = %d\n", + __func__, __LINE__, rc); + + doip = kmalloc(sizeof(struct cipso_v4_doi), GFP_KERNEL); + if (doip == NULL) + panic("smack: Failed to initialize cipso DOI.\n"); + doip->map.std = NULL; + doip->doi = smk_cipso_doi_value; + doip->type = CIPSO_V4_MAP_PASS; + doip->tags[0] = CIPSO_V4_TAG_RBITMAP; + for (rc = 1; rc < CIPSO_V4_TAG_MAXCNT; rc++) + doip->tags[rc] = CIPSO_V4_TAG_INVALID; + + rc = netlbl_cfg_cipsov4_add_map(doip, NULL, &audit_info); + if (rc != 0) + printk(KERN_WARNING "%s:%d add rc = %d\n", + __func__, __LINE__, rc); +} + +/* + * Seq_file read operations for /smack/cipso + */ + +static void *cipso_seq_start(struct seq_file *s, loff_t *pos) +{ + if (*pos == SEQ_READ_FINISHED) + return NULL; + + return smack_known; +} + +static void *cipso_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct smack_known *skp = ((struct smack_known *) v)->smk_next; + + /* + * Omit labels with no associated cipso value + */ + while (skp != NULL && !skp->smk_cipso) + skp = skp->smk_next; + + if (skp == NULL) + *pos = SEQ_READ_FINISHED; + + return skp; +} + +/* + * Print cipso labels in format: + * label level[/cat[,cat]] + */ +static int cipso_seq_show(struct seq_file *s, void *v) +{ + struct smack_known *skp = (struct smack_known *) v; + struct smack_cipso *scp = skp->smk_cipso; + char *cbp; + char sep = '/'; + int cat = 1; + int i; + unsigned char m; + + if (scp == NULL) + return 0; + + seq_printf(s, "%s %3d", (char *)&skp->smk_known, scp->smk_level); + + cbp = scp->smk_catset; + for (i = 0; i < SMK_LABELLEN; i++) + for (m = 0x80; m != 0; m >>= 1) { + if (m & cbp[i]) { + seq_printf(s, "%c%d", sep, cat); + sep = ','; + } + cat++; + } + + seq_putc(s, '\n'); + + return 0; +} + +static void cipso_seq_stop(struct seq_file *s, void *v) +{ + /* No-op */ +} + +static struct seq_operations cipso_seq_ops = { + .start = cipso_seq_start, + .stop = cipso_seq_stop, + .next = cipso_seq_next, + .show = cipso_seq_show, +}; + +/** + * smk_open_cipso - open() for /smack/cipso + * @inode: inode structure representing file + * @file: "cipso" file pointer + * + * Connect our cipso_seq_* operations with /smack/cipso + * file_operations + */ +static int smk_open_cipso(struct inode *inode, struct file *file) +{ + return seq_open(file, &cipso_seq_ops); +} + +/** + * smk_write_cipso - write() for /smack/cipso + * @filp: file pointer, not actually used + * @buf: where to get the data from + * @count: bytes sent + * @ppos: where to start + * + * Accepts only one cipso rule per write call. + * Returns number of bytes written or error code, as appropriate + */ +static ssize_t smk_write_cipso(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct smack_known *skp; + struct smack_cipso *scp = NULL; + char mapcatset[SMK_LABELLEN]; + int maplevel; + int cat; + int catlen; + ssize_t rc = -EINVAL; + char *data = NULL; + char *rule; + int ret; + int i; + + /* + * Must have privilege. + * No partial writes. + * Enough data must be present. + */ + if (!capable(CAP_MAC_ADMIN)) + return -EPERM; + if (*ppos != 0) + return -EINVAL; + if (count <= SMK_CIPSOMIN) + return -EINVAL; + + data = kzalloc(count + 1, GFP_KERNEL); + if (data == NULL) + return -ENOMEM; + + if (copy_from_user(data, buf, count) != 0) { + rc = -EFAULT; + goto unlockedout; + } + + data[count] = '\0'; + rule = data; + /* + * Only allow one writer at a time. Writes should be + * quite rare and small in any case. + */ + mutex_lock(&smack_cipso_lock); + + skp = smk_import_entry(rule, 0); + if (skp == NULL) + goto out; + + rule += SMK_LABELLEN;; + ret = sscanf(rule, "%d", &maplevel); + if (ret != 1 || maplevel > SMACK_CIPSO_MAXLEVEL) + goto out; + + rule += SMK_DIGITLEN; + ret = sscanf(rule, "%d", &catlen); + if (ret != 1 || catlen > SMACK_CIPSO_MAXCATNUM) + goto out; + + if (count <= (SMK_CIPSOMIN + catlen * SMK_DIGITLEN)) + goto out; + + memset(mapcatset, 0, sizeof(mapcatset)); + + for (i = 0; i < catlen; i++) { + rule += SMK_DIGITLEN; + ret = sscanf(rule, "%d", &cat); + if (ret != 1 || cat > SMACK_CIPSO_MAXCATVAL) + goto out; + + smack_catset_bit(cat, mapcatset); + } + + if (skp->smk_cipso == NULL) { + scp = kzalloc(sizeof(struct smack_cipso), GFP_KERNEL); + if (scp == NULL) { + rc = -ENOMEM; + goto out; + } + } + + spin_lock_bh(&skp->smk_cipsolock); + + if (scp == NULL) + scp = skp->smk_cipso; + else + skp->smk_cipso = scp; + + scp->smk_level = maplevel; + memcpy(scp->smk_catset, mapcatset, sizeof(mapcatset)); + + spin_unlock_bh(&skp->smk_cipsolock); + + rc = count; +out: + mutex_unlock(&smack_cipso_lock); +unlockedout: + kfree(data); + return rc; +} + +static const struct file_operations smk_cipso_ops = { + .open = smk_open_cipso, + .read = seq_read, + .llseek = seq_lseek, + .write = smk_write_cipso, + .release = seq_release, +}; + +/** + * smk_read_doi - read() for /smack/doi + * @filp: file pointer, not actually used + * @buf: where to put the result + * @count: maximum to send along + * @ppos: where to start + * + * Returns number of bytes read or error code, as appropriate + */ +static ssize_t smk_read_doi(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + char temp[80]; + ssize_t rc; + + if (*ppos != 0) + return 0; + + sprintf(temp, "%d", smk_cipso_doi_value); + rc = simple_read_from_buffer(buf, count, ppos, temp, strlen(temp)); + + return rc; +} + +/** + * smk_write_doi - write() for /smack/doi + * @filp: file pointer, not actually used + * @buf: where to get the data from + * @count: bytes sent + * @ppos: where to start + * + * Returns number of bytes written or error code, as appropriate + */ +static ssize_t smk_write_doi(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char temp[80]; + int i; + + if (!capable(CAP_MAC_ADMIN)) + return -EPERM; + + if (count >= sizeof(temp) || count == 0) + return -EINVAL; + + if (copy_from_user(temp, buf, count) != 0) + return -EFAULT; + + temp[count] = '\0'; + + if (sscanf(temp, "%d", &i) != 1) + return -EINVAL; + + smk_cipso_doi_value = i; + + smk_cipso_doi(); + + return count; +} + +static const struct file_operations smk_doi_ops = { + .read = smk_read_doi, + .write = smk_write_doi, +}; + +/** + * smk_read_direct - read() for /smack/direct + * @filp: file pointer, not actually used + * @buf: where to put the result + * @count: maximum to send along + * @ppos: where to start + * + * Returns number of bytes read or error code, as appropriate + */ +static ssize_t smk_read_direct(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + char temp[80]; + ssize_t rc; + + if (*ppos != 0) + return 0; + + sprintf(temp, "%d", smack_cipso_direct); + rc = simple_read_from_buffer(buf, count, ppos, temp, strlen(temp)); + + return rc; +} + +/** + * smk_write_direct - write() for /smack/direct + * @filp: file pointer, not actually used + * @buf: where to get the data from + * @count: bytes sent + * @ppos: where to start + * + * Returns number of bytes written or error code, as appropriate + */ +static ssize_t smk_write_direct(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char temp[80]; + int i; + + if (!capable(CAP_MAC_ADMIN)) + return -EPERM; + + if (count >= sizeof(temp) || count == 0) + return -EINVAL; + + if (copy_from_user(temp, buf, count) != 0) + return -EFAULT; + + temp[count] = '\0'; + + if (sscanf(temp, "%d", &i) != 1) + return -EINVAL; + + smack_cipso_direct = i; + + return count; +} + +static const struct file_operations smk_direct_ops = { + .read = smk_read_direct, + .write = smk_write_direct, +}; + +/** + * smk_read_ambient - read() for /smack/ambient + * @filp: file pointer, not actually used + * @buf: where to put the result + * @cn: maximum to send along + * @ppos: where to start + * + * Returns number of bytes read or error code, as appropriate + */ +static ssize_t smk_read_ambient(struct file *filp, char __user *buf, + size_t cn, loff_t *ppos) +{ + ssize_t rc; + char out[SMK_LABELLEN]; + int asize; + + if (*ppos != 0) + return 0; + /* + * Being careful to avoid a problem in the case where + * smack_net_ambient gets changed in midstream. + * Since smack_net_ambient is always set with a value + * from the label list, including initially, and those + * never get freed, the worst case is that the pointer + * gets changed just after this strncpy, in which case + * the value passed up is incorrect. Locking around + * smack_net_ambient wouldn't be any better than this + * copy scheme as by the time the caller got to look + * at the ambient value it would have cleared the lock + * and been changed. + */ + strncpy(out, smack_net_ambient, SMK_LABELLEN); + asize = strlen(out) + 1; + + if (cn < asize) + return -EINVAL; + + rc = simple_read_from_buffer(buf, cn, ppos, out, asize); + + return rc; +} + +/** + * smk_write_ambient - write() for /smack/ambient + * @filp: file pointer, not actually used + * @buf: where to get the data from + * @count: bytes sent + * @ppos: where to start + * + * Returns number of bytes written or error code, as appropriate + */ +static ssize_t smk_write_ambient(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char in[SMK_LABELLEN]; + char *smack; + + if (!capable(CAP_MAC_ADMIN)) + return -EPERM; + + if (count >= SMK_LABELLEN) + return -EINVAL; + + if (copy_from_user(in, buf, count) != 0) + return -EFAULT; + + smack = smk_import(in, count); + if (smack == NULL) + return -EINVAL; + + smack_net_ambient = smack; + + return count; +} + +static const struct file_operations smk_ambient_ops = { + .read = smk_read_ambient, + .write = smk_write_ambient, +}; + +struct option_names { + int o_number; + char *o_name; + char *o_alias; +}; + +static struct option_names netlbl_choices[] = { + { NETLBL_NLTYPE_RIPSO, + NETLBL_NLTYPE_RIPSO_NAME, "ripso" }, + { NETLBL_NLTYPE_CIPSOV4, + NETLBL_NLTYPE_CIPSOV4_NAME, "cipsov4" }, + { NETLBL_NLTYPE_CIPSOV4, + NETLBL_NLTYPE_CIPSOV4_NAME, "cipso" }, + { NETLBL_NLTYPE_CIPSOV6, + NETLBL_NLTYPE_CIPSOV6_NAME, "cipsov6" }, + { NETLBL_NLTYPE_UNLABELED, + NETLBL_NLTYPE_UNLABELED_NAME, "unlabeled" }, +}; + +/** + * smk_read_nltype - read() for /smack/nltype + * @filp: file pointer, not actually used + * @buf: where to put the result + * @count: maximum to send along + * @ppos: where to start + * + * Returns number of bytes read or error code, as appropriate + */ +static ssize_t smk_read_nltype(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + char bound[40]; + ssize_t rc; + int i; + + if (count < SMK_LABELLEN) + return -EINVAL; + + if (*ppos != 0) + return 0; + + sprintf(bound, "unknown"); + + for (i = 0; i < ARRAY_SIZE(netlbl_choices); i++) + if (smack_net_nltype == netlbl_choices[i].o_number) { + sprintf(bound, "%s", netlbl_choices[i].o_name); + break; + } + + rc = simple_read_from_buffer(buf, count, ppos, bound, strlen(bound)); + + return rc; +} + +/** + * smk_write_nltype - write() for /smack/nltype + * @filp: file pointer, not actually used + * @buf: where to get the data from + * @count: bytes sent + * @ppos: where to start + * + * Returns number of bytes written or error code, as appropriate + */ +static ssize_t smk_write_nltype(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char bound[40]; + char *cp; + int i; + + if (!capable(CAP_MAC_ADMIN)) + return -EPERM; + + if (count >= 40) + return -EINVAL; + + if (copy_from_user(bound, buf, count) != 0) + return -EFAULT; + + bound[count] = '\0'; + cp = strchr(bound, ' '); + if (cp != NULL) + *cp = '\0'; + cp = strchr(bound, '\n'); + if (cp != NULL) + *cp = '\0'; + + for (i = 0; i < ARRAY_SIZE(netlbl_choices); i++) + if (strcmp(bound, netlbl_choices[i].o_name) == 0 || + strcmp(bound, netlbl_choices[i].o_alias) == 0) { + smack_net_nltype = netlbl_choices[i].o_number; + return count; + } + /* + * Not a valid choice. + */ + return -EINVAL; +} + +static const struct file_operations smk_nltype_ops = { + .read = smk_read_nltype, + .write = smk_write_nltype, +}; + +/** + * smk_fill_super - fill the /smackfs superblock + * @sb: the empty superblock + * @data: unused + * @silent: unused + * + * Fill in the well known entries for /smack + * + * Returns 0 on success, an error code on failure + */ +static int smk_fill_super(struct super_block *sb, void *data, int silent) +{ + int rc; + struct inode *root_inode; + + static struct tree_descr smack_files[] = { + [SMK_LOAD] = + {"load", &smk_load_ops, S_IRUGO|S_IWUSR}, + [SMK_CIPSO] = + {"cipso", &smk_cipso_ops, S_IRUGO|S_IWUSR}, + [SMK_DOI] = + {"doi", &smk_doi_ops, S_IRUGO|S_IWUSR}, + [SMK_DIRECT] = + {"direct", &smk_direct_ops, S_IRUGO|S_IWUSR}, + [SMK_AMBIENT] = + {"ambient", &smk_ambient_ops, S_IRUGO|S_IWUSR}, + [SMK_NLTYPE] = + {"nltype", &smk_nltype_ops, S_IRUGO|S_IWUSR}, + /* last one */ {""} + }; + + rc = simple_fill_super(sb, SMACK_MAGIC, smack_files); + if (rc != 0) { + printk(KERN_ERR "%s failed %d while creating inodes\n", + __func__, rc); + return rc; + } + + root_inode = sb->s_root->d_inode; + root_inode->i_security = new_inode_smack(smack_known_floor.smk_known); + + return 0; +} + +/** + * smk_get_sb - get the smackfs superblock + * @fs_type: passed along without comment + * @flags: passed along without comment + * @dev_name: passed along without comment + * @data: passed along without comment + * @mnt: passed along without comment + * + * Just passes everything along. + * + * Returns what the lower level code does. + */ +static int smk_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + struct vfsmount *mnt) +{ + return get_sb_single(fs_type, flags, data, smk_fill_super, mnt); +} + +static struct file_system_type smk_fs_type = { + .name = "smackfs", + .get_sb = smk_get_sb, + .kill_sb = kill_litter_super, +}; + +static struct vfsmount *smackfs_mount; + +/** + * init_smk_fs - get the smackfs superblock + * + * register the smackfs + * + * Returns 0 unless the registration fails. + */ +static int __init init_smk_fs(void) +{ + int err; + + err = register_filesystem(&smk_fs_type); + if (!err) { + smackfs_mount = kern_mount(&smk_fs_type); + if (IS_ERR(smackfs_mount)) { + printk(KERN_ERR "smackfs: could not mount!\n"); + err = PTR_ERR(smackfs_mount); + smackfs_mount = NULL; + } + } + + sema_init(&smack_write_sem, 1); + smk_cipso_doi(); + + return err; +} + +__initcall(init_smk_fs); diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 6244911..61f5d42 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -23,7 +23,7 @@ #include <linux/file.h> #include <linux/slab.h> #include <linux/time.h> -#include <linux/latency.h> +#include <linux/pm_qos_params.h> #include <linux/uio.h> #include <sound/core.h> #include <sound/control.h> @@ -443,9 +443,11 @@ static int snd_pcm_hw_params(struct snd_pcm_substream *substream, snd_pcm_timer_resolution_change(substream); runtime->status->state = SNDRV_PCM_STATE_SETUP; - remove_acceptable_latency(substream->latency_id); + pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, + substream->latency_id); if ((usecs = period_to_usecs(runtime)) >= 0) - set_acceptable_latency(substream->latency_id, usecs); + pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY, + substream->latency_id, usecs); return 0; _error: /* hardware might be unuseable from this time, @@ -505,7 +507,8 @@ static int snd_pcm_hw_free(struct snd_pcm_substream *substream) if (substream->ops->hw_free) result = substream->ops->hw_free(substream); runtime->status->state = SNDRV_PCM_STATE_OPEN; - remove_acceptable_latency(substream->latency_id); + pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, + substream->latency_id); return result; } |