diff options
327 files changed, 10265 insertions, 3996 deletions
diff --git a/Documentation/devicetree/bindings/net/micrel-ksz9021.txt b/Documentation/devicetree/bindings/net/micrel-ksz9021.txt new file mode 100644 index 0000000..997a63f --- /dev/null +++ b/Documentation/devicetree/bindings/net/micrel-ksz9021.txt @@ -0,0 +1,49 @@ +Micrel KSZ9021 Gigabit Ethernet PHY + +Some boards require special tuning values, particularly when it comes to +clock delays. You can specify clock delay values by adding +micrel-specific properties to an Ethernet OF device node. + +All skew control options are specified in picoseconds. The minimum +value is 0, and the maximum value is 3000. + +Optional properties: + - rxc-skew-ps : Skew control of RXC pad + - rxdv-skew-ps : Skew control of RX CTL pad + - txc-skew-ps : Skew control of TXC pad + - txen-skew-ps : Skew control of TX_CTL pad + - rxd0-skew-ps : Skew control of RX data 0 pad + - rxd1-skew-ps : Skew control of RX data 1 pad + - rxd2-skew-ps : Skew control of RX data 2 pad + - rxd3-skew-ps : Skew control of RX data 3 pad + - txd0-skew-ps : Skew control of TX data 0 pad + - txd1-skew-ps : Skew control of TX data 1 pad + - txd2-skew-ps : Skew control of TX data 2 pad + - txd3-skew-ps : Skew control of TX data 3 pad + +Examples: + + /* Attach to an Ethernet device with autodetected PHY */ + &enet { + rxc-skew-ps = <3000>; + rxdv-skew-ps = <0>; + txc-skew-ps = <3000>; + txen-skew-ps = <0>; + status = "okay"; + }; + + /* Attach to an explicitly-specified PHY */ + mdio { + phy0: ethernet-phy@0 { + rxc-skew-ps = <3000>; + rxdv-skew-ps = <0>; + txc-skew-ps = <3000>; + txen-skew-ps = <0>; + reg = <0>; + }; + }; + ethernet@70000 { + status = "okay"; + phy = <&phy0>; + phy-mode = "rgmii-id"; + }; diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 15356ac..7f9d4f5 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2953,7 +2953,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. improve throughput, but will also increase the amount of memory reserved for use by the client. - swapaccount[=0|1] + swapaccount=[0|1] [KNL] Enable accounting of swap in memory resource controller if no parameter or 1 is given or disable it if 0 is given (See Documentation/cgroups/memory.txt) diff --git a/Documentation/networking/e100.txt b/Documentation/networking/e100.txt index fcb6c71c..13a3212 100644 --- a/Documentation/networking/e100.txt +++ b/Documentation/networking/e100.txt @@ -1,7 +1,7 @@ Linux* Base Driver for the Intel(R) PRO/100 Family of Adapters ============================================================== -November 15, 2005 +March 15, 2011 Contents ======== @@ -122,7 +122,7 @@ Additional Configurations NOTE: This setting is not saved across reboots. - Ethtool + ethtool ------- The driver utilizes the ethtool interface for driver configuration and diff --git a/Documentation/networking/e1000.txt b/Documentation/networking/e1000.txt index 71ca958..437b209 100644 --- a/Documentation/networking/e1000.txt +++ b/Documentation/networking/e1000.txt @@ -1,8 +1,8 @@ -Linux* Base Driver for the Intel(R) PRO/1000 Family of Adapters -=============================================================== +Linux* Base Driver for Intel(R) Ethernet Network Connection +=========================================================== Intel Gigabit Linux driver. -Copyright(c) 1999 - 2010 Intel Corporation. +Copyright(c) 1999 - 2013 Intel Corporation. Contents ======== @@ -420,15 +420,15 @@ Additional Configurations - The maximum MTU setting for Jumbo Frames is 16110. This value coincides with the maximum Jumbo Frames size of 16128. - - Using Jumbo Frames at 10 or 100 Mbps may result in poor performance or - loss of link. + - Using Jumbo frames at 10 or 100 Mbps is not supported and may result in + poor performance or loss of link. - Adapters based on the Intel(R) 82542 and 82573V/E controller do not support Jumbo Frames. These correspond to the following product names: Intel(R) PRO/1000 Gigabit Server Adapter Intel(R) PRO/1000 PM Network Connection - Ethtool + ethtool ------- The driver utilizes the ethtool interface for driver configuration and diagnostics, as well as displaying statistical information. The ethtool diff --git a/Documentation/networking/e1000e.txt b/Documentation/networking/e1000e.txt index 97b5ba9..ad2d9f3 100644 --- a/Documentation/networking/e1000e.txt +++ b/Documentation/networking/e1000e.txt @@ -1,8 +1,8 @@ -Linux* Driver for Intel(R) Network Connection -============================================= +Linux* Driver for Intel(R) Ethernet Network Connection +====================================================== Intel Gigabit Linux driver. -Copyright(c) 1999 - 2010 Intel Corporation. +Copyright(c) 1999 - 2013 Intel Corporation. Contents ======== @@ -259,13 +259,16 @@ Additional Configurations - The maximum MTU setting for Jumbo Frames is 9216. This value coincides with the maximum Jumbo Frames size of 9234 bytes. - - Using Jumbo Frames at 10 or 100 Mbps is not supported and may result in + - Using Jumbo frames at 10 or 100 Mbps is not supported and may result in poor performance or loss of link. - Some adapters limit Jumbo Frames sized packets to a maximum of 4096 bytes and some adapters do not support Jumbo Frames. - Ethtool + - Jumbo Frames cannot be configured on an 82579-based Network device, if + MACSec is enabled on the system. + + ethtool ------- The driver utilizes the ethtool interface for driver configuration and diagnostics, as well as displaying statistical information. We @@ -273,6 +276,9 @@ Additional Configurations http://ftp.kernel.org/pub/software/network/ethtool/ + NOTE: When validating enable/disable tests on some parts (82578, for example) + you need to add a few seconds between tests when working with ethtool. + Speed and Duplex ---------------- Speed and Duplex are configured through the ethtool* utility. For diff --git a/Documentation/networking/igb.txt b/Documentation/networking/igb.txt index 9a2a0371..4ebbd65 100644 --- a/Documentation/networking/igb.txt +++ b/Documentation/networking/igb.txt @@ -1,8 +1,8 @@ -Linux* Base Driver for Intel(R) Network Connection -================================================== +Linux* Base Driver for Intel(R) Ethernet Network Connection +=========================================================== Intel Gigabit Linux driver. -Copyright(c) 1999 - 2010 Intel Corporation. +Copyright(c) 1999 - 2013 Intel Corporation. Contents ======== @@ -36,6 +36,53 @@ Default Value: 0 This parameter adds support for SR-IOV. It causes the driver to spawn up to max_vfs worth of virtual function. +QueuePairs +---------- +Valid Range: 0-1 +Default Value: 1 (TX and RX will be paired onto one interrupt vector) + +If set to 0, when MSI-X is enabled, the TX and RX will attempt to occupy +separate vectors. + +This option can be overridden to 1 if there are not sufficient interrupts +available. This can occur if any combination of RSS, VMDQ, and max_vfs +results in more than 4 queues being used. + +Node +---- +Valid Range: 0-n +Default Value: -1 (off) + + 0 - n: where n is the number of the NUMA node that should be used to + allocate memory for this adapter port. + -1: uses the driver default of allocating memory on whichever processor is + running insmod/modprobe. + + The Node parameter will allow you to pick which NUMA node you want to have + the adapter allocate memory from. All driver structures, in-memory queues, + and receive buffers will be allocated on the node specified. This parameter + is only useful when interrupt affinity is specified, otherwise some portion + of the time the interrupt could run on a different core than the memory is + allocated on, causing slower memory access and impacting throughput, CPU, or + both. + +EEE +--- +Valid Range: 0-1 +Default Value: 1 (enabled) + + A link between two EEE-compliant devices will result in periodic bursts of + data followed by long periods where in the link is in an idle state. This Low + Power Idle (LPI) state is supported in both 1Gbps and 100Mbps link speeds. + NOTE: EEE support requires autonegotiation. + +DMAC +---- +Valid Range: 0-1 +Default Value: 1 (enabled) + Enables or disables DMA Coalescing feature. + + Additional Configurations ========================= @@ -55,10 +102,10 @@ Additional Configurations - The maximum MTU setting for Jumbo Frames is 9216. This value coincides with the maximum Jumbo Frames size of 9234 bytes. - - Using Jumbo Frames at 10 or 100 Mbps may result in poor performance or - loss of link. + - Using Jumbo frames at 10 or 100 Mbps is not supported and may result in + poor performance or loss of link. - Ethtool + ethtool ------- The driver utilizes the ethtool interface for driver configuration and diagnostics, as well as displaying statistical information. The latest @@ -106,6 +153,14 @@ Additional Configurations Where n=the VF that attempted to do the spoofing. + Setting MAC Address, VLAN and Rate Limit Using IProute2 Tool + ------------------------------------------------------------ + You can set a MAC address of a Virtual Function (VF), a default VLAN and the + rate limit using the IProute2 tool. Download the latest version of the + iproute2 tool from Sourceforge if your version does not have all the + features you require. + + Support ======= diff --git a/Documentation/networking/igbvf.txt b/Documentation/networking/igbvf.txt index cbfe4ee..40db17a 100644 --- a/Documentation/networking/igbvf.txt +++ b/Documentation/networking/igbvf.txt @@ -1,8 +1,8 @@ -Linux* Base Driver for Intel(R) Network Connection -================================================== +Linux* Base Driver for Intel(R) Ethernet Network Connection +=========================================================== Intel Gigabit Linux driver. -Copyright(c) 1999 - 2010 Intel Corporation. +Copyright(c) 1999 - 2013 Intel Corporation. Contents ======== @@ -55,7 +55,7 @@ networking link on the left to search for your adapter: Additional Configurations ========================= - Ethtool + ethtool ------- The driver utilizes the ethtool interface for driver configuration and diagnostics, as well as displaying statistical information. The ethtool diff --git a/Documentation/networking/ixgb.txt b/Documentation/networking/ixgb.txt index d75a1f9..1e0c045 100644 --- a/Documentation/networking/ixgb.txt +++ b/Documentation/networking/ixgb.txt @@ -1,7 +1,7 @@ -Linux Base Driver for 10 Gigabit Intel(R) Network Connection -============================================================= +Linux Base Driver for 10 Gigabit Intel(R) Ethernet Network Connection +===================================================================== -October 9, 2007 +March 14, 2011 Contents @@ -274,9 +274,9 @@ Additional Configurations ------------------------------------------------- Configuring a network driver to load properly when the system is started is distribution dependent. Typically, the configuration process involves adding - an alias line to files in /etc/modprobe.d/ as well as editing other system - startup scripts and/or configuration files. Many popular Linux distributions - ship with tools to make these changes for you. To learn the proper way to + an alias line to /etc/modprobe.conf as well as editing other system startup + scripts and/or configuration files. Many popular Linux distributions ship + with tools to make these changes for you. To learn the proper way to configure a network device for your system, refer to your distribution documentation. If during this process you are asked for the driver or module name, the name for the Linux Base Driver for the Intel 10GbE Family of @@ -306,7 +306,7 @@ Additional Configurations with the maximum Jumbo Frames size of 16128. - Ethtool + ethtool ------- The driver utilizes the ethtool interface for driver configuration and diagnostics, as well as displaying statistical information. The ethtool diff --git a/Documentation/networking/ixgbe.txt b/Documentation/networking/ixgbe.txt index af77ed3..96ccceb 100644 --- a/Documentation/networking/ixgbe.txt +++ b/Documentation/networking/ixgbe.txt @@ -1,8 +1,9 @@ -Linux Base Driver for 10 Gigabit PCI Express Intel(R) Network Connection -======================================================================== +Linux* Base Driver for the Intel(R) Ethernet 10 Gigabit PCI Express Family of +Adapters +============================================================================= -Intel Gigabit Linux driver. -Copyright(c) 1999 - 2010 Intel Corporation. +Intel 10 Gigabit Linux driver. +Copyright(c) 1999 - 2013 Intel Corporation. Contents ======== @@ -16,8 +17,8 @@ Contents Identifying Your Adapter ======================== -The driver in this release is compatible with 82598 and 82599-based Intel -Network Connections. +The driver in this release is compatible with 82598, 82599 and X540-based +Intel Network Connections. For more information on how to identify your adapter, go to the Adapter & Driver ID Guide at: @@ -72,7 +73,7 @@ cables that comply with SFF-8431 v4.1 and SFF-8472 v10.4 specifications. Laser turns off for SFP+ when ifconfig down ------------------------------------------- "ifconfig down" turns off the laser for 82599-based SFP+ fiber adapters. -"ifconfig up" turns on the later. +"ifconfig up" turns on the laser. 82598-BASED ADAPTERS @@ -118,6 +119,93 @@ NOTE: For 82598 backplane cards entering 1 gig mode, flow control default behavior is changed to off. Flow control in 1 gig mode on these devices can lead to Tx hangs. +Intel(R) Ethernet Flow Director +------------------------------- +Supports advanced filters that direct receive packets by their flows to +different queues. Enables tight control on routing a flow in the platform. +Matches flows and CPU cores for flow affinity. Supports multiple parameters +for flexible flow classification and load balancing. + +Flow director is enabled only if the kernel is multiple TX queue capable. + +An included script (set_irq_affinity.sh) automates setting the IRQ to CPU +affinity. + +You can verify that the driver is using Flow Director by looking at the counter +in ethtool: fdir_miss and fdir_match. + +Other ethtool Commands: +To enable Flow Director + ethtool -K ethX ntuple on +To add a filter + Use -U switch. e.g., ethtool -U ethX flow-type tcp4 src-ip 0x178000a + action 1 +To see the list of filters currently present: + ethtool -u ethX + +Perfect Filter: Perfect filter is an interface to load the filter table that +funnels all flow into queue_0 unless an alternative queue is specified using +"action". In that case, any flow that matches the filter criteria will be +directed to the appropriate queue. + +If the queue is defined as -1, filter will drop matching packets. + +To account for filter matches and misses, there are two stats in ethtool: +fdir_match and fdir_miss. In addition, rx_queue_N_packets shows the number of +packets processed by the Nth queue. + +NOTE: Receive Packet Steering (RPS) and Receive Flow Steering (RFS) are not +compatible with Flow Director. IF Flow Director is enabled, these will be +disabled. + +The following three parameters impact Flow Director. + +FdirMode +-------- +Valid Range: 0-2 (0=off, 1=ATR, 2=Perfect filter mode) +Default Value: 1 + + Flow Director filtering modes. + +FdirPballoc +----------- +Valid Range: 0-2 (0=64k, 1=128k, 2=256k) +Default Value: 0 + + Flow Director allocated packet buffer size. + +AtrSampleRate +-------------- +Valid Range: 1-100 +Default Value: 20 + + Software ATR Tx packet sample rate. For example, when set to 20, every 20th + packet, looks to see if the packet will create a new flow. + +Node +---- +Valid Range: 0-n +Default Value: 1 (off) + + 0 - n: where n is the number of NUMA nodes (i.e. 0 - 3) currently online in + your system + 1: turns this option off + + The Node parameter will allow you to pick which NUMA node you want to have + the adapter allocate memory on. + +max_vfs +------- +Valid Range: 1-63 +Default Value: 0 + + If the value is greater than 0 it will also force the VMDq parameter to be 1 + or more. + + This parameter adds support for SR-IOV. It causes the driver to spawn up to + max_vfs worth of virtual function. + + Additional Configurations ========================= @@ -221,9 +309,10 @@ http://www.redhat.com/promo/summit/2008/downloads/pdf/Thursday/Mark_Wagner.pdf Known Issues ============ - Enabling SR-IOV in a 32-bit Microsoft* Windows* Server 2008 Guest OS using - Intel (R) 82576-based GbE or Intel (R) 82599-based 10GbE controller under KVM - ----------------------------------------------------------------------------- + Enabling SR-IOV in a 32-bit or 64-bit Microsoft* Windows* Server 2008/R2 + Guest OS using Intel (R) 82576-based GbE or Intel (R) 82599-based 10GbE + controller under KVM + ------------------------------------------------------------------------ KVM Hypervisor/VMM supports direct assignment of a PCIe device to a VM. This includes traditional PCIe devices, as well as SR-IOV-capable devices using Intel 82576-based and 82599-based controllers. diff --git a/Documentation/networking/ixgbevf.txt b/Documentation/networking/ixgbevf.txt index 5a91a41..53d8d2a 100644 --- a/Documentation/networking/ixgbevf.txt +++ b/Documentation/networking/ixgbevf.txt @@ -1,8 +1,8 @@ -Linux* Base Driver for Intel(R) Network Connection -================================================== +Linux* Base Driver for Intel(R) Ethernet Network Connection +=========================================================== Intel Gigabit Linux driver. -Copyright(c) 1999 - 2010 Intel Corporation. +Copyright(c) 1999 - 2013 Intel Corporation. Contents ======== diff --git a/Documentation/networking/openvswitch.txt b/Documentation/networking/openvswitch.txt index 8fa2dd1..37c20ee 100644 --- a/Documentation/networking/openvswitch.txt +++ b/Documentation/networking/openvswitch.txt @@ -91,6 +91,46 @@ Often we ellipsize arguments not important to the discussion, e.g.: in_port(1), eth(...), eth_type(0x0800), ipv4(...), tcp(...) +Wildcarded flow key format +-------------------------- + +A wildcarded flow is described with two sequences of Netlink attributes +passed over the Netlink socket. A flow key, exactly as described above, and an +optional corresponding flow mask. + +A wildcarded flow can represent a group of exact match flows. Each '1' bit +in the mask specifies a exact match with the corresponding bit in the flow key. +A '0' bit specifies a don't care bit, which will match either a '1' or '0' bit +of a incoming packet. Using wildcarded flow can improve the flow set up rate +by reduce the number of new flows need to be processed by the user space program. + +Support for the mask Netlink attribute is optional for both the kernel and user +space program. The kernel can ignore the mask attribute, installing an exact +match flow, or reduce the number of don't care bits in the kernel to less than +what was specified by the user space program. In this case, variations in bits +that the kernel does not implement will simply result in additional flow setups. +The kernel module will also work with user space programs that neither support +nor supply flow mask attributes. + +Since the kernel may ignore or modify wildcard bits, it can be difficult for +the userspace program to know exactly what matches are installed. There are +two possible approaches: reactively install flows as they miss the kernel +flow table (and therefore not attempt to determine wildcard changes at all) +or use the kernel's response messages to determine the installed wildcards. + +When interacting with userspace, the kernel should maintain the match portion +of the key exactly as originally installed. This will provides a handle to +identify the flow for all future operations. However, when reporting the +mask of an installed flow, the mask should include any restrictions imposed +by the kernel. + +The behavior when using overlapping wildcarded flows is undefined. It is the +responsibility of the user space program to ensure that any incoming packet +can match at most one flow, wildcarded or not. The current implementation +performs best-effort detection of overlapping wildcarded flows and may reject +some but not all of them. However, this behavior may change in future versions. + + Basic rule for evolving flow keys --------------------------------- diff --git a/Documentation/networking/tproxy.txt b/Documentation/networking/tproxy.txt index 7b5996d..ec11429 100644 --- a/Documentation/networking/tproxy.txt +++ b/Documentation/networking/tproxy.txt @@ -2,9 +2,8 @@ Transparent proxy support ========================= This feature adds Linux 2.2-like transparent proxy support to current kernels. -To use it, enable NETFILTER_TPROXY, the socket match and the TPROXY target in -your kernel config. You will need policy routing too, so be sure to enable that -as well. +To use it, enable the socket match and the TPROXY target in your kernel config. +You will need policy routing too, so be sure to enable that as well. 1. Making non-local sockets work diff --git a/MAINTAINERS b/MAINTAINERS index 1c6f9db..b2887c5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5884,7 +5884,7 @@ F: drivers/i2c/busses/i2c-omap.c F: include/linux/i2c-omap.h OMAP DEVICE TREE SUPPORT -M: Benoît Cousson <b-cousson@ti.com> +M: Benoît Cousson <bcousson@baylibre.com> M: Tony Lindgren <tony@atomide.com> L: linux-omap@vger.kernel.org L: devicetree@vger.kernel.org @@ -5964,14 +5964,14 @@ S: Maintained F: drivers/char/hw_random/omap-rng.c OMAP HWMOD SUPPORT -M: Benoît Cousson <b-cousson@ti.com> +M: Benoît Cousson <bcousson@baylibre.com> M: Paul Walmsley <paul@pwsan.com> L: linux-omap@vger.kernel.org S: Maintained F: arch/arm/mach-omap2/omap_hwmod.* OMAP HWMOD DATA FOR OMAP4-BASED DEVICES -M: Benoît Cousson <b-cousson@ti.com> +M: Benoît Cousson <bcousson@baylibre.com> L: linux-omap@vger.kernel.org S: Maintained F: arch/arm/mach-omap2/omap_hwmod_44xx_data.c @@ -7367,7 +7367,6 @@ F: drivers/net/ethernet/sfc/ SGI GRU DRIVER M: Dimitri Sivanich <sivanich@sgi.com> -M: Robin Holt <holt@sgi.com> S: Maintained F: drivers/misc/sgi-gru/ @@ -7387,7 +7386,8 @@ S: Maintained for 2.6. F: Documentation/sgi-visws.txt SGI XP/XPC/XPNET DRIVER -M: Robin Holt <holt@sgi.com> +M: Cliff Whickman <cpw@sgi.com> +M: Robin Holt <robinmholt@gmail.com> S: Maintained F: drivers/misc/sgi-xp/ @@ -7973,6 +7973,12 @@ F: arch/m68k/sun3*/ F: arch/m68k/include/asm/sun3* F: drivers/net/ethernet/i825xx/sun3* +SUNDANCE NETWORK DRIVER +M: Denis Kirjanov <kda@linux-powerpc.org> +L: netdev@vger.kernel.org +S: Maintained +F: drivers/net/ethernet/dlink/sundance.c + SUPERH M: Paul Mundt <lethal@linux-sh.org> L: linux-sh@vger.kernel.org @@ -1,7 +1,7 @@ VERSION = 3 PATCHLEVEL = 11 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Linux for Workgroups # *DOCUMENTATION* diff --git a/arch/arm/boot/dts/at91sam9n12ek.dts b/arch/arm/boot/dts/at91sam9n12ek.dts index d59b70c..3d77dbe 100644 --- a/arch/arm/boot/dts/at91sam9n12ek.dts +++ b/arch/arm/boot/dts/at91sam9n12ek.dts @@ -14,11 +14,11 @@ compatible = "atmel,at91sam9n12ek", "atmel,at91sam9n12", "atmel,at91sam9"; chosen { - bootargs = "mem=128M console=ttyS0,115200 root=/dev/mtdblock1 rw rootfstype=jffs2"; + bootargs = "console=ttyS0,115200 root=/dev/mtdblock1 rw rootfstype=jffs2"; }; memory { - reg = <0x20000000 0x10000000>; + reg = <0x20000000 0x8000000>; }; clocks { diff --git a/arch/arm/boot/dts/at91sam9x5ek.dtsi b/arch/arm/boot/dts/at91sam9x5ek.dtsi index b753855..49e3c45 100644 --- a/arch/arm/boot/dts/at91sam9x5ek.dtsi +++ b/arch/arm/boot/dts/at91sam9x5ek.dtsi @@ -94,8 +94,9 @@ usb0: ohci@00600000 { status = "okay"; - num-ports = <2>; - atmel,vbus-gpio = <&pioD 19 GPIO_ACTIVE_LOW + num-ports = <3>; + atmel,vbus-gpio = <0 /* &pioD 18 GPIO_ACTIVE_LOW *//* Activate to have access to port A */ + &pioD 19 GPIO_ACTIVE_LOW &pioD 20 GPIO_ACTIVE_LOW >; }; diff --git a/arch/arm/boot/dts/sama5d3xmb.dtsi b/arch/arm/boot/dts/sama5d3xmb.dtsi index 8a9e05d..e9521d5 100644 --- a/arch/arm/boot/dts/sama5d3xmb.dtsi +++ b/arch/arm/boot/dts/sama5d3xmb.dtsi @@ -81,6 +81,14 @@ macb1: ethernet@f802c000 { phy-mode = "rmii"; + + #address-cells = <1>; + #size-cells = <0>; + phy0: ethernet-phy@0 { + interrupt-parent = <&pioE>; + interrupts = <30 IRQ_TYPE_EDGE_FALLING>; + reg = <1>; + }; }; pinctrl@fffff200 { diff --git a/arch/arm/boot/dts/tegra20-seaboard.dts b/arch/arm/boot/dts/tegra20-seaboard.dts index 365760b..40e6fb2 100644 --- a/arch/arm/boot/dts/tegra20-seaboard.dts +++ b/arch/arm/boot/dts/tegra20-seaboard.dts @@ -830,6 +830,8 @@ regulator-max-microvolt = <5000000>; enable-active-high; gpio = <&gpio 24 0>; /* PD0 */ + regulator-always-on; + regulator-boot-on; }; }; diff --git a/arch/arm/boot/dts/tegra20-trimslice.dts b/arch/arm/boot/dts/tegra20-trimslice.dts index ed4b901..37c93d3 100644 --- a/arch/arm/boot/dts/tegra20-trimslice.dts +++ b/arch/arm/boot/dts/tegra20-trimslice.dts @@ -412,6 +412,8 @@ regulator-max-microvolt = <5000000>; enable-active-high; gpio = <&gpio 170 0>; /* PV2 */ + regulator-always-on; + regulator-boot-on; }; }; diff --git a/arch/arm/boot/dts/tegra20-whistler.dts b/arch/arm/boot/dts/tegra20-whistler.dts index ab67c94..a3d0eba 100644 --- a/arch/arm/boot/dts/tegra20-whistler.dts +++ b/arch/arm/boot/dts/tegra20-whistler.dts @@ -588,6 +588,8 @@ regulator-max-microvolt = <5000000>; enable-active-high; gpio = <&tca6416 0 0>; /* GPIO_PMU0 */ + regulator-always-on; + regulator-boot-on; }; vbus3_reg: regulator@3 { @@ -598,6 +600,8 @@ regulator-max-microvolt = <5000000>; enable-active-high; gpio = <&tca6416 1 0>; /* GPIO_PMU1 */ + regulator-always-on; + regulator-boot-on; }; }; diff --git a/arch/arm/include/asm/smp_plat.h b/arch/arm/include/asm/smp_plat.h index 6462a72..a252c0b 100644 --- a/arch/arm/include/asm/smp_plat.h +++ b/arch/arm/include/asm/smp_plat.h @@ -88,4 +88,7 @@ static inline u32 mpidr_hash_size(void) { return 1 << mpidr_hash.bits; } + +extern int platform_can_cpu_hotplug(void); + #endif diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h index f8b8965..b07c09e 100644 --- a/arch/arm/include/asm/spinlock.h +++ b/arch/arm/include/asm/spinlock.h @@ -107,7 +107,7 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock) " subs %1, %0, %0, ror #16\n" " addeq %0, %0, %4\n" " strexeq %2, %0, [%3]" - : "=&r" (slock), "=&r" (contended), "=r" (res) + : "=&r" (slock), "=&r" (contended), "=&r" (res) : "r" (&lock->slock), "I" (1 << TICKET_SHIFT) : "cc"); } while (res); @@ -168,17 +168,20 @@ static inline void arch_write_lock(arch_rwlock_t *rw) static inline int arch_write_trylock(arch_rwlock_t *rw) { - unsigned long tmp; + unsigned long contended, res; - __asm__ __volatile__( -" ldrex %0, [%1]\n" -" teq %0, #0\n" -" strexeq %0, %2, [%1]" - : "=&r" (tmp) - : "r" (&rw->lock), "r" (0x80000000) - : "cc"); + do { + __asm__ __volatile__( + " ldrex %0, [%2]\n" + " mov %1, #0\n" + " teq %0, #0\n" + " strexeq %1, %3, [%2]" + : "=&r" (contended), "=&r" (res) + : "r" (&rw->lock), "r" (0x80000000) + : "cc"); + } while (res); - if (tmp == 0) { + if (!contended) { smp_mb(); return 1; } else { @@ -254,18 +257,26 @@ static inline void arch_read_unlock(arch_rwlock_t *rw) static inline int arch_read_trylock(arch_rwlock_t *rw) { - unsigned long tmp, tmp2 = 1; + unsigned long contended, res; - __asm__ __volatile__( -" ldrex %0, [%2]\n" -" adds %0, %0, #1\n" -" strexpl %1, %0, [%2]\n" - : "=&r" (tmp), "+r" (tmp2) - : "r" (&rw->lock) - : "cc"); + do { + __asm__ __volatile__( + " ldrex %0, [%2]\n" + " mov %1, #0\n" + " adds %0, %0, #1\n" + " strexpl %1, %0, [%2]" + : "=&r" (contended), "=&r" (res) + : "r" (&rw->lock) + : "cc"); + } while (res); - smp_mb(); - return tmp2 == 0; + /* If the lock is negative, then it is already held for write. */ + if (contended < 0x80000000) { + smp_mb(); + return 1; + } else { + return 0; + } } /* read_can_lock - would read_trylock() succeed? */ diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index d40d0ef..9cbe70c 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -357,7 +357,8 @@ ENDPROC(__pabt_svc) .endm .macro kuser_cmpxchg_check -#if !defined(CONFIG_CPU_32v6K) && !defined(CONFIG_NEEDS_SYSCALL_FOR_CMPXCHG) +#if !defined(CONFIG_CPU_32v6K) && defined(CONFIG_KUSER_HELPERS) && \ + !defined(CONFIG_NEEDS_SYSCALL_FOR_CMPXCHG) #ifndef CONFIG_MMU #warning "NPTL on non MMU needs fixing" #else diff --git a/arch/arm/kernel/fiq.c b/arch/arm/kernel/fiq.c index 25442f4..fc79202 100644 --- a/arch/arm/kernel/fiq.c +++ b/arch/arm/kernel/fiq.c @@ -84,17 +84,13 @@ int show_fiq_list(struct seq_file *p, int prec) void set_fiq_handler(void *start, unsigned int length) { -#if defined(CONFIG_CPU_USE_DOMAINS) - void *base = (void *)0xffff0000; -#else void *base = vectors_page; -#endif unsigned offset = FIQ_OFFSET; memcpy(base + offset, start, length); + if (!cache_is_vipt_nonaliasing()) + flush_icache_range(base + offset, offset + length); flush_icache_range(0xffff0000 + offset, 0xffff0000 + offset + length); - if (!vectors_high()) - flush_icache_range(offset, offset + length); } int claim_fiq(struct fiq_handler *f) diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c index 4fb074c..d7c82df 100644 --- a/arch/arm/kernel/machine_kexec.c +++ b/arch/arm/kernel/machine_kexec.c @@ -15,6 +15,7 @@ #include <asm/mmu_context.h> #include <asm/cacheflush.h> #include <asm/mach-types.h> +#include <asm/smp_plat.h> #include <asm/system_misc.h> extern const unsigned char relocate_new_kernel[]; @@ -39,6 +40,14 @@ int machine_kexec_prepare(struct kimage *image) int i, err; /* + * Validate that if the current HW supports SMP, then the SW supports + * and implements CPU hotplug for the current HW. If not, we won't be + * able to kexec reliably, so fail the prepare operation. + */ + if (num_possible_cpus() > 1 && !platform_can_cpu_hotplug()) + return -EINVAL; + + /* * No segment at default ATAGs address. try to locate * a dtb using magic. */ @@ -134,10 +143,13 @@ void machine_kexec(struct kimage *image) unsigned long reboot_code_buffer_phys; void *reboot_code_buffer; - if (num_online_cpus() > 1) { - pr_err("kexec: error: multiple CPUs still online\n"); - return; - } + /* + * This can only happen if machine_shutdown() failed to disable some + * CPU, and that can only happen if the checks in + * machine_kexec_prepare() were not correct. If this fails, we can't + * reliably kexec anyway, so BUG_ON is appropriate. + */ + BUG_ON(num_online_cpus() > 1); page_list = image->head & PAGE_MASK; diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index 21f7790..e186ee1 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -56,7 +56,7 @@ armpmu_map_hw_event(const unsigned (*event_map)[PERF_COUNT_HW_MAX], u64 config) int mapping; if (config >= PERF_COUNT_HW_MAX) - return -ENOENT; + return -EINVAL; mapping = (*event_map)[config]; return mapping == HW_OP_UNSUPPORTED ? -ENOENT : mapping; @@ -258,6 +258,9 @@ validate_event(struct pmu_hw_events *hw_events, struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct pmu *leader_pmu = event->group_leader->pmu; + if (is_software_event(event)) + return 1; + if (event->pmu != leader_pmu || event->state < PERF_EVENT_STATE_OFF) return 1; diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 536c85f..94f6b05 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -462,7 +462,7 @@ int in_gate_area_no_mm(unsigned long addr) { return in_gate_area(NULL, addr); } -#define is_gate_vma(vma) ((vma) = &gate_vma) +#define is_gate_vma(vma) ((vma) == &gate_vma) #else #define is_gate_vma(vma) 0 #endif diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index c2b4f8f..2dc19349e 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -145,6 +145,16 @@ int boot_secondary(unsigned int cpu, struct task_struct *idle) return -ENOSYS; } +int platform_can_cpu_hotplug(void) +{ +#ifdef CONFIG_HOTPLUG_CPU + if (smp_ops.cpu_kill) + return 1; +#endif + + return 0; +} + #ifdef CONFIG_HOTPLUG_CPU static void percpu_timer_stop(void); diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index 4a51990..db9cf69 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -146,7 +146,11 @@ static bool pm_fake(struct kvm_vcpu *vcpu, #define access_pmintenclr pm_fake /* Architected CP15 registers. - * Important: Must be sorted ascending by CRn, CRM, Op1, Op2 + * CRn denotes the primary register number, but is copied to the CRm in the + * user space API for 64-bit register access in line with the terminology used + * in the ARM ARM. + * Important: Must be sorted ascending by CRn, CRM, Op1, Op2 and with 64-bit + * registers preceding 32-bit ones. */ static const struct coproc_reg cp15_regs[] = { /* CSSELR: swapped by interrupt.S. */ @@ -154,8 +158,8 @@ static const struct coproc_reg cp15_regs[] = { NULL, reset_unknown, c0_CSSELR }, /* TTBR0/TTBR1: swapped by interrupt.S. */ - { CRm( 2), Op1( 0), is64, NULL, reset_unknown64, c2_TTBR0 }, - { CRm( 2), Op1( 1), is64, NULL, reset_unknown64, c2_TTBR1 }, + { CRm64( 2), Op1( 0), is64, NULL, reset_unknown64, c2_TTBR0 }, + { CRm64( 2), Op1( 1), is64, NULL, reset_unknown64, c2_TTBR1 }, /* TTBCR: swapped by interrupt.S. */ { CRn( 2), CRm( 0), Op1( 0), Op2( 2), is32, @@ -182,7 +186,7 @@ static const struct coproc_reg cp15_regs[] = { NULL, reset_unknown, c6_IFAR }, /* PAR swapped by interrupt.S */ - { CRn( 7), Op1( 0), is64, NULL, reset_unknown64, c7_PAR }, + { CRm64( 7), Op1( 0), is64, NULL, reset_unknown64, c7_PAR }, /* * DC{C,I,CI}SW operations: @@ -399,12 +403,13 @@ static bool index_to_params(u64 id, struct coproc_params *params) | KVM_REG_ARM_OPC1_MASK)) return false; params->is_64bit = true; - params->CRm = ((id & KVM_REG_ARM_CRM_MASK) + /* CRm to CRn: see cp15_to_index for details */ + params->CRn = ((id & KVM_REG_ARM_CRM_MASK) >> KVM_REG_ARM_CRM_SHIFT); params->Op1 = ((id & KVM_REG_ARM_OPC1_MASK) >> KVM_REG_ARM_OPC1_SHIFT); params->Op2 = 0; - params->CRn = 0; + params->CRm = 0; return true; default: return false; @@ -898,7 +903,14 @@ static u64 cp15_to_index(const struct coproc_reg *reg) if (reg->is_64) { val |= KVM_REG_SIZE_U64; val |= (reg->Op1 << KVM_REG_ARM_OPC1_SHIFT); - val |= (reg->CRm << KVM_REG_ARM_CRM_SHIFT); + /* + * CRn always denotes the primary coproc. reg. nr. for the + * in-kernel representation, but the user space API uses the + * CRm for the encoding, because it is modelled after the + * MRRC/MCRR instructions: see the ARM ARM rev. c page + * B3-1445 + */ + val |= (reg->CRn << KVM_REG_ARM_CRM_SHIFT); } else { val |= KVM_REG_SIZE_U32; val |= (reg->Op1 << KVM_REG_ARM_OPC1_SHIFT); diff --git a/arch/arm/kvm/coproc.h b/arch/arm/kvm/coproc.h index b7301d3..0461d5c 100644 --- a/arch/arm/kvm/coproc.h +++ b/arch/arm/kvm/coproc.h @@ -135,6 +135,8 @@ static inline int cmp_reg(const struct coproc_reg *i1, return -1; if (i1->CRn != i2->CRn) return i1->CRn - i2->CRn; + if (i1->is_64 != i2->is_64) + return i2->is_64 - i1->is_64; if (i1->CRm != i2->CRm) return i1->CRm - i2->CRm; if (i1->Op1 != i2->Op1) @@ -145,6 +147,7 @@ static inline int cmp_reg(const struct coproc_reg *i1, #define CRn(_x) .CRn = _x #define CRm(_x) .CRm = _x +#define CRm64(_x) .CRn = _x, .CRm = 0 #define Op1(_x) .Op1 = _x #define Op2(_x) .Op2 = _x #define is64 .is_64 = true diff --git a/arch/arm/kvm/coproc_a15.c b/arch/arm/kvm/coproc_a15.c index 685063a..cf93472 100644 --- a/arch/arm/kvm/coproc_a15.c +++ b/arch/arm/kvm/coproc_a15.c @@ -114,7 +114,11 @@ static bool access_l2ectlr(struct kvm_vcpu *vcpu, /* * A15-specific CP15 registers. - * Important: Must be sorted ascending by CRn, CRM, Op1, Op2 + * CRn denotes the primary register number, but is copied to the CRm in the + * user space API for 64-bit register access in line with the terminology used + * in the ARM ARM. + * Important: Must be sorted ascending by CRn, CRM, Op1, Op2 and with 64-bit + * registers preceding 32-bit ones. */ static const struct coproc_reg a15_regs[] = { /* MPIDR: we use VMPIDR for guest access. */ diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c index b8e06b7..0c25d94 100644 --- a/arch/arm/kvm/mmio.c +++ b/arch/arm/kvm/mmio.c @@ -63,7 +63,8 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) static int decode_hsr(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_exit_mmio *mmio) { - unsigned long rt, len; + unsigned long rt; + int len; bool is_write, sign_extend; if (kvm_vcpu_dabt_isextabt(vcpu)) { diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index ca6bea4..0988d9e 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -85,6 +85,12 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) return p; } +static bool page_empty(void *ptr) +{ + struct page *ptr_page = virt_to_page(ptr); + return page_count(ptr_page) == 1; +} + static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) { pmd_t *pmd_table = pmd_offset(pud, 0); @@ -103,12 +109,6 @@ static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) put_page(virt_to_page(pmd)); } -static bool pmd_empty(pmd_t *pmd) -{ - struct page *pmd_page = virt_to_page(pmd); - return page_count(pmd_page) == 1; -} - static void clear_pte_entry(struct kvm *kvm, pte_t *pte, phys_addr_t addr) { if (pte_present(*pte)) { @@ -118,12 +118,6 @@ static void clear_pte_entry(struct kvm *kvm, pte_t *pte, phys_addr_t addr) } } -static bool pte_empty(pte_t *pte) -{ - struct page *pte_page = virt_to_page(pte); - return page_count(pte_page) == 1; -} - static void unmap_range(struct kvm *kvm, pgd_t *pgdp, unsigned long long start, u64 size) { @@ -132,37 +126,37 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, pmd_t *pmd; pte_t *pte; unsigned long long addr = start, end = start + size; - u64 range; + u64 next; while (addr < end) { pgd = pgdp + pgd_index(addr); pud = pud_offset(pgd, addr); if (pud_none(*pud)) { - addr += PUD_SIZE; + addr = pud_addr_end(addr, end); continue; } pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { - addr += PMD_SIZE; + addr = pmd_addr_end(addr, end); continue; } pte = pte_offset_kernel(pmd, addr); clear_pte_entry(kvm, pte, addr); - range = PAGE_SIZE; + next = addr + PAGE_SIZE; /* If we emptied the pte, walk back up the ladder */ - if (pte_empty(pte)) { + if (page_empty(pte)) { clear_pmd_entry(kvm, pmd, addr); - range = PMD_SIZE; - if (pmd_empty(pmd)) { + next = pmd_addr_end(addr, end); + if (page_empty(pmd) && !page_empty(pud)) { clear_pud_entry(kvm, pud, addr); - range = PUD_SIZE; + next = pud_addr_end(addr, end); } } - addr += range; + addr = next; } } diff --git a/arch/arm/mach-at91/at91sam9x5.c b/arch/arm/mach-at91/at91sam9x5.c index 2abee66..916e5a1 100644 --- a/arch/arm/mach-at91/at91sam9x5.c +++ b/arch/arm/mach-at91/at91sam9x5.c @@ -227,6 +227,8 @@ static struct clk_lookup periph_clocks_lookups[] = { CLKDEV_CON_DEV_ID("usart", "f8020000.serial", &usart1_clk), CLKDEV_CON_DEV_ID("usart", "f8024000.serial", &usart2_clk), CLKDEV_CON_DEV_ID("usart", "f8028000.serial", &usart3_clk), + CLKDEV_CON_DEV_ID("usart", "f8040000.serial", &uart0_clk), + CLKDEV_CON_DEV_ID("usart", "f8044000.serial", &uart1_clk), CLKDEV_CON_DEV_ID("t0_clk", "f8008000.timer", &tcb0_clk), CLKDEV_CON_DEV_ID("t0_clk", "f800c000.timer", &tcb0_clk), CLKDEV_CON_DEV_ID("mci_clk", "f0008000.mmc", &mmc0_clk), diff --git a/arch/arm/mach-davinci/board-dm355-leopard.c b/arch/arm/mach-davinci/board-dm355-leopard.c index dff4ddc..139e42d 100644 --- a/arch/arm/mach-davinci/board-dm355-leopard.c +++ b/arch/arm/mach-davinci/board-dm355-leopard.c @@ -75,6 +75,7 @@ static struct davinci_nand_pdata davinci_nand_data = { .parts = davinci_nand_partitions, .nr_parts = ARRAY_SIZE(davinci_nand_partitions), .ecc_mode = NAND_ECC_HW_SYNDROME, + .ecc_bits = 4, .bbt_options = NAND_BBT_USE_FLASH, }; diff --git a/arch/arm/mach-davinci/board-dm644x-evm.c b/arch/arm/mach-davinci/board-dm644x-evm.c index a33686a..fa4bfaf 100644 --- a/arch/arm/mach-davinci/board-dm644x-evm.c +++ b/arch/arm/mach-davinci/board-dm644x-evm.c @@ -153,6 +153,7 @@ static struct davinci_nand_pdata davinci_evm_nandflash_data = { .parts = davinci_evm_nandflash_partition, .nr_parts = ARRAY_SIZE(davinci_evm_nandflash_partition), .ecc_mode = NAND_ECC_HW, + .ecc_bits = 1, .bbt_options = NAND_BBT_USE_FLASH, .timing = &davinci_evm_nandflash_timing, }; diff --git a/arch/arm/mach-davinci/board-dm646x-evm.c b/arch/arm/mach-davinci/board-dm646x-evm.c index fbb8e5a..0c005e8 100644 --- a/arch/arm/mach-davinci/board-dm646x-evm.c +++ b/arch/arm/mach-davinci/board-dm646x-evm.c @@ -90,6 +90,7 @@ static struct davinci_nand_pdata davinci_nand_data = { .parts = davinci_nand_partitions, .nr_parts = ARRAY_SIZE(davinci_nand_partitions), .ecc_mode = NAND_ECC_HW, + .ecc_bits = 1, .options = 0, }; diff --git a/arch/arm/mach-davinci/board-neuros-osd2.c b/arch/arm/mach-davinci/board-neuros-osd2.c index 2bc112a..808233b 100644 --- a/arch/arm/mach-davinci/board-neuros-osd2.c +++ b/arch/arm/mach-davinci/board-neuros-osd2.c @@ -88,6 +88,7 @@ static struct davinci_nand_pdata davinci_ntosd2_nandflash_data = { .parts = davinci_ntosd2_nandflash_partition, .nr_parts = ARRAY_SIZE(davinci_ntosd2_nandflash_partition), .ecc_mode = NAND_ECC_HW, + .ecc_bits = 1, .bbt_options = NAND_BBT_USE_FLASH, }; diff --git a/arch/arm/mach-omap2/board-n8x0.c b/arch/arm/mach-omap2/board-n8x0.c index f6eeb87..827d150 100644 --- a/arch/arm/mach-omap2/board-n8x0.c +++ b/arch/arm/mach-omap2/board-n8x0.c @@ -122,11 +122,7 @@ static struct musb_hdrc_config musb_config = { }; static struct musb_hdrc_platform_data tusb_data = { -#ifdef CONFIG_USB_GADGET_MUSB_HDRC .mode = MUSB_OTG, -#else - .mode = MUSB_HOST, -#endif .set_power = tusb_set_power, .min_power = 25, /* x2 = 50 mA drawn from VBUS as peripheral */ .power = 100, /* Max 100 mA VBUS for host mode */ diff --git a/arch/arm/mach-omap2/board-rx51.c b/arch/arm/mach-omap2/board-rx51.c index d2ea68e..773510556 100644 --- a/arch/arm/mach-omap2/board-rx51.c +++ b/arch/arm/mach-omap2/board-rx51.c @@ -85,7 +85,7 @@ static struct omap_board_mux board_mux[] __initdata = { static struct omap_musb_board_data musb_board_data = { .interface_type = MUSB_INTERFACE_ULPI, - .mode = MUSB_PERIPHERAL, + .mode = MUSB_OTG, .power = 0, }; diff --git a/arch/arm/mach-omap2/usb-musb.c b/arch/arm/mach-omap2/usb-musb.c index 8c4de27..bc89723 100644 --- a/arch/arm/mach-omap2/usb-musb.c +++ b/arch/arm/mach-omap2/usb-musb.c @@ -38,11 +38,8 @@ static struct musb_hdrc_config musb_config = { }; static struct musb_hdrc_platform_data musb_plat = { -#ifdef CONFIG_USB_GADGET_MUSB_HDRC .mode = MUSB_OTG, -#else - .mode = MUSB_HOST, -#endif + /* .clock is set dynamically */ .config = &musb_config, diff --git a/arch/arm/mach-pxa/icontrol.c b/arch/arm/mach-pxa/icontrol.c index fe31bfc..c98511c 100644 --- a/arch/arm/mach-pxa/icontrol.c +++ b/arch/arm/mach-pxa/icontrol.c @@ -73,9 +73,6 @@ static struct pxa2xx_spi_chip mcp251x_chip_info4 = { static struct mcp251x_platform_data mcp251x_info = { .oscillator_frequency = 16E6, - .board_specific_setup = NULL, - .power_enable = NULL, - .transceiver_enable = NULL }; static struct spi_board_info mcp251x_board_info[] = { diff --git a/arch/arm/mach-pxa/zeus.c b/arch/arm/mach-pxa/zeus.c index f5d4364..04a0aea 100644 --- a/arch/arm/mach-pxa/zeus.c +++ b/arch/arm/mach-pxa/zeus.c @@ -29,6 +29,8 @@ #include <linux/i2c/pca953x.h> #include <linux/apm-emulation.h> #include <linux/can/platform/mcp251x.h> +#include <linux/regulator/fixed.h> +#include <linux/regulator/machine.h> #include <asm/mach-types.h> #include <asm/suspend.h> @@ -391,33 +393,34 @@ static struct pxa2xx_spi_master pxa2xx_spi_ssp3_master_info = { }; /* CAN bus on SPI */ -static int zeus_mcp2515_setup(struct spi_device *sdev) -{ - int err; - - err = gpio_request(ZEUS_CAN_SHDN_GPIO, "CAN shutdown"); - if (err) - return err; +static struct regulator_consumer_supply can_regulator_consumer = + REGULATOR_SUPPLY("vdd", "spi3.0"); - err = gpio_direction_output(ZEUS_CAN_SHDN_GPIO, 1); - if (err) { - gpio_free(ZEUS_CAN_SHDN_GPIO); - return err; - } +static struct regulator_init_data can_regulator_init_data = { + .constraints = { + .valid_ops_mask = REGULATOR_CHANGE_STATUS, + }, + .consumer_supplies = &can_regulator_consumer, + .num_consumer_supplies = 1, +}; - return 0; -} +static struct fixed_voltage_config can_regulator_pdata = { + .supply_name = "CAN_SHDN", + .microvolts = 3300000, + .gpio = ZEUS_CAN_SHDN_GPIO, + .init_data = &can_regulator_init_data, +}; -static int zeus_mcp2515_transceiver_enable(int enable) -{ - gpio_set_value(ZEUS_CAN_SHDN_GPIO, !enable); - return 0; -} +static struct platform_device can_regulator_device = { + .name = "reg-fixed-volage", + .id = -1, + .dev = { + .platform_data = &can_regulator_pdata, + }, +}; static struct mcp251x_platform_data zeus_mcp2515_pdata = { .oscillator_frequency = 16*1000*1000, - .board_specific_setup = zeus_mcp2515_setup, - .power_enable = zeus_mcp2515_transceiver_enable, }; static struct spi_board_info zeus_spi_board_info[] = { @@ -516,6 +519,7 @@ static struct platform_device *zeus_devices[] __initdata = { &zeus_leds_device, &zeus_pcmcia_device, &zeus_max6369_device, + &can_regulator_device, }; /* AC'97 */ diff --git a/arch/arm/mach-shmobile/board-armadillo800eva.c b/arch/arm/mach-shmobile/board-armadillo800eva.c index c5be60d..3a6ffa2 100644 --- a/arch/arm/mach-shmobile/board-armadillo800eva.c +++ b/arch/arm/mach-shmobile/board-armadillo800eva.c @@ -358,7 +358,6 @@ static struct platform_device usbhsf_device = { static struct sh_eth_plat_data sh_eth_platdata = { .phy = 0x00, /* LAN8710A */ .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_GIGABIT, .phy_interface = PHY_INTERFACE_MODE_MII, }; diff --git a/arch/arm/mach-shmobile/board-bockw.c b/arch/arm/mach-shmobile/board-bockw.c index 3354a85..fa8885b 100644 --- a/arch/arm/mach-shmobile/board-bockw.c +++ b/arch/arm/mach-shmobile/board-bockw.c @@ -89,7 +89,6 @@ static struct sh_mobile_sdhi_info sdhi0_info = { static struct sh_eth_plat_data ether_platform_data __initdata = { .phy = 0x01, .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_FAST_RCAR, .phy_interface = PHY_INTERFACE_MODE_RMII, /* * Although the LINK signal is available on the board, it's connected to diff --git a/arch/arm/plat-samsung/init.c b/arch/arm/plat-samsung/init.c index 3e5c461..50a3ea0 100644 --- a/arch/arm/plat-samsung/init.c +++ b/arch/arm/plat-samsung/init.c @@ -55,12 +55,13 @@ void __init s3c_init_cpu(unsigned long idcode, printk("CPU %s (id 0x%08lx)\n", cpu->name, idcode); - if (cpu->map_io == NULL || cpu->init == NULL) { + if (cpu->init == NULL) { printk(KERN_ERR "CPU %s support not enabled\n", cpu->name); panic("Unsupported Samsung CPU"); } - cpu->map_io(); + if (cpu->map_io) + cpu->map_io(); } /* s3c24xx_init_clocks diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index c9770ba..8a6295c 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -170,6 +170,7 @@ static void __init xen_percpu_init(void *unused) per_cpu(xen_vcpu, cpu) = vcpup; enable_percpu_irq(xen_events_irq, 0); + put_cpu(); } static void xen_restart(enum reboot_mode reboot_mode, const char *cmd) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index c92de41..b25763b 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -42,14 +42,15 @@ #define TPIDR_EL1 18 /* Thread ID, Privileged */ #define AMAIR_EL1 19 /* Aux Memory Attribute Indirection Register */ #define CNTKCTL_EL1 20 /* Timer Control Register (EL1) */ +#define PAR_EL1 21 /* Physical Address Register */ /* 32bit specific registers. Keep them at the end of the range */ -#define DACR32_EL2 21 /* Domain Access Control Register */ -#define IFSR32_EL2 22 /* Instruction Fault Status Register */ -#define FPEXC32_EL2 23 /* Floating-Point Exception Control Register */ -#define DBGVCR32_EL2 24 /* Debug Vector Catch Register */ -#define TEECR32_EL1 25 /* ThumbEE Configuration Register */ -#define TEEHBR32_EL1 26 /* ThumbEE Handler Base Register */ -#define NR_SYS_REGS 27 +#define DACR32_EL2 22 /* Domain Access Control Register */ +#define IFSR32_EL2 23 /* Instruction Fault Status Register */ +#define FPEXC32_EL2 24 /* Floating-Point Exception Control Register */ +#define DBGVCR32_EL2 25 /* Debug Vector Catch Register */ +#define TEECR32_EL1 26 /* ThumbEE Configuration Register */ +#define TEEHBR32_EL1 27 /* ThumbEE Handler Base Register */ +#define NR_SYS_REGS 28 /* 32bit mapping */ #define c0_MPIDR (MPIDR_EL1 * 2) /* MultiProcessor ID Register */ @@ -69,6 +70,8 @@ #define c5_AIFSR (AFSR1_EL1 * 2) /* Auxiliary Instr Fault Status R */ #define c6_DFAR (FAR_EL1 * 2) /* Data Fault Address Register */ #define c6_IFAR (c6_DFAR + 1) /* Instruction Fault Address Register */ +#define c7_PAR (PAR_EL1 * 2) /* Physical Address Register */ +#define c7_PAR_high (c7_PAR + 1) /* PAR top 32 bits */ #define c10_PRRR (MAIR_EL1 * 2) /* Primary Region Remap Register */ #define c10_NMRR (c10_PRRR + 1) /* Normal Memory Remap Register */ #define c12_VBAR (VBAR_EL1 * 2) /* Vector Base Address Register */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 644d739..0859a4d 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -129,7 +129,7 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_page_cache; /* Target CPU and feature flags */ - u32 target; + int target; DECLARE_BITMAP(features, KVM_VCPU_MAX_FEATURES); /* Detect first run of a vcpu */ diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 9ba33c4..12e6ccb 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -107,7 +107,12 @@ armpmu_map_cache_event(const unsigned (*cache_map) static int armpmu_map_event(const unsigned (*event_map)[PERF_COUNT_HW_MAX], u64 config) { - int mapping = (*event_map)[config]; + int mapping; + + if (config >= PERF_COUNT_HW_MAX) + return -EINVAL; + + mapping = (*event_map)[config]; return mapping == HW_OP_UNSUPPORTED ? -ENOENT : mapping; } @@ -317,6 +322,9 @@ validate_event(struct pmu_hw_events *hw_events, struct hw_perf_event fake_event = event->hw; struct pmu *leader_pmu = event->group_leader->pmu; + if (is_software_event(event)) + return 1; + if (event->pmu != leader_pmu || event->state <= PERF_EVENT_STATE_OFF) return 1; diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index ff985e3..1ac0bbb 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -214,6 +214,7 @@ __kvm_hyp_code_start: mrs x21, tpidr_el1 mrs x22, amair_el1 mrs x23, cntkctl_el1 + mrs x24, par_el1 stp x4, x5, [x3] stp x6, x7, [x3, #16] @@ -225,6 +226,7 @@ __kvm_hyp_code_start: stp x18, x19, [x3, #112] stp x20, x21, [x3, #128] stp x22, x23, [x3, #144] + str x24, [x3, #160] .endm .macro restore_sysregs @@ -243,6 +245,7 @@ __kvm_hyp_code_start: ldp x18, x19, [x3, #112] ldp x20, x21, [x3, #128] ldp x22, x23, [x3, #144] + ldr x24, [x3, #160] msr vmpidr_el2, x4 msr csselr_el1, x5 @@ -264,6 +267,7 @@ __kvm_hyp_code_start: msr tpidr_el1, x21 msr amair_el1, x22 msr cntkctl_el1, x23 + msr par_el1, x24 .endm .macro skip_32bit_state tmp, target @@ -600,6 +604,8 @@ END(__kvm_vcpu_run) // void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); ENTRY(__kvm_tlb_flush_vmid_ipa) + dsb ishst + kern_hyp_va x0 ldr x2, [x0, #KVM_VTTBR] msr vttbr_el2, x2 @@ -621,6 +627,7 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) ENDPROC(__kvm_tlb_flush_vmid_ipa) ENTRY(__kvm_flush_vm_context) + dsb ishst tlbi alle1is ic ialluis dsb sy @@ -753,6 +760,10 @@ el1_trap: */ tbnz x1, #7, 1f // S1PTW is set + /* Preserve PAR_EL1 */ + mrs x3, par_el1 + push x3, xzr + /* * Permission fault, HPFAR_EL2 is invalid. * Resolve the IPA the hard way using the guest VA. @@ -766,6 +777,8 @@ el1_trap: /* Read result */ mrs x3, par_el1 + pop x0, xzr // Restore PAR_EL1 from the stack + msr par_el1, x0 tbnz x3, #0, 3f // Bail out if we failed the translation ubfx x3, x3, #12, #36 // Extract IPA lsl x3, x3, #4 // and present it like HPFAR diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 9492360..02e9d09 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -211,6 +211,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* FAR_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b0110), CRm(0b0000), Op2(0b000), NULL, reset_unknown, FAR_EL1 }, + /* PAR_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b0111), CRm(0b0100), Op2(0b000), + NULL, reset_unknown, PAR_EL1 }, /* PMINTENSET_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b1001), CRm(0b1110), Op2(0b001), diff --git a/arch/m68k/emu/natfeat.c b/arch/m68k/emu/natfeat.c index 2291a7d..fa277ae 100644 --- a/arch/m68k/emu/natfeat.c +++ b/arch/m68k/emu/natfeat.c @@ -18,9 +18,11 @@ #include <asm/machdep.h> #include <asm/natfeat.h> +extern long nf_get_id2(const char *feature_name); + asm("\n" -" .global nf_get_id,nf_call\n" -"nf_get_id:\n" +" .global nf_get_id2,nf_call\n" +"nf_get_id2:\n" " .short 0x7300\n" " rts\n" "nf_call:\n" @@ -29,12 +31,25 @@ asm("\n" "1: moveq.l #0,%d0\n" " rts\n" " .section __ex_table,\"a\"\n" -" .long nf_get_id,1b\n" +" .long nf_get_id2,1b\n" " .long nf_call,1b\n" " .previous"); -EXPORT_SYMBOL_GPL(nf_get_id); EXPORT_SYMBOL_GPL(nf_call); +long nf_get_id(const char *feature_name) +{ + /* feature_name may be in vmalloc()ed memory, so make a copy */ + char name_copy[32]; + size_t n; + + n = strlcpy(name_copy, feature_name, sizeof(name_copy)); + if (n >= sizeof(name_copy)) + return 0; + + return nf_get_id2(name_copy); +} +EXPORT_SYMBOL_GPL(nf_get_id); + void nfprint(const char *fmt, ...) { static char buf[256]; diff --git a/arch/m68k/include/asm/div64.h b/arch/m68k/include/asm/div64.h index 444ea8a..ef881cf 100644 --- a/arch/m68k/include/asm/div64.h +++ b/arch/m68k/include/asm/div64.h @@ -15,16 +15,17 @@ unsigned long long n64; \ } __n; \ unsigned long __rem, __upper; \ + unsigned long __base = (base); \ \ __n.n64 = (n); \ if ((__upper = __n.n32[0])) { \ asm ("divul.l %2,%1:%0" \ - : "=d" (__n.n32[0]), "=d" (__upper) \ - : "d" (base), "0" (__n.n32[0])); \ + : "=d" (__n.n32[0]), "=d" (__upper) \ + : "d" (__base), "0" (__n.n32[0])); \ } \ asm ("divu.l %2,%1:%0" \ - : "=d" (__n.n32[1]), "=d" (__rem) \ - : "d" (base), "1" (__upper), "0" (__n.n32[1])); \ + : "=d" (__n.n32[1]), "=d" (__rem) \ + : "d" (__base), "1" (__upper), "0" (__n.n32[1])); \ (n) = __n.n64; \ __rem; \ }) diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c index e773659..46048d2 100644 --- a/arch/mips/math-emu/cp1emu.c +++ b/arch/mips/math-emu/cp1emu.c @@ -803,6 +803,32 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn, dec_insn.next_pc_inc; return 1; break; +#ifdef CONFIG_CPU_CAVIUM_OCTEON + case lwc2_op: /* This is bbit0 on Octeon */ + if ((regs->regs[insn.i_format.rs] & (1ull<<insn.i_format.rt)) == 0) + *contpc = regs->cp0_epc + 4 + (insn.i_format.simmediate << 2); + else + *contpc = regs->cp0_epc + 8; + return 1; + case ldc2_op: /* This is bbit032 on Octeon */ + if ((regs->regs[insn.i_format.rs] & (1ull<<(insn.i_format.rt + 32))) == 0) + *contpc = regs->cp0_epc + 4 + (insn.i_format.simmediate << 2); + else + *contpc = regs->cp0_epc + 8; + return 1; + case swc2_op: /* This is bbit1 on Octeon */ + if (regs->regs[insn.i_format.rs] & (1ull<<insn.i_format.rt)) + *contpc = regs->cp0_epc + 4 + (insn.i_format.simmediate << 2); + else + *contpc = regs->cp0_epc + 8; + return 1; + case sdc2_op: /* This is bbit132 on Octeon */ + if (regs->regs[insn.i_format.rs] & (1ull<<(insn.i_format.rt + 32))) + *contpc = regs->cp0_epc + 4 + (insn.i_format.simmediate << 2); + else + *contpc = regs->cp0_epc + 8; + return 1; +#endif case cop0_op: case cop1_op: case cop2_op: diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 23a64d2..6d6d92b 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -32,7 +32,7 @@ struct mmu_gather { struct mm_struct *mm; struct mmu_table_batch *batch; unsigned int fullmm; - unsigned long start, unsigned long end; + unsigned long start, end; }; struct mmu_table_batch { diff --git a/arch/sh/boards/board-espt.c b/arch/sh/boards/board-espt.c index 4d94dff..7291e2f 100644 --- a/arch/sh/boards/board-espt.c +++ b/arch/sh/boards/board-espt.c @@ -80,7 +80,6 @@ static struct resource sh_eth_resources[] = { static struct sh_eth_plat_data sh7763_eth_pdata = { .phy = 0, .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_GIGABIT, .phy_interface = PHY_INTERFACE_MODE_MII, }; diff --git a/arch/sh/boards/board-sh7757lcr.c b/arch/sh/boards/board-sh7757lcr.c index 4f114d1..25c5a93 100644 --- a/arch/sh/boards/board-sh7757lcr.c +++ b/arch/sh/boards/board-sh7757lcr.c @@ -77,7 +77,6 @@ static struct resource sh_eth0_resources[] = { static struct sh_eth_plat_data sh7757_eth0_pdata = { .phy = 1, .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_FAST_SH4, .set_mdio_gate = sh7757_eth_set_mdio_gate, }; @@ -106,7 +105,6 @@ static struct resource sh_eth1_resources[] = { static struct sh_eth_plat_data sh7757_eth1_pdata = { .phy = 1, .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_FAST_SH4, .set_mdio_gate = sh7757_eth_set_mdio_gate, }; @@ -151,7 +149,6 @@ static struct resource sh_eth_giga0_resources[] = { static struct sh_eth_plat_data sh7757_eth_giga0_pdata = { .phy = 18, .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_GIGABIT, .set_mdio_gate = sh7757_eth_giga_set_mdio_gate, .phy_interface = PHY_INTERFACE_MODE_RGMII_ID, }; @@ -186,7 +183,6 @@ static struct resource sh_eth_giga1_resources[] = { static struct sh_eth_plat_data sh7757_eth_giga1_pdata = { .phy = 19, .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_GIGABIT, .set_mdio_gate = sh7757_eth_giga_set_mdio_gate, .phy_interface = PHY_INTERFACE_MODE_RGMII_ID, }; diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c index 61fade0f..a4f630f 100644 --- a/arch/sh/boards/mach-ecovec24/setup.c +++ b/arch/sh/boards/mach-ecovec24/setup.c @@ -159,7 +159,6 @@ static struct resource sh_eth_resources[] = { static struct sh_eth_plat_data sh_eth_plat = { .phy = 0x1f, /* SMSC LAN8700 */ .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_FAST_SH4, .phy_interface = PHY_INTERFACE_MODE_MII, .ether_link_active_low = 1 }; diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c index b70180e..21e4230 100644 --- a/arch/sh/boards/mach-se/7724/setup.c +++ b/arch/sh/boards/mach-se/7724/setup.c @@ -365,7 +365,7 @@ static struct platform_device keysc_device = { static struct resource sh_eth_resources[] = { [0] = { .start = SH_ETH_ADDR, - .end = SH_ETH_ADDR + 0x1FC, + .end = SH_ETH_ADDR + 0x1FC - 1, .flags = IORESOURCE_MEM, }, [1] = { @@ -377,6 +377,7 @@ static struct resource sh_eth_resources[] = { static struct sh_eth_plat_data sh_eth_plat = { .phy = 0x1f, /* SMSC LAN8187 */ .edmac_endian = EDMAC_LITTLE_ENDIAN, + .phy_interface = PHY_INTERFACE_MODE_MII, }; static struct platform_device sh_eth_device = { diff --git a/arch/sh/boards/mach-sh7763rdp/setup.c b/arch/sh/boards/mach-sh7763rdp/setup.c index 50ba481..2c8fb04 100644 --- a/arch/sh/boards/mach-sh7763rdp/setup.c +++ b/arch/sh/boards/mach-sh7763rdp/setup.c @@ -88,7 +88,6 @@ static struct resource sh_eth_resources[] = { static struct sh_eth_plat_data sh7763_eth_pdata = { .phy = 1, .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_GIGABIT, .phy_interface = PHY_INTERFACE_MODE_MII, }; diff --git a/arch/sh/kernel/cpu/sh2/setup-sh7619.c b/arch/sh/kernel/cpu/sh2/setup-sh7619.c index bb11e19..4df4d4f 100644 --- a/arch/sh/kernel/cpu/sh2/setup-sh7619.c +++ b/arch/sh/kernel/cpu/sh2/setup-sh7619.c @@ -12,6 +12,7 @@ #include <linux/init.h> #include <linux/serial.h> #include <linux/serial_sci.h> +#include <linux/sh_eth.h> #include <linux/sh_timer.h> #include <linux/io.h> @@ -110,10 +111,16 @@ static struct platform_device scif2_device = { }, }; +static struct sh_eth_plat_data eth_platform_data = { + .phy = 1, + .edmac_endian = EDMAC_LITTLE_ENDIAN, + .phy_interface = PHY_INTERFACE_MODE_MII, +}; + static struct resource eth_resources[] = { [0] = { .start = 0xfb000000, - .end = 0xfb0001c8, + .end = 0xfb0001c7, .flags = IORESOURCE_MEM, }, [1] = { @@ -127,7 +134,7 @@ static struct platform_device eth_device = { .name = "sh7619-ether", .id = -1, .dev = { - .platform_data = (void *)1, + .platform_data = ð_platform_data, }, .num_resources = ARRAY_SIZE(eth_resources), .resource = eth_resources, diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index 653668d..4a8cb8d 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -35,9 +35,9 @@ static void sanitize_boot_params(struct boot_params *boot_params) */ if (boot_params->sentinel) { /* fields in boot_params are left uninitialized, clear them */ - memset(&boot_params->olpc_ofw_header, 0, + memset(&boot_params->ext_ramdisk_image, 0, (char *)&boot_params->efi_info - - (char *)&boot_params->olpc_ofw_header); + (char *)&boot_params->ext_ramdisk_image); memset(&boot_params->kbd_status, 0, (char *)&boot_params->hdr - (char *)&boot_params->kbd_status); diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index 50e5c58..4c01917 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -59,7 +59,7 @@ static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, extern int __apply_microcode_amd(struct microcode_amd *mc_amd); extern int apply_microcode_amd(int cpu); -extern enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size); +extern enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); #ifdef CONFIG_MICROCODE_AMD_EARLY #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f654ece..08a0890 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -512,7 +512,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) static const int amd_erratum_383[]; static const int amd_erratum_400[]; -static bool cpu_has_amd_erratum(const int *erratum); +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); static void init_amd(struct cpuinfo_x86 *c) { @@ -729,11 +729,11 @@ static void init_amd(struct cpuinfo_x86 *c) value &= ~(1ULL << 24); wrmsrl_safe(MSR_AMD64_BU_CFG2, value); - if (cpu_has_amd_erratum(amd_erratum_383)) + if (cpu_has_amd_erratum(c, amd_erratum_383)) set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); } - if (cpu_has_amd_erratum(amd_erratum_400)) + if (cpu_has_amd_erratum(c, amd_erratum_400)) set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); @@ -878,23 +878,13 @@ static const int amd_erratum_400[] = static const int amd_erratum_383[] = AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); -static bool cpu_has_amd_erratum(const int *erratum) + +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) { - struct cpuinfo_x86 *cpu = __this_cpu_ptr(&cpu_info); int osvw_id = *erratum++; u32 range; u32 ms; - /* - * If called early enough that current_cpu_data hasn't been initialized - * yet, fall back to boot_cpu_data. - */ - if (cpu->x86 == 0) - cpu = &boot_cpu_data; - - if (cpu->x86_vendor != X86_VENDOR_AMD) - return false; - if (osvw_id >= 0 && osvw_id < 65536 && cpu_has(cpu, X86_FEATURE_OSVW)) { u64 osvw_len; diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 7a0adb7..7123b5d 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -145,10 +145,9 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) return 0; } -static unsigned int verify_patch_size(int cpu, u32 patch_size, +static unsigned int verify_patch_size(u8 family, u32 patch_size, unsigned int size) { - struct cpuinfo_x86 *c = &cpu_data(cpu); u32 max_size; #define F1XH_MPB_MAX_SIZE 2048 @@ -156,7 +155,7 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size, #define F15H_MPB_MAX_SIZE 4096 #define F16H_MPB_MAX_SIZE 3458 - switch (c->x86) { + switch (family) { case 0x14: max_size = F14H_MPB_MAX_SIZE; break; @@ -277,9 +276,8 @@ static void cleanup(void) * driver cannot continue functioning normally. In such cases, we tear * down everything we've used up so far and exit. */ -static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) +static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover) { - struct cpuinfo_x86 *c = &cpu_data(cpu); struct microcode_header_amd *mc_hdr; struct ucode_patch *patch; unsigned int patch_size, crnt_size, ret; @@ -299,7 +297,7 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) /* check if patch is for the current family */ proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff); - if (proc_fam != c->x86) + if (proc_fam != family) return crnt_size; if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { @@ -308,7 +306,7 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) return crnt_size; } - ret = verify_patch_size(cpu, patch_size, leftover); + ret = verify_patch_size(family, patch_size, leftover); if (!ret) { pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id); return crnt_size; @@ -339,7 +337,8 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) return crnt_size; } -static enum ucode_state __load_microcode_amd(int cpu, const u8 *data, size_t size) +static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, + size_t size) { enum ucode_state ret = UCODE_ERROR; unsigned int leftover; @@ -362,7 +361,7 @@ static enum ucode_state __load_microcode_amd(int cpu, const u8 *data, size_t siz } while (leftover) { - crnt_size = verify_and_add_patch(cpu, fw, leftover); + crnt_size = verify_and_add_patch(family, fw, leftover); if (crnt_size < 0) return ret; @@ -373,22 +372,22 @@ static enum ucode_state __load_microcode_amd(int cpu, const u8 *data, size_t siz return UCODE_OK; } -enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size) +enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) { enum ucode_state ret; /* free old equiv table */ free_equiv_cpu_table(); - ret = __load_microcode_amd(cpu, data, size); + ret = __load_microcode_amd(family, data, size); if (ret != UCODE_OK) cleanup(); #if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32) /* save BSP's matching patch for early load */ - if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) { - struct ucode_patch *p = find_patch(cpu); + if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { + struct ucode_patch *p = find_patch(smp_processor_id()); if (p) { memset(amd_bsp_mpb, 0, MPB_MAX_SIZE); memcpy(amd_bsp_mpb, p->data, min_t(u32, ksize(p->data), @@ -441,7 +440,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, goto fw_release; } - ret = load_microcode_amd(cpu, fw->data, fw->size); + ret = load_microcode_amd(c->x86, fw->data, fw->size); fw_release: release_firmware(fw); diff --git a/arch/x86/kernel/microcode_amd_early.c b/arch/x86/kernel/microcode_amd_early.c index 1d14ffe..6073104 100644 --- a/arch/x86/kernel/microcode_amd_early.c +++ b/arch/x86/kernel/microcode_amd_early.c @@ -238,25 +238,17 @@ static void __init collect_cpu_sig_on_bsp(void *arg) uci->cpu_sig.sig = cpuid_eax(0x00000001); } #else -static void collect_cpu_info_amd_early(struct cpuinfo_x86 *c, - struct ucode_cpu_info *uci) +void load_ucode_amd_ap(void) { + unsigned int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; u32 rev, eax; rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); eax = cpuid_eax(0x00000001); - uci->cpu_sig.sig = eax; uci->cpu_sig.rev = rev; - c->microcode = rev; - c->x86 = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); -} - -void load_ucode_amd_ap(void) -{ - unsigned int cpu = smp_processor_id(); - - collect_cpu_info_amd_early(&cpu_data(cpu), ucode_cpu_info + cpu); + uci->cpu_sig.sig = eax; if (cpu && !ucode_loaded) { void *ucode; @@ -265,8 +257,10 @@ void load_ucode_amd_ap(void) return; ucode = (void *)(initrd_start + ucode_offset); - if (load_microcode_amd(0, ucode, ucode_size) != UCODE_OK) + eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + if (load_microcode_amd(eax, ucode, ucode_size) != UCODE_OK) return; + ucode_loaded = true; } @@ -278,6 +272,8 @@ int __init save_microcode_in_initrd_amd(void) { enum ucode_state ret; void *ucode; + u32 eax; + #ifdef CONFIG_X86_32 unsigned int bsp = boot_cpu_data.cpu_index; struct ucode_cpu_info *uci = ucode_cpu_info + bsp; @@ -293,7 +289,10 @@ int __init save_microcode_in_initrd_amd(void) return 0; ucode = (void *)(initrd_start + ucode_offset); - ret = load_microcode_amd(0, ucode, ucode_size); + eax = cpuid_eax(0x00000001); + eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + + ret = load_microcode_amd(eax, ucode, ucode_size); if (ret != UCODE_OK) return -EINVAL; diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 48f8375..30277e2 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -101,7 +101,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin, *begin = new_begin; } } else { - *begin = mmap_legacy_base(); + *begin = current->mm->mmap_legacy_base; *end = TASK_SIZE; } } diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index f63778c..25e7e13 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -98,7 +98,7 @@ static unsigned long mmap_base(void) * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 * does, but not when emulating X86_32 */ -unsigned long mmap_legacy_base(void) +static unsigned long mmap_legacy_base(void) { if (mmap_is_ia32()) return TASK_UNMAPPED_BASE; @@ -112,11 +112,13 @@ unsigned long mmap_legacy_base(void) */ void arch_pick_mmap_layout(struct mm_struct *mm) { + mm->mmap_legacy_base = mmap_legacy_base(); + mm->mmap_base = mmap_base(); + if (mmap_is_legacy()) { - mm->mmap_base = mmap_legacy_base(); + mm->mmap_base = mm->mmap_legacy_base; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 056d11f..8f3eea6 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -313,6 +313,17 @@ static void xen_align_and_add_e820_region(u64 start, u64 size, int type) e820_add_region(start, end - start, type); } +void xen_ignore_unusable(struct e820entry *list, size_t map_size) +{ + struct e820entry *entry; + unsigned int i; + + for (i = 0, entry = list; i < map_size; i++, entry++) { + if (entry->type == E820_UNUSABLE) + entry->type = E820_RAM; + } +} + /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ @@ -353,6 +364,17 @@ char * __init xen_memory_setup(void) } BUG_ON(rc); + /* + * Xen won't allow a 1:1 mapping to be created to UNUSABLE + * regions, so if we're using the machine memory map leave the + * region as RAM as it is in the pseudo-physical map. + * + * UNUSABLE regions in domUs are not handled and will need + * a patch in the future. + */ + if (xen_initial_domain()) + xen_ignore_unusable(map, memmap.nr_entries); + /* Make sure the Xen-supplied memory map is well-ordered. */ sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index ca92754..b81c88e 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -694,8 +694,15 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) { int rc; - rc = native_cpu_up(cpu, tidle); - WARN_ON (xen_smp_intr_init(cpu)); + /* + * xen_smp_intr_init() needs to run before native_cpu_up() + * so that IPI vectors are set up on the booting CPU before + * it is marked online in native_cpu_up(). + */ + rc = xen_smp_intr_init(cpu); + WARN_ON(rc); + if (!rc) + rc = native_cpu_up(cpu, tidle); return rc; } diff --git a/drivers/gpu/drm/i915/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/i915_gem_dmabuf.c index dc53a52..9e65783 100644 --- a/drivers/gpu/drm/i915/i915_gem_dmabuf.c +++ b/drivers/gpu/drm/i915/i915_gem_dmabuf.c @@ -85,9 +85,17 @@ static void i915_gem_unmap_dma_buf(struct dma_buf_attachment *attachment, struct sg_table *sg, enum dma_data_direction dir) { + struct drm_i915_gem_object *obj = attachment->dmabuf->priv; + + mutex_lock(&obj->base.dev->struct_mutex); + dma_unmap_sg(attachment->dev, sg->sgl, sg->nents, dir); sg_free_table(sg); kfree(sg); + + i915_gem_object_unpin_pages(obj); + + mutex_unlock(&obj->base.dev->struct_mutex); } static void i915_gem_dmabuf_release(struct dma_buf *dma_buf) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index e38b457..be79f47 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -10042,6 +10042,8 @@ struct intel_display_error_state { u32 power_well_driver; + int num_transcoders; + struct intel_cursor_error_state { u32 control; u32 position; @@ -10050,16 +10052,7 @@ struct intel_display_error_state { } cursor[I915_MAX_PIPES]; struct intel_pipe_error_state { - enum transcoder cpu_transcoder; - u32 conf; u32 source; - - u32 htotal; - u32 hblank; - u32 hsync; - u32 vtotal; - u32 vblank; - u32 vsync; } pipe[I915_MAX_PIPES]; struct intel_plane_error_state { @@ -10071,6 +10064,19 @@ struct intel_display_error_state { u32 surface; u32 tile_offset; } plane[I915_MAX_PIPES]; + + struct intel_transcoder_error_state { + enum transcoder cpu_transcoder; + + u32 conf; + + u32 htotal; + u32 hblank; + u32 hsync; + u32 vtotal; + u32 vblank; + u32 vsync; + } transcoder[4]; }; struct intel_display_error_state * @@ -10078,9 +10084,17 @@ intel_display_capture_error_state(struct drm_device *dev) { drm_i915_private_t *dev_priv = dev->dev_private; struct intel_display_error_state *error; - enum transcoder cpu_transcoder; + int transcoders[] = { + TRANSCODER_A, + TRANSCODER_B, + TRANSCODER_C, + TRANSCODER_EDP, + }; int i; + if (INTEL_INFO(dev)->num_pipes == 0) + return NULL; + error = kmalloc(sizeof(*error), GFP_ATOMIC); if (error == NULL) return NULL; @@ -10089,9 +10103,6 @@ intel_display_capture_error_state(struct drm_device *dev) error->power_well_driver = I915_READ(HSW_PWR_WELL_DRIVER); for_each_pipe(i) { - cpu_transcoder = intel_pipe_to_cpu_transcoder(dev_priv, i); - error->pipe[i].cpu_transcoder = cpu_transcoder; - if (INTEL_INFO(dev)->gen <= 6 || IS_VALLEYVIEW(dev)) { error->cursor[i].control = I915_READ(CURCNTR(i)); error->cursor[i].position = I915_READ(CURPOS(i)); @@ -10115,14 +10126,25 @@ intel_display_capture_error_state(struct drm_device *dev) error->plane[i].tile_offset = I915_READ(DSPTILEOFF(i)); } - error->pipe[i].conf = I915_READ(PIPECONF(cpu_transcoder)); error->pipe[i].source = I915_READ(PIPESRC(i)); - error->pipe[i].htotal = I915_READ(HTOTAL(cpu_transcoder)); - error->pipe[i].hblank = I915_READ(HBLANK(cpu_transcoder)); - error->pipe[i].hsync = I915_READ(HSYNC(cpu_transcoder)); - error->pipe[i].vtotal = I915_READ(VTOTAL(cpu_transcoder)); - error->pipe[i].vblank = I915_READ(VBLANK(cpu_transcoder)); - error->pipe[i].vsync = I915_READ(VSYNC(cpu_transcoder)); + } + + error->num_transcoders = INTEL_INFO(dev)->num_pipes; + if (HAS_DDI(dev_priv->dev)) + error->num_transcoders++; /* Account for eDP. */ + + for (i = 0; i < error->num_transcoders; i++) { + enum transcoder cpu_transcoder = transcoders[i]; + + error->transcoder[i].cpu_transcoder = cpu_transcoder; + + error->transcoder[i].conf = I915_READ(PIPECONF(cpu_transcoder)); + error->transcoder[i].htotal = I915_READ(HTOTAL(cpu_transcoder)); + error->transcoder[i].hblank = I915_READ(HBLANK(cpu_transcoder)); + error->transcoder[i].hsync = I915_READ(HSYNC(cpu_transcoder)); + error->transcoder[i].vtotal = I915_READ(VTOTAL(cpu_transcoder)); + error->transcoder[i].vblank = I915_READ(VBLANK(cpu_transcoder)); + error->transcoder[i].vsync = I915_READ(VSYNC(cpu_transcoder)); } /* In the code above we read the registers without checking if the power @@ -10144,22 +10166,16 @@ intel_display_print_error_state(struct drm_i915_error_state_buf *m, { int i; + if (!error) + return; + err_printf(m, "Num Pipes: %d\n", INTEL_INFO(dev)->num_pipes); if (HAS_POWER_WELL(dev)) err_printf(m, "PWR_WELL_CTL2: %08x\n", error->power_well_driver); for_each_pipe(i) { err_printf(m, "Pipe [%d]:\n", i); - err_printf(m, " CPU transcoder: %c\n", - transcoder_name(error->pipe[i].cpu_transcoder)); - err_printf(m, " CONF: %08x\n", error->pipe[i].conf); err_printf(m, " SRC: %08x\n", error->pipe[i].source); - err_printf(m, " HTOTAL: %08x\n", error->pipe[i].htotal); - err_printf(m, " HBLANK: %08x\n", error->pipe[i].hblank); - err_printf(m, " HSYNC: %08x\n", error->pipe[i].hsync); - err_printf(m, " VTOTAL: %08x\n", error->pipe[i].vtotal); - err_printf(m, " VBLANK: %08x\n", error->pipe[i].vblank); - err_printf(m, " VSYNC: %08x\n", error->pipe[i].vsync); err_printf(m, "Plane [%d]:\n", i); err_printf(m, " CNTR: %08x\n", error->plane[i].control); @@ -10180,5 +10196,17 @@ intel_display_print_error_state(struct drm_i915_error_state_buf *m, err_printf(m, " POS: %08x\n", error->cursor[i].position); err_printf(m, " BASE: %08x\n", error->cursor[i].base); } + + for (i = 0; i < error->num_transcoders; i++) { + err_printf(m, " CPU transcoder: %c\n", + transcoder_name(error->transcoder[i].cpu_transcoder)); + err_printf(m, " CONF: %08x\n", error->transcoder[i].conf); + err_printf(m, " HTOTAL: %08x\n", error->transcoder[i].htotal); + err_printf(m, " HBLANK: %08x\n", error->transcoder[i].hblank); + err_printf(m, " HSYNC: %08x\n", error->transcoder[i].hsync); + err_printf(m, " VTOTAL: %08x\n", error->transcoder[i].vtotal); + err_printf(m, " VBLANK: %08x\n", error->transcoder[i].vblank); + err_printf(m, " VSYNC: %08x\n", error->transcoder[i].vsync); + } } #endif diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 274b8e1..9f19259 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -2163,7 +2163,7 @@ void cik_mm_wdoorbell(struct radeon_device *rdev, u32 offset, u32 v); WREG32(reg, tmp_); \ } while (0) #define WREG32_AND(reg, and) WREG32_P(reg, 0, and) -#define WREG32_OR(reg, or) WREG32_P(reg, or, ~or) +#define WREG32_OR(reg, or) WREG32_P(reg, or, ~(or)) #define WREG32_PLL_P(reg, val, mask) \ do { \ uint32_t tmp_ = RREG32_PLL(reg); \ diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c b/drivers/gpu/drm/radeon/radeon_uvd.c index f1c1575..b79f4f5 100644 --- a/drivers/gpu/drm/radeon/radeon_uvd.c +++ b/drivers/gpu/drm/radeon/radeon_uvd.c @@ -356,6 +356,14 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, struct radeon_bo *bo, return -EINVAL; } + if (bo->tbo.sync_obj) { + r = radeon_fence_wait(bo->tbo.sync_obj, false); + if (r) { + DRM_ERROR("Failed waiting for UVD message (%d)!\n", r); + return r; + } + } + r = radeon_bo_kmap(bo, &ptr); if (r) { DRM_ERROR("Failed mapping the UVD message (%d)!\n", r); diff --git a/drivers/gpu/drm/radeon/rv770.c b/drivers/gpu/drm/radeon/rv770.c index bcc68ec..f5e92cf 100644 --- a/drivers/gpu/drm/radeon/rv770.c +++ b/drivers/gpu/drm/radeon/rv770.c @@ -744,10 +744,10 @@ static void rv770_init_golden_registers(struct radeon_device *rdev) (const u32)ARRAY_SIZE(r7xx_golden_dyn_gpr_registers)); radeon_program_register_sequence(rdev, rv730_golden_registers, - (const u32)ARRAY_SIZE(rv770_golden_registers)); + (const u32)ARRAY_SIZE(rv730_golden_registers)); radeon_program_register_sequence(rdev, rv730_mgcg_init, - (const u32)ARRAY_SIZE(rv770_mgcg_init)); + (const u32)ARRAY_SIZE(rv730_mgcg_init)); break; case CHIP_RV710: radeon_program_register_sequence(rdev, @@ -758,18 +758,18 @@ static void rv770_init_golden_registers(struct radeon_device *rdev) (const u32)ARRAY_SIZE(r7xx_golden_dyn_gpr_registers)); radeon_program_register_sequence(rdev, rv710_golden_registers, - (const u32)ARRAY_SIZE(rv770_golden_registers)); + (const u32)ARRAY_SIZE(rv710_golden_registers)); radeon_program_register_sequence(rdev, rv710_mgcg_init, - (const u32)ARRAY_SIZE(rv770_mgcg_init)); + (const u32)ARRAY_SIZE(rv710_mgcg_init)); break; case CHIP_RV740: radeon_program_register_sequence(rdev, rv740_golden_registers, - (const u32)ARRAY_SIZE(rv770_golden_registers)); + (const u32)ARRAY_SIZE(rv740_golden_registers)); radeon_program_register_sequence(rdev, rv740_mgcg_init, - (const u32)ARRAY_SIZE(rv770_mgcg_init)); + (const u32)ARRAY_SIZE(rv740_mgcg_init)); break; default: break; diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index dc112a7..4296155 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -959,23 +959,21 @@ out: return r; } -static void remove_mapping(struct mq_policy *mq, dm_oblock_t oblock) +static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) { - struct entry *e = hash_lookup(mq, oblock); + struct mq_policy *mq = to_mq_policy(p); + struct entry *e; + + mutex_lock(&mq->lock); + + e = hash_lookup(mq, oblock); BUG_ON(!e || !e->in_cache); del(mq, e); e->in_cache = false; push(mq, e); -} -static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) -{ - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - remove_mapping(mq, oblock); mutex_unlock(&mq->lock); } diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 4264a76..7407e65 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1603,7 +1603,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) dev_mc_add(slave_dev, lacpdu_multicast); } - if (vlan_vids_add_by_dev(slave_dev, bond_dev)) { + res = vlan_vids_add_by_dev(slave_dev, bond_dev); + if (res) { pr_err("%s: Error: Couldn't add bond vlan ids to %s\n", bond_dev->name, slave_dev->name); goto err_close; diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c index dbbe97a..3b1ff61 100644 --- a/drivers/net/can/at91_can.c +++ b/drivers/net/can/at91_can.c @@ -1355,7 +1355,7 @@ static int at91_can_probe(struct platform_device *pdev) if (at91_is_sam9263(priv)) dev->sysfs_groups[0] = &at91_sysfs_attr_group; - dev_set_drvdata(&pdev->dev, dev); + platform_set_drvdata(pdev, dev); SET_NETDEV_DEV(dev, &pdev->dev); err = register_candev(dev); diff --git a/drivers/net/can/c_can/c_can_platform.c b/drivers/net/can/c_can/c_can_platform.c index c6f838d..294ced3 100644 --- a/drivers/net/can/c_can/c_can_platform.c +++ b/drivers/net/can/c_can/c_can_platform.c @@ -195,7 +195,7 @@ static int c_can_plat_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 1); priv->raminit_ctrlreg = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(priv->raminit_ctrlreg) || priv->instance < 0) + if (IS_ERR(priv->raminit_ctrlreg) || (int)priv->instance < 0) dev_info(&pdev->dev, "control memory is not used for raminit\n"); else priv->raminit = c_can_hw_raminit; diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index c48174e..71c677e 100644 --- a/drivers/net/can/flexcan.c +++ b/drivers/net/can/flexcan.c @@ -1083,7 +1083,7 @@ static int flexcan_probe(struct platform_device *pdev) netif_napi_add(dev, &priv->napi, flexcan_poll, FLEXCAN_NAPI_WEIGHT); - dev_set_drvdata(&pdev->dev, dev); + platform_set_drvdata(pdev, dev); SET_NETDEV_DEV(dev, &pdev->dev); err = register_flexcandev(dev); diff --git a/drivers/net/can/mcp251x.c b/drivers/net/can/mcp251x.c index 8cda23b..fe7dd69 100644 --- a/drivers/net/can/mcp251x.c +++ b/drivers/net/can/mcp251x.c @@ -37,9 +37,6 @@ * * static struct mcp251x_platform_data mcp251x_info = { * .oscillator_frequency = 8000000, - * .board_specific_setup = &mcp251x_setup, - * .power_enable = mcp251x_power_enable, - * .transceiver_enable = NULL, * }; * * static struct spi_board_info spi_board_info[] = { @@ -76,6 +73,7 @@ #include <linux/slab.h> #include <linux/spi/spi.h> #include <linux/uaccess.h> +#include <linux/regulator/consumer.h> /* SPI interface instruction set */ #define INSTRUCTION_WRITE 0x02 @@ -264,6 +262,8 @@ struct mcp251x_priv { #define AFTER_SUSPEND_POWER 4 #define AFTER_SUSPEND_RESTART 8 int restart_tx; + struct regulator *power; + struct regulator *transceiver; }; #define MCP251X_IS(_model) \ @@ -667,16 +667,25 @@ static int mcp251x_hw_probe(struct spi_device *spi) return (st1 == 0x80 && st2 == 0x07) ? 1 : 0; } +static int mcp251x_power_enable(struct regulator *reg, int enable) +{ + if (IS_ERR(reg)) + return 0; + + if (enable) + return regulator_enable(reg); + else + return regulator_disable(reg); +} + static void mcp251x_open_clean(struct net_device *net) { struct mcp251x_priv *priv = netdev_priv(net); struct spi_device *spi = priv->spi; - struct mcp251x_platform_data *pdata = spi->dev.platform_data; free_irq(spi->irq, priv); mcp251x_hw_sleep(spi); - if (pdata->transceiver_enable) - pdata->transceiver_enable(0); + mcp251x_power_enable(priv->transceiver, 0); close_candev(net); } @@ -684,7 +693,6 @@ static int mcp251x_stop(struct net_device *net) { struct mcp251x_priv *priv = netdev_priv(net); struct spi_device *spi = priv->spi; - struct mcp251x_platform_data *pdata = spi->dev.platform_data; close_candev(net); @@ -704,8 +712,7 @@ static int mcp251x_stop(struct net_device *net) mcp251x_hw_sleep(spi); - if (pdata->transceiver_enable) - pdata->transceiver_enable(0); + mcp251x_power_enable(priv->transceiver, 0); priv->can.state = CAN_STATE_STOPPED; @@ -928,8 +935,7 @@ static int mcp251x_open(struct net_device *net) { struct mcp251x_priv *priv = netdev_priv(net); struct spi_device *spi = priv->spi; - struct mcp251x_platform_data *pdata = spi->dev.platform_data; - unsigned long flags; + unsigned long flags = IRQF_ONESHOT | IRQF_TRIGGER_FALLING; int ret; ret = open_candev(net); @@ -939,25 +945,17 @@ static int mcp251x_open(struct net_device *net) } mutex_lock(&priv->mcp_lock); - if (pdata->transceiver_enable) - pdata->transceiver_enable(1); + mcp251x_power_enable(priv->transceiver, 1); priv->force_quit = 0; priv->tx_skb = NULL; priv->tx_len = 0; - flags = IRQF_ONESHOT; - if (pdata->irq_flags) - flags |= pdata->irq_flags; - else - flags |= IRQF_TRIGGER_FALLING; - ret = request_threaded_irq(spi->irq, NULL, mcp251x_can_ist, flags, DEVICE_NAME, priv); if (ret) { dev_err(&spi->dev, "failed to acquire irq %d\n", spi->irq); - if (pdata->transceiver_enable) - pdata->transceiver_enable(0); + mcp251x_power_enable(priv->transceiver, 0); close_candev(net); goto open_unlock; } @@ -1026,6 +1024,19 @@ static int mcp251x_can_probe(struct spi_device *spi) CAN_CTRLMODE_LOOPBACK | CAN_CTRLMODE_LISTENONLY; priv->model = spi_get_device_id(spi)->driver_data; priv->net = net; + + priv->power = devm_regulator_get(&spi->dev, "vdd"); + priv->transceiver = devm_regulator_get(&spi->dev, "xceiver"); + if ((PTR_ERR(priv->power) == -EPROBE_DEFER) || + (PTR_ERR(priv->transceiver) == -EPROBE_DEFER)) { + ret = -EPROBE_DEFER; + goto error_power; + } + + ret = mcp251x_power_enable(priv->power, 1); + if (ret) + goto error_power; + spi_set_drvdata(spi, priv); priv->spi = spi; @@ -1068,30 +1079,24 @@ static int mcp251x_can_probe(struct spi_device *spi) } } - if (pdata->power_enable) - pdata->power_enable(1); - - /* Call out to platform specific setup */ - if (pdata->board_specific_setup) - pdata->board_specific_setup(spi); - SET_NETDEV_DEV(net, &spi->dev); /* Configure the SPI bus */ - spi->mode = SPI_MODE_0; + spi->mode = spi->mode ? : SPI_MODE_0; + if (mcp251x_is_2510(spi)) + spi->max_speed_hz = spi->max_speed_hz ? : 5 * 1000 * 1000; + else + spi->max_speed_hz = spi->max_speed_hz ? : 10 * 1000 * 1000; spi->bits_per_word = 8; spi_setup(spi); /* Here is OK to not lock the MCP, no one knows about it yet */ if (!mcp251x_hw_probe(spi)) { - dev_info(&spi->dev, "Probe failed\n"); + ret = -ENODEV; goto error_probe; } mcp251x_hw_sleep(spi); - if (pdata->transceiver_enable) - pdata->transceiver_enable(0); - ret = register_candev(net); if (ret) goto error_probe; @@ -1109,13 +1114,13 @@ error_rx_buf: if (!mcp251x_enable_dma) kfree(priv->spi_tx_buf); error_tx_buf: - free_candev(net); if (mcp251x_enable_dma) dma_free_coherent(&spi->dev, PAGE_SIZE, priv->spi_tx_buf, priv->spi_tx_dma); + mcp251x_power_enable(priv->power, 0); +error_power: + free_candev(net); error_alloc: - if (pdata->power_enable) - pdata->power_enable(0); dev_err(&spi->dev, "probe failed\n"); error_out: return ret; @@ -1123,12 +1128,10 @@ error_out: static int mcp251x_can_remove(struct spi_device *spi) { - struct mcp251x_platform_data *pdata = spi->dev.platform_data; struct mcp251x_priv *priv = spi_get_drvdata(spi); struct net_device *net = priv->net; unregister_candev(net); - free_candev(net); if (mcp251x_enable_dma) { dma_free_coherent(&spi->dev, PAGE_SIZE, @@ -1138,8 +1141,9 @@ static int mcp251x_can_remove(struct spi_device *spi) kfree(priv->spi_rx_buf); } - if (pdata->power_enable) - pdata->power_enable(0); + mcp251x_power_enable(priv->power, 0); + + free_candev(net); return 0; } @@ -1149,7 +1153,6 @@ static int mcp251x_can_remove(struct spi_device *spi) static int mcp251x_can_suspend(struct device *dev) { struct spi_device *spi = to_spi_device(dev); - struct mcp251x_platform_data *pdata = spi->dev.platform_data; struct mcp251x_priv *priv = spi_get_drvdata(spi); struct net_device *net = priv->net; @@ -1163,15 +1166,14 @@ static int mcp251x_can_suspend(struct device *dev) netif_device_detach(net); mcp251x_hw_sleep(spi); - if (pdata->transceiver_enable) - pdata->transceiver_enable(0); + mcp251x_power_enable(priv->transceiver, 0); priv->after_suspend = AFTER_SUSPEND_UP; } else { priv->after_suspend = AFTER_SUSPEND_DOWN; } - if (pdata->power_enable) { - pdata->power_enable(0); + if (!IS_ERR(priv->power)) { + regulator_disable(priv->power); priv->after_suspend |= AFTER_SUSPEND_POWER; } @@ -1181,16 +1183,14 @@ static int mcp251x_can_suspend(struct device *dev) static int mcp251x_can_resume(struct device *dev) { struct spi_device *spi = to_spi_device(dev); - struct mcp251x_platform_data *pdata = spi->dev.platform_data; struct mcp251x_priv *priv = spi_get_drvdata(spi); if (priv->after_suspend & AFTER_SUSPEND_POWER) { - pdata->power_enable(1); + mcp251x_power_enable(priv->power, 1); queue_work(priv->wq, &priv->restart_work); } else { if (priv->after_suspend & AFTER_SUSPEND_UP) { - if (pdata->transceiver_enable) - pdata->transceiver_enable(1); + mcp251x_power_enable(priv->transceiver, 1); queue_work(priv->wq, &priv->restart_work); } else { priv->after_suspend = 0; diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c index b1bcd4b..190219e 100644 --- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c +++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c @@ -1747,11 +1747,10 @@ static int bcm_enet_probe(struct platform_device *pdev) if (!bcm_enet_shared_base[0]) return -ENODEV; - res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); res_irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0); res_irq_rx = platform_get_resource(pdev, IORESOURCE_IRQ, 1); res_irq_tx = platform_get_resource(pdev, IORESOURCE_IRQ, 2); - if (!res_mem || !res_irq || !res_irq_rx || !res_irq_tx) + if (!res_irq || !res_irq_rx || !res_irq_tx) return -ENODEV; ret = 0; @@ -1767,9 +1766,10 @@ static int bcm_enet_probe(struct platform_device *pdev) if (ret) goto out; - priv->base = devm_request_and_ioremap(&pdev->dev, res_mem); - if (priv->base == NULL) { - ret = -ENOMEM; + res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); + priv->base = devm_ioremap_resource(&pdev->dev, res_mem); + if (IS_ERR(priv->base)) { + ret = PTR_ERR(priv->base); goto out; } @@ -2836,7 +2836,6 @@ static int bcm_enetsw_remove(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); release_mem_region(res->start, resource_size(res)); - platform_set_drvdata(pdev, NULL); free_netdev(dev); return 0; } diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h index 126dec4..12202f8 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h @@ -1333,6 +1333,8 @@ enum { BNX2X_SP_RTNL_VFPF_CHANNEL_DOWN, BNX2X_SP_RTNL_RX_MODE, BNX2X_SP_RTNL_HYPERVISOR_VLAN, + BNX2X_SP_RTNL_TX_STOP, + BNX2X_SP_RTNL_TX_RESUME, }; struct bnx2x_prev_path_list { diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.c index f9122f2..fcf2761 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.c @@ -30,10 +30,8 @@ #include "bnx2x_dcb.h" /* forward declarations of dcbx related functions */ -static int bnx2x_dcbx_stop_hw_tx(struct bnx2x *bp); static void bnx2x_pfc_set_pfc(struct bnx2x *bp); static void bnx2x_dcbx_update_ets_params(struct bnx2x *bp); -static int bnx2x_dcbx_resume_hw_tx(struct bnx2x *bp); static void bnx2x_dcbx_get_ets_pri_pg_tbl(struct bnx2x *bp, u32 *set_configuration_ets_pg, u32 *pri_pg_tbl); @@ -425,30 +423,52 @@ static void bnx2x_pfc_set_pfc(struct bnx2x *bp) bnx2x_pfc_clear(bp); } -static int bnx2x_dcbx_stop_hw_tx(struct bnx2x *bp) +int bnx2x_dcbx_stop_hw_tx(struct bnx2x *bp) { struct bnx2x_func_state_params func_params = {NULL}; + int rc; func_params.f_obj = &bp->func_obj; func_params.cmd = BNX2X_F_CMD_TX_STOP; + __set_bit(RAMROD_COMP_WAIT, &func_params.ramrod_flags); + __set_bit(RAMROD_RETRY, &func_params.ramrod_flags); + DP(BNX2X_MSG_DCB, "STOP TRAFFIC\n"); - return bnx2x_func_state_change(bp, &func_params); + + rc = bnx2x_func_state_change(bp, &func_params); + if (rc) { + BNX2X_ERR("Unable to hold traffic for HW configuration\n"); + bnx2x_panic(); + } + + return rc; } -static int bnx2x_dcbx_resume_hw_tx(struct bnx2x *bp) +int bnx2x_dcbx_resume_hw_tx(struct bnx2x *bp) { struct bnx2x_func_state_params func_params = {NULL}; struct bnx2x_func_tx_start_params *tx_params = &func_params.params.tx_start; + int rc; func_params.f_obj = &bp->func_obj; func_params.cmd = BNX2X_F_CMD_TX_START; + __set_bit(RAMROD_COMP_WAIT, &func_params.ramrod_flags); + __set_bit(RAMROD_RETRY, &func_params.ramrod_flags); + bnx2x_dcbx_fw_struct(bp, tx_params); DP(BNX2X_MSG_DCB, "START TRAFFIC\n"); - return bnx2x_func_state_change(bp, &func_params); + + rc = bnx2x_func_state_change(bp, &func_params); + if (rc) { + BNX2X_ERR("Unable to resume traffic after HW configuration\n"); + bnx2x_panic(); + } + + return rc; } static void bnx2x_dcbx_2cos_limit_update_ets_config(struct bnx2x *bp) @@ -744,7 +764,9 @@ void bnx2x_dcbx_set_params(struct bnx2x *bp, u32 state) if (IS_MF(bp)) bnx2x_link_sync_notify(bp); - bnx2x_dcbx_stop_hw_tx(bp); + set_bit(BNX2X_SP_RTNL_TX_STOP, &bp->sp_rtnl_state); + + schedule_delayed_work(&bp->sp_rtnl_task, 0); return; } @@ -757,7 +779,9 @@ void bnx2x_dcbx_set_params(struct bnx2x *bp, u32 state) /* ets may affect cmng configuration: reinit it in hw */ bnx2x_set_local_cmng(bp); - bnx2x_dcbx_resume_hw_tx(bp); + set_bit(BNX2X_SP_RTNL_TX_RESUME, &bp->sp_rtnl_state); + + schedule_delayed_work(&bp->sp_rtnl_task, 0); return; case BNX2X_DCBX_STATE_TX_RELEASED: @@ -2367,21 +2391,24 @@ static u8 bnx2x_dcbnl_get_featcfg(struct net_device *netdev, int featid, case DCB_FEATCFG_ATTR_PG: if (bp->dcbx_local_feat.ets.enabled) *flags |= DCB_FEATCFG_ENABLE; - if (bp->dcbx_error & DCBX_LOCAL_ETS_ERROR) + if (bp->dcbx_error & (DCBX_LOCAL_ETS_ERROR | + DCBX_REMOTE_MIB_ERROR)) *flags |= DCB_FEATCFG_ERROR; break; case DCB_FEATCFG_ATTR_PFC: if (bp->dcbx_local_feat.pfc.enabled) *flags |= DCB_FEATCFG_ENABLE; if (bp->dcbx_error & (DCBX_LOCAL_PFC_ERROR | - DCBX_LOCAL_PFC_MISMATCH)) + DCBX_LOCAL_PFC_MISMATCH | + DCBX_REMOTE_MIB_ERROR)) *flags |= DCB_FEATCFG_ERROR; break; case DCB_FEATCFG_ATTR_APP: if (bp->dcbx_local_feat.app.enabled) *flags |= DCB_FEATCFG_ENABLE; if (bp->dcbx_error & (DCBX_LOCAL_APP_ERROR | - DCBX_LOCAL_APP_MISMATCH)) + DCBX_LOCAL_APP_MISMATCH | + DCBX_REMOTE_MIB_ERROR)) *flags |= DCB_FEATCFG_ERROR; break; default: diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.h index 125bd1b..804b8f6 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.h @@ -199,4 +199,7 @@ extern const struct dcbnl_rtnl_ops bnx2x_dcbnl_ops; int bnx2x_dcbnl_update_applist(struct bnx2x *bp, bool delall); #endif /* BCM_DCBNL */ +int bnx2x_dcbx_stop_hw_tx(struct bnx2x *bp); +int bnx2x_dcbx_resume_hw_tx(struct bnx2x *bp); + #endif /* BNX2X_DCB_H */ diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index 7f4ec80..17f117c 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -2261,6 +2261,23 @@ static void bnx2x_set_requested_fc(struct bnx2x *bp) bp->link_params.req_fc_auto_adv = BNX2X_FLOW_CTRL_BOTH; } +static void bnx2x_init_dropless_fc(struct bnx2x *bp) +{ + u32 pause_enabled = 0; + + if (!CHIP_IS_E1(bp) && bp->dropless_fc && bp->link_vars.link_up) { + if (bp->link_vars.flow_ctrl & BNX2X_FLOW_CTRL_TX) + pause_enabled = 1; + + REG_WR(bp, BAR_USTRORM_INTMEM + + USTORM_ETH_PAUSE_ENABLED_OFFSET(BP_PORT(bp)), + pause_enabled); + } + + DP(NETIF_MSG_IFUP | NETIF_MSG_LINK, "dropless_fc is %s\n", + pause_enabled ? "enabled" : "disabled"); +} + int bnx2x_initial_phy_init(struct bnx2x *bp, int load_mode) { int rc, cfx_idx = bnx2x_get_link_cfg_idx(bp); @@ -2294,6 +2311,8 @@ int bnx2x_initial_phy_init(struct bnx2x *bp, int load_mode) bnx2x_release_phy_lock(bp); + bnx2x_init_dropless_fc(bp); + bnx2x_calc_fc_adv(bp); if (bp->link_vars.link_up) { @@ -2315,6 +2334,8 @@ void bnx2x_link_set(struct bnx2x *bp) bnx2x_phy_init(&bp->link_params, &bp->link_vars); bnx2x_release_phy_lock(bp); + bnx2x_init_dropless_fc(bp); + bnx2x_calc_fc_adv(bp); } else BNX2X_ERR("Bootcode is missing - can not set link\n"); @@ -2556,20 +2577,9 @@ static void bnx2x_link_attn(struct bnx2x *bp) bnx2x_link_update(&bp->link_params, &bp->link_vars); - if (bp->link_vars.link_up) { + bnx2x_init_dropless_fc(bp); - /* dropless flow control */ - if (!CHIP_IS_E1(bp) && bp->dropless_fc) { - int port = BP_PORT(bp); - u32 pause_enabled = 0; - - if (bp->link_vars.flow_ctrl & BNX2X_FLOW_CTRL_TX) - pause_enabled = 1; - - REG_WR(bp, BAR_USTRORM_INTMEM + - USTORM_ETH_PAUSE_ENABLED_OFFSET(port), - pause_enabled); - } + if (bp->link_vars.link_up) { if (bp->link_vars.mac_type != MAC_TYPE_EMAC) { struct host_port_stats *pstats; @@ -9643,6 +9653,12 @@ sp_rtnl_not_reset: &bp->sp_rtnl_state)) bnx2x_pf_set_vfs_vlan(bp); + if (test_and_clear_bit(BNX2X_SP_RTNL_TX_STOP, &bp->sp_rtnl_state)) + bnx2x_dcbx_stop_hw_tx(bp); + + if (test_and_clear_bit(BNX2X_SP_RTNL_TX_RESUME, &bp->sp_rtnl_state)) + bnx2x_dcbx_resume_hw_tx(bp); + /* work which needs rtnl lock not-taken (as it takes the lock itself and * can be called from other contexts as well) */ @@ -11145,6 +11161,9 @@ static bool bnx2x_get_dropless_info(struct bnx2x *bp) int tmp; u32 cfg; + if (IS_VF(bp)) + return 0; + if (IS_MF(bp) && !CHIP_IS_E1x(bp)) { /* Take function: tmp = func */ tmp = BP_ABS_FUNC(bp); diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c index 1d925fd..fbc026c 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c @@ -1755,11 +1755,8 @@ void bnx2x_iov_init_dq(struct bnx2x *bp) void bnx2x_iov_init_dmae(struct bnx2x *bp) { - DP(BNX2X_MSG_IOV, "SRIOV is %s\n", IS_SRIOV(bp) ? "ON" : "OFF"); - if (!IS_SRIOV(bp)) - return; - - REG_WR(bp, DMAE_REG_BACKWARD_COMP_EN, 0); + if (pci_find_ext_capability(bp->pdev, PCI_EXT_CAP_ID_SRIOV)) + REG_WR(bp, DMAE_REG_BACKWARD_COMP_EN, 0); } static int bnx2x_vf_bus(struct bnx2x *bp, int vfid) @@ -3092,8 +3089,9 @@ void bnx2x_disable_sriov(struct bnx2x *bp) pci_disable_sriov(bp->pdev); } -static int bnx2x_vf_ndo_sanity(struct bnx2x *bp, int vfidx, - struct bnx2x_virtf *vf) +static int bnx2x_vf_ndo_prep(struct bnx2x *bp, int vfidx, + struct bnx2x_virtf **vf, + struct pf_vf_bulletin_content **bulletin) { if (bp->state != BNX2X_STATE_OPEN) { BNX2X_ERR("vf ndo called though PF is down\n"); @@ -3111,12 +3109,22 @@ static int bnx2x_vf_ndo_sanity(struct bnx2x *bp, int vfidx, return -EINVAL; } - if (!vf) { + /* init members */ + *vf = BP_VF(bp, vfidx); + *bulletin = BP_VF_BULLETIN(bp, vfidx); + + if (!*vf) { BNX2X_ERR("vf ndo called but vf was null. vfidx was %d\n", vfidx); return -EINVAL; } + if (!*bulletin) { + BNX2X_ERR("vf ndo called but Bulletin Board struct is null. vfidx was %d\n", + vfidx); + return -EINVAL; + } + return 0; } @@ -3124,17 +3132,19 @@ int bnx2x_get_vf_config(struct net_device *dev, int vfidx, struct ifla_vf_info *ivi) { struct bnx2x *bp = netdev_priv(dev); - struct bnx2x_virtf *vf = BP_VF(bp, vfidx); - struct bnx2x_vlan_mac_obj *mac_obj = &bnx2x_vfq(vf, 0, mac_obj); - struct bnx2x_vlan_mac_obj *vlan_obj = &bnx2x_vfq(vf, 0, vlan_obj); - struct pf_vf_bulletin_content *bulletin = BP_VF_BULLETIN(bp, vfidx); + struct bnx2x_virtf *vf = NULL; + struct pf_vf_bulletin_content *bulletin = NULL; + struct bnx2x_vlan_mac_obj *mac_obj; + struct bnx2x_vlan_mac_obj *vlan_obj; int rc; - /* sanity */ - rc = bnx2x_vf_ndo_sanity(bp, vfidx, vf); + /* sanity and init */ + rc = bnx2x_vf_ndo_prep(bp, vfidx, &vf, &bulletin); if (rc) return rc; - if (!mac_obj || !vlan_obj || !bulletin) { + mac_obj = &bnx2x_vfq(vf, 0, mac_obj); + vlan_obj = &bnx2x_vfq(vf, 0, vlan_obj); + if (!mac_obj || !vlan_obj) { BNX2X_ERR("VF partially initialized\n"); return -EINVAL; } @@ -3191,11 +3201,11 @@ int bnx2x_set_vf_mac(struct net_device *dev, int vfidx, u8 *mac) { struct bnx2x *bp = netdev_priv(dev); int rc, q_logical_state; - struct bnx2x_virtf *vf = BP_VF(bp, vfidx); - struct pf_vf_bulletin_content *bulletin = BP_VF_BULLETIN(bp, vfidx); + struct bnx2x_virtf *vf = NULL; + struct pf_vf_bulletin_content *bulletin = NULL; - /* sanity */ - rc = bnx2x_vf_ndo_sanity(bp, vfidx, vf); + /* sanity and init */ + rc = bnx2x_vf_ndo_prep(bp, vfidx, &vf, &bulletin); if (rc) return rc; if (!is_valid_ether_addr(mac)) { @@ -3257,11 +3267,11 @@ int bnx2x_set_vf_vlan(struct net_device *dev, int vfidx, u16 vlan, u8 qos) { struct bnx2x *bp = netdev_priv(dev); int rc, q_logical_state; - struct bnx2x_virtf *vf = BP_VF(bp, vfidx); - struct pf_vf_bulletin_content *bulletin = BP_VF_BULLETIN(bp, vfidx); + struct bnx2x_virtf *vf = NULL; + struct pf_vf_bulletin_content *bulletin = NULL; - /* sanity */ - rc = bnx2x_vf_ndo_sanity(bp, vfidx, vf); + /* sanity and init */ + rc = bnx2x_vf_ndo_prep(bp, vfidx, &vf, &bulletin); if (rc) return rc; diff --git a/drivers/net/ethernet/brocade/bna/cna.h b/drivers/net/ethernet/brocade/bna/cna.h index c37f706..43405f6 100644 --- a/drivers/net/ethernet/brocade/bna/cna.h +++ b/drivers/net/ethernet/brocade/bna/cna.h @@ -37,8 +37,8 @@ extern char bfa_version[]; -#define CNA_FW_FILE_CT "ctfw-3.2.1.0.bin" -#define CNA_FW_FILE_CT2 "ct2fw-3.2.1.0.bin" +#define CNA_FW_FILE_CT "ctfw-3.2.1.1.bin" +#define CNA_FW_FILE_CT2 "ct2fw-3.2.1.1.bin" #define FC_SYMNAME_MAX 256 /*!< max name server symbolic name size */ #pragma pack(1) diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c index e866608..fe06ab0 100644 --- a/drivers/net/ethernet/cadence/macb.c +++ b/drivers/net/ethernet/cadence/macb.c @@ -27,6 +27,7 @@ #include <linux/phy.h> #include <linux/of.h> #include <linux/of_device.h> +#include <linux/of_mdio.h> #include <linux/of_net.h> #include <linux/pinctrl/consumer.h> @@ -314,6 +315,7 @@ static int macb_mii_probe(struct net_device *dev) int macb_mii_init(struct macb *bp) { struct macb_platform_data *pdata; + struct device_node *np; int err = -ENXIO, i; /* Enable management port */ @@ -335,21 +337,46 @@ int macb_mii_init(struct macb *bp) bp->mii_bus->parent = &bp->dev->dev; pdata = bp->pdev->dev.platform_data; - if (pdata) - bp->mii_bus->phy_mask = pdata->phy_mask; - bp->mii_bus->irq = kmalloc(sizeof(int)*PHY_MAX_ADDR, GFP_KERNEL); if (!bp->mii_bus->irq) { err = -ENOMEM; goto err_out_free_mdiobus; } - for (i = 0; i < PHY_MAX_ADDR; i++) - bp->mii_bus->irq[i] = PHY_POLL; - dev_set_drvdata(&bp->dev->dev, bp->mii_bus); - if (mdiobus_register(bp->mii_bus)) + np = bp->pdev->dev.of_node; + if (np) { + /* try dt phy registration */ + err = of_mdiobus_register(bp->mii_bus, np); + + /* fallback to standard phy registration if no phy were + found during dt phy registration */ + if (!err && !phy_find_first(bp->mii_bus)) { + for (i = 0; i < PHY_MAX_ADDR; i++) { + struct phy_device *phydev; + + phydev = mdiobus_scan(bp->mii_bus, i); + if (IS_ERR(phydev)) { + err = PTR_ERR(phydev); + break; + } + } + + if (err) + goto err_out_unregister_bus; + } + } else { + for (i = 0; i < PHY_MAX_ADDR; i++) + bp->mii_bus->irq[i] = PHY_POLL; + + if (pdata) + bp->mii_bus->phy_mask = pdata->phy_mask; + + err = mdiobus_register(bp->mii_bus); + } + + if (err) goto err_out_free_mdio_irq; if (macb_mii_probe(bp->dev) != 0) { diff --git a/drivers/net/ethernet/cisco/enic/Makefile b/drivers/net/ethernet/cisco/enic/Makefile index e52296d..239e1e4 100644 --- a/drivers/net/ethernet/cisco/enic/Makefile +++ b/drivers/net/ethernet/cisco/enic/Makefile @@ -2,5 +2,5 @@ obj-$(CONFIG_ENIC) := enic.o enic-y := enic_main.o vnic_cq.o vnic_intr.o vnic_wq.o \ enic_res.o enic_dev.o enic_pp.o vnic_dev.o vnic_rq.o vnic_vic.o \ - enic_ethtool.o + enic_ethtool.o enic_api.o diff --git a/drivers/net/ethernet/cisco/enic/enic.h b/drivers/net/ethernet/cisco/enic/enic.h index 2e37c63..be16731 100644 --- a/drivers/net/ethernet/cisco/enic/enic.h +++ b/drivers/net/ethernet/cisco/enic/enic.h @@ -32,8 +32,8 @@ #define DRV_NAME "enic" #define DRV_DESCRIPTION "Cisco VIC Ethernet NIC Driver" -#define DRV_VERSION "2.1.1.39" -#define DRV_COPYRIGHT "Copyright 2008-2011 Cisco Systems, Inc" +#define DRV_VERSION "2.1.1.43" +#define DRV_COPYRIGHT "Copyright 2008-2013 Cisco Systems, Inc" #define ENIC_BARS_MAX 6 @@ -96,6 +96,7 @@ struct enic { #ifdef CONFIG_PCI_IOV u16 num_vfs; #endif + spinlock_t enic_api_lock; struct enic_port_profile *pp; /* work queue cache line section */ diff --git a/drivers/net/ethernet/cisco/enic/enic_api.c b/drivers/net/ethernet/cisco/enic/enic_api.c new file mode 100644 index 0000000..e13efbd --- /dev/null +++ b/drivers/net/ethernet/cisco/enic/enic_api.c @@ -0,0 +1,48 @@ +/** + * Copyright 2013 Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/netdevice.h> +#include <linux/spinlock.h> + +#include "vnic_dev.h" +#include "vnic_devcmd.h" + +#include "enic_res.h" +#include "enic.h" +#include "enic_api.h" + +int enic_api_devcmd_proxy_by_index(struct net_device *netdev, int vf, + enum vnic_devcmd_cmd cmd, u64 *a0, u64 *a1, int wait) +{ + int err; + struct enic *enic = netdev_priv(netdev); + struct vnic_dev *vdev = enic->vdev; + + spin_lock(&enic->enic_api_lock); + spin_lock(&enic->devcmd_lock); + + vnic_dev_cmd_proxy_by_index_start(vdev, vf); + err = vnic_dev_cmd(vdev, cmd, a0, a1, wait); + vnic_dev_cmd_proxy_end(vdev); + + spin_unlock(&enic->devcmd_lock); + spin_unlock(&enic->enic_api_lock); + + return err; +} +EXPORT_SYMBOL(enic_api_devcmd_proxy_by_index); diff --git a/drivers/net/ethernet/cisco/enic/enic_api.h b/drivers/net/ethernet/cisco/enic/enic_api.h new file mode 100644 index 0000000..6b9f925 --- /dev/null +++ b/drivers/net/ethernet/cisco/enic/enic_api.h @@ -0,0 +1,30 @@ +/** + * Copyright 2013 Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __ENIC_API_H__ +#define __ENIC_API_H__ + +#include <linux/netdevice.h> + +#include "vnic_dev.h" +#include "vnic_devcmd.h" + +int enic_api_devcmd_proxy_by_index(struct net_device *netdev, int vf, + enum vnic_devcmd_cmd cmd, u64 *a0, u64 *a1, int wait); + +#endif diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index b12b32b..bcf15b1 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -1733,6 +1733,7 @@ static void enic_reset(struct work_struct *work) rtnl_lock(); + spin_lock(&enic->enic_api_lock); enic_dev_hang_notify(enic); enic_stop(enic->netdev); enic_dev_hang_reset(enic); @@ -1741,6 +1742,8 @@ static void enic_reset(struct work_struct *work) enic_set_rss_nic_cfg(enic); enic_dev_set_ig_vlan_rewrite_mode(enic); enic_open(enic->netdev); + spin_unlock(&enic->enic_api_lock); + call_netdevice_notifiers(NETDEV_REBOOT, enic->netdev); rtnl_unlock(); } @@ -2153,6 +2156,7 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) */ spin_lock_init(&enic->devcmd_lock); + spin_lock_init(&enic->enic_api_lock); /* * Set ingress vlan rewrite mode before vnic initialization diff --git a/drivers/net/ethernet/cisco/enic/enic_res.h b/drivers/net/ethernet/cisco/enic/enic_res.h index 25be273..69f60af 100644 --- a/drivers/net/ethernet/cisco/enic/enic_res.h +++ b/drivers/net/ethernet/cisco/enic/enic_res.h @@ -47,6 +47,9 @@ static inline void enic_queue_wq_desc_ex(struct vnic_wq *wq, int offload_mode, int cq_entry, int sop, int eop, int loopback) { struct wq_enet_desc *desc = vnic_wq_next_desc(wq); + u8 desc_skip_cnt = 1; + u8 compressed_send = 0; + u64 wrid = 0; wq_enet_desc_enc(desc, (u64)dma_addr | VNIC_PADDR_TARGET, @@ -59,7 +62,8 @@ static inline void enic_queue_wq_desc_ex(struct vnic_wq *wq, (u16)vlan_tag, (u8)loopback); - vnic_wq_post(wq, os_buf, dma_addr, len, sop, eop); + vnic_wq_post(wq, os_buf, dma_addr, len, sop, eop, desc_skip_cnt, + (u8)cq_entry, compressed_send, wrid); } static inline void enic_queue_wq_desc_cont(struct vnic_wq *wq, @@ -120,6 +124,7 @@ static inline void enic_queue_rq_desc(struct vnic_rq *rq, dma_addr_t dma_addr, unsigned int len) { struct rq_enet_desc *desc = vnic_rq_next_desc(rq); + u64 wrid = 0; u8 type = os_buf_index ? RQ_ENET_TYPE_NOT_SOP : RQ_ENET_TYPE_ONLY_SOP; @@ -127,7 +132,7 @@ static inline void enic_queue_rq_desc(struct vnic_rq *rq, (u64)dma_addr | VNIC_PADDR_TARGET, type, (u16)len); - vnic_rq_post(rq, os_buf, os_buf_index, dma_addr, len); + vnic_rq_post(rq, os_buf, os_buf_index, dma_addr, len, wrid); } struct enic; diff --git a/drivers/net/ethernet/cisco/enic/vnic_devcmd.h b/drivers/net/ethernet/cisco/enic/vnic_devcmd.h index 23d5552..b9a0d78 100644 --- a/drivers/net/ethernet/cisco/enic/vnic_devcmd.h +++ b/drivers/net/ethernet/cisco/enic/vnic_devcmd.h @@ -281,11 +281,25 @@ enum vnic_devcmd_cmd { * 0 if no VIF-CONFIG-INFO TLV was ever received. */ CMD_CONFIG_INFO_GET = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ALL, 44), + /* INT13 API: (u64)a0=paddr to vnic_int13_params struct + * (u32)a1=INT13_CMD_xxx + */ + CMD_INT13_ALL = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 45), + + /* Set default vlan: + * in: (u16)a0=new default vlan + * (u16)a1=zero for overriding vlan with param a0, + * non-zero for resetting vlan to the default + * out: (u16)a0=old default vlan + */ + CMD_SET_DEFAULT_VLAN = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ALL, 46), + /* init_prov_info2: * Variant of CMD_INIT_PROV_INFO, where it will not try to enable * the vnic until CMD_ENABLE2 is issued. * (u64)a0=paddr of vnic_devcmd_provinfo - * (u32)a1=sizeof provision info */ + * (u32)a1=sizeof provision info + */ CMD_INIT_PROV_INFO2 = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 47), /* enable2: @@ -339,16 +353,57 @@ enum vnic_devcmd_cmd { CMD_INTR_COAL_CONVERT = _CMDC(_CMD_DIR_READ, _CMD_VTYPE_ALL, 50), /* - * cmd_set_mac_addr - * set mac address + * Set the predefined mac address as default * in: * (u48)a0 = mac addr - * */ CMD_SET_MAC_ADDR = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 55), + + /* Update the provisioning info of the given VIF + * (u64)a0=paddr of vnic_devcmd_provinfo + * (u32)a1=sizeof provision info + */ + CMD_PROV_INFO_UPDATE = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 56), + + /* Add a filter. + * in: (u64) a0= filter address + * (u32) a1= size of filter + * out: (u32) a0=filter identifier + */ + CMD_ADD_FILTER = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ENET, 58), + + /* Delete a filter. + * in: (u32) a0=filter identifier + */ + CMD_DEL_FILTER = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 59), + + /* Enable a Queue Pair in User space NIC + * in: (u32) a0=Queue Pair number + * (u32) a1= command + */ + CMD_QP_ENABLE = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 60), + + /* Disable a Queue Pair in User space NIC + * in: (u32) a0=Queue Pair number + * (u32) a1= command + */ + CMD_QP_DISABLE = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 61), + + /* Stats dump Queue Pair in User space NIC + * in: (u32) a0=Queue Pair number + * (u64) a1=host buffer addr for status dump + * (u32) a2=length of the buffer + */ + CMD_QP_STATS_DUMP = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 62), + + /* Clear stats for Queue Pair in User space NIC + * in: (u32) a0=Queue Pair number + */ + CMD_QP_STATS_CLEAR = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 63), }; /* CMD_ENABLE2 flags */ +#define CMD_ENABLE2_STANDBY 0x0 #define CMD_ENABLE2_ACTIVE 0x1 /* flags for CMD_OPEN */ @@ -364,6 +419,9 @@ enum vnic_devcmd_cmd { #define CMD_PFILTER_PROMISCUOUS 0x08 #define CMD_PFILTER_ALL_MULTICAST 0x10 +/* Commands for CMD_QP_ENABLE/CM_QP_DISABLE */ +#define CMD_QP_RQWQ 0x0 + /* rewrite modes for CMD_IG_VLAN_REWRITE_MODE */ #define IG_VLAN_REWRITE_MODE_DEFAULT_TRUNK 0 #define IG_VLAN_REWRITE_MODE_UNTAG_DEFAULT_VLAN 1 @@ -390,6 +448,7 @@ enum vnic_devcmd_error { ERR_EMAXRES = 10, ERR_ENOTSUPPORTED = 11, ERR_EINPROGRESS = 12, + ERR_MAX }; /* @@ -435,6 +494,115 @@ struct vnic_devcmd_provinfo { u8 data[0]; }; +/* These are used in flags field of different filters to denote + * valid fields used. + */ +#define FILTER_FIELD_VALID(fld) (1 << (fld - 1)) + +#define FILTER_FIELDS_USNIC ( \ + FILTER_FIELD_VALID(1) | \ + FILTER_FIELD_VALID(2) | \ + FILTER_FIELD_VALID(3) | \ + FILTER_FIELD_VALID(4)) + +#define FILTER_FIELDS_IPV4_5TUPLE ( \ + FILTER_FIELD_VALID(1) | \ + FILTER_FIELD_VALID(2) | \ + FILTER_FIELD_VALID(3) | \ + FILTER_FIELD_VALID(4) | \ + FILTER_FIELD_VALID(5)) + +#define FILTER_FIELDS_MAC_VLAN ( \ + FILTER_FIELD_VALID(1) | \ + FILTER_FIELD_VALID(2)) + +#define FILTER_FIELD_USNIC_VLAN FILTER_FIELD_VALID(1) +#define FILTER_FIELD_USNIC_ETHTYPE FILTER_FIELD_VALID(2) +#define FILTER_FIELD_USNIC_PROTO FILTER_FIELD_VALID(3) +#define FILTER_FIELD_USNIC_ID FILTER_FIELD_VALID(4) + +struct filter_usnic_id { + u32 flags; + u16 vlan; + u16 ethtype; + u8 proto_version; + u32 usnic_id; +} __packed; + +#define FILTER_FIELD_5TUP_PROTO FILTER_FIELD_VALID(1) +#define FILTER_FIELD_5TUP_SRC_AD FILTER_FIELD_VALID(2) +#define FILTER_FIELD_5TUP_DST_AD FILTER_FIELD_VALID(3) +#define FILTER_FIELD_5TUP_SRC_PT FILTER_FIELD_VALID(4) +#define FILTER_FIELD_5TUP_DST_PT FILTER_FIELD_VALID(5) + +/* Enums for the protocol field. */ +enum protocol_e { + PROTO_UDP = 0, + PROTO_TCP = 1, +}; + +struct filter_ipv4_5tuple { + u32 flags; + u32 protocol; + u32 src_addr; + u32 dst_addr; + u16 src_port; + u16 dst_port; +} __packed; + +#define FILTER_FIELD_VMQ_VLAN FILTER_FIELD_VALID(1) +#define FILTER_FIELD_VMQ_MAC FILTER_FIELD_VALID(2) + +struct filter_mac_vlan { + u32 flags; + u16 vlan; + u8 mac_addr[6]; +} __packed; + +/* Specifies the filter_action type. */ +enum { + FILTER_ACTION_RQ_STEERING = 0, + FILTER_ACTION_MAX +}; + +struct filter_action { + u32 type; + union { + u32 rq_idx; + } u; +} __packed; + +/* Specifies the filter type. */ +enum filter_type { + FILTER_USNIC_ID = 0, + FILTER_IPV4_5TUPLE = 1, + FILTER_MAC_VLAN = 2, + FILTER_MAX +}; + +struct filter { + u32 type; + union { + struct filter_usnic_id usnic; + struct filter_ipv4_5tuple ipv4; + struct filter_mac_vlan mac_vlan; + } u; +} __packed; + +enum { + CLSF_TLV_FILTER = 0, + CLSF_TLV_ACTION = 1, +}; + +/* Maximum size of buffer to CMD_ADD_FILTER */ +#define FILTER_MAX_BUF_SIZE 100 + +struct filter_tlv { + u_int32_t type; + u_int32_t length; + u_int32_t val[0]; +}; + /* * Writing cmd register causes STAT_BUSY to get set in status register. * When cmd completes, STAT_BUSY will be cleared. diff --git a/drivers/net/ethernet/cisco/enic/vnic_rq.c b/drivers/net/ethernet/cisco/enic/vnic_rq.c index 7e1488f..36a2ed6 100644 --- a/drivers/net/ethernet/cisco/enic/vnic_rq.c +++ b/drivers/net/ethernet/cisco/enic/vnic_rq.c @@ -30,12 +30,9 @@ static int vnic_rq_alloc_bufs(struct vnic_rq *rq) { struct vnic_rq_buf *buf; - struct vnic_dev *vdev; unsigned int i, j, count = rq->ring.desc_count; unsigned int blks = VNIC_RQ_BUF_BLKS_NEEDED(count); - vdev = rq->vdev; - for (i = 0; i < blks; i++) { rq->bufs[i] = kzalloc(VNIC_RQ_BUF_BLK_SZ(count), GFP_ATOMIC); if (!rq->bufs[i]) @@ -141,7 +138,7 @@ void vnic_rq_init(struct vnic_rq *rq, unsigned int cq_index, unsigned int error_interrupt_enable, unsigned int error_interrupt_offset) { - u32 fetch_index; + u32 fetch_index = 0; /* Use current fetch_index as the ring starting point */ fetch_index = ioread32(&rq->ctrl->fetch_index); diff --git a/drivers/net/ethernet/cisco/enic/vnic_rq.h b/drivers/net/ethernet/cisco/enic/vnic_rq.h index 2056586..ee7bc95 100644 --- a/drivers/net/ethernet/cisco/enic/vnic_rq.h +++ b/drivers/net/ethernet/cisco/enic/vnic_rq.h @@ -72,6 +72,7 @@ struct vnic_rq_buf { unsigned int len; unsigned int index; void *desc; + uint64_t wr_id; }; struct vnic_rq { @@ -110,7 +111,8 @@ static inline unsigned int vnic_rq_next_index(struct vnic_rq *rq) static inline void vnic_rq_post(struct vnic_rq *rq, void *os_buf, unsigned int os_buf_index, - dma_addr_t dma_addr, unsigned int len) + dma_addr_t dma_addr, unsigned int len, + uint64_t wrid) { struct vnic_rq_buf *buf = rq->to_use; @@ -118,6 +120,7 @@ static inline void vnic_rq_post(struct vnic_rq *rq, buf->os_buf_index = os_buf_index; buf->dma_addr = dma_addr; buf->len = len; + buf->wr_id = wrid; buf = buf->next; rq->to_use = buf; diff --git a/drivers/net/ethernet/cisco/enic/vnic_wq.c b/drivers/net/ethernet/cisco/enic/vnic_wq.c index 5e0d7a2..3e6b8d5 100644 --- a/drivers/net/ethernet/cisco/enic/vnic_wq.c +++ b/drivers/net/ethernet/cisco/enic/vnic_wq.c @@ -30,12 +30,9 @@ static int vnic_wq_alloc_bufs(struct vnic_wq *wq) { struct vnic_wq_buf *buf; - struct vnic_dev *vdev; unsigned int i, j, count = wq->ring.desc_count; unsigned int blks = VNIC_WQ_BUF_BLKS_NEEDED(count); - vdev = wq->vdev; - for (i = 0; i < blks; i++) { wq->bufs[i] = kzalloc(VNIC_WQ_BUF_BLK_SZ(count), GFP_ATOMIC); if (!wq->bufs[i]) diff --git a/drivers/net/ethernet/cisco/enic/vnic_wq.h b/drivers/net/ethernet/cisco/enic/vnic_wq.h index 7dd937a..2c6c708 100644 --- a/drivers/net/ethernet/cisco/enic/vnic_wq.h +++ b/drivers/net/ethernet/cisco/enic/vnic_wq.h @@ -58,6 +58,10 @@ struct vnic_wq_buf { unsigned int index; int sop; void *desc; + uint64_t wr_id; /* Cookie */ + uint8_t cq_entry; /* Gets completion event from hw */ + uint8_t desc_skip_cnt; /* Num descs to occupy */ + uint8_t compressed_send; /* Both hdr and payload in one desc */ }; /* Break the vnic_wq_buf allocations into blocks of 32/64 entries */ @@ -102,14 +106,20 @@ static inline void *vnic_wq_next_desc(struct vnic_wq *wq) static inline void vnic_wq_post(struct vnic_wq *wq, void *os_buf, dma_addr_t dma_addr, - unsigned int len, int sop, int eop) + unsigned int len, int sop, int eop, + uint8_t desc_skip_cnt, uint8_t cq_entry, + uint8_t compressed_send, uint64_t wrid) { struct vnic_wq_buf *buf = wq->to_use; buf->sop = sop; + buf->cq_entry = cq_entry; + buf->compressed_send = compressed_send; + buf->desc_skip_cnt = desc_skip_cnt; buf->os_buf = eop ? os_buf : NULL; buf->dma_addr = dma_addr; buf->len = len; + buf->wr_id = wrid; buf = buf->next; if (eop) { @@ -123,7 +133,7 @@ static inline void vnic_wq_post(struct vnic_wq *wq, } wq->to_use = buf; - wq->ring.desc_avail--; + wq->ring.desc_avail -= desc_skip_cnt; } static inline void vnic_wq_service(struct vnic_wq *wq, diff --git a/drivers/net/ethernet/dlink/sundance.c b/drivers/net/ethernet/dlink/sundance.c index 50d9c63..bf3bf6f 100644 --- a/drivers/net/ethernet/dlink/sundance.c +++ b/drivers/net/ethernet/dlink/sundance.c @@ -469,6 +469,17 @@ static void sundance_reset(struct net_device *dev, unsigned long reset_cmd) } } +#ifdef CONFIG_NET_POLL_CONTROLLER +static void sundance_poll_controller(struct net_device *dev) +{ + struct netdev_private *np = netdev_priv(dev); + + disable_irq(np->pci_dev->irq); + intr_handler(np->pci_dev->irq, dev); + enable_irq(np->pci_dev->irq); +} +#endif + static const struct net_device_ops netdev_ops = { .ndo_open = netdev_open, .ndo_stop = netdev_close, @@ -480,6 +491,9 @@ static const struct net_device_ops netdev_ops = { .ndo_change_mtu = change_mtu, .ndo_set_mac_address = sundance_set_mac_addr, .ndo_validate_addr = eth_validate_addr, +#ifdef CONFIG_NET_POLL_CONTROLLER + .ndo_poll_controller = sundance_poll_controller, +#endif }; static int sundance_probe1(struct pci_dev *pdev, diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h index 11c815d..ace5050 100644 --- a/drivers/net/ethernet/emulex/benet/be.h +++ b/drivers/net/ethernet/emulex/benet/be.h @@ -99,14 +99,18 @@ static inline char *nic_name(struct pci_dev *pdev) #define MCC_Q_LEN 128 /* total size not to exceed 8 pages */ #define MCC_CQ_LEN 256 -#define BE3_MAX_RSS_QS 8 #define BE2_MAX_RSS_QS 4 -#define MAX_RSS_QS BE3_MAX_RSS_QS -#define MAX_RX_QS (MAX_RSS_QS + 1) /* RSS qs + 1 def Rx */ +#define BE3_MAX_RSS_QS 16 +#define BE3_MAX_TX_QS 16 +#define BE3_MAX_EVT_QS 16 + +#define MAX_RX_QS 32 +#define MAX_EVT_QS 32 +#define MAX_TX_QS 32 -#define MAX_TX_QS 8 #define MAX_ROCE_EQS 5 -#define MAX_MSIX_VECTORS (MAX_RSS_QS + MAX_ROCE_EQS) /* RSS qs + RoCE */ +#define MAX_MSIX_VECTORS 32 +#define MIN_MSIX_VECTORS 1 #define BE_TX_BUDGET 256 #define BE_NAPI_WEIGHT 64 #define MAX_RX_POST BE_NAPI_WEIGHT /* Frags posted at a time */ @@ -189,6 +193,7 @@ struct be_eq_obj { u32 cur_eqd; /* in usecs */ u8 idx; /* array index */ + u8 msix_idx; u16 tx_budget; u16 spurious_intr; struct napi_struct napi; @@ -352,6 +357,18 @@ struct phy_info { u32 supported; }; +struct be_resources { + u16 max_vfs; /* Total VFs "really" supported by FW/HW */ + u16 max_mcast_mac; + u16 max_tx_qs; + u16 max_rss_qs; + u16 max_rx_qs; + u16 max_uc_mac; /* Max UC MACs programmable */ + u16 max_vlans; /* Number of vlans supported */ + u16 max_evt_qs; + u32 if_cap_flags; +}; + struct be_adapter { struct pci_dev *pdev; struct net_device *netdev; @@ -369,18 +386,19 @@ struct be_adapter { spinlock_t mcc_lock; /* For serializing mcc cmds to BE card */ spinlock_t mcc_cq_lock; - u32 num_msix_vec; - u32 num_evt_qs; - struct be_eq_obj eq_obj[MAX_MSIX_VECTORS]; + u16 cfg_num_qs; /* configured via set-channels */ + u16 num_evt_qs; + u16 num_msix_vec; + struct be_eq_obj eq_obj[MAX_EVT_QS]; struct msix_entry msix_entries[MAX_MSIX_VECTORS]; bool isr_registered; /* TX Rings */ - u32 num_tx_qs; + u16 num_tx_qs; struct be_tx_obj tx_obj[MAX_TX_QS]; /* Rx rings */ - u32 num_rx_qs; + u16 num_rx_qs; struct be_rx_obj rx_obj[MAX_RX_QS]; u32 big_page_size; /* Compounded page size shared by rx wrbs */ @@ -430,8 +448,8 @@ struct be_adapter { u32 flash_status; struct completion flash_compl; - u32 num_vfs; /* Number of VFs provisioned by PF driver */ - u32 dev_num_vfs; /* Number of VFs supported by HW */ + struct be_resources res; /* resources available for the func */ + u16 num_vfs; /* Number of VFs provisioned by PF */ u8 virtfn; struct be_vf_cfg *vf_cfg; bool be3_native; @@ -446,21 +464,13 @@ struct be_adapter { u16 qnq_vid; u32 msg_enable; int be_get_temp_freq; - u16 max_mcast_mac; - u16 max_tx_queues; - u16 max_rss_queues; - u16 max_rx_queues; - u16 max_pmac_cnt; - u16 max_vlans; - u16 max_event_queues; - u32 if_cap_flags; u8 pf_number; u64 rss_flags; }; #define be_physfn(adapter) (!adapter->virtfn) #define sriov_enabled(adapter) (adapter->num_vfs > 0) -#define sriov_want(adapter) (adapter->dev_num_vfs && num_vfs && \ +#define sriov_want(adapter) (be_max_vfs(adapter) && num_vfs && \ be_physfn(adapter)) #define for_all_vfs(adapter, vf_cfg, i) \ for (i = 0, vf_cfg = &adapter->vf_cfg[i]; i < adapter->num_vfs; \ @@ -469,6 +479,26 @@ struct be_adapter { #define ON 1 #define OFF 0 +#define be_max_vlans(adapter) (adapter->res.max_vlans) +#define be_max_uc(adapter) (adapter->res.max_uc_mac) +#define be_max_mc(adapter) (adapter->res.max_mcast_mac) +#define be_max_vfs(adapter) (adapter->res.max_vfs) +#define be_max_rss(adapter) (adapter->res.max_rss_qs) +#define be_max_txqs(adapter) (adapter->res.max_tx_qs) +#define be_max_prio_txqs(adapter) (adapter->res.max_prio_tx_qs) +#define be_max_rxqs(adapter) (adapter->res.max_rx_qs) +#define be_max_eqs(adapter) (adapter->res.max_evt_qs) +#define be_if_cap_flags(adapter) (adapter->res.if_cap_flags) + +static inline u16 be_max_qs(struct be_adapter *adapter) +{ + /* If no RSS, need atleast the one def RXQ */ + u16 num = max_t(u16, be_max_rss(adapter), 1); + + num = min(num, be_max_eqs(adapter)); + return min_t(u16, num, num_online_cpus()); +} + #define lancer_chip(adapter) (adapter->pdev->device == OC_DEVICE_ID3 || \ adapter->pdev->device == OC_DEVICE_ID4) @@ -672,6 +702,8 @@ extern int be_load_fw(struct be_adapter *adapter, u8 *func); extern bool be_is_wol_supported(struct be_adapter *adapter); extern bool be_pause_supported(struct be_adapter *adapter); extern u32 be_get_fw_log_level(struct be_adapter *adapter); +int be_update_queues(struct be_adapter *adapter); +int be_poll(struct napi_struct *napi, int budget); /* * internal function to initialize-cleanup roce device. diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c index 85923e2..52c9085 100644 --- a/drivers/net/ethernet/emulex/benet/be_cmds.c +++ b/drivers/net/ethernet/emulex/benet/be_cmds.c @@ -633,6 +633,12 @@ static inline struct be_sge *nonembedded_sgl(struct be_mcc_wrb *wrb) return &wrb->payload.sgl[0]; } +static inline void fill_wrb_tags(struct be_mcc_wrb *wrb, + unsigned long addr) +{ + wrb->tag0 = addr & 0xFFFFFFFF; + wrb->tag1 = upper_32_bits(addr); +} /* Don't touch the hdr after it's prepared */ /* mem will be NULL for embedded commands */ @@ -641,17 +647,12 @@ static void be_wrb_cmd_hdr_prepare(struct be_cmd_req_hdr *req_hdr, struct be_mcc_wrb *wrb, struct be_dma_mem *mem) { struct be_sge *sge; - unsigned long addr = (unsigned long)req_hdr; - u64 req_addr = addr; req_hdr->opcode = opcode; req_hdr->subsystem = subsystem; req_hdr->request_length = cpu_to_le32(cmd_len - sizeof(*req_hdr)); req_hdr->version = 0; - - wrb->tag0 = req_addr & 0xFFFFFFFF; - wrb->tag1 = upper_32_bits(req_addr); - + fill_wrb_tags(wrb, (ulong) req_hdr); wrb->payload_length = cmd_len; if (mem) { wrb->embedded |= (1 & MCC_WRB_SGE_CNT_MASK) << @@ -678,31 +679,6 @@ static void be_cmd_page_addrs_prepare(struct phys_addr *pages, u32 max_pages, } } -/* Converts interrupt delay in microseconds to multiplier value */ -static u32 eq_delay_to_mult(u32 usec_delay) -{ -#define MAX_INTR_RATE 651042 - const u32 round = 10; - u32 multiplier; - - if (usec_delay == 0) - multiplier = 0; - else { - u32 interrupt_rate = 1000000 / usec_delay; - /* Max delay, corresponding to the lowest interrupt rate */ - if (interrupt_rate == 0) - multiplier = 1023; - else { - multiplier = (MAX_INTR_RATE - interrupt_rate) * round; - multiplier /= interrupt_rate; - /* Round the multiplier to the closest value.*/ - multiplier = (multiplier + round/2) / round; - multiplier = min(multiplier, (u32)1023); - } - } - return multiplier; -} - static inline struct be_mcc_wrb *wrb_from_mbox(struct be_adapter *adapter) { struct be_dma_mem *mbox_mem = &adapter->mbox_mem; @@ -730,6 +706,78 @@ static struct be_mcc_wrb *wrb_from_mccq(struct be_adapter *adapter) return wrb; } +static bool use_mcc(struct be_adapter *adapter) +{ + return adapter->mcc_obj.q.created; +} + +/* Must be used only in process context */ +static int be_cmd_lock(struct be_adapter *adapter) +{ + if (use_mcc(adapter)) { + spin_lock_bh(&adapter->mcc_lock); + return 0; + } else { + return mutex_lock_interruptible(&adapter->mbox_lock); + } +} + +/* Must be used only in process context */ +static void be_cmd_unlock(struct be_adapter *adapter) +{ + if (use_mcc(adapter)) + spin_unlock_bh(&adapter->mcc_lock); + else + return mutex_unlock(&adapter->mbox_lock); +} + +static struct be_mcc_wrb *be_cmd_copy(struct be_adapter *adapter, + struct be_mcc_wrb *wrb) +{ + struct be_mcc_wrb *dest_wrb; + + if (use_mcc(adapter)) { + dest_wrb = wrb_from_mccq(adapter); + if (!dest_wrb) + return NULL; + } else { + dest_wrb = wrb_from_mbox(adapter); + } + + memcpy(dest_wrb, wrb, sizeof(*wrb)); + if (wrb->embedded & cpu_to_le32(MCC_WRB_EMBEDDED_MASK)) + fill_wrb_tags(dest_wrb, (ulong) embedded_payload(wrb)); + + return dest_wrb; +} + +/* Must be used only in process context */ +static int be_cmd_notify_wait(struct be_adapter *adapter, + struct be_mcc_wrb *wrb) +{ + struct be_mcc_wrb *dest_wrb; + int status; + + status = be_cmd_lock(adapter); + if (status) + return status; + + dest_wrb = be_cmd_copy(adapter, wrb); + if (!dest_wrb) + return -EBUSY; + + if (use_mcc(adapter)) + status = be_mcc_notify_wait(adapter); + else + status = be_mbox_notify_wait(adapter); + + if (!status) + memcpy(wrb, dest_wrb, sizeof(*wrb)); + + be_cmd_unlock(adapter); + return status; +} + /* Tell fw we're about to start firing cmds by writing a * special pattern across the wrb hdr; uses mbox */ @@ -790,13 +838,12 @@ int be_cmd_fw_clean(struct be_adapter *adapter) return status; } -int be_cmd_eq_create(struct be_adapter *adapter, - struct be_queue_info *eq, int eq_delay) +int be_cmd_eq_create(struct be_adapter *adapter, struct be_eq_obj *eqo) { struct be_mcc_wrb *wrb; struct be_cmd_req_eq_create *req; - struct be_dma_mem *q_mem = &eq->dma_mem; - int status; + struct be_dma_mem *q_mem = &eqo->q.dma_mem; + int status, ver = 0; if (mutex_lock_interruptible(&adapter->mbox_lock)) return -1; @@ -807,15 +854,18 @@ int be_cmd_eq_create(struct be_adapter *adapter, be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, OPCODE_COMMON_EQ_CREATE, sizeof(*req), wrb, NULL); + /* Support for EQ_CREATEv2 available only SH-R onwards */ + if (!(BEx_chip(adapter) || lancer_chip(adapter))) + ver = 2; + + req->hdr.version = ver; req->num_pages = cpu_to_le16(PAGES_4K_SPANNED(q_mem->va, q_mem->size)); AMAP_SET_BITS(struct amap_eq_context, valid, req->context, 1); /* 4byte eqe*/ AMAP_SET_BITS(struct amap_eq_context, size, req->context, 0); AMAP_SET_BITS(struct amap_eq_context, count, req->context, - __ilog2_u32(eq->len/256)); - AMAP_SET_BITS(struct amap_eq_context, delaymult, req->context, - eq_delay_to_mult(eq_delay)); + __ilog2_u32(eqo->q.len / 256)); be_dws_cpu_to_le(req->context, sizeof(req->context)); be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem); @@ -823,8 +873,10 @@ int be_cmd_eq_create(struct be_adapter *adapter, status = be_mbox_notify_wait(adapter); if (!status) { struct be_cmd_resp_eq_create *resp = embedded_payload(wrb); - eq->id = le16_to_cpu(resp->eq_id); - eq->created = true; + eqo->q.id = le16_to_cpu(resp->eq_id); + eqo->msix_idx = + (ver == 2) ? le16_to_cpu(resp->msix_idx) : eqo->idx; + eqo->q.created = true; } mutex_unlock(&adapter->mbox_lock); @@ -1130,25 +1182,16 @@ int be_cmd_mccq_create(struct be_adapter *adapter, int be_cmd_txq_create(struct be_adapter *adapter, struct be_tx_obj *txo) { - struct be_mcc_wrb *wrb; + struct be_mcc_wrb wrb = {0}; struct be_cmd_req_eth_tx_create *req; struct be_queue_info *txq = &txo->q; struct be_queue_info *cq = &txo->cq; struct be_dma_mem *q_mem = &txq->dma_mem; int status, ver = 0; - spin_lock_bh(&adapter->mcc_lock); - - wrb = wrb_from_mccq(adapter); - if (!wrb) { - status = -EBUSY; - goto err; - } - - req = embedded_payload(wrb); - + req = embedded_payload(&wrb); be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH, - OPCODE_ETH_TX_CREATE, sizeof(*req), wrb, NULL); + OPCODE_ETH_TX_CREATE, sizeof(*req), &wrb, NULL); if (lancer_chip(adapter)) { req->hdr.version = 1; @@ -1166,12 +1209,11 @@ int be_cmd_txq_create(struct be_adapter *adapter, struct be_tx_obj *txo) req->cq_id = cpu_to_le16(cq->id); req->queue_size = be_encoded_q_len(txq->len); be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem); - ver = req->hdr.version; - status = be_mcc_notify_wait(adapter); + status = be_cmd_notify_wait(adapter, &wrb); if (!status) { - struct be_cmd_resp_eth_tx_create *resp = embedded_payload(wrb); + struct be_cmd_resp_eth_tx_create *resp = embedded_payload(&wrb); txq->id = le16_to_cpu(resp->cid); if (ver == 2) txo->db_offset = le32_to_cpu(resp->db_offset); @@ -1180,9 +1222,6 @@ int be_cmd_txq_create(struct be_adapter *adapter, struct be_tx_obj *txo) txq->created = true; } -err: - spin_unlock_bh(&adapter->mcc_lock); - return status; } @@ -1311,44 +1350,32 @@ err: } /* Create an rx filtering policy configuration on an i/f - * Uses MCCQ + * Will use MBOX only if MCCQ has not been created. */ int be_cmd_if_create(struct be_adapter *adapter, u32 cap_flags, u32 en_flags, u32 *if_handle, u32 domain) { - struct be_mcc_wrb *wrb; + struct be_mcc_wrb wrb = {0}; struct be_cmd_req_if_create *req; int status; - spin_lock_bh(&adapter->mcc_lock); - - wrb = wrb_from_mccq(adapter); - if (!wrb) { - status = -EBUSY; - goto err; - } - req = embedded_payload(wrb); - + req = embedded_payload(&wrb); be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, - OPCODE_COMMON_NTWK_INTERFACE_CREATE, sizeof(*req), wrb, NULL); + OPCODE_COMMON_NTWK_INTERFACE_CREATE, sizeof(*req), &wrb, NULL); req->hdr.domain = domain; req->capability_flags = cpu_to_le32(cap_flags); req->enable_flags = cpu_to_le32(en_flags); - req->pmac_invalid = true; - status = be_mcc_notify_wait(adapter); + status = be_cmd_notify_wait(adapter, &wrb); if (!status) { - struct be_cmd_resp_if_create *resp = embedded_payload(wrb); + struct be_cmd_resp_if_create *resp = embedded_payload(&wrb); *if_handle = le32_to_cpu(resp->interface_id); /* Hack to retrieve VF's pmac-id on BE3 */ if (BE3_chip(adapter) && !be_physfn(adapter)) adapter->pmac_id[0] = le32_to_cpu(resp->pmac_id); } - -err: - spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -1797,8 +1824,7 @@ int be_cmd_rx_filter(struct be_adapter *adapter, u32 flags, u32 value) */ req->if_flags_mask |= cpu_to_le32(BE_IF_FLAGS_MCAST_PROMISCUOUS & - adapter->if_cap_flags); - + be_if_cap_flags(adapter)); req->mcast_num = cpu_to_le32(netdev_mc_count(adapter->netdev)); netdev_for_each_mc_addr(ha, adapter->netdev) memcpy(req->mcast_mac[i++].byte, ha->addr, ETH_ALEN); @@ -3087,30 +3113,63 @@ err: return status; } -static struct be_nic_resource_desc *be_get_nic_desc(u8 *buf, u32 desc_count, - u32 max_buf_size) +static struct be_nic_res_desc *be_get_nic_desc(u8 *buf, u32 desc_count) { - struct be_nic_resource_desc *desc = (struct be_nic_resource_desc *)buf; + struct be_res_desc_hdr *hdr = (struct be_res_desc_hdr *)buf; int i; for (i = 0; i < desc_count; i++) { - desc->desc_len = desc->desc_len ? : RESOURCE_DESC_SIZE; - if (((void *)desc + desc->desc_len) > - (void *)(buf + max_buf_size)) - return NULL; + if (hdr->desc_type == NIC_RESOURCE_DESC_TYPE_V0 || + hdr->desc_type == NIC_RESOURCE_DESC_TYPE_V1) + return (struct be_nic_res_desc *)hdr; - if (desc->desc_type == NIC_RESOURCE_DESC_TYPE_V0 || - desc->desc_type == NIC_RESOURCE_DESC_TYPE_V1) - return desc; - - desc = (void *)desc + desc->desc_len; + hdr->desc_len = hdr->desc_len ? : RESOURCE_DESC_SIZE_V0; + hdr = (void *)hdr + hdr->desc_len; } + return NULL; +} + +static struct be_pcie_res_desc *be_get_pcie_desc(u8 devfn, u8 *buf, + u32 desc_count) +{ + struct be_res_desc_hdr *hdr = (struct be_res_desc_hdr *)buf; + struct be_pcie_res_desc *pcie; + int i; + + for (i = 0; i < desc_count; i++) { + if ((hdr->desc_type == PCIE_RESOURCE_DESC_TYPE_V0 || + hdr->desc_type == PCIE_RESOURCE_DESC_TYPE_V1)) { + pcie = (struct be_pcie_res_desc *)hdr; + if (pcie->pf_num == devfn) + return pcie; + } + hdr->desc_len = hdr->desc_len ? : RESOURCE_DESC_SIZE_V0; + hdr = (void *)hdr + hdr->desc_len; + } return NULL; } +static void be_copy_nic_desc(struct be_resources *res, + struct be_nic_res_desc *desc) +{ + res->max_uc_mac = le16_to_cpu(desc->unicast_mac_count); + res->max_vlans = le16_to_cpu(desc->vlan_count); + res->max_mcast_mac = le16_to_cpu(desc->mcast_mac_count); + res->max_tx_qs = le16_to_cpu(desc->txq_count); + res->max_rss_qs = le16_to_cpu(desc->rssq_count); + res->max_rx_qs = le16_to_cpu(desc->rq_count); + res->max_evt_qs = le16_to_cpu(desc->eq_count); + /* Clear flags that driver is not interested in */ + res->if_cap_flags = le32_to_cpu(desc->cap_flags) & + BE_IF_CAP_FLAGS_WANT; + /* Need 1 RXQ as the default RXQ */ + if (res->max_rss_qs && res->max_rss_qs == res->max_rx_qs) + res->max_rss_qs -= 1; +} + /* Uses Mbox */ -int be_cmd_get_func_config(struct be_adapter *adapter) +int be_cmd_get_func_config(struct be_adapter *adapter, struct be_resources *res) { struct be_mcc_wrb *wrb; struct be_cmd_req_get_func_config *req; @@ -3149,28 +3208,16 @@ int be_cmd_get_func_config(struct be_adapter *adapter) if (!status) { struct be_cmd_resp_get_func_config *resp = cmd.va; u32 desc_count = le32_to_cpu(resp->desc_count); - struct be_nic_resource_desc *desc; + struct be_nic_res_desc *desc; - desc = be_get_nic_desc(resp->func_param, desc_count, - sizeof(resp->func_param)); + desc = be_get_nic_desc(resp->func_param, desc_count); if (!desc) { status = -EINVAL; goto err; } adapter->pf_number = desc->pf_num; - adapter->max_pmac_cnt = le16_to_cpu(desc->unicast_mac_count); - adapter->max_vlans = le16_to_cpu(desc->vlan_count); - adapter->max_mcast_mac = le16_to_cpu(desc->mcast_mac_count); - adapter->max_tx_queues = le16_to_cpu(desc->txq_count); - adapter->max_rss_queues = le16_to_cpu(desc->rssq_count); - adapter->max_rx_queues = le16_to_cpu(desc->rq_count); - - adapter->max_event_queues = le16_to_cpu(desc->eq_count); - adapter->if_cap_flags = le32_to_cpu(desc->cap_flags); - - /* Clear flags that driver is not interested in */ - adapter->if_cap_flags &= BE_IF_CAP_FLAGS_WANT; + be_copy_nic_desc(res, desc); } err: mutex_unlock(&adapter->mbox_lock); @@ -3241,54 +3288,51 @@ err: } /* Uses sync mcc, if MCCQ is already created otherwise mbox */ -int be_cmd_get_profile_config(struct be_adapter *adapter, u32 *cap_flags, - u16 *txq_count, u8 domain) +int be_cmd_get_profile_config(struct be_adapter *adapter, + struct be_resources *res, u8 domain) { + struct be_cmd_resp_get_profile_config *resp; + struct be_pcie_res_desc *pcie; + struct be_nic_res_desc *nic; struct be_queue_info *mccq = &adapter->mcc_obj.q; struct be_dma_mem cmd; + u32 desc_count; int status; memset(&cmd, 0, sizeof(struct be_dma_mem)); - if (!lancer_chip(adapter)) - cmd.size = sizeof(struct be_cmd_resp_get_profile_config_v1); - else - cmd.size = sizeof(struct be_cmd_resp_get_profile_config); - cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, - &cmd.dma); - if (!cmd.va) { - dev_err(&adapter->pdev->dev, "Memory alloc failure\n"); + cmd.size = sizeof(struct be_cmd_resp_get_profile_config); + cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma); + if (!cmd.va) return -ENOMEM; - } if (!mccq->created) status = be_cmd_get_profile_config_mbox(adapter, domain, &cmd); else status = be_cmd_get_profile_config_mccq(adapter, domain, &cmd); - if (!status) { - struct be_cmd_resp_get_profile_config *resp = cmd.va; - u32 desc_count = le32_to_cpu(resp->desc_count); - struct be_nic_resource_desc *desc; + if (status) + goto err; - desc = be_get_nic_desc(resp->func_param, desc_count, - sizeof(resp->func_param)); + resp = cmd.va; + desc_count = le32_to_cpu(resp->desc_count); + + pcie = be_get_pcie_desc(adapter->pdev->devfn, resp->func_param, + desc_count); + if (pcie) + res->max_vfs = le16_to_cpu(pcie->num_vfs); + + nic = be_get_nic_desc(resp->func_param, desc_count); + if (nic) + be_copy_nic_desc(res, nic); - if (!desc) { - status = -EINVAL; - goto err; - } - if (cap_flags) - *cap_flags = le32_to_cpu(desc->cap_flags); - if (txq_count) - *txq_count = le32_to_cpu(desc->txq_count); - } err: if (cmd.va) - pci_free_consistent(adapter->pdev, cmd.size, - cmd.va, cmd.dma); + pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma); return status; } -/* Uses sync mcc */ +/* Currently only Lancer uses this command and it supports version 0 only + * Uses sync mcc + */ int be_cmd_set_profile_config(struct be_adapter *adapter, u32 bps, u8 domain) { @@ -3309,12 +3353,10 @@ int be_cmd_set_profile_config(struct be_adapter *adapter, u32 bps, be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, OPCODE_COMMON_SET_PROFILE_CONFIG, sizeof(*req), wrb, NULL); - req->hdr.domain = domain; req->desc_count = cpu_to_le32(1); - - req->nic_desc.desc_type = NIC_RESOURCE_DESC_TYPE_V0; - req->nic_desc.desc_len = RESOURCE_DESC_SIZE; + req->nic_desc.hdr.desc_type = NIC_RESOURCE_DESC_TYPE_V0; + req->nic_desc.hdr.desc_len = RESOURCE_DESC_SIZE_V0; req->nic_desc.flags = (1 << QUN) | (1 << IMM) | (1 << NOSV); req->nic_desc.pf_num = adapter->pf_number; req->nic_desc.vf_num = domain; diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h index 6237192..52f3d4c 100644 --- a/drivers/net/ethernet/emulex/benet/be_cmds.h +++ b/drivers/net/ethernet/emulex/benet/be_cmds.h @@ -307,7 +307,7 @@ struct be_cmd_req_eq_create { struct be_cmd_resp_eq_create { struct be_cmd_resp_hdr resp_hdr; u16 eq_id; /* sword */ - u16 rsvd0; /* sword */ + u16 msix_idx; /* available only in v2 */ } __packed; /******************** Mac query ***************************/ @@ -1718,11 +1718,13 @@ struct be_cmd_req_set_ext_fat_caps { struct be_fat_conf_params set_params; }; -#define RESOURCE_DESC_SIZE 88 +#define RESOURCE_DESC_SIZE_V0 72 +#define RESOURCE_DESC_SIZE_V1 88 +#define PCIE_RESOURCE_DESC_TYPE_V0 0x40 #define NIC_RESOURCE_DESC_TYPE_V0 0x41 +#define PCIE_RESOURCE_DESC_TYPE_V1 0x50 #define NIC_RESOURCE_DESC_TYPE_V1 0x51 -#define MAX_RESOURCE_DESC 4 -#define MAX_RESOURCE_DESC_V1 32 +#define MAX_RESOURCE_DESC 264 /* QOS unit number */ #define QUN 4 @@ -1731,9 +1733,30 @@ struct be_cmd_req_set_ext_fat_caps { /* No save */ #define NOSV 7 -struct be_nic_resource_desc { +struct be_res_desc_hdr { u8 desc_type; u8 desc_len; +} __packed; + +struct be_pcie_res_desc { + struct be_res_desc_hdr hdr; + u8 rsvd0; + u8 flags; + u16 rsvd1; + u8 pf_num; + u8 rsvd2; + u32 rsvd3; + u8 sriov_state; + u8 pf_state; + u8 pf_type; + u8 rsvd4; + u16 num_vfs; + u16 rsvd5; + u32 rsvd6[17]; +} __packed; + +struct be_nic_res_desc { + struct be_res_desc_hdr hdr; u8 rsvd1; u8 flags; u8 vf_num; @@ -1762,7 +1785,7 @@ struct be_nic_resource_desc { u8 wol_param; u16 rsvd7; u32 rsvd8[3]; -}; +} __packed; struct be_cmd_req_get_func_config { struct be_cmd_req_hdr hdr; @@ -1771,7 +1794,7 @@ struct be_cmd_req_get_func_config { struct be_cmd_resp_get_func_config { struct be_cmd_resp_hdr hdr; u32 desc_count; - u8 func_param[MAX_RESOURCE_DESC * RESOURCE_DESC_SIZE]; + u8 func_param[MAX_RESOURCE_DESC * RESOURCE_DESC_SIZE_V1]; }; #define ACTIVE_PROFILE_TYPE 0x2 @@ -1783,26 +1806,20 @@ struct be_cmd_req_get_profile_config { }; struct be_cmd_resp_get_profile_config { - struct be_cmd_req_hdr hdr; - u32 desc_count; - u8 func_param[MAX_RESOURCE_DESC * RESOURCE_DESC_SIZE]; -}; - -struct be_cmd_resp_get_profile_config_v1 { - struct be_cmd_req_hdr hdr; + struct be_cmd_resp_hdr hdr; u32 desc_count; - u8 func_param[MAX_RESOURCE_DESC_V1 * RESOURCE_DESC_SIZE]; + u8 func_param[MAX_RESOURCE_DESC * RESOURCE_DESC_SIZE_V1]; }; struct be_cmd_req_set_profile_config { struct be_cmd_req_hdr hdr; u32 rsvd; u32 desc_count; - struct be_nic_resource_desc nic_desc; + struct be_nic_res_desc nic_desc; }; struct be_cmd_resp_set_profile_config { - struct be_cmd_req_hdr hdr; + struct be_cmd_resp_hdr hdr; }; struct be_cmd_enable_disable_vf { @@ -1851,8 +1868,7 @@ extern int be_cmd_if_create(struct be_adapter *adapter, u32 cap_flags, u32 en_flags, u32 *if_handle, u32 domain); extern int be_cmd_if_destroy(struct be_adapter *adapter, int if_handle, u32 domain); -extern int be_cmd_eq_create(struct be_adapter *adapter, - struct be_queue_info *eq, int eq_delay); +extern int be_cmd_eq_create(struct be_adapter *adapter, struct be_eq_obj *eqo); extern int be_cmd_cq_create(struct be_adapter *adapter, struct be_queue_info *cq, struct be_queue_info *eq, bool no_delay, int num_cqe_dma_coalesce); @@ -1964,10 +1980,10 @@ extern int lancer_initiate_dump(struct be_adapter *adapter); extern bool dump_present(struct be_adapter *adapter); extern int lancer_test_and_set_rdy_state(struct be_adapter *adapter); extern int be_cmd_query_port_name(struct be_adapter *adapter, u8 *port_name); -extern int be_cmd_get_func_config(struct be_adapter *adapter); -extern int be_cmd_get_profile_config(struct be_adapter *adapter, u32 *cap_flags, - u16 *txq_count, u8 domain); - +int be_cmd_get_func_config(struct be_adapter *adapter, + struct be_resources *res); +int be_cmd_get_profile_config(struct be_adapter *adapter, + struct be_resources *res, u8 domain); extern int be_cmd_set_profile_config(struct be_adapter *adapter, u32 bps, u8 domain); extern int be_cmd_get_if_id(struct be_adapter *adapter, diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c index 4f8c941..b440a1f 100644 --- a/drivers/net/ethernet/emulex/benet/be_ethtool.c +++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c @@ -1119,6 +1119,29 @@ static int be_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd) return status; } +static void be_get_channels(struct net_device *netdev, + struct ethtool_channels *ch) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + ch->combined_count = adapter->num_evt_qs; + ch->max_combined = be_max_qs(adapter); +} + +static int be_set_channels(struct net_device *netdev, + struct ethtool_channels *ch) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + if (ch->rx_count || ch->tx_count || ch->other_count || + !ch->combined_count || ch->combined_count > be_max_qs(adapter)) + return -EINVAL; + + adapter->cfg_num_qs = ch->combined_count; + + return be_update_queues(adapter); +} + const struct ethtool_ops be_ethtool_ops = { .get_settings = be_get_settings, .get_drvinfo = be_get_drvinfo, @@ -1145,4 +1168,6 @@ const struct ethtool_ops be_ethtool_ops = { .self_test = be_self_test, .get_rxnfc = be_get_rxnfc, .set_rxnfc = be_set_rxnfc, + .get_channels = be_get_channels, + .set_channels = be_set_channels }; diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index ff2b40d..50116f8 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -1001,7 +1001,7 @@ static int be_vid_config(struct be_adapter *adapter) if (adapter->promiscuous) return 0; - if (adapter->vlans_added > adapter->max_vlans) + if (adapter->vlans_added > be_max_vlans(adapter)) goto set_vlan_promisc; /* Construct VLAN Table to give to HW */ @@ -1042,7 +1042,7 @@ static int be_vlan_add_vid(struct net_device *netdev, __be16 proto, u16 vid) goto ret; adapter->vlan_tag[vid] = 1; - if (adapter->vlans_added <= (adapter->max_vlans + 1)) + if (adapter->vlans_added <= (be_max_vlans(adapter) + 1)) status = be_vid_config(adapter); if (!status) @@ -1068,7 +1068,7 @@ static int be_vlan_rem_vid(struct net_device *netdev, __be16 proto, u16 vid) goto ret; adapter->vlan_tag[vid] = 0; - if (adapter->vlans_added <= adapter->max_vlans) + if (adapter->vlans_added <= be_max_vlans(adapter)) status = be_vid_config(adapter); if (!status) @@ -1101,7 +1101,7 @@ static void be_set_rx_mode(struct net_device *netdev) /* Enable multicast promisc if num configured exceeds what we support */ if (netdev->flags & IFF_ALLMULTI || - netdev_mc_count(netdev) > adapter->max_mcast_mac) { + netdev_mc_count(netdev) > be_max_mc(adapter)) { be_cmd_rx_filter(adapter, IFF_ALLMULTI, ON); goto done; } @@ -1115,7 +1115,7 @@ static void be_set_rx_mode(struct net_device *netdev) adapter->pmac_id[i], 0); } - if (netdev_uc_count(netdev) > adapter->max_pmac_cnt) { + if (netdev_uc_count(netdev) > be_max_uc(adapter)) { be_cmd_rx_filter(adapter, IFF_PROMISC, ON); adapter->promiscuous = true; goto done; @@ -1913,6 +1913,7 @@ static void be_evt_queues_destroy(struct be_adapter *adapter) if (eqo->q.created) { be_eq_clean(eqo); be_cmd_q_destroy(adapter, &eqo->q, QTYPE_EQ); + netif_napi_del(&eqo->napi); } be_queue_free(adapter, &eqo->q); } @@ -1924,9 +1925,12 @@ static int be_evt_queues_create(struct be_adapter *adapter) struct be_eq_obj *eqo; int i, rc; - adapter->num_evt_qs = num_irqs(adapter); + adapter->num_evt_qs = min_t(u16, num_irqs(adapter), + adapter->cfg_num_qs); for_all_evt_queues(adapter, eqo, i) { + netif_napi_add(adapter->netdev, &eqo->napi, be_poll, + BE_NAPI_WEIGHT); eqo->adapter = adapter; eqo->tx_budget = BE_TX_BUDGET; eqo->idx = i; @@ -1939,7 +1943,7 @@ static int be_evt_queues_create(struct be_adapter *adapter) if (rc) return rc; - rc = be_cmd_eq_create(adapter, eq, eqo->cur_eqd); + rc = be_cmd_eq_create(adapter, eqo); if (rc) return rc; } @@ -2013,31 +2017,13 @@ static void be_tx_queues_destroy(struct be_adapter *adapter) } } -static int be_num_txqs_want(struct be_adapter *adapter) -{ - if ((!lancer_chip(adapter) && sriov_want(adapter)) || - be_is_mc(adapter) || - (!lancer_chip(adapter) && !be_physfn(adapter)) || - BE2_chip(adapter)) - return 1; - else - return adapter->max_tx_queues; -} - -static int be_tx_cqs_create(struct be_adapter *adapter) +static int be_tx_qs_create(struct be_adapter *adapter) { struct be_queue_info *cq, *eq; - int status; struct be_tx_obj *txo; - u8 i; + int status, i; - adapter->num_tx_qs = be_num_txqs_want(adapter); - if (adapter->num_tx_qs != MAX_TX_QS) { - rtnl_lock(); - netif_set_real_num_tx_queues(adapter->netdev, - adapter->num_tx_qs); - rtnl_unlock(); - } + adapter->num_tx_qs = min(adapter->num_evt_qs, be_max_txqs(adapter)); for_all_tx_queues(adapter, txo, i) { cq = &txo->cq; @@ -2053,16 +2039,7 @@ static int be_tx_cqs_create(struct be_adapter *adapter) status = be_cmd_cq_create(adapter, cq, eq, false, 3); if (status) return status; - } - return 0; -} -static int be_tx_qs_create(struct be_adapter *adapter) -{ - struct be_tx_obj *txo; - int i, status; - - for_all_tx_queues(adapter, txo, i) { status = be_queue_alloc(adapter, &txo->q, TX_Q_LEN, sizeof(struct be_eth_wrb)); if (status) @@ -2098,17 +2075,14 @@ static int be_rx_cqs_create(struct be_adapter *adapter) struct be_rx_obj *rxo; int rc, i; - /* We'll create as many RSS rings as there are irqs. - * But when there's only one irq there's no use creating RSS rings + /* We can create as many RSS rings as there are EQs. */ + adapter->num_rx_qs = adapter->num_evt_qs; + + /* We'll use RSS only if atleast 2 RSS rings are supported. + * When RSS is used, we'll need a default RXQ for non-IP traffic. */ - adapter->num_rx_qs = (num_irqs(adapter) > 1) ? - num_irqs(adapter) + 1 : 1; - if (adapter->num_rx_qs != MAX_RX_QS) { - rtnl_lock(); - netif_set_real_num_rx_queues(adapter->netdev, - adapter->num_rx_qs); - rtnl_unlock(); - } + if (adapter->num_rx_qs > 1) + adapter->num_rx_qs++; adapter->big_page_size = (1 << get_order(rx_frag_size)) * PAGE_SIZE; for_all_rx_queues(adapter, rxo, i) { @@ -2260,7 +2234,7 @@ static bool be_process_tx(struct be_adapter *adapter, struct be_tx_obj *txo, return (work_done < budget); /* Done */ } -static int be_poll(struct napi_struct *napi, int budget) +int be_poll(struct napi_struct *napi, int budget) { struct be_eq_obj *eqo = container_of(napi, struct be_eq_obj, napi); struct be_adapter *adapter = eqo->adapter; @@ -2372,38 +2346,24 @@ static void be_msix_disable(struct be_adapter *adapter) if (msix_enabled(adapter)) { pci_disable_msix(adapter->pdev); adapter->num_msix_vec = 0; + adapter->num_msix_roce_vec = 0; } } -static uint be_num_rss_want(struct be_adapter *adapter) -{ - u32 num = 0; - - if ((adapter->function_caps & BE_FUNCTION_CAPS_RSS) && - (lancer_chip(adapter) || - (!sriov_want(adapter) && be_physfn(adapter)))) { - num = adapter->max_rss_queues; - num = min_t(u32, num, (u32)netif_get_num_default_rss_queues()); - } - return num; -} - static int be_msix_enable(struct be_adapter *adapter) { -#define BE_MIN_MSIX_VECTORS 1 - int i, status, num_vec, num_roce_vec = 0; + int i, status, num_vec; struct device *dev = &adapter->pdev->dev; - /* If RSS queues are not used, need a vec for default RX Q */ - num_vec = min(be_num_rss_want(adapter), num_online_cpus()); - if (be_roce_supported(adapter)) { - num_roce_vec = min_t(u32, MAX_ROCE_MSIX_VECTORS, - (num_online_cpus() + 1)); - num_roce_vec = min(num_roce_vec, MAX_ROCE_EQS); - num_vec += num_roce_vec; - num_vec = min(num_vec, MAX_MSIX_VECTORS); - } - num_vec = max(num_vec, BE_MIN_MSIX_VECTORS); + /* If RoCE is supported, program the max number of NIC vectors that + * may be configured via set-channels, along with vectors needed for + * RoCe. Else, just program the number we'll use initially. + */ + if (be_roce_supported(adapter)) + num_vec = min_t(int, 2 * be_max_eqs(adapter), + 2 * num_online_cpus()); + else + num_vec = adapter->cfg_num_qs; for (i = 0; i < num_vec; i++) adapter->msix_entries[i].entry = i; @@ -2411,7 +2371,7 @@ static int be_msix_enable(struct be_adapter *adapter) status = pci_enable_msix(adapter->pdev, adapter->msix_entries, num_vec); if (status == 0) { goto done; - } else if (status >= BE_MIN_MSIX_VECTORS) { + } else if (status >= MIN_MSIX_VECTORS) { num_vec = status; status = pci_enable_msix(adapter->pdev, adapter->msix_entries, num_vec); @@ -2420,30 +2380,29 @@ static int be_msix_enable(struct be_adapter *adapter) } dev_warn(dev, "MSIx enable failed\n"); + /* INTx is not supported in VFs, so fail probe if enable_msix fails */ if (!be_physfn(adapter)) return status; return 0; done: - if (be_roce_supported(adapter)) { - if (num_vec > num_roce_vec) { - adapter->num_msix_vec = num_vec - num_roce_vec; - adapter->num_msix_roce_vec = - num_vec - adapter->num_msix_vec; - } else { - adapter->num_msix_vec = num_vec; - adapter->num_msix_roce_vec = 0; - } - } else - adapter->num_msix_vec = num_vec; - dev_info(dev, "enabled %d MSI-x vector(s)\n", adapter->num_msix_vec); + if (be_roce_supported(adapter) && num_vec > MIN_MSIX_VECTORS) { + adapter->num_msix_roce_vec = num_vec / 2; + dev_info(dev, "enabled %d MSI-x vector(s) for RoCE\n", + adapter->num_msix_roce_vec); + } + + adapter->num_msix_vec = num_vec - adapter->num_msix_roce_vec; + + dev_info(dev, "enabled %d MSI-x vector(s) for NIC\n", + adapter->num_msix_vec); return 0; } static inline int be_msix_vec_get(struct be_adapter *adapter, struct be_eq_obj *eqo) { - return adapter->msix_entries[eqo->idx].vector; + return adapter->msix_entries[eqo->msix_idx].vector; } static int be_msix_register(struct be_adapter *adapter) @@ -2556,8 +2515,8 @@ static int be_close(struct net_device *netdev) /* Wait for all pending tx completions to arrive so that * all tx skbs are freed. */ - be_tx_compl_clean(adapter); netif_tx_disable(netdev); + be_tx_compl_clean(adapter); be_rx_qs_destroy(adapter); @@ -2795,14 +2754,27 @@ done: adapter->num_vfs = 0; } -static int be_clear(struct be_adapter *adapter) +static void be_clear_queues(struct be_adapter *adapter) { - int i; + be_mcc_queues_destroy(adapter); + be_rx_cqs_destroy(adapter); + be_tx_queues_destroy(adapter); + be_evt_queues_destroy(adapter); +} +static void be_cancel_worker(struct be_adapter *adapter) +{ if (adapter->flags & BE_FLAGS_WORKER_SCHEDULED) { cancel_delayed_work_sync(&adapter->work); adapter->flags &= ~BE_FLAGS_WORKER_SCHEDULED; } +} + +static int be_clear(struct be_adapter *adapter) +{ + int i; + + be_cancel_worker(adapter); if (sriov_enabled(adapter)) be_vf_clear(adapter); @@ -2815,10 +2787,7 @@ static int be_clear(struct be_adapter *adapter) be_cmd_if_destroy(adapter, adapter->if_handle, 0); - be_mcc_queues_destroy(adapter); - be_rx_cqs_destroy(adapter); - be_tx_queues_destroy(adapter); - be_evt_queues_destroy(adapter); + be_clear_queues(adapter); kfree(adapter->pmac_id); adapter->pmac_id = NULL; @@ -2829,6 +2798,7 @@ static int be_clear(struct be_adapter *adapter) static int be_vfs_if_create(struct be_adapter *adapter) { + struct be_resources res = {0}; struct be_vf_cfg *vf_cfg; u32 cap_flags, en_flags, vf; int status; @@ -2837,9 +2807,12 @@ static int be_vfs_if_create(struct be_adapter *adapter) BE_IF_FLAGS_MULTICAST; for_all_vfs(adapter, vf_cfg, vf) { - if (!BE3_chip(adapter)) - be_cmd_get_profile_config(adapter, &cap_flags, - NULL, vf + 1); + if (!BE3_chip(adapter)) { + status = be_cmd_get_profile_config(adapter, &res, + vf + 1); + if (!status) + cap_flags = res.if_cap_flags; + } /* If a FW profile exists, then cap_flags are updated */ en_flags = cap_flags & (BE_IF_FLAGS_UNTAGGED | @@ -2885,10 +2858,10 @@ static int be_vf_setup(struct be_adapter *adapter) dev_warn(dev, "Ignoring num_vfs=%d setting\n", num_vfs); adapter->num_vfs = old_vfs; } else { - if (num_vfs > adapter->dev_num_vfs) + if (num_vfs > be_max_vfs(adapter)) dev_info(dev, "Device supports %d VFs and not %d\n", - adapter->dev_num_vfs, num_vfs); - adapter->num_vfs = min_t(u16, num_vfs, adapter->dev_num_vfs); + be_max_vfs(adapter), num_vfs); + adapter->num_vfs = min_t(u16, num_vfs, be_max_vfs(adapter)); if (!adapter->num_vfs) return 0; } @@ -2967,6 +2940,51 @@ err: return status; } +/* On BE2/BE3 FW does not suggest the supported limits */ +static void BEx_get_resources(struct be_adapter *adapter, + struct be_resources *res) +{ + struct pci_dev *pdev = adapter->pdev; + bool use_sriov = false; + + if (BE3_chip(adapter) && be_physfn(adapter)) { + int max_vfs; + + max_vfs = pci_sriov_get_totalvfs(pdev); + res->max_vfs = max_vfs > 0 ? min(MAX_VFS, max_vfs) : 0; + use_sriov = res->max_vfs && num_vfs; + } + + if (be_physfn(adapter)) + res->max_uc_mac = BE_UC_PMAC_COUNT; + else + res->max_uc_mac = BE_VF_UC_PMAC_COUNT; + + if (adapter->function_mode & FLEX10_MODE) + res->max_vlans = BE_NUM_VLANS_SUPPORTED/8; + else + res->max_vlans = BE_NUM_VLANS_SUPPORTED; + res->max_mcast_mac = BE_MAX_MC; + + if (BE2_chip(adapter) || use_sriov || be_is_mc(adapter) || + !be_physfn(adapter)) + res->max_tx_qs = 1; + else + res->max_tx_qs = BE3_MAX_TX_QS; + + if ((adapter->function_caps & BE_FUNCTION_CAPS_RSS) && + !use_sriov && be_physfn(adapter)) + res->max_rss_qs = (adapter->be3_native) ? + BE3_MAX_RSS_QS : BE2_MAX_RSS_QS; + res->max_rx_qs = res->max_rss_qs + 1; + + res->max_evt_qs = be_physfn(adapter) ? BE3_MAX_EVT_QS : 1; + + res->if_cap_flags = BE_IF_CAP_FLAGS_WANT; + if (!(adapter->function_caps & BE_FUNCTION_CAPS_RSS)) + res->if_cap_flags &= ~BE_IF_FLAGS_RSS; +} + static void be_setup_init(struct be_adapter *adapter) { adapter->vlan_prio_bmap = 0xff; @@ -2980,76 +2998,56 @@ static void be_setup_init(struct be_adapter *adapter) adapter->cmd_privileges = MIN_PRIVILEGES; } -static void be_get_resources(struct be_adapter *adapter) +static int be_get_resources(struct be_adapter *adapter) { - u16 dev_num_vfs; - int pos, status; - bool profile_present = false; - u16 txq_count = 0; + struct device *dev = &adapter->pdev->dev; + struct be_resources res = {0}; + int status; - if (!BEx_chip(adapter)) { - status = be_cmd_get_func_config(adapter); - if (!status) - profile_present = true; - } else if (BE3_chip(adapter) && be_physfn(adapter)) { - be_cmd_get_profile_config(adapter, NULL, &txq_count, 0); + if (BEx_chip(adapter)) { + BEx_get_resources(adapter, &res); + adapter->res = res; } - if (profile_present) { - adapter->max_tx_queues = min_t(u16, adapter->max_tx_queues, - MAX_TX_QS); - adapter->max_rss_queues = min_t(u16, adapter->max_rss_queues, - BE3_MAX_RSS_QS); - adapter->max_event_queues = min_t(u16, - adapter->max_event_queues, - BE3_MAX_RSS_QS); - - if (adapter->max_rss_queues && - adapter->max_rss_queues == adapter->max_rx_queues) - adapter->max_rss_queues -= 1; - - if (adapter->max_event_queues < adapter->max_rss_queues) - adapter->max_rss_queues = adapter->max_event_queues; - - } else { - if (be_physfn(adapter)) - adapter->max_pmac_cnt = BE_UC_PMAC_COUNT; - else - adapter->max_pmac_cnt = BE_VF_UC_PMAC_COUNT; + /* For BE3 only check if FW suggests a different max-txqs value */ + if (BE3_chip(adapter)) { + status = be_cmd_get_profile_config(adapter, &res, 0); + if (!status && res.max_tx_qs) + adapter->res.max_tx_qs = + min(adapter->res.max_tx_qs, res.max_tx_qs); + } - if (adapter->function_mode & FLEX10_MODE) - adapter->max_vlans = BE_NUM_VLANS_SUPPORTED/8; - else - adapter->max_vlans = BE_NUM_VLANS_SUPPORTED; + /* For Lancer, SH etc read per-function resource limits from FW. + * GET_FUNC_CONFIG returns per function guaranteed limits. + * GET_PROFILE_CONFIG returns PCI-E related limits PF-pool limits + */ + if (!BEx_chip(adapter)) { + status = be_cmd_get_func_config(adapter, &res); + if (status) + return status; - adapter->max_mcast_mac = BE_MAX_MC; - adapter->max_tx_queues = txq_count ? txq_count : MAX_TX_QS; - adapter->max_tx_queues = min_t(u16, adapter->max_tx_queues, - MAX_TX_QS); - adapter->max_rss_queues = (adapter->be3_native) ? - BE3_MAX_RSS_QS : BE2_MAX_RSS_QS; - adapter->max_event_queues = BE3_MAX_RSS_QS; + /* If RoCE may be enabled stash away half the EQs for RoCE */ + if (be_roce_supported(adapter)) + res.max_evt_qs /= 2; + adapter->res = res; - adapter->if_cap_flags = BE_IF_FLAGS_UNTAGGED | - BE_IF_FLAGS_BROADCAST | - BE_IF_FLAGS_MULTICAST | - BE_IF_FLAGS_PASS_L3L4_ERRORS | - BE_IF_FLAGS_MCAST_PROMISCUOUS | - BE_IF_FLAGS_VLAN_PROMISCUOUS | - BE_IF_FLAGS_PROMISCUOUS; + if (be_physfn(adapter)) { + status = be_cmd_get_profile_config(adapter, &res, 0); + if (status) + return status; + adapter->res.max_vfs = res.max_vfs; + } - if (adapter->function_caps & BE_FUNCTION_CAPS_RSS) - adapter->if_cap_flags |= BE_IF_FLAGS_RSS; + dev_info(dev, "Max: txqs %d, rxqs %d, rss %d, eqs %d, vfs %d\n", + be_max_txqs(adapter), be_max_rxqs(adapter), + be_max_rss(adapter), be_max_eqs(adapter), + be_max_vfs(adapter)); + dev_info(dev, "Max: uc-macs %d, mc-macs %d, vlans %d\n", + be_max_uc(adapter), be_max_mc(adapter), + be_max_vlans(adapter)); } - pos = pci_find_ext_capability(adapter->pdev, PCI_EXT_CAP_ID_SRIOV); - if (pos) { - pci_read_config_word(adapter->pdev, pos + PCI_SRIOV_TOTAL_VF, - &dev_num_vfs); - if (BE3_chip(adapter)) - dev_num_vfs = min_t(u16, dev_num_vfs, MAX_VFS); - adapter->dev_num_vfs = dev_num_vfs; - } + return 0; } /* Routine to query per function resource limits */ @@ -3062,20 +3060,22 @@ static int be_get_config(struct be_adapter *adapter) &adapter->function_caps, &adapter->asic_rev); if (status) - goto err; + return status; - be_get_resources(adapter); + status = be_get_resources(adapter); + if (status) + return status; /* primary mac needs 1 pmac entry */ - adapter->pmac_id = kcalloc(adapter->max_pmac_cnt + 1, - sizeof(u32), GFP_KERNEL); - if (!adapter->pmac_id) { - status = -ENOMEM; - goto err; - } + adapter->pmac_id = kcalloc(be_max_uc(adapter) + 1, sizeof(u32), + GFP_KERNEL); + if (!adapter->pmac_id) + return -ENOMEM; -err: - return status; + /* Sanitize cfg_num_qs based on HW and platform limits */ + adapter->cfg_num_qs = min(adapter->cfg_num_qs, be_max_qs(adapter)); + + return 0; } static int be_mac_setup(struct be_adapter *adapter) @@ -3104,64 +3104,127 @@ static int be_mac_setup(struct be_adapter *adapter) return 0; } -static int be_setup(struct be_adapter *adapter) +static void be_schedule_worker(struct be_adapter *adapter) { - struct device *dev = &adapter->pdev->dev; - u32 en_flags; - u32 tx_fc, rx_fc; - int status; - - be_setup_init(adapter); + schedule_delayed_work(&adapter->work, msecs_to_jiffies(1000)); + adapter->flags |= BE_FLAGS_WORKER_SCHEDULED; +} - if (!lancer_chip(adapter)) - be_cmd_req_native_mode(adapter); +static int be_setup_queues(struct be_adapter *adapter) +{ + struct net_device *netdev = adapter->netdev; + int status; - status = be_get_config(adapter); + status = be_evt_queues_create(adapter); if (status) goto err; - status = be_msix_enable(adapter); + status = be_tx_qs_create(adapter); if (status) goto err; - status = be_evt_queues_create(adapter); + status = be_rx_cqs_create(adapter); if (status) goto err; - status = be_tx_cqs_create(adapter); + status = be_mcc_queues_create(adapter); if (status) goto err; - status = be_rx_cqs_create(adapter); + status = netif_set_real_num_rx_queues(netdev, adapter->num_rx_qs); if (status) goto err; - status = be_mcc_queues_create(adapter); + status = netif_set_real_num_tx_queues(netdev, adapter->num_tx_qs); if (status) goto err; - be_cmd_get_fn_privileges(adapter, &adapter->cmd_privileges, 0); - /* In UMC mode FW does not return right privileges. - * Override with correct privilege equivalent to PF. + return 0; +err: + dev_err(&adapter->pdev->dev, "queue_setup failed\n"); + return status; +} + +int be_update_queues(struct be_adapter *adapter) +{ + struct net_device *netdev = adapter->netdev; + int status; + + if (netif_running(netdev)) + be_close(netdev); + + be_cancel_worker(adapter); + + /* If any vectors have been shared with RoCE we cannot re-program + * the MSIx table. */ - if (be_is_mc(adapter)) - adapter->cmd_privileges = MAX_PRIVILEGES; + if (!adapter->num_msix_roce_vec) + be_msix_disable(adapter); + + be_clear_queues(adapter); + + if (!msix_enabled(adapter)) { + status = be_msix_enable(adapter); + if (status) + return status; + } + + status = be_setup_queues(adapter); + if (status) + return status; + + be_schedule_worker(adapter); + + if (netif_running(netdev)) + status = be_open(netdev); + + return status; +} + +static int be_setup(struct be_adapter *adapter) +{ + struct device *dev = &adapter->pdev->dev; + u32 tx_fc, rx_fc, en_flags; + int status; + + be_setup_init(adapter); + + if (!lancer_chip(adapter)) + be_cmd_req_native_mode(adapter); + + status = be_get_config(adapter); + if (status) + goto err; + + status = be_msix_enable(adapter); + if (status) + goto err; en_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST | - BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_PASS_L3L4_ERRORS; + BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_PASS_L3L4_ERRORS; if (adapter->function_caps & BE_FUNCTION_CAPS_RSS) en_flags |= BE_IF_FLAGS_RSS; - en_flags = en_flags & adapter->if_cap_flags; - status = be_cmd_if_create(adapter, adapter->if_cap_flags, en_flags, + en_flags = en_flags & be_if_cap_flags(adapter); + status = be_cmd_if_create(adapter, be_if_cap_flags(adapter), en_flags, &adapter->if_handle, 0); - if (status != 0) + if (status) goto err; - status = be_mac_setup(adapter); + /* Updating real_num_tx/rx_queues() requires rtnl_lock() */ + rtnl_lock(); + status = be_setup_queues(adapter); + rtnl_unlock(); if (status) goto err; - status = be_tx_qs_create(adapter); + be_cmd_get_fn_privileges(adapter, &adapter->cmd_privileges, 0); + /* In UMC mode FW does not return right privileges. + * Override with correct privilege equivalent to PF. + */ + if (be_is_mc(adapter)) + adapter->cmd_privileges = MAX_PRIVILEGES; + + status = be_mac_setup(adapter); if (status) goto err; @@ -3178,8 +3241,8 @@ static int be_setup(struct be_adapter *adapter) be_cmd_set_flow_control(adapter, adapter->tx_fc, adapter->rx_fc); - if (be_physfn(adapter)) { - if (adapter->dev_num_vfs) + if (be_physfn(adapter) && num_vfs) { + if (be_max_vfs(adapter)) be_vf_setup(adapter); else dev_warn(dev, "device doesn't support SRIOV\n"); @@ -3189,8 +3252,7 @@ static int be_setup(struct be_adapter *adapter) if (!status && be_pause_supported(adapter)) adapter->phy.fc_autoneg = 1; - schedule_delayed_work(&adapter->work, msecs_to_jiffies(1000)); - adapter->flags |= BE_FLAGS_WORKER_SCHEDULED; + be_schedule_worker(adapter); return 0; err: be_clear(adapter); @@ -3756,8 +3818,6 @@ static const struct net_device_ops be_netdev_ops = { static void be_netdev_init(struct net_device *netdev) { struct be_adapter *adapter = netdev_priv(netdev); - struct be_eq_obj *eqo; - int i; netdev->hw_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_RXCSUM | @@ -3780,9 +3840,6 @@ static void be_netdev_init(struct net_device *netdev) netdev->netdev_ops = &be_netdev_ops; SET_ETHTOOL_OPS(netdev, &be_ethtool_ops); - - for_all_evt_queues(adapter, eqo, i) - netif_napi_add(netdev, &eqo->napi, be_poll, BE_NAPI_WEIGHT); } static void be_unmap_pci_bars(struct be_adapter *adapter) @@ -4045,6 +4102,7 @@ static int be_get_initial_config(struct be_adapter *adapter) level = be_get_fw_log_level(adapter); adapter->msg_enable = level <= FW_LOG_LEVEL_DEFAULT ? NETIF_MSG_HW : 0; + adapter->cfg_num_qs = netif_get_num_default_rss_queues(); return 0; } diff --git a/drivers/net/ethernet/emulex/benet/be_roce.c b/drivers/net/ethernet/emulex/benet/be_roce.c index 645e846..9cd5415 100644 --- a/drivers/net/ethernet/emulex/benet/be_roce.c +++ b/drivers/net/ethernet/emulex/benet/be_roce.c @@ -60,7 +60,7 @@ static void _be_roce_dev_add(struct be_adapter *adapter) */ num_vec = adapter->num_msix_vec + adapter->num_msix_roce_vec; dev_info.intr_mode = BE_INTERRUPT_MODE_MSIX; - dev_info.msix.num_vectors = min(num_vec, MAX_ROCE_MSIX_VECTORS); + dev_info.msix.num_vectors = min(num_vec, MAX_MSIX_VECTORS); /* provide start index of the vector, * so in case of linear usage, * it can use the base as starting point. diff --git a/drivers/net/ethernet/emulex/benet/be_roce.h b/drivers/net/ethernet/emulex/benet/be_roce.h index 2765729..2cd1129 100644 --- a/drivers/net/ethernet/emulex/benet/be_roce.h +++ b/drivers/net/ethernet/emulex/benet/be_roce.h @@ -29,7 +29,7 @@ enum be_interrupt_mode { BE_INTERRUPT_MODE_MSI = 2, }; -#define MAX_ROCE_MSIX_VECTORS 16 +#define MAX_MSIX_VECTORS 32 struct be_dev_info { u8 __iomem *db; u64 unmapped_db; @@ -45,7 +45,7 @@ struct be_dev_info { struct { int num_vectors; int start_vector; - u32 vector_list[MAX_ROCE_MSIX_VECTORS]; + u32 vector_list[MAX_MSIX_VECTORS]; } msix; }; diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx_phy.c b/drivers/net/ethernet/freescale/fec_mpc52xx_phy.c index 360a578..e052890 100644 --- a/drivers/net/ethernet/freescale/fec_mpc52xx_phy.c +++ b/drivers/net/ethernet/freescale/fec_mpc52xx_phy.c @@ -123,12 +123,10 @@ static int mpc52xx_fec_mdio_probe(struct platform_device *of) static int mpc52xx_fec_mdio_remove(struct platform_device *of) { - struct device *dev = &of->dev; - struct mii_bus *bus = dev_get_drvdata(dev); + struct mii_bus *bus = platform_get_drvdata(of); struct mpc52xx_fec_mdio_priv *priv = bus->priv; mdiobus_unregister(bus); - dev_set_drvdata(dev, NULL); iounmap(priv->regs); kfree(priv); mdiobus_free(bus); diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c index 8de53a1..6b60582 100644 --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c @@ -583,7 +583,6 @@ static struct sk_buff *tx_skb_align_workaround(struct net_device *dev, struct sk_buff *skb) { struct sk_buff *new_skb; - struct fs_enet_private *fep = netdev_priv(dev); /* Alloc new skb */ new_skb = netdev_alloc_skb(dev, skb->len + 4); @@ -1000,6 +999,8 @@ static int fs_enet_probe(struct platform_device *ofdev) struct fs_enet_private *fep; struct fs_platform_info *fpi; const u32 *data; + struct clk *clk; + int err; const u8 *mac_addr; const char *phy_connection_type; int privsize, len, ret = -ENODEV; @@ -1037,6 +1038,20 @@ static int fs_enet_probe(struct platform_device *ofdev) fpi->use_rmii = 1; } + /* make clock lookup non-fatal (the driver is shared among platforms), + * but require enable to succeed when a clock was specified/found, + * keep a reference to the clock upon successful acquisition + */ + clk = devm_clk_get(&ofdev->dev, "per"); + if (!IS_ERR(clk)) { + err = clk_prepare_enable(clk); + if (err) { + ret = err; + goto out_free_fpi; + } + fpi->clk_per = clk; + } + privsize = sizeof(*fep) + sizeof(struct sk_buff **) * (fpi->rx_ring + fpi->tx_ring); @@ -1108,6 +1123,8 @@ out_free_dev: free_netdev(ndev); out_put: of_node_put(fpi->phy_node); + if (fpi->clk_per) + clk_disable_unprepare(fpi->clk_per); out_free_fpi: kfree(fpi); return ret; @@ -1124,6 +1141,8 @@ static int fs_enet_remove(struct platform_device *ofdev) fep->ops->cleanup_data(ndev); dev_set_drvdata(fep->dev, NULL); of_node_put(fep->fpi->phy_node); + if (fep->fpi->clk_per) + clk_disable_unprepare(fep->fpi->clk_per); free_netdev(ndev); return 0; } diff --git a/drivers/net/ethernet/freescale/fsl_pq_mdio.c b/drivers/net/ethernet/freescale/fsl_pq_mdio.c index c93a056..c4f6506 100644 --- a/drivers/net/ethernet/freescale/fsl_pq_mdio.c +++ b/drivers/net/ethernet/freescale/fsl_pq_mdio.c @@ -409,7 +409,7 @@ static int fsl_pq_mdio_probe(struct platform_device *pdev) priv->regs = priv->map + data->mii_offset; new_bus->parent = &pdev->dev; - dev_set_drvdata(&pdev->dev, new_bus); + platform_set_drvdata(pdev, new_bus); if (data->get_tbipa) { for_each_child_of_node(np, tbi) { @@ -468,8 +468,6 @@ static int fsl_pq_mdio_remove(struct platform_device *pdev) mdiobus_unregister(bus); - dev_set_drvdata(device, NULL); - iounmap(priv->map); mdiobus_free(bus); diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index 3c43dac..5930c39 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -3911,14 +3911,12 @@ static int ucc_geth_probe(struct platform_device* ofdev) static int ucc_geth_remove(struct platform_device* ofdev) { - struct device *device = &ofdev->dev; - struct net_device *dev = dev_get_drvdata(device); + struct net_device *dev = platform_get_drvdata(ofdev); struct ucc_geth_private *ugeth = netdev_priv(dev); unregister_netdev(dev); free_netdev(dev); ucc_geth_memclean(ugeth); - dev_set_drvdata(device, NULL); return 0; } diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c index d300a0c..2d3b064 100644 --- a/drivers/net/ethernet/ibm/emac/core.c +++ b/drivers/net/ethernet/ibm/emac/core.c @@ -2955,8 +2955,6 @@ static int emac_remove(struct platform_device *ofdev) DBG(dev, "remove" NL); - dev_set_drvdata(&ofdev->dev, NULL); - unregister_netdev(dev->ndev); cancel_work_sync(&dev->reset_work); diff --git a/drivers/net/ethernet/intel/e1000e/82571.c b/drivers/net/ethernet/intel/e1000e/82571.c index 104fcec..8fed74e 100644 --- a/drivers/net/ethernet/intel/e1000e/82571.c +++ b/drivers/net/ethernet/intel/e1000e/82571.c @@ -1011,6 +1011,11 @@ static s32 e1000_reset_hw_82571(struct e1000_hw *hw) /* Must release MDIO ownership and mutex after MAC reset. */ switch (hw->mac.type) { + case e1000_82573: + /* Release mutex only if the hw semaphore is acquired */ + if (!ret_val) + e1000_put_hw_semaphore_82573(hw); + break; case e1000_82574: case e1000_82583: /* Release mutex only if the hw semaphore is acquired */ diff --git a/drivers/net/ethernet/intel/e1000e/ethtool.c b/drivers/net/ethernet/intel/e1000e/ethtool.c index e4ebd7d..a8633b8 100644 --- a/drivers/net/ethernet/intel/e1000e/ethtool.c +++ b/drivers/net/ethernet/intel/e1000e/ethtool.c @@ -1665,7 +1665,7 @@ static int e1000_run_loopback_test(struct e1000_adapter *adapter) ret_val = 13; /* ret_val is the same as mis-compare */ break; } - if (jiffies >= (time + 20)) { + if (time_after(jiffies, time + 20)) { ret_val = 14; /* error code for time out error */ break; } diff --git a/drivers/net/ethernet/intel/e1000e/hw.h b/drivers/net/ethernet/intel/e1000e/hw.h index b799fd9..b7f3843 100644 --- a/drivers/net/ethernet/intel/e1000e/hw.h +++ b/drivers/net/ethernet/intel/e1000e/hw.h @@ -233,7 +233,8 @@ union e1000_rx_desc_extended { #define MAX_PS_BUFFERS 4 /* Number of packet split data buffers (not including the header buffer) */ -#define PS_PAGE_BUFFERS (MAX_PS_BUFFERS - 1) +#define PS_PAGE_BUFFERS (MAX_PS_BUFFERS - 1) + /* Receive Descriptor - Packet Split */ union e1000_rx_desc_packet_split { struct { diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index e6d2c0f..e87e9b0 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -64,8 +64,6 @@ static int debug = -1; module_param(debug, int, 0); MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)"); -static void e1000e_disable_aspm(struct pci_dev *pdev, u16 state); - static const struct e1000_info *e1000_info_tbl[] = { [board_82571] = &e1000_82571_info, [board_82572] = &e1000_82572_info, @@ -6001,11 +5999,18 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool runtime) * correctable error when the MAC transitions from D0 to D3. To * prevent this we need to mask off the correctable errors on the * downstream port of the pci-e switch. + * + * We don't have the associated upstream bridge while assigning + * the PCI device into guest. For example, the KVM on power is + * one of the cases. */ if (adapter->flags & FLAG_IS_QUAD_PORT) { struct pci_dev *us_dev = pdev->bus->self; u16 devctl; + if (!us_dev) + return 0; + pcie_capability_read_word(us_dev, PCI_EXP_DEVCTL, &devctl); pcie_capability_write_word(us_dev, PCI_EXP_DEVCTL, (devctl & ~PCI_EXP_DEVCTL_CERE)); @@ -6019,38 +6024,73 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool runtime) return 0; } -#ifdef CONFIG_PCIEASPM -static void __e1000e_disable_aspm(struct pci_dev *pdev, u16 state) +/** + * e1000e_disable_aspm - Disable ASPM states + * @pdev: pointer to PCI device struct + * @state: bit-mask of ASPM states to disable + * + * Some devices *must* have certain ASPM states disabled per hardware errata. + **/ +static void e1000e_disable_aspm(struct pci_dev *pdev, u16 state) { + struct pci_dev *parent = pdev->bus->self; + u16 aspm_dis_mask = 0; + u16 pdev_aspmc, parent_aspmc; + + switch (state) { + case PCIE_LINK_STATE_L0S: + case PCIE_LINK_STATE_L0S | PCIE_LINK_STATE_L1: + aspm_dis_mask |= PCI_EXP_LNKCTL_ASPM_L0S; + /* fall-through - can't have L1 without L0s */ + case PCIE_LINK_STATE_L1: + aspm_dis_mask |= PCI_EXP_LNKCTL_ASPM_L1; + break; + default: + return; + } + + pcie_capability_read_word(pdev, PCI_EXP_LNKCTL, &pdev_aspmc); + pdev_aspmc &= PCI_EXP_LNKCTL_ASPMC; + + if (parent) { + pcie_capability_read_word(parent, PCI_EXP_LNKCTL, + &parent_aspmc); + parent_aspmc &= PCI_EXP_LNKCTL_ASPMC; + } + + /* Nothing to do if the ASPM states to be disabled already are */ + if (!(pdev_aspmc & aspm_dis_mask) && + (!parent || !(parent_aspmc & aspm_dis_mask))) + return; + + dev_info(&pdev->dev, "Disabling ASPM %s %s\n", + (aspm_dis_mask & pdev_aspmc & PCI_EXP_LNKCTL_ASPM_L0S) ? + "L0s" : "", + (aspm_dis_mask & pdev_aspmc & PCI_EXP_LNKCTL_ASPM_L1) ? + "L1" : ""); + +#ifdef CONFIG_PCIEASPM pci_disable_link_state_locked(pdev, state); -} -#else -static void __e1000e_disable_aspm(struct pci_dev *pdev, u16 state) -{ - u16 aspm_ctl = 0; - if (state & PCIE_LINK_STATE_L0S) - aspm_ctl |= PCI_EXP_LNKCTL_ASPM_L0S; - if (state & PCIE_LINK_STATE_L1) - aspm_ctl |= PCI_EXP_LNKCTL_ASPM_L1; + /* Double-check ASPM control. If not disabled by the above, the + * BIOS is preventing that from happening (or CONFIG_PCIEASPM is + * not enabled); override by writing PCI config space directly. + */ + pcie_capability_read_word(pdev, PCI_EXP_LNKCTL, &pdev_aspmc); + pdev_aspmc &= PCI_EXP_LNKCTL_ASPMC; + + if (!(aspm_dis_mask & pdev_aspmc)) + return; +#endif /* Both device and parent should have the same ASPM setting. * Disable ASPM in downstream component first and then upstream. */ - pcie_capability_clear_word(pdev, PCI_EXP_LNKCTL, aspm_ctl); - - if (pdev->bus->self) - pcie_capability_clear_word(pdev->bus->self, PCI_EXP_LNKCTL, - aspm_ctl); -} -#endif -static void e1000e_disable_aspm(struct pci_dev *pdev, u16 state) -{ - dev_info(&pdev->dev, "Disabling ASPM %s %s\n", - (state & PCIE_LINK_STATE_L0S) ? "L0s" : "", - (state & PCIE_LINK_STATE_L1) ? "L1" : ""); + pcie_capability_clear_word(pdev, PCI_EXP_LNKCTL, aspm_dis_mask); - __e1000e_disable_aspm(pdev, state); + if (parent) + pcie_capability_clear_word(parent, PCI_EXP_LNKCTL, + aspm_dis_mask); } #ifdef CONFIG_PM diff --git a/drivers/net/ethernet/intel/igb/e1000_82575.c b/drivers/net/ethernet/intel/igb/e1000_82575.c index f21a91a..d398fad 100644 --- a/drivers/net/ethernet/intel/igb/e1000_82575.c +++ b/drivers/net/ethernet/intel/igb/e1000_82575.c @@ -238,6 +238,7 @@ static s32 igb_init_nvm_params_82575(struct e1000_hw *hw) size = (u16)((eecd & E1000_EECD_SIZE_EX_MASK) >> E1000_EECD_SIZE_EX_SHIFT); + /* Added to a constant, "size" becomes the left-shift value * for setting word_size. */ @@ -250,86 +251,52 @@ static s32 igb_init_nvm_params_82575(struct e1000_hw *hw) size = 15; nvm->word_size = 1 << size; - if (hw->mac.type < e1000_i210) { - nvm->opcode_bits = 8; - nvm->delay_usec = 1; - - switch (nvm->override) { - case e1000_nvm_override_spi_large: - nvm->page_size = 32; - nvm->address_bits = 16; - break; - case e1000_nvm_override_spi_small: - nvm->page_size = 8; - nvm->address_bits = 8; - break; - default: - nvm->page_size = eecd & E1000_EECD_ADDR_BITS ? 32 : 8; - nvm->address_bits = eecd & E1000_EECD_ADDR_BITS ? - 16 : 8; - break; - } - if (nvm->word_size == (1 << 15)) - nvm->page_size = 128; + nvm->opcode_bits = 8; + nvm->delay_usec = 1; - nvm->type = e1000_nvm_eeprom_spi; - } else { - nvm->type = e1000_nvm_flash_hw; + switch (nvm->override) { + case e1000_nvm_override_spi_large: + nvm->page_size = 32; + nvm->address_bits = 16; + break; + case e1000_nvm_override_spi_small: + nvm->page_size = 8; + nvm->address_bits = 8; + break; + default: + nvm->page_size = eecd & E1000_EECD_ADDR_BITS ? 32 : 8; + nvm->address_bits = eecd & E1000_EECD_ADDR_BITS ? + 16 : 8; + break; } + if (nvm->word_size == (1 << 15)) + nvm->page_size = 128; + + nvm->type = e1000_nvm_eeprom_spi; /* NVM Function Pointers */ + nvm->ops.acquire = igb_acquire_nvm_82575; + nvm->ops.release = igb_release_nvm_82575; + nvm->ops.write = igb_write_nvm_spi; + nvm->ops.validate = igb_validate_nvm_checksum; + nvm->ops.update = igb_update_nvm_checksum; + if (nvm->word_size < (1 << 15)) + nvm->ops.read = igb_read_nvm_eerd; + else + nvm->ops.read = igb_read_nvm_spi; + + /* override generic family function pointers for specific descendants */ switch (hw->mac.type) { case e1000_82580: nvm->ops.validate = igb_validate_nvm_checksum_82580; nvm->ops.update = igb_update_nvm_checksum_82580; - nvm->ops.acquire = igb_acquire_nvm_82575; - nvm->ops.release = igb_release_nvm_82575; - if (nvm->word_size < (1 << 15)) - nvm->ops.read = igb_read_nvm_eerd; - else - nvm->ops.read = igb_read_nvm_spi; - nvm->ops.write = igb_write_nvm_spi; break; case e1000_i354: case e1000_i350: nvm->ops.validate = igb_validate_nvm_checksum_i350; nvm->ops.update = igb_update_nvm_checksum_i350; - nvm->ops.acquire = igb_acquire_nvm_82575; - nvm->ops.release = igb_release_nvm_82575; - if (nvm->word_size < (1 << 15)) - nvm->ops.read = igb_read_nvm_eerd; - else - nvm->ops.read = igb_read_nvm_spi; - nvm->ops.write = igb_write_nvm_spi; - break; - case e1000_i210: - nvm->ops.validate = igb_validate_nvm_checksum_i210; - nvm->ops.update = igb_update_nvm_checksum_i210; - nvm->ops.acquire = igb_acquire_nvm_i210; - nvm->ops.release = igb_release_nvm_i210; - nvm->ops.read = igb_read_nvm_srrd_i210; - nvm->ops.write = igb_write_nvm_srwr_i210; - nvm->ops.valid_led_default = igb_valid_led_default_i210; - break; - case e1000_i211: - nvm->ops.acquire = igb_acquire_nvm_i210; - nvm->ops.release = igb_release_nvm_i210; - nvm->ops.read = igb_read_nvm_i211; - nvm->ops.valid_led_default = igb_valid_led_default_i210; - nvm->ops.validate = NULL; - nvm->ops.update = NULL; - nvm->ops.write = NULL; break; default: - nvm->ops.validate = igb_validate_nvm_checksum; - nvm->ops.update = igb_update_nvm_checksum; - nvm->ops.acquire = igb_acquire_nvm_82575; - nvm->ops.release = igb_release_nvm_82575; - if (nvm->word_size < (1 << 15)) - nvm->ops.read = igb_read_nvm_eerd; - else - nvm->ops.read = igb_read_nvm_spi; - nvm->ops.write = igb_write_nvm_spi; break; } @@ -516,6 +483,8 @@ static s32 igb_get_invariants_82575(struct e1000_hw *hw) case E1000_DEV_ID_I210_FIBER: case E1000_DEV_ID_I210_SERDES: case E1000_DEV_ID_I210_SGMII: + case E1000_DEV_ID_I210_COPPER_FLASHLESS: + case E1000_DEV_ID_I210_SERDES_FLASHLESS: mac->type = e1000_i210; break; case E1000_DEV_ID_I211_COPPER: @@ -601,6 +570,15 @@ static s32 igb_get_invariants_82575(struct e1000_hw *hw) /* NVM initialization */ ret_val = igb_init_nvm_params_82575(hw); + switch (hw->mac.type) { + case e1000_i210: + case e1000_i211: + ret_val = igb_init_nvm_params_i210(hw); + break; + default: + break; + } + if (ret_val) goto out; @@ -1320,7 +1298,7 @@ void igb_shutdown_serdes_link_82575(struct e1000_hw *hw) **/ static s32 igb_reset_hw_82575(struct e1000_hw *hw) { - u32 ctrl, icr; + u32 ctrl; s32 ret_val; /* Prevent the PCI-E bus from sticking if there is no TLP connection @@ -1365,7 +1343,7 @@ static s32 igb_reset_hw_82575(struct e1000_hw *hw) /* Clear any pending interrupt events. */ wr32(E1000_IMC, 0xffffffff); - icr = rd32(E1000_ICR); + rd32(E1000_ICR); /* Install any alternate MAC address into RAR0 */ ret_val = igb_check_alt_mac_addr(hw); @@ -2103,10 +2081,9 @@ static s32 igb_reset_hw_82580(struct e1000_hw *hw) s32 ret_val = 0; /* BH SW mailbox bit in SW_FW_SYNC */ u16 swmbsw_mask = E1000_SW_SYNCH_MB; - u32 ctrl, icr; + u32 ctrl; bool global_device_reset = hw->dev_spec._82575.global_device_reset; - hw->dev_spec._82575.global_device_reset = false; /* due to hw errata, global device reset doesn't always @@ -2165,7 +2142,7 @@ static s32 igb_reset_hw_82580(struct e1000_hw *hw) /* Clear any pending interrupt events. */ wr32(E1000_IMC, 0xffffffff); - icr = rd32(E1000_ICR); + rd32(E1000_ICR); ret_val = igb_reset_mdicnfg_82580(hw); if (ret_val) diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h b/drivers/net/ethernet/intel/igb/e1000_defines.h index aa201ab..60559af 100644 --- a/drivers/net/ethernet/intel/igb/e1000_defines.h +++ b/drivers/net/ethernet/intel/igb/e1000_defines.h @@ -620,6 +620,7 @@ #define E1000_EECD_SIZE_EX_SHIFT 11 #define E1000_EECD_FLUPD_I210 0x00800000 /* Update FLASH */ #define E1000_EECD_FLUDONE_I210 0x04000000 /* Update FLASH done*/ +#define E1000_EECD_FLASH_DETECTED_I210 0x00080000 /* FLASH detected */ #define E1000_FLUDONE_ATTEMPTS 20000 #define E1000_EERD_EEWR_MAX_COUNT 512 /* buffered EEPROM words rw */ #define E1000_I210_FIFO_SEL_RX 0x00 @@ -627,6 +628,11 @@ #define E1000_I210_FIFO_SEL_TX_LEGACY E1000_I210_FIFO_SEL_TX_QAV(0) #define E1000_I210_FIFO_SEL_BMC2OS_TX 0x06 #define E1000_I210_FIFO_SEL_BMC2OS_RX 0x01 +#define E1000_I210_FLASH_SECTOR_SIZE 0x1000 /* 4KB FLASH sector unit size */ +/* Secure FLASH mode requires removing MSb */ +#define E1000_I210_FW_PTR_MASK 0x7FFF +/* Firmware code revision field word offset*/ +#define E1000_I210_FW_VER_OFFSET 328 #define E1000_EECD_FLUPD_I210 0x00800000 /* Update FLASH */ #define E1000_EECD_FLUDONE_I210 0x04000000 /* Update FLASH done*/ #define E1000_FLUDONE_ATTEMPTS 20000 @@ -665,20 +671,26 @@ #define NVM_INIT_CTRL_4 0x0013 #define NVM_LED_1_CFG 0x001C #define NVM_LED_0_2_CFG 0x001F - -/* NVM version defines */ #define NVM_ETRACK_WORD 0x0042 +#define NVM_ETRACK_HIWORD 0x0043 #define NVM_COMB_VER_OFF 0x0083 #define NVM_COMB_VER_PTR 0x003d -#define NVM_MAJOR_MASK 0xF000 -#define NVM_MINOR_MASK 0x0FF0 -#define NVM_BUILD_MASK 0x000F -#define NVM_COMB_VER_MASK 0x00FF -#define NVM_MAJOR_SHIFT 12 -#define NVM_MINOR_SHIFT 4 -#define NVM_COMB_VER_SHFT 8 -#define NVM_VER_INVALID 0xFFFF -#define NVM_ETRACK_SHIFT 16 + +/* NVM version defines */ +#define NVM_MAJOR_MASK 0xF000 +#define NVM_MINOR_MASK 0x0FF0 +#define NVM_IMAGE_ID_MASK 0x000F +#define NVM_COMB_VER_MASK 0x00FF +#define NVM_MAJOR_SHIFT 12 +#define NVM_MINOR_SHIFT 4 +#define NVM_COMB_VER_SHFT 8 +#define NVM_VER_INVALID 0xFFFF +#define NVM_ETRACK_SHIFT 16 +#define NVM_ETRACK_VALID 0x8000 +#define NVM_NEW_DEC_MASK 0x0F00 +#define NVM_HEX_CONV 16 +#define NVM_HEX_TENS 10 + #define NVM_ETS_CFG 0x003E #define NVM_ETS_LTHRES_DELTA_MASK 0x07C0 #define NVM_ETS_LTHRES_DELTA_SHIFT 6 diff --git a/drivers/net/ethernet/intel/igb/e1000_hw.h b/drivers/net/ethernet/intel/igb/e1000_hw.h index 94d7866..37a9c06 100644 --- a/drivers/net/ethernet/intel/igb/e1000_hw.h +++ b/drivers/net/ethernet/intel/igb/e1000_hw.h @@ -67,6 +67,8 @@ struct e1000_hw; #define E1000_DEV_ID_I210_FIBER 0x1536 #define E1000_DEV_ID_I210_SERDES 0x1537 #define E1000_DEV_ID_I210_SGMII 0x1538 +#define E1000_DEV_ID_I210_COPPER_FLASHLESS 0x157B +#define E1000_DEV_ID_I210_SERDES_FLASHLESS 0x157C #define E1000_DEV_ID_I211_COPPER 0x1539 #define E1000_DEV_ID_I354_BACKPLANE_1GBPS 0x1F40 #define E1000_DEV_ID_I354_SGMII 0x1F41 @@ -110,6 +112,7 @@ enum e1000_nvm_type { e1000_nvm_none, e1000_nvm_eeprom_spi, e1000_nvm_flash_hw, + e1000_nvm_invm, e1000_nvm_flash_sw }; diff --git a/drivers/net/ethernet/intel/igb/e1000_i210.c b/drivers/net/ethernet/intel/igb/e1000_i210.c index ddb3cf5..0c03933 100644 --- a/drivers/net/ethernet/intel/igb/e1000_i210.c +++ b/drivers/net/ethernet/intel/igb/e1000_i210.c @@ -335,57 +335,101 @@ s32 igb_write_nvm_srwr_i210(struct e1000_hw *hw, u16 offset, u16 words, } /** - * igb_read_nvm_i211 - Read NVM wrapper function for I211 + * igb_read_invm_word_i210 - Reads OTP + * @hw: pointer to the HW structure + * @address: the word address (aka eeprom offset) to read + * @data: pointer to the data read + * + * Reads 16-bit words from the OTP. Return error when the word is not + * stored in OTP. + **/ +static s32 igb_read_invm_word_i210(struct e1000_hw *hw, u8 address, u16 *data) +{ + s32 status = -E1000_ERR_INVM_VALUE_NOT_FOUND; + u32 invm_dword; + u16 i; + u8 record_type, word_address; + + for (i = 0; i < E1000_INVM_SIZE; i++) { + invm_dword = rd32(E1000_INVM_DATA_REG(i)); + /* Get record type */ + record_type = INVM_DWORD_TO_RECORD_TYPE(invm_dword); + if (record_type == E1000_INVM_UNINITIALIZED_STRUCTURE) + break; + if (record_type == E1000_INVM_CSR_AUTOLOAD_STRUCTURE) + i += E1000_INVM_CSR_AUTOLOAD_DATA_SIZE_IN_DWORDS; + if (record_type == E1000_INVM_RSA_KEY_SHA256_STRUCTURE) + i += E1000_INVM_RSA_KEY_SHA256_DATA_SIZE_IN_DWORDS; + if (record_type == E1000_INVM_WORD_AUTOLOAD_STRUCTURE) { + word_address = INVM_DWORD_TO_WORD_ADDRESS(invm_dword); + if (word_address == address) { + *data = INVM_DWORD_TO_WORD_DATA(invm_dword); + hw_dbg("Read INVM Word 0x%02x = %x", + address, *data); + status = E1000_SUCCESS; + break; + } + } + } + if (status != E1000_SUCCESS) + hw_dbg("Requested word 0x%02x not found in OTP\n", address); + return status; +} + +/** + * igb_read_invm_i210 - Read invm wrapper function for I210/I211 * @hw: pointer to the HW structure * @words: number of words to read * @data: pointer to the data read * * Wrapper function to return data formerly found in the NVM. **/ -s32 igb_read_nvm_i211(struct e1000_hw *hw, u16 offset, u16 words, - u16 *data) +static s32 igb_read_invm_i210(struct e1000_hw *hw, u16 offset, + u16 words __always_unused, u16 *data) { s32 ret_val = E1000_SUCCESS; /* Only the MAC addr is required to be present in the iNVM */ switch (offset) { case NVM_MAC_ADDR: - ret_val = igb_read_invm_i211(hw, offset, &data[0]); - ret_val |= igb_read_invm_i211(hw, offset+1, &data[1]); - ret_val |= igb_read_invm_i211(hw, offset+2, &data[2]); + ret_val = igb_read_invm_word_i210(hw, (u8)offset, &data[0]); + ret_val |= igb_read_invm_word_i210(hw, (u8)offset+1, + &data[1]); + ret_val |= igb_read_invm_word_i210(hw, (u8)offset+2, + &data[2]); if (ret_val != E1000_SUCCESS) hw_dbg("MAC Addr not found in iNVM\n"); break; case NVM_INIT_CTRL_2: - ret_val = igb_read_invm_i211(hw, (u8)offset, data); + ret_val = igb_read_invm_word_i210(hw, (u8)offset, data); if (ret_val != E1000_SUCCESS) { *data = NVM_INIT_CTRL_2_DEFAULT_I211; ret_val = E1000_SUCCESS; } break; case NVM_INIT_CTRL_4: - ret_val = igb_read_invm_i211(hw, (u8)offset, data); + ret_val = igb_read_invm_word_i210(hw, (u8)offset, data); if (ret_val != E1000_SUCCESS) { *data = NVM_INIT_CTRL_4_DEFAULT_I211; ret_val = E1000_SUCCESS; } break; case NVM_LED_1_CFG: - ret_val = igb_read_invm_i211(hw, (u8)offset, data); + ret_val = igb_read_invm_word_i210(hw, (u8)offset, data); if (ret_val != E1000_SUCCESS) { *data = NVM_LED_1_CFG_DEFAULT_I211; ret_val = E1000_SUCCESS; } break; case NVM_LED_0_2_CFG: - igb_read_invm_i211(hw, offset, data); + ret_val = igb_read_invm_word_i210(hw, (u8)offset, data); if (ret_val != E1000_SUCCESS) { *data = NVM_LED_0_2_CFG_DEFAULT_I211; ret_val = E1000_SUCCESS; } break; case NVM_ID_LED_SETTINGS: - ret_val = igb_read_invm_i211(hw, (u8)offset, data); + ret_val = igb_read_invm_word_i210(hw, (u8)offset, data); if (ret_val != E1000_SUCCESS) { *data = ID_LED_RESERVED_FFFF; ret_val = E1000_SUCCESS; @@ -411,48 +455,6 @@ s32 igb_read_nvm_i211(struct e1000_hw *hw, u16 offset, u16 words, } /** - * igb_read_invm_i211 - Reads OTP - * @hw: pointer to the HW structure - * @address: the word address (aka eeprom offset) to read - * @data: pointer to the data read - * - * Reads 16-bit words from the OTP. Return error when the word is not - * stored in OTP. - **/ -s32 igb_read_invm_i211(struct e1000_hw *hw, u16 address, u16 *data) -{ - s32 status = -E1000_ERR_INVM_VALUE_NOT_FOUND; - u32 invm_dword; - u16 i; - u8 record_type, word_address; - - for (i = 0; i < E1000_INVM_SIZE; i++) { - invm_dword = rd32(E1000_INVM_DATA_REG(i)); - /* Get record type */ - record_type = INVM_DWORD_TO_RECORD_TYPE(invm_dword); - if (record_type == E1000_INVM_UNINITIALIZED_STRUCTURE) - break; - if (record_type == E1000_INVM_CSR_AUTOLOAD_STRUCTURE) - i += E1000_INVM_CSR_AUTOLOAD_DATA_SIZE_IN_DWORDS; - if (record_type == E1000_INVM_RSA_KEY_SHA256_STRUCTURE) - i += E1000_INVM_RSA_KEY_SHA256_DATA_SIZE_IN_DWORDS; - if (record_type == E1000_INVM_WORD_AUTOLOAD_STRUCTURE) { - word_address = INVM_DWORD_TO_WORD_ADDRESS(invm_dword); - if (word_address == (u8)address) { - *data = INVM_DWORD_TO_WORD_DATA(invm_dword); - hw_dbg("Read INVM Word 0x%02x = %x", - address, *data); - status = E1000_SUCCESS; - break; - } - } - } - if (status != E1000_SUCCESS) - hw_dbg("Requested word 0x%02x not found in OTP\n", address); - return status; -} - -/** * igb_read_invm_version - Reads iNVM version and image type * @hw: pointer to the HW structure * @invm_ver: version structure for the version read @@ -661,6 +663,23 @@ static s32 igb_pool_flash_update_done_i210(struct e1000_hw *hw) } /** + * igb_get_flash_presence_i210 - Check if flash device is detected. + * @hw: pointer to the HW structure + * + **/ +bool igb_get_flash_presence_i210(struct e1000_hw *hw) +{ + u32 eec = 0; + bool ret_val = false; + + eec = rd32(E1000_EECD); + if (eec & E1000_EECD_FLASH_DETECTED_I210) + ret_val = true; + + return ret_val; +} + +/** * igb_update_flash_i210 - Commit EEPROM to the flash * @hw: pointer to the HW structure * @@ -786,3 +805,33 @@ s32 igb_write_xmdio_reg(struct e1000_hw *hw, u16 addr, u8 dev_addr, u16 data) { return __igb_access_xmdio_reg(hw, addr, dev_addr, &data, false); } + +/** + * igb_init_nvm_params_i210 - Init NVM func ptrs. + * @hw: pointer to the HW structure + **/ +s32 igb_init_nvm_params_i210(struct e1000_hw *hw) +{ + s32 ret_val = 0; + struct e1000_nvm_info *nvm = &hw->nvm; + + nvm->ops.acquire = igb_acquire_nvm_i210; + nvm->ops.release = igb_release_nvm_i210; + nvm->ops.valid_led_default = igb_valid_led_default_i210; + + /* NVM Function Pointers */ + if (igb_get_flash_presence_i210(hw)) { + hw->nvm.type = e1000_nvm_flash_hw; + nvm->ops.read = igb_read_nvm_srrd_i210; + nvm->ops.write = igb_write_nvm_srwr_i210; + nvm->ops.validate = igb_validate_nvm_checksum_i210; + nvm->ops.update = igb_update_nvm_checksum_i210; + } else { + hw->nvm.type = e1000_nvm_invm; + nvm->ops.read = igb_read_invm_i210; + nvm->ops.write = NULL; + nvm->ops.validate = NULL; + nvm->ops.update = NULL; + } + return ret_val; +} diff --git a/drivers/net/ethernet/intel/igb/e1000_i210.h b/drivers/net/ethernet/intel/igb/e1000_i210.h index 5caa332..dde3c4b 100644 --- a/drivers/net/ethernet/intel/igb/e1000_i210.h +++ b/drivers/net/ethernet/intel/igb/e1000_i210.h @@ -35,20 +35,19 @@ extern s32 igb_write_nvm_srwr_i210(struct e1000_hw *hw, u16 offset, u16 words, u16 *data); extern s32 igb_read_nvm_srrd_i210(struct e1000_hw *hw, u16 offset, u16 words, u16 *data); -extern s32 igb_read_invm_i211(struct e1000_hw *hw, u16 address, u16 *data); extern s32 igb_acquire_swfw_sync_i210(struct e1000_hw *hw, u16 mask); extern void igb_release_swfw_sync_i210(struct e1000_hw *hw, u16 mask); extern s32 igb_acquire_nvm_i210(struct e1000_hw *hw); extern void igb_release_nvm_i210(struct e1000_hw *hw); extern s32 igb_valid_led_default_i210(struct e1000_hw *hw, u16 *data); -extern s32 igb_read_nvm_i211(struct e1000_hw *hw, u16 offset, u16 words, - u16 *data); extern s32 igb_read_invm_version(struct e1000_hw *hw, struct e1000_fw_version *invm_ver); extern s32 igb_read_xmdio_reg(struct e1000_hw *hw, u16 addr, u8 dev_addr, u16 *data); extern s32 igb_write_xmdio_reg(struct e1000_hw *hw, u16 addr, u8 dev_addr, u16 data); +extern s32 igb_init_nvm_params_i210(struct e1000_hw *hw); +extern bool igb_get_flash_presence_i210(struct e1000_hw *hw); #define E1000_STM_OPCODE 0xDB00 #define E1000_EEPROM_FLASH_SIZE_WORD 0x11 diff --git a/drivers/net/ethernet/intel/igb/e1000_nvm.c b/drivers/net/ethernet/intel/igb/e1000_nvm.c index 7f9cd7c..a7db7f3 100644 --- a/drivers/net/ethernet/intel/igb/e1000_nvm.c +++ b/drivers/net/ethernet/intel/igb/e1000_nvm.c @@ -709,11 +709,16 @@ out: **/ void igb_get_fw_version(struct e1000_hw *hw, struct e1000_fw_version *fw_vers) { - u16 eeprom_verh, eeprom_verl, comb_verh, comb_verl, comb_offset; - u16 fw_version; + u16 eeprom_verh, eeprom_verl, etrack_test, fw_version; + u8 q, hval, rem, result; + u16 comb_verh, comb_verl, comb_offset; memset(fw_vers, 0, sizeof(struct e1000_fw_version)); + /* basic eeprom version numbers and bits used vary by part and by tool + * used to create the nvm images. Check which data format we have. + */ + hw->nvm.ops.read(hw, NVM_ETRACK_HIWORD, 1, &etrack_test); switch (hw->mac.type) { case e1000_i211: igb_read_invm_version(hw, fw_vers); @@ -721,30 +726,30 @@ void igb_get_fw_version(struct e1000_hw *hw, struct e1000_fw_version *fw_vers) case e1000_82575: case e1000_82576: case e1000_82580: - case e1000_i354: - case e1000_i350: - case e1000_i210: + /* Use this format, unless EETRACK ID exists, + * then use alternate format + */ + if ((etrack_test & NVM_MAJOR_MASK) != NVM_ETRACK_VALID) { + hw->nvm.ops.read(hw, NVM_VERSION, 1, &fw_version); + fw_vers->eep_major = (fw_version & NVM_MAJOR_MASK) + >> NVM_MAJOR_SHIFT; + fw_vers->eep_minor = (fw_version & NVM_MINOR_MASK) + >> NVM_MINOR_SHIFT; + fw_vers->eep_build = (fw_version & NVM_IMAGE_ID_MASK); + goto etrack_id; + } break; - default: - return; - } - /* basic eeprom version numbers */ - hw->nvm.ops.read(hw, NVM_VERSION, 1, &fw_version); - fw_vers->eep_major = (fw_version & NVM_MAJOR_MASK) >> NVM_MAJOR_SHIFT; - fw_vers->eep_minor = (fw_version & NVM_MINOR_MASK); - - /* etrack id */ - hw->nvm.ops.read(hw, NVM_ETRACK_WORD, 1, &eeprom_verl); - hw->nvm.ops.read(hw, (NVM_ETRACK_WORD + 1), 1, &eeprom_verh); - fw_vers->etrack_id = (eeprom_verh << NVM_ETRACK_SHIFT) | eeprom_verl; - - switch (hw->mac.type) { case e1000_i210: - case e1000_i354: + if (!(igb_get_flash_presence_i210(hw))) { + igb_read_invm_version(hw, fw_vers); + return; + } + /* fall through */ case e1000_i350: /* find combo image version */ hw->nvm.ops.read(hw, NVM_COMB_VER_PTR, 1, &comb_offset); - if ((comb_offset != 0x0) && (comb_offset != NVM_VER_INVALID)) { + if ((comb_offset != 0x0) && + (comb_offset != NVM_VER_INVALID)) { hw->nvm.ops.read(hw, (NVM_COMB_VER_OFF + comb_offset + 1), 1, &comb_verh); @@ -760,15 +765,42 @@ void igb_get_fw_version(struct e1000_hw *hw, struct e1000_fw_version *fw_vers) fw_vers->or_major = comb_verl >> NVM_COMB_VER_SHFT; fw_vers->or_build = - ((comb_verl << NVM_COMB_VER_SHFT) - | (comb_verh >> NVM_COMB_VER_SHFT)); + (comb_verl << NVM_COMB_VER_SHFT) + | (comb_verh >> NVM_COMB_VER_SHFT); fw_vers->or_patch = comb_verh & NVM_COMB_VER_MASK; } } break; default: - break; + return; + } + hw->nvm.ops.read(hw, NVM_VERSION, 1, &fw_version); + fw_vers->eep_major = (fw_version & NVM_MAJOR_MASK) + >> NVM_MAJOR_SHIFT; + + /* check for old style version format in newer images*/ + if ((fw_version & NVM_NEW_DEC_MASK) == 0x0) { + eeprom_verl = (fw_version & NVM_COMB_VER_MASK); + } else { + eeprom_verl = (fw_version & NVM_MINOR_MASK) + >> NVM_MINOR_SHIFT; + } + /* Convert minor value to hex before assigning to output struct + * Val to be converted will not be higher than 99, per tool output + */ + q = eeprom_verl / NVM_HEX_CONV; + hval = q * NVM_HEX_TENS; + rem = eeprom_verl % NVM_HEX_CONV; + result = hval + rem; + fw_vers->eep_minor = result; + +etrack_id: + if ((etrack_test & NVM_MAJOR_MASK) == NVM_ETRACK_VALID) { + hw->nvm.ops.read(hw, NVM_ETRACK_WORD, 1, &eeprom_verl); + hw->nvm.ops.read(hw, (NVM_ETRACK_WORD + 1), 1, &eeprom_verh); + fw_vers->etrack_id = (eeprom_verh << NVM_ETRACK_SHIFT) + | eeprom_verl; } return; } diff --git a/drivers/net/ethernet/intel/igb/e1000_nvm.h b/drivers/net/ethernet/intel/igb/e1000_nvm.h index 6bfc0c4..433b741 100644 --- a/drivers/net/ethernet/intel/igb/e1000_nvm.h +++ b/drivers/net/ethernet/intel/igb/e1000_nvm.h @@ -44,6 +44,7 @@ struct e1000_fw_version { u32 etrack_id; u16 eep_major; u16 eep_minor; + u16 eep_build; u8 invm_major; u8 invm_minor; diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index 15ea8dc..c1fae7a 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -343,6 +343,8 @@ struct hwmon_buff { }; #endif +#define IGB_RETA_SIZE 128 + /* board specific private data structure */ struct igb_adapter { unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)]; @@ -444,6 +446,8 @@ struct igb_adapter { struct i2c_algo_bit_data i2c_algo; struct i2c_adapter i2c_adap; struct i2c_client *i2c_client; + u32 rss_indir_tbl_init; + u8 rss_indir_tbl[IGB_RETA_SIZE]; }; #define IGB_FLAG_HAS_MSI (1 << 0) @@ -480,6 +484,7 @@ extern int igb_up(struct igb_adapter *); extern void igb_down(struct igb_adapter *); extern void igb_reinit_locked(struct igb_adapter *); extern void igb_reset(struct igb_adapter *); +extern void igb_write_rss_indir_tbl(struct igb_adapter *); extern int igb_set_spd_dplx(struct igb_adapter *, u32, u8); extern int igb_setup_tx_resources(struct igb_ring *); extern int igb_setup_rx_resources(struct igb_ring *); diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c index 85fe7b5..ce9b5a9 100644 --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c @@ -1335,12 +1335,23 @@ static int igb_reg_test(struct igb_adapter *adapter, u64 *data) static int igb_eeprom_test(struct igb_adapter *adapter, u64 *data) { + struct e1000_hw *hw = &adapter->hw; + *data = 0; - /* Validate eeprom on all parts but i211 */ - if (adapter->hw.mac.type != e1000_i211) { + /* Validate eeprom on all parts but flashless */ + switch (hw->mac.type) { + case e1000_i210: + case e1000_i211: + if (igb_get_flash_presence_i210(hw)) { + if (adapter->hw.nvm.ops.validate(&adapter->hw) < 0) + *data = 2; + } + break; + default: if (adapter->hw.nvm.ops.validate(&adapter->hw) < 0) *data = 2; + break; } return *data; @@ -2672,7 +2683,9 @@ static int igb_set_eee(struct net_device *netdev, igb_set_eee_i350(hw); /* reset link */ - if (!netif_running(netdev)) + if (netif_running(netdev)) + igb_reinit_locked(adapter); + else igb_reset(adapter); } @@ -2771,6 +2784,90 @@ static void igb_ethtool_complete(struct net_device *netdev) pm_runtime_put(&adapter->pdev->dev); } +static u32 igb_get_rxfh_indir_size(struct net_device *netdev) +{ + return IGB_RETA_SIZE; +} + +static int igb_get_rxfh_indir(struct net_device *netdev, u32 *indir) +{ + struct igb_adapter *adapter = netdev_priv(netdev); + int i; + + for (i = 0; i < IGB_RETA_SIZE; i++) + indir[i] = adapter->rss_indir_tbl[i]; + + return 0; +} + +void igb_write_rss_indir_tbl(struct igb_adapter *adapter) +{ + struct e1000_hw *hw = &adapter->hw; + u32 reg = E1000_RETA(0); + u32 shift = 0; + int i = 0; + + switch (hw->mac.type) { + case e1000_82575: + shift = 6; + break; + case e1000_82576: + /* 82576 supports 2 RSS queues for SR-IOV */ + if (adapter->vfs_allocated_count) + shift = 3; + break; + default: + break; + } + + while (i < IGB_RETA_SIZE) { + u32 val = 0; + int j; + + for (j = 3; j >= 0; j--) { + val <<= 8; + val |= adapter->rss_indir_tbl[i + j]; + } + + wr32(reg, val << shift); + reg += 4; + i += 4; + } +} + +static int igb_set_rxfh_indir(struct net_device *netdev, const u32 *indir) +{ + struct igb_adapter *adapter = netdev_priv(netdev); + struct e1000_hw *hw = &adapter->hw; + int i; + u32 num_queues; + + num_queues = adapter->rss_queues; + + switch (hw->mac.type) { + case e1000_82576: + /* 82576 supports 2 RSS queues for SR-IOV */ + if (adapter->vfs_allocated_count) + num_queues = 2; + break; + default: + break; + } + + /* Verify user input. */ + for (i = 0; i < IGB_RETA_SIZE; i++) + if (indir[i] >= num_queues) + return -EINVAL; + + + for (i = 0; i < IGB_RETA_SIZE; i++) + adapter->rss_indir_tbl[i] = indir[i]; + + igb_write_rss_indir_tbl(adapter); + + return 0; +} + static const struct ethtool_ops igb_ethtool_ops = { .get_settings = igb_get_settings, .set_settings = igb_set_settings, @@ -2804,6 +2901,9 @@ static const struct ethtool_ops igb_ethtool_ops = { .set_eee = igb_set_eee, .get_module_info = igb_get_module_info, .get_module_eeprom = igb_get_module_eeprom, + .get_rxfh_indir_size = igb_get_rxfh_indir_size, + .get_rxfh_indir = igb_get_rxfh_indir, + .set_rxfh_indir = igb_set_rxfh_indir, .begin = igb_ethtool_begin, .complete = igb_ethtool_complete, }; diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index c1d72c0..df33c4b 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -85,6 +85,8 @@ static DEFINE_PCI_DEVICE_TABLE(igb_pci_tbl) = { { PCI_VDEVICE(INTEL, E1000_DEV_ID_I210_FIBER), board_82575 }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_I210_SERDES), board_82575 }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_I210_SGMII), board_82575 }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_I210_COPPER_FLASHLESS), board_82575 }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_I210_SERDES_FLASHLESS), board_82575 }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_I350_COPPER), board_82575 }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_I350_FIBER), board_82575 }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_I350_SERDES), board_82575 }, @@ -1013,7 +1015,7 @@ static void igb_free_q_vector(struct igb_adapter *adapter, int v_idx) adapter->q_vector[v_idx] = NULL; netif_napi_del(&q_vector->napi); - /* ixgbe_get_stats64() might access the rings on this vector, + /* igb_get_stats64() might access the rings on this vector, * we must wait a grace period before freeing it. */ kfree_rcu(q_vector, rcu); @@ -1929,12 +1931,17 @@ void igb_set_fw_version(struct igb_adapter *adapter) igb_get_fw_version(hw, &fw); switch (hw->mac.type) { + case e1000_i210: case e1000_i211: - snprintf(adapter->fw_version, sizeof(adapter->fw_version), - "%2d.%2d-%d", - fw.invm_major, fw.invm_minor, fw.invm_img_type); - break; - + if (!(igb_get_flash_presence_i210(hw))) { + snprintf(adapter->fw_version, + sizeof(adapter->fw_version), + "%2d.%2d-%d", + fw.invm_major, fw.invm_minor, + fw.invm_img_type); + break; + } + /* fall through */ default: /* if option is rom valid, display its version too */ if (fw.or_valid) { @@ -1944,11 +1951,16 @@ void igb_set_fw_version(struct igb_adapter *adapter) fw.eep_major, fw.eep_minor, fw.etrack_id, fw.or_major, fw.or_build, fw.or_patch); /* no option rom */ - } else { + } else if (fw.etrack_id != 0X0000) { snprintf(adapter->fw_version, - sizeof(adapter->fw_version), - "%d.%d, 0x%08x", - fw.eep_major, fw.eep_minor, fw.etrack_id); + sizeof(adapter->fw_version), + "%d.%d, 0x%08x", + fw.eep_major, fw.eep_minor, fw.etrack_id); + } else { + snprintf(adapter->fw_version, + sizeof(adapter->fw_version), + "%d.%d.%d", + fw.eep_major, fw.eep_minor, fw.eep_build); } break; } @@ -2166,15 +2178,28 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent) */ hw->mac.ops.reset_hw(hw); - /* make sure the NVM is good , i211 parts have special NVM that - * doesn't contain a checksum + /* make sure the NVM is good , i211/i210 parts can have special NVM + * that doesn't contain a checksum */ - if (hw->mac.type != e1000_i211) { + switch (hw->mac.type) { + case e1000_i210: + case e1000_i211: + if (igb_get_flash_presence_i210(hw)) { + if (hw->nvm.ops.validate(hw) < 0) { + dev_err(&pdev->dev, + "The NVM Checksum Is Not Valid\n"); + err = -EIO; + goto err_eeprom; + } + } + break; + default: if (hw->nvm.ops.validate(hw) < 0) { dev_err(&pdev->dev, "The NVM Checksum Is Not Valid\n"); err = -EIO; goto err_eeprom; } + break; } /* copy the MAC address out of the NVM */ @@ -2436,6 +2461,11 @@ static int igb_enable_sriov(struct pci_dev *pdev, int num_vfs) int err = 0; int i; + if (!adapter->msix_entries) { + err = -EPERM; + goto out; + } + if (!num_vfs) goto out; else if (old_vfs && old_vfs == num_vfs) @@ -3096,7 +3126,7 @@ static void igb_setup_mrqc(struct igb_adapter *adapter) { struct e1000_hw *hw = &adapter->hw; u32 mrqc, rxcsum; - u32 j, num_rx_queues, shift = 0; + u32 j, num_rx_queues; static const u32 rsskey[10] = { 0xDA565A6D, 0xC20E5B25, 0x3D256741, 0xB08FA343, 0xCB2BCAD0, 0xB4307BAE, 0xA32DCB77, 0x0CF23080, 0x3BB7426A, @@ -3109,35 +3139,21 @@ static void igb_setup_mrqc(struct igb_adapter *adapter) num_rx_queues = adapter->rss_queues; switch (hw->mac.type) { - case e1000_82575: - shift = 6; - break; case e1000_82576: /* 82576 supports 2 RSS queues for SR-IOV */ - if (adapter->vfs_allocated_count) { - shift = 3; + if (adapter->vfs_allocated_count) num_rx_queues = 2; - } break; default: break; } - /* Populate the indirection table 4 entries at a time. To do this - * we are generating the results for n and n+2 and then interleaving - * those with the results with n+1 and n+3. - */ - for (j = 0; j < 32; j++) { - /* first pass generates n and n+2 */ - u32 base = ((j * 0x00040004) + 0x00020000) * num_rx_queues; - u32 reta = (base & 0x07800780) >> (7 - shift); - - /* second pass generates n+1 and n+3 */ - base += 0x00010001 * num_rx_queues; - reta |= (base & 0x07800780) << (1 + shift); - - wr32(E1000_RETA(j), reta); + if (adapter->rss_indir_tbl_init != num_rx_queues) { + for (j = 0; j < IGB_RETA_SIZE; j++) + adapter->rss_indir_tbl[j] = (j * num_rx_queues) / IGB_RETA_SIZE; + adapter->rss_indir_tbl_init = num_rx_queues; } + igb_write_rss_indir_tbl(adapter); /* Disable raw packet checksumming so that RSS hash is placed in * descriptor on writeback. No need to enable TCP/UDP/IP checksum @@ -3844,7 +3860,6 @@ bool igb_has_link(struct igb_adapter *adapter) { struct e1000_hw *hw = &adapter->hw; bool link_active = false; - s32 ret_val = 0; /* get_link_status is set on LSC (link status) interrupt or * rx sequence error interrupt. get_link_status will stay @@ -3853,16 +3868,11 @@ bool igb_has_link(struct igb_adapter *adapter) */ switch (hw->phy.media_type) { case e1000_media_type_copper: - if (hw->mac.get_link_status) { - ret_val = hw->mac.ops.check_for_link(hw); - link_active = !hw->mac.get_link_status; - } else { - link_active = true; - } - break; + if (!hw->mac.get_link_status) + return true; case e1000_media_type_internal_serdes: - ret_val = hw->mac.ops.check_for_link(hw); - link_active = hw->mac.serdes_has_link; + hw->mac.ops.check_for_link(hw); + link_active = !hw->mac.get_link_status; break; default: case e1000_media_type_unknown: @@ -4814,6 +4824,10 @@ static int igb_change_mtu(struct net_device *netdev, int new_mtu) return -EINVAL; } + /* adjust max frame to be at least the size of a standard frame */ + if (max_frame < (ETH_FRAME_LEN + ETH_FCS_LEN)) + max_frame = ETH_FRAME_LEN + ETH_FCS_LEN; + while (test_and_set_bit(__IGB_RESETTING, &adapter->state)) msleep(1); @@ -4865,6 +4879,8 @@ void igb_update_stats(struct igb_adapter *adapter, bytes = 0; packets = 0; + + rcu_read_lock(); for (i = 0; i < adapter->num_rx_queues; i++) { u32 rqdpc = rd32(E1000_RQDPC(i)); struct igb_ring *ring = adapter->rx_ring[i]; @@ -4900,6 +4916,7 @@ void igb_update_stats(struct igb_adapter *adapter, } net_stats->tx_bytes = bytes; net_stats->tx_packets = packets; + rcu_read_unlock(); /* read stats registers */ adapter->stats.crcerrs += rd32(E1000_CRCERRS); diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c index 7e8c477..5a54e3d 100644 --- a/drivers/net/ethernet/intel/igb/igb_ptp.c +++ b/drivers/net/ethernet/intel/igb/igb_ptp.c @@ -97,14 +97,14 @@ static cycle_t igb_ptp_read_82580(const struct cyclecounter *cc) { struct igb_adapter *igb = container_of(cc, struct igb_adapter, cc); struct e1000_hw *hw = &igb->hw; + u32 lo, hi; u64 val; - u32 lo, hi, jk; /* The timestamp latches on lowest register read. For the 82580 * the lowest register is SYSTIMR instead of SYSTIML. However we only * need to provide nanosecond resolution, so we just ignore it. */ - jk = rd32(E1000_SYSTIMR); + rd32(E1000_SYSTIMR); lo = rd32(E1000_SYSTIML); hi = rd32(E1000_SYSTIMH); @@ -118,13 +118,13 @@ static cycle_t igb_ptp_read_82580(const struct cyclecounter *cc) static void igb_ptp_read_i210(struct igb_adapter *adapter, struct timespec *ts) { struct e1000_hw *hw = &adapter->hw; - u32 sec, nsec, jk; + u32 sec, nsec; /* The timestamp latches on lowest register read. For I210/I211, the * lowest register is SYSTIMR. Since we only need to provide nanosecond * resolution, we can ignore it. */ - jk = rd32(E1000_SYSTIMR); + rd32(E1000_SYSTIMR); nsec = rd32(E1000_SYSTIML); sec = rd32(E1000_SYSTIMH); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c b/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c index 9d4a1ea..b4881b6 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c @@ -160,6 +160,7 @@ static int mlx4_en_dcbnl_ieee_setpfc(struct net_device *dev, struct ieee_pfc *pfc) { struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_port_profile *prof = priv->prof; struct mlx4_en_dev *mdev = priv->mdev; int err; @@ -169,15 +170,17 @@ static int mlx4_en_dcbnl_ieee_setpfc(struct net_device *dev, pfc->mbc, pfc->delay); - priv->prof->rx_pause = priv->prof->tx_pause = !!pfc->pfc_en; - priv->prof->rx_ppp = priv->prof->tx_ppp = pfc->pfc_en; + prof->rx_pause = !pfc->pfc_en; + prof->tx_pause = !pfc->pfc_en; + prof->rx_ppp = pfc->pfc_en; + prof->tx_ppp = pfc->pfc_en; err = mlx4_SET_PORT_general(mdev->dev, priv->port, priv->rx_skb_size + ETH_FCS_LEN, - priv->prof->tx_pause, - priv->prof->tx_ppp, - priv->prof->rx_pause, - priv->prof->rx_ppp); + prof->tx_pause, + prof->tx_ppp, + prof->rx_pause, + prof->rx_ppp); if (err) en_err(priv, "Failed setting pause params\n"); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 6dcca98..0698c82 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -362,6 +362,15 @@ static void mlx4_en_process_tx_cq(struct net_device *dev, struct mlx4_en_cq *cq) */ rmb(); + if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == + MLX4_CQE_OPCODE_ERROR)) { + struct mlx4_err_cqe *cqe_err = (struct mlx4_err_cqe *)cqe; + + en_err(priv, "CQE error - vendor syndrome: 0x%x syndrome: 0x%x\n", + cqe_err->vendor_err_syndrome, + cqe_err->syndrome); + } + /* Skip over last polled CQE */ new_index = be16_to_cpu(cqe->wqe_index) & size_mask; @@ -579,17 +588,15 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; + struct device *ddev = priv->ddev; struct mlx4_en_tx_ring *ring; struct mlx4_en_tx_desc *tx_desc; struct mlx4_wqe_data_seg *data; - struct skb_frag_struct *frag; struct mlx4_en_tx_info *tx_info; - struct ethhdr *ethh; int tx_ind = 0; int nr_txbb; int desc_size; int real_size; - dma_addr_t dma; u32 index, bf_index; __be32 op_own; u16 vlan_tag = 0; @@ -665,6 +672,61 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) tx_info->skb = skb; tx_info->nr_txbb = nr_txbb; + if (lso_header_size) + data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4, + DS_SIZE)); + else + data = &tx_desc->data; + + /* valid only for none inline segments */ + tx_info->data_offset = (void *)data - (void *)tx_desc; + + tx_info->linear = (lso_header_size < skb_headlen(skb) && + !is_inline(skb, NULL)) ? 1 : 0; + + data += skb_shinfo(skb)->nr_frags + tx_info->linear - 1; + + if (is_inline(skb, &fragptr)) { + tx_info->inl = 1; + } else { + /* Map fragments */ + for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) { + struct skb_frag_struct *frag; + dma_addr_t dma; + + frag = &skb_shinfo(skb)->frags[i]; + dma = skb_frag_dma_map(ddev, frag, + 0, skb_frag_size(frag), + DMA_TO_DEVICE); + if (dma_mapping_error(ddev, dma)) + goto tx_drop_unmap; + + data->addr = cpu_to_be64(dma); + data->lkey = cpu_to_be32(mdev->mr.key); + wmb(); + data->byte_count = cpu_to_be32(skb_frag_size(frag)); + --data; + } + + /* Map linear part */ + if (tx_info->linear) { + u32 byte_count = skb_headlen(skb) - lso_header_size; + dma_addr_t dma; + + dma = dma_map_single(ddev, skb->data + + lso_header_size, byte_count, + PCI_DMA_TODEVICE); + if (dma_mapping_error(ddev, dma)) + goto tx_drop_unmap; + + data->addr = cpu_to_be64(dma); + data->lkey = cpu_to_be32(mdev->mr.key); + wmb(); + data->byte_count = cpu_to_be32(byte_count); + } + tx_info->inl = 0; + } + /* * For timestamping add flag to skb_shinfo and * set flag for further reference @@ -689,6 +751,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) } if (priv->flags & MLX4_EN_FLAG_ENABLE_HW_LOOPBACK) { + struct ethhdr *ethh; + /* Copy dst mac address to wqe. This allows loopback in eSwitch, * so that VFs and PF can communicate with each other */ @@ -711,8 +775,6 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) /* Copy headers; * note that we already verified that it is linear */ memcpy(tx_desc->lso.header, skb->data, lso_header_size); - data = ((void *) &tx_desc->lso + - ALIGN(lso_header_size + 4, DS_SIZE)); priv->port_stats.tso_packets++; i = ((skb->len - lso_header_size) / skb_shinfo(skb)->gso_size) + @@ -724,7 +786,6 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) op_own = cpu_to_be32(MLX4_OPCODE_SEND) | ((ring->prod & ring->size) ? cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0); - data = &tx_desc->data; tx_info->nr_bytes = max_t(unsigned int, skb->len, ETH_ZLEN); ring->packets++; @@ -733,38 +794,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) netdev_tx_sent_queue(ring->tx_queue, tx_info->nr_bytes); AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len); - - /* valid only for none inline segments */ - tx_info->data_offset = (void *) data - (void *) tx_desc; - - tx_info->linear = (lso_header_size < skb_headlen(skb) && !is_inline(skb, NULL)) ? 1 : 0; - data += skb_shinfo(skb)->nr_frags + tx_info->linear - 1; - - if (!is_inline(skb, &fragptr)) { - /* Map fragments */ - for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) { - frag = &skb_shinfo(skb)->frags[i]; - dma = skb_frag_dma_map(priv->ddev, frag, - 0, skb_frag_size(frag), - DMA_TO_DEVICE); - data->addr = cpu_to_be64(dma); - data->lkey = cpu_to_be32(mdev->mr.key); - wmb(); - data->byte_count = cpu_to_be32(skb_frag_size(frag)); - --data; - } - - /* Map linear part */ - if (tx_info->linear) { - dma = dma_map_single(priv->ddev, skb->data + lso_header_size, - skb_headlen(skb) - lso_header_size, PCI_DMA_TODEVICE); - data->addr = cpu_to_be64(dma); - data->lkey = cpu_to_be32(mdev->mr.key); - wmb(); - data->byte_count = cpu_to_be32(skb_headlen(skb) - lso_header_size); - } - tx_info->inl = 0; - } else { + if (tx_info->inl) { build_inline_wqe(tx_desc, skb, real_size, &vlan_tag, tx_ind, fragptr); tx_info->inl = 1; } @@ -804,6 +834,16 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; +tx_drop_unmap: + en_err(priv, "DMA mapping error\n"); + + for (i++; i < skb_shinfo(skb)->nr_frags; i++) { + data++; + dma_unmap_page(ddev, (dma_addr_t) be64_to_cpu(data->addr), + be32_to_cpu(data->byte_count), + PCI_DMA_TODEVICE); + } + tx_drop: dev_kfree_skb_any(skb); priv->stats.tx_dropped++; diff --git a/drivers/net/ethernet/moxa/moxart_ether.c b/drivers/net/ethernet/moxa/moxart_ether.c index abd2c54..83c2091 100644 --- a/drivers/net/ethernet/moxa/moxart_ether.c +++ b/drivers/net/ethernet/moxa/moxart_ether.c @@ -314,6 +314,7 @@ static int moxart_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev) unsigned int len; unsigned int tx_head = priv->tx_head; u32 txdes1; + int ret = NETDEV_TX_BUSY; desc = priv->tx_desc_base + (TX_REG_DESC_SIZE * tx_head); @@ -321,7 +322,7 @@ static int moxart_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev) if (readl(desc + TX_REG_OFFSET_DESC0) & TX_DESC0_DMA_OWN) { net_dbg_ratelimited("no TX space for packet\n"); priv->stats.tx_dropped++; - return NETDEV_TX_BUSY; + goto out_unlock; } len = skb->len > TX_BUF_SIZE ? TX_BUF_SIZE : skb->len; @@ -330,7 +331,7 @@ static int moxart_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev) len, DMA_TO_DEVICE); if (dma_mapping_error(&ndev->dev, priv->tx_mapping[tx_head])) { netdev_err(ndev, "DMA mapping error\n"); - return NETDEV_TX_BUSY; + goto out_unlock; } priv->tx_len[tx_head] = len; @@ -360,10 +361,11 @@ static int moxart_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev) priv->tx_head = TX_NEXT(tx_head); ndev->trans_start = jiffies; - + ret = NETDEV_TX_OK; +out_unlock: spin_unlock_irq(&priv->txlock); - return NETDEV_TX_OK; + return ret; } static struct net_device_stats *moxart_mac_get_stats(struct net_device *ndev) @@ -531,7 +533,6 @@ static int moxart_remove(struct platform_device *pdev) unregister_netdev(ndev); free_irq(ndev->irq, ndev); moxart_mac_free_memory(ndev); - platform_set_drvdata(pdev, NULL); free_netdev(ndev); return 0; diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index 66c0400..50a1d4a 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -74,6 +74,7 @@ #ifdef CONFIG_MTRR #include <asm/mtrr.h> #endif +#include <net/busy_poll.h> #include "myri10ge_mcp.h" #include "myri10ge_mcp_gen_header.h" @@ -194,6 +195,21 @@ struct myri10ge_slice_state { int cpu; __be32 __iomem *dca_tag; #endif +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned int state; +#define SLICE_STATE_IDLE 0 +#define SLICE_STATE_NAPI 1 /* NAPI owns this slice */ +#define SLICE_STATE_POLL 2 /* poll owns this slice */ +#define SLICE_LOCKED (SLICE_STATE_NAPI | SLICE_STATE_POLL) +#define SLICE_STATE_NAPI_YIELD 4 /* NAPI yielded this slice */ +#define SLICE_STATE_POLL_YIELD 8 /* poll yielded this slice */ +#define SLICE_USER_PEND (SLICE_STATE_POLL | SLICE_STATE_POLL_YIELD) + spinlock_t lock; + unsigned long lock_napi_yield; + unsigned long lock_poll_yield; + unsigned long busy_poll_miss; + unsigned long busy_poll_cnt; +#endif /* CONFIG_NET_RX_BUSY_POLL */ char irq_desc[32]; }; @@ -909,6 +925,92 @@ abort: return status; } +#ifdef CONFIG_NET_RX_BUSY_POLL +static inline void myri10ge_ss_init_lock(struct myri10ge_slice_state *ss) +{ + spin_lock_init(&ss->lock); + ss->state = SLICE_STATE_IDLE; +} + +static inline bool myri10ge_ss_lock_napi(struct myri10ge_slice_state *ss) +{ + int rc = true; + spin_lock(&ss->lock); + if ((ss->state & SLICE_LOCKED)) { + WARN_ON((ss->state & SLICE_STATE_NAPI)); + ss->state |= SLICE_STATE_NAPI_YIELD; + rc = false; + ss->lock_napi_yield++; + } else + ss->state = SLICE_STATE_NAPI; + spin_unlock(&ss->lock); + return rc; +} + +static inline void myri10ge_ss_unlock_napi(struct myri10ge_slice_state *ss) +{ + spin_lock(&ss->lock); + WARN_ON((ss->state & (SLICE_STATE_POLL | SLICE_STATE_NAPI_YIELD))); + ss->state = SLICE_STATE_IDLE; + spin_unlock(&ss->lock); +} + +static inline bool myri10ge_ss_lock_poll(struct myri10ge_slice_state *ss) +{ + int rc = true; + spin_lock_bh(&ss->lock); + if ((ss->state & SLICE_LOCKED)) { + ss->state |= SLICE_STATE_POLL_YIELD; + rc = false; + ss->lock_poll_yield++; + } else + ss->state |= SLICE_STATE_POLL; + spin_unlock_bh(&ss->lock); + return rc; +} + +static inline void myri10ge_ss_unlock_poll(struct myri10ge_slice_state *ss) +{ + spin_lock_bh(&ss->lock); + WARN_ON((ss->state & SLICE_STATE_NAPI)); + ss->state = SLICE_STATE_IDLE; + spin_unlock_bh(&ss->lock); +} + +static inline bool myri10ge_ss_busy_polling(struct myri10ge_slice_state *ss) +{ + WARN_ON(!(ss->state & SLICE_LOCKED)); + return (ss->state & SLICE_USER_PEND); +} +#else /* CONFIG_NET_RX_BUSY_POLL */ +static inline void myri10ge_ss_init_lock(struct myri10ge_slice_state *ss) +{ +} + +static inline bool myri10ge_ss_lock_napi(struct myri10ge_slice_state *ss) +{ + return false; +} + +static inline void myri10ge_ss_unlock_napi(struct myri10ge_slice_state *ss) +{ +} + +static inline bool myri10ge_ss_lock_poll(struct myri10ge_slice_state *ss) +{ + return false; +} + +static inline void myri10ge_ss_unlock_poll(struct myri10ge_slice_state *ss) +{ +} + +static inline bool myri10ge_ss_busy_polling(struct myri10ge_slice_state *ss) +{ + return false; +} +#endif + static int myri10ge_reset(struct myri10ge_priv *mgp) { struct myri10ge_cmd cmd; @@ -1300,6 +1402,8 @@ myri10ge_vlan_rx(struct net_device *dev, void *addr, struct sk_buff *skb) } } +#define MYRI10GE_HLEN 64 /* Bytes to copy from page to skb linear memory */ + static inline int myri10ge_rx_done(struct myri10ge_slice_state *ss, int len, __wsum csum) { @@ -1311,6 +1415,7 @@ myri10ge_rx_done(struct myri10ge_slice_state *ss, int len, __wsum csum) struct pci_dev *pdev = mgp->pdev; struct net_device *dev = mgp->dev; u8 *va; + bool polling; if (len <= mgp->small_bytes) { rx = &ss->rx_small; @@ -1325,7 +1430,15 @@ myri10ge_rx_done(struct myri10ge_slice_state *ss, int len, __wsum csum) va = page_address(rx->info[idx].page) + rx->info[idx].page_offset; prefetch(va); - skb = napi_get_frags(&ss->napi); + /* When busy polling in user context, allocate skb and copy headers to + * skb's linear memory ourselves. When not busy polling, use the napi + * gro api. + */ + polling = myri10ge_ss_busy_polling(ss); + if (polling) + skb = netdev_alloc_skb(dev, MYRI10GE_HLEN + 16); + else + skb = napi_get_frags(&ss->napi); if (unlikely(skb == NULL)) { ss->stats.rx_dropped++; for (i = 0, remainder = len; remainder > 0; i++) { @@ -1364,8 +1477,29 @@ myri10ge_rx_done(struct myri10ge_slice_state *ss, int len, __wsum csum) } myri10ge_vlan_rx(mgp->dev, va, skb); skb_record_rx_queue(skb, ss - &mgp->ss[0]); + skb_mark_napi_id(skb, &ss->napi); + + if (polling) { + int hlen; + + /* myri10ge_vlan_rx might have moved the header, so compute + * length and address again. + */ + hlen = MYRI10GE_HLEN > skb->len ? skb->len : MYRI10GE_HLEN; + va = page_address(skb_frag_page(&rx_frags[0])) + + rx_frags[0].page_offset; + /* Copy header into the skb linear memory */ + skb_copy_to_linear_data(skb, va, hlen); + rx_frags[0].page_offset += hlen; + rx_frags[0].size -= hlen; + skb->data_len -= hlen; + skb->tail += hlen; + skb->protocol = eth_type_trans(skb, dev); + netif_receive_skb(skb); + } + else + napi_gro_frags(&ss->napi); - napi_gro_frags(&ss->napi); return 1; } @@ -1524,10 +1658,14 @@ static int myri10ge_poll(struct napi_struct *napi, int budget) if (ss->mgp->dca_enabled) myri10ge_update_dca(ss); #endif + /* Try later if the busy_poll handler is running. */ + if (!myri10ge_ss_lock_napi(ss)) + return budget; /* process as many rx events as NAPI will allow */ work_done = myri10ge_clean_rx_done(ss, budget); + myri10ge_ss_unlock_napi(ss); if (work_done < budget) { napi_complete(napi); put_be32(htonl(3), ss->irq_claim); @@ -1535,6 +1673,34 @@ static int myri10ge_poll(struct napi_struct *napi, int budget) return work_done; } +#ifdef CONFIG_NET_RX_BUSY_POLL +static int myri10ge_busy_poll(struct napi_struct *napi) +{ + struct myri10ge_slice_state *ss = + container_of(napi, struct myri10ge_slice_state, napi); + struct myri10ge_priv *mgp = ss->mgp; + int work_done; + + /* Poll only when the link is up */ + if (mgp->link_state != MXGEFW_LINK_UP) + return LL_FLUSH_FAILED; + + if (!myri10ge_ss_lock_poll(ss)) + return LL_FLUSH_BUSY; + + /* Process a small number of packets */ + work_done = myri10ge_clean_rx_done(ss, 4); + if (work_done) + ss->busy_poll_cnt += work_done; + else + ss->busy_poll_miss++; + + myri10ge_ss_unlock_poll(ss); + + return work_done; +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ + static irqreturn_t myri10ge_intr(int irq, void *arg) { struct myri10ge_slice_state *ss = arg; @@ -1742,6 +1908,10 @@ static const char myri10ge_gstrings_slice_stats[][ETH_GSTRING_LEN] = { "tx_pkt_start", "tx_pkt_done", "tx_req", "tx_done", "rx_small_cnt", "rx_big_cnt", "wake_queue", "stop_queue", "tx_linearized", +#ifdef CONFIG_NET_RX_BUSY_POLL + "rx_lock_napi_yield", "rx_lock_poll_yield", "rx_busy_poll_miss", + "rx_busy_poll_cnt", +#endif }; #define MYRI10GE_NET_STATS_LEN 21 @@ -1842,6 +2012,12 @@ myri10ge_get_ethtool_stats(struct net_device *netdev, data[i++] = (unsigned int)ss->tx.wake_queue; data[i++] = (unsigned int)ss->tx.stop_queue; data[i++] = (unsigned int)ss->tx.linearized; +#ifdef CONFIG_NET_RX_BUSY_POLL + data[i++] = ss->lock_napi_yield; + data[i++] = ss->lock_poll_yield; + data[i++] = ss->busy_poll_miss; + data[i++] = ss->busy_poll_cnt; +#endif } } @@ -2405,6 +2581,9 @@ static int myri10ge_open(struct net_device *dev) goto abort_with_rings; } + /* Initialize the slice spinlock and state used for polling */ + myri10ge_ss_init_lock(ss); + /* must happen prior to any irq */ napi_enable(&(ss)->napi); } @@ -2481,9 +2660,19 @@ static int myri10ge_close(struct net_device *dev) del_timer_sync(&mgp->watchdog_timer); mgp->running = MYRI10GE_ETH_STOPPING; + local_bh_disable(); /* myri10ge_ss_lock_napi needs bh disabled */ for (i = 0; i < mgp->num_slices; i++) { napi_disable(&mgp->ss[i].napi); + /* Lock the slice to prevent the busy_poll handler from + * accessing it. Later when we bring the NIC up, myri10ge_open + * resets the slice including this lock. + */ + while (!myri10ge_ss_lock_napi(&mgp->ss[i])) { + pr_info("Slice %d locked\n", i); + mdelay(1); + } } + local_bh_enable(); netif_carrier_off(dev); netif_tx_stop_all_queues(dev); @@ -3569,8 +3758,11 @@ static void myri10ge_free_slices(struct myri10ge_priv *mgp) ss->fw_stats, ss->fw_stats_bus); ss->fw_stats = NULL; } + napi_hash_del(&ss->napi); netif_napi_del(&ss->napi); } + /* Wait till napi structs are no longer used, and then free ss. */ + synchronize_rcu(); kfree(mgp->ss); mgp->ss = NULL; } @@ -3606,6 +3798,7 @@ static int myri10ge_alloc_slices(struct myri10ge_priv *mgp) ss->dev = mgp->dev; netif_napi_add(ss->dev, &ss->napi, myri10ge_poll, myri10ge_napi_weight); + napi_hash_add(&ss->napi); } return 0; abort: @@ -3748,6 +3941,9 @@ static const struct net_device_ops myri10ge_netdev_ops = { .ndo_change_mtu = myri10ge_change_mtu, .ndo_set_rx_mode = myri10ge_set_multicast_list, .ndo_set_mac_address = myri10ge_set_mac_address, +#ifdef CONFIG_NET_RX_BUSY_POLL + .ndo_busy_poll = myri10ge_busy_poll, +#endif }; static int myri10ge_probe(struct pci_dev *pdev, const struct pci_device_id *ent) diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_ethtool.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_ethtool.c index 1129db0..f0ceb89 100644 --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_ethtool.c +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_ethtool.c @@ -118,6 +118,7 @@ static int pch_gbe_set_settings(struct net_device *netdev, * filled by get_settings() on a down link, speed is -1: */ if (speed == UINT_MAX) { speed = SPEED_1000; + ethtool_cmd_speed_set(ecmd, speed); ecmd->duplex = DUPLEX_FULL; } ret = mii_ethtool_sset(&adapter->mii, ecmd); diff --git a/drivers/net/ethernet/qlogic/Kconfig b/drivers/net/ethernet/qlogic/Kconfig index 0e17972..f59e6be 100644 --- a/drivers/net/ethernet/qlogic/Kconfig +++ b/drivers/net/ethernet/qlogic/Kconfig @@ -45,6 +45,17 @@ config QLCNIC_SRIOV This allows for virtual function acceleration in virtualized environments. +config QLCNIC_DCB + bool "QLOGIC QLCNIC 82XX and 83XX family DCB Support" + depends on QLCNIC && DCB + default y + ---help--- + This configuration parameter enables DCB support in QLE83XX + and QLE82XX Converged Ethernet devices. This allows for DCB + get operations support through rtNetlink interface. Only CEE + mode of DCB is supported. PG and PFC values are related only + to Tx. + config QLGE tristate "QLogic QLGE 10Gb Ethernet Driver Support" depends on PCI diff --git a/drivers/net/ethernet/qlogic/qlcnic/Makefile b/drivers/net/ethernet/qlogic/qlcnic/Makefile index 4b1fb3f..a848d29 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/Makefile +++ b/drivers/net/ethernet/qlogic/qlcnic/Makefile @@ -11,3 +11,5 @@ qlcnic-y := qlcnic_hw.o qlcnic_main.o qlcnic_init.o \ qlcnic_minidump.o qlcnic_sriov_common.o qlcnic-$(CONFIG_QLCNIC_SRIOV) += qlcnic_sriov_pf.o + +qlcnic-$(CONFIG_QLCNIC_DCB) += qlcnic_dcb.o diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h index 7387354..22bd425 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h @@ -34,11 +34,12 @@ #include "qlcnic_hdr.h" #include "qlcnic_hw.h" #include "qlcnic_83xx_hw.h" +#include "qlcnic_dcb.h" #define _QLCNIC_LINUX_MAJOR 5 -#define _QLCNIC_LINUX_MINOR 2 -#define _QLCNIC_LINUX_SUBVERSION 46 -#define QLCNIC_LINUX_VERSIONID "5.2.46" +#define _QLCNIC_LINUX_MINOR 3 +#define _QLCNIC_LINUX_SUBVERSION 49 +#define QLCNIC_LINUX_VERSIONID "5.3.49" #define QLCNIC_DRV_IDC_VER 0x01 #define QLCNIC_DRIVER_VERSION ((_QLCNIC_LINUX_MAJOR << 16) |\ (_QLCNIC_LINUX_MINOR << 8) | (_QLCNIC_LINUX_SUBVERSION)) @@ -97,6 +98,9 @@ #define TX_STOP_THRESH ((MAX_SKB_FRAGS >> 2) + MAX_TSO_HEADER_DESC \ + MGMT_CMD_DESC_RESV) #define QLCNIC_MAX_TX_TIMEOUTS 2 +#define QLCNIC_MAX_TX_RINGS 8 +#define QLCNIC_MAX_SDS_RINGS 8 + /* * Following are the states of the Phantom. Phantom will set them and * Host will read to check if the fields are correct. @@ -467,6 +471,8 @@ struct qlcnic_hardware_context { u32 mbox_aen[QLC_83XX_MBX_AEN_CNT]; u32 mbox_reg[4]; struct qlcnic_mailbox *mailbox; + u8 extend_lb_time; + u8 phys_port_id[ETH_ALEN]; }; struct qlcnic_adapter_stats { @@ -514,6 +520,7 @@ struct qlcnic_host_sds_ring { u32 num_desc; void __iomem *crb_sts_consumer; + struct qlcnic_host_tx_ring *tx_ring; struct status_desc *desc_head; struct qlcnic_adapter *adapter; struct napi_struct napi; @@ -531,9 +538,17 @@ struct qlcnic_host_tx_ring { void __iomem *crb_intr_mask; char name[IFNAMSIZ + 12]; u16 ctx_id; + + u32 state; u32 producer; u32 sw_consumer; u32 num_desc; + + u64 xmit_on; + u64 xmit_off; + u64 xmit_called; + u64 xmit_finished; + void __iomem *crb_cmd_producer; struct cmd_desc_type0 *desc_head; struct qlcnic_adapter *adapter; @@ -558,7 +573,6 @@ struct qlcnic_recv_context { u32 state; u16 context_id; u16 virt_port; - }; /* HW context creation */ @@ -603,6 +617,7 @@ struct qlcnic_recv_context { #define QLCNIC_CAP0_LRO_CONTIGUOUS (1 << 8) #define QLCNIC_CAP0_VALIDOFF (1 << 11) #define QLCNIC_CAP0_LRO_MSS (1 << 21) +#define QLCNIC_CAP0_TX_MULTI (1 << 22) /* * Context state @@ -630,7 +645,7 @@ struct qlcnic_hostrq_rds_ring { struct qlcnic_hostrq_rx_ctx { __le64 host_rsp_dma_addr; /* Response dma'd here */ - __le32 capabilities[4]; /* Flag bit vector */ + __le32 capabilities[4]; /* Flag bit vector */ __le32 host_int_crb_mode; /* Interrupt crb usage */ __le32 host_rds_crb_mode; /* RDS crb usage */ /* These ring offsets are relative to data[0] below */ @@ -801,6 +816,7 @@ struct qlcnic_mac_list_s { #define QLCNIC_C2H_OPCODE_CONFIG_LOOPBACK 0x8f #define QLCNIC_C2H_OPCODE_GET_LINKEVENT_RESPONSE 0x8D +#define QLCNIC_C2H_OPCODE_GET_DCB_AEN 0x90 #define VPORT_MISS_MODE_DROP 0 /* drop all unmatched */ #define VPORT_MISS_MODE_ACCEPT_ALL 1 /* accept all packets */ @@ -813,6 +829,7 @@ struct qlcnic_mac_list_s { #define QLCNIC_FW_CAPABILITY_BDG BIT_8 #define QLCNIC_FW_CAPABILITY_FVLANTX BIT_9 #define QLCNIC_FW_CAPABILITY_HW_LRO BIT_10 +#define QLCNIC_FW_CAPABILITY_2_MULTI_TX BIT_4 #define QLCNIC_FW_CAPABILITY_MULTI_LOOPBACK BIT_27 #define QLCNIC_FW_CAPABILITY_MORE_CAPS BIT_31 @@ -912,6 +929,8 @@ struct qlcnic_ipaddr { #define QLCNIC_FW_LRO_MSS_CAP 0x8000 #define QLCNIC_TX_INTR_SHARED 0x10000 #define QLCNIC_APP_CHANGED_FLAGS 0x20000 +#define QLCNIC_HAS_PHYS_PORT_ID 0x40000 + #define QLCNIC_IS_MSI_FAMILY(adapter) \ ((adapter)->flags & (QLCNIC_MSI_ENABLED | QLCNIC_MSIX_ENABLED)) #define QLCNIC_IS_TSO_CAPABLE(adapter) \ @@ -921,6 +940,7 @@ struct qlcnic_ipaddr { #define QLCNIC_BEACON_DISABLE 0xD #define QLCNIC_DEF_NUM_STS_DESC_RINGS 4 +#define QLCNIC_DEF_NUM_TX_RINGS 4 #define QLCNIC_MSIX_TBL_SPACE 8192 #define QLCNIC_PCI_REG_MSIX_TBL 0x44 #define QLCNIC_MSIX_TBL_PGSIZE 4096 @@ -936,10 +956,13 @@ struct qlcnic_ipaddr { #define __QLCNIC_DIAG_RES_ALLOC 6 #define __QLCNIC_LED_ENABLE 7 #define __QLCNIC_ELB_INPROGRESS 8 +#define __QLCNIC_MULTI_TX_UNIQUE 9 #define __QLCNIC_SRIOV_ENABLE 10 #define __QLCNIC_SRIOV_CAPABLE 11 #define __QLCNIC_MBX_POLL_ENABLE 12 #define __QLCNIC_DIAG_MODE 13 +#define __QLCNIC_DCB_STATE 14 +#define __QLCNIC_DCB_IN_AEN 15 #define QLCNIC_INTERRUPT_TEST 1 #define QLCNIC_LOOPBACK_TEST 2 @@ -1043,6 +1066,7 @@ struct qlcnic_adapter { struct delayed_work fw_work; struct delayed_work idc_aen_work; struct delayed_work mbx_poll_work; + struct qlcnic_dcb *dcb; struct qlcnic_filter_hash fhash; struct qlcnic_filter_hash rx_fhash; @@ -1481,7 +1505,8 @@ void qlcnic_fw_destroy_ctx(struct qlcnic_adapter *adapter); void qlcnic_reset_rx_buffers_list(struct qlcnic_adapter *adapter); void qlcnic_release_rx_buffers(struct qlcnic_adapter *adapter); -void qlcnic_release_tx_buffers(struct qlcnic_adapter *adapter); +void qlcnic_release_tx_buffers(struct qlcnic_adapter *, + struct qlcnic_host_tx_ring *); int qlcnic_check_fw_status(struct qlcnic_adapter *adapter); void qlcnic_watchdog_task(struct work_struct *work); @@ -1493,6 +1518,7 @@ void __qlcnic_set_multi(struct net_device *, u16); int qlcnic_nic_add_mac(struct qlcnic_adapter *, const u8 *, u16); int qlcnic_nic_del_mac(struct qlcnic_adapter *, const u8 *); void qlcnic_82xx_free_mac_list(struct qlcnic_adapter *adapter); +int qlcnic_82xx_read_phys_port_id(struct qlcnic_adapter *); int qlcnic_fw_cmd_set_mtu(struct qlcnic_adapter *adapter, int mtu); int qlcnic_fw_cmd_set_drv_version(struct qlcnic_adapter *, u32); @@ -1514,8 +1540,9 @@ int qlcnic_reset_context(struct qlcnic_adapter *); void qlcnic_diag_free_res(struct net_device *netdev, int max_sds_rings); int qlcnic_diag_alloc_res(struct net_device *netdev, int test); netdev_tx_t qlcnic_xmit_frame(struct sk_buff *skb, struct net_device *netdev); -int qlcnic_set_max_rss(struct qlcnic_adapter *, u8, size_t); +int qlcnic_set_max_rss(struct qlcnic_adapter *, u8, int); int qlcnic_validate_max_rss(struct qlcnic_adapter *, __u32); +int qlcnic_validate_max_tx_rings(struct qlcnic_adapter *, u32 txq); void qlcnic_alloc_lb_filters_mem(struct qlcnic_adapter *adapter); void qlcnic_82xx_set_mac_filter_count(struct qlcnic_adapter *); int qlcnic_enable_msix(struct qlcnic_adapter *, u32); @@ -1542,6 +1569,7 @@ void qlcnic_free_sds_rings(struct qlcnic_recv_context *); void qlcnic_advert_link_change(struct qlcnic_adapter *, int); void qlcnic_free_tx_rings(struct qlcnic_adapter *); int qlcnic_alloc_tx_rings(struct qlcnic_adapter *, struct net_device *); +void qlcnic_dump_mbx(struct qlcnic_adapter *, struct qlcnic_cmd_args *); void qlcnic_create_sysfs_entries(struct qlcnic_adapter *adapter); void qlcnic_remove_sysfs_entries(struct qlcnic_adapter *adapter); @@ -1604,6 +1632,26 @@ static inline u32 qlcnic_tx_avail(struct qlcnic_host_tx_ring *tx_ring) tx_ring->producer; } +static inline int qlcnic_set_real_num_queues(struct qlcnic_adapter *adapter, + struct net_device *netdev) +{ + int err, tx_q; + + tx_q = adapter->max_drv_tx_rings; + + netdev->num_tx_queues = tx_q; + netdev->real_num_tx_queues = tx_q; + + err = netif_set_real_num_tx_queues(netdev, tx_q); + if (err) + dev_err(&adapter->pdev->dev, "failed to set %d Tx queues\n", + tx_q); + else + dev_info(&adapter->pdev->dev, "set %d Tx queues\n", tx_q); + + return err; +} + struct qlcnic_nic_template { int (*config_bridged_mode) (struct qlcnic_adapter *, u32); int (*config_led) (struct qlcnic_adapter *, u32, u32); @@ -1640,8 +1688,8 @@ struct qlcnic_hardware_ops { int (*read_reg) (struct qlcnic_adapter *, ulong, int *); int (*write_reg) (struct qlcnic_adapter *, ulong, u32); void (*get_ocm_win) (struct qlcnic_hardware_context *); - int (*get_mac_address) (struct qlcnic_adapter *, u8 *); - int (*setup_intr) (struct qlcnic_adapter *, u8); + int (*get_mac_address) (struct qlcnic_adapter *, u8 *, u8); + int (*setup_intr) (struct qlcnic_adapter *, u8, int); int (*alloc_mbx_args)(struct qlcnic_cmd_args *, struct qlcnic_adapter *, u32); int (*mbx_cmd) (struct qlcnic_adapter *, struct qlcnic_cmd_args *); @@ -1674,6 +1722,7 @@ struct qlcnic_hardware_ops { int (*get_board_info) (struct qlcnic_adapter *); void (*set_mac_filter_count) (struct qlcnic_adapter *); void (*free_mac_list) (struct qlcnic_adapter *); + int (*read_phys_port_id) (struct qlcnic_adapter *); }; extern struct qlcnic_nic_template qlcnic_vf_ops; @@ -1702,14 +1751,15 @@ static inline int qlcnic_hw_write_wx_2M(struct qlcnic_adapter *adapter, } static inline int qlcnic_get_mac_address(struct qlcnic_adapter *adapter, - u8 *mac) + u8 *mac, u8 function) { - return adapter->ahw->hw_ops->get_mac_address(adapter, mac); + return adapter->ahw->hw_ops->get_mac_address(adapter, mac, function); } -static inline int qlcnic_setup_intr(struct qlcnic_adapter *adapter, u8 num_intr) +static inline int qlcnic_setup_intr(struct qlcnic_adapter *adapter, + u8 num_intr, int txq) { - return adapter->ahw->hw_ops->setup_intr(adapter, num_intr); + return adapter->ahw->hw_ops->setup_intr(adapter, num_intr, txq); } static inline int qlcnic_alloc_mbx_args(struct qlcnic_cmd_args *mbx, @@ -1900,6 +1950,12 @@ static inline void qlcnic_set_mac_filter_count(struct qlcnic_adapter *adapter) adapter->ahw->hw_ops->set_mac_filter_count(adapter); } +static inline void qlcnic_read_phys_port_id(struct qlcnic_adapter *adapter) +{ + if (adapter->ahw->hw_ops->read_phys_port_id) + adapter->ahw->hw_ops->read_phys_port_id(adapter); +} + static inline void qlcnic_dev_request_reset(struct qlcnic_adapter *adapter, u32 key) { @@ -1931,16 +1987,45 @@ static inline void qlcnic_config_ipaddr(struct qlcnic_adapter *adapter, adapter->nic_ops->config_ipaddr(adapter, ip, cmd); } +static inline bool qlcnic_check_multi_tx(struct qlcnic_adapter *adapter) +{ + return test_bit(__QLCNIC_MULTI_TX_UNIQUE, &adapter->state); +} + +static inline void qlcnic_disable_multi_tx(struct qlcnic_adapter *adapter) +{ + test_and_clear_bit(__QLCNIC_MULTI_TX_UNIQUE, &adapter->state); + adapter->max_drv_tx_rings = 1; +} + +/* When operating in a muti tx mode, driver needs to write 0x1 + * to src register, instead of 0x0 to disable receiving interrupt. + */ static inline void qlcnic_disable_int(struct qlcnic_host_sds_ring *sds_ring) { - writel(0, sds_ring->crb_intr_mask); + struct qlcnic_adapter *adapter = sds_ring->adapter; + + if (qlcnic_check_multi_tx(adapter) && + !adapter->ahw->diag_test && + (adapter->flags & QLCNIC_MSIX_ENABLED)) + writel(0x1, sds_ring->crb_intr_mask); + else + writel(0, sds_ring->crb_intr_mask); } +/* When operating in a muti tx mode, driver needs to write 0x0 + * to src register, instead of 0x1 to enable receiving interrupts. + */ static inline void qlcnic_enable_int(struct qlcnic_host_sds_ring *sds_ring) { struct qlcnic_adapter *adapter = sds_ring->adapter; - writel(0x1, sds_ring->crb_intr_mask); + if (qlcnic_check_multi_tx(adapter) && + !adapter->ahw->diag_test && + (adapter->flags & QLCNIC_MSIX_ENABLED)) + writel(0, sds_ring->crb_intr_mask); + else + writel(0x1, sds_ring->crb_intr_mask); if (!QLCNIC_IS_MSI_FAMILY(adapter)) writel(0xfbff, adapter->tgt_mask_reg); @@ -1972,9 +2057,11 @@ extern const struct ethtool_ops qlcnic_ethtool_failed_ops; __func__, ##_args); \ } while (0) -#define PCI_DEVICE_ID_QLOGIC_QLE834X 0x8030 +#define PCI_DEVICE_ID_QLOGIC_QLE824X 0x8020 +#define PCI_DEVICE_ID_QLOGIC_QLE834X 0x8030 #define PCI_DEVICE_ID_QLOGIC_VF_QLE834X 0x8430 -#define PCI_DEVICE_ID_QLOGIC_QLE824X 0x8020 +#define PCI_DEVICE_ID_QLOGIC_QLE844X 0x8040 +#define PCI_DEVICE_ID_QLOGIC_VF_QLE844X 0x8440 static inline bool qlcnic_82xx_check(struct qlcnic_adapter *adapter) { @@ -1988,6 +2075,8 @@ static inline bool qlcnic_83xx_check(struct qlcnic_adapter *adapter) bool status; status = ((device == PCI_DEVICE_ID_QLOGIC_QLE834X) || + (device == PCI_DEVICE_ID_QLOGIC_QLE844X) || + (device == PCI_DEVICE_ID_QLOGIC_VF_QLE844X) || (device == PCI_DEVICE_ID_QLOGIC_VF_QLE834X)) ? true : false; return status; @@ -2001,7 +2090,105 @@ static inline bool qlcnic_sriov_pf_check(struct qlcnic_adapter *adapter) static inline bool qlcnic_sriov_vf_check(struct qlcnic_adapter *adapter) { unsigned short device = adapter->pdev->device; + bool status; + + status = ((device == PCI_DEVICE_ID_QLOGIC_VF_QLE834X) || + (device == PCI_DEVICE_ID_QLOGIC_VF_QLE844X)) ? true : false; + + return status; +} + +static inline int qlcnic_dcb_get_hw_capability(struct qlcnic_adapter *adapter) +{ + struct qlcnic_dcb *dcb = adapter->dcb; + + if (dcb && dcb->ops->get_hw_capability) + return dcb->ops->get_hw_capability(adapter); + + return 0; +} + +static inline void qlcnic_dcb_free(struct qlcnic_adapter *adapter) +{ + struct qlcnic_dcb *dcb = adapter->dcb; + + if (dcb && dcb->ops->free) + dcb->ops->free(adapter); +} + +static inline int qlcnic_dcb_attach(struct qlcnic_adapter *adapter) +{ + struct qlcnic_dcb *dcb = adapter->dcb; + + if (dcb && dcb->ops->attach) + return dcb->ops->attach(adapter); + + return 0; +} + +static inline int +qlcnic_dcb_query_hw_capability(struct qlcnic_adapter *adapter, char *buf) +{ + struct qlcnic_dcb *dcb = adapter->dcb; + + if (dcb && dcb->ops->query_hw_capability) + return dcb->ops->query_hw_capability(adapter, buf); + + return 0; +} + +static inline void qlcnic_dcb_get_info(struct qlcnic_adapter *adapter) +{ + struct qlcnic_dcb *dcb = adapter->dcb; + + if (dcb && dcb->ops->get_info) + dcb->ops->get_info(adapter); +} + +static inline int +qlcnic_dcb_query_cee_param(struct qlcnic_adapter *adapter, char *buf, u8 type) +{ + struct qlcnic_dcb *dcb = adapter->dcb; + + if (dcb && dcb->ops->query_cee_param) + return dcb->ops->query_cee_param(adapter, buf, type); + + return 0; +} + +static inline int qlcnic_dcb_get_cee_cfg(struct qlcnic_adapter *adapter) +{ + struct qlcnic_dcb *dcb = adapter->dcb; + + if (dcb && dcb->ops->get_cee_cfg) + return dcb->ops->get_cee_cfg(adapter); + + return 0; +} + +static inline void +qlcnic_dcb_register_aen(struct qlcnic_adapter *adapter, u8 flag) +{ + struct qlcnic_dcb *dcb = adapter->dcb; + + if (dcb && dcb->ops->register_aen) + dcb->ops->register_aen(adapter, flag); +} + +static inline void qlcnic_dcb_handle_aen(struct qlcnic_adapter *adapter, + void *msg) +{ + struct qlcnic_dcb *dcb = adapter->dcb; + + if (dcb && dcb->ops->handle_aen) + dcb->ops->handle_aen(adapter, msg); +} + +static inline void qlcnic_dcb_init_dcbnl_ops(struct qlcnic_adapter *adapter) +{ + struct qlcnic_dcb *dcb = adapter->dcb; - return (device == PCI_DEVICE_ID_QLOGIC_VF_QLE834X) ? true : false; + if (dcb && dcb->ops->init_dcbnl_ops) + dcb->ops->init_dcbnl_ops(adapter); } #endif /* __QLCNIC_H_ */ diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c index e53caaa..8fce1d3 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c @@ -67,6 +67,8 @@ static const struct qlcnic_mailbox_metadata qlcnic_83xx_mbx_tbl[] = { {QLCNIC_CMD_ADD_RCV_RINGS, 130, 26}, {QLCNIC_CMD_CONFIG_VPORT, 4, 4}, {QLCNIC_CMD_BC_EVENT_SETUP, 2, 1}, + {QLCNIC_CMD_DCB_QUERY_CAP, 1, 2}, + {QLCNIC_CMD_DCB_QUERY_PARAM, 2, 50}, }; const u32 qlcnic_83xx_ext_reg_tbl[] = { @@ -261,7 +263,7 @@ int qlcnic_83xx_wrt_reg_indirect(struct qlcnic_adapter *adapter, ulong addr, } } -int qlcnic_83xx_setup_intr(struct qlcnic_adapter *adapter, u8 num_intr) +int qlcnic_83xx_setup_intr(struct qlcnic_adapter *adapter, u8 num_intr, int txq) { int err, i, num_msix; struct qlcnic_hardware_context *ahw = adapter->ahw; @@ -695,8 +697,8 @@ static void qlcnic_83xx_handle_link_aen(struct qlcnic_adapter *adapter, static void qlcnic_83xx_handle_idc_comp_aen(struct qlcnic_adapter *adapter, u32 data[]); -static void qlcnic_dump_mbx(struct qlcnic_adapter *adapter, - struct qlcnic_cmd_args *cmd) +void qlcnic_dump_mbx(struct qlcnic_adapter *adapter, + struct qlcnic_cmd_args *cmd) { int i; @@ -860,9 +862,9 @@ static void qlcnic_83xx_handle_idc_comp_aen(struct qlcnic_adapter *adapter, void __qlcnic_83xx_process_aen(struct qlcnic_adapter *adapter) { + struct qlcnic_hardware_context *ahw = adapter->ahw; u32 event[QLC_83XX_MBX_AEN_CNT]; int i; - struct qlcnic_hardware_context *ahw = adapter->ahw; for (i = 0; i < QLC_83XX_MBX_AEN_CNT; i++) event[i] = readl(QLCNIC_MBX_FW(ahw, i)); @@ -882,6 +884,7 @@ void __qlcnic_83xx_process_aen(struct qlcnic_adapter *adapter) &adapter->idc_aen_work, 0); break; case QLCNIC_MBX_TIME_EXTEND_EVENT: + ahw->extend_lb_time = event[1] >> 8 & 0xf; break; case QLCNIC_MBX_BC_EVENT: qlcnic_sriov_handle_bc_event(adapter, event[1]); @@ -894,6 +897,9 @@ void __qlcnic_83xx_process_aen(struct qlcnic_adapter *adapter) dev_info(&adapter->pdev->dev, "SFP Removed AEN:0x%x.\n", QLCNIC_MBX_RSP(event[0])); break; + case QLCNIC_MBX_DCBX_CONFIG_CHANGE_EVENT: + qlcnic_dcb_handle_aen(adapter, (void *)&event[1]); + break; default: dev_dbg(&adapter->pdev->dev, "Unsupported AEN:0x%x.\n", QLCNIC_MBX_RSP(event[0])); @@ -1690,7 +1696,7 @@ int qlcnic_83xx_loopback_test(struct net_device *netdev, u8 mode) /* Make sure carrier is off and queue is stopped during loopback */ if (netif_running(netdev)) { netif_carrier_off(netdev); - netif_stop_queue(netdev); + netif_tx_stop_all_queues(netdev); } ret = qlcnic_do_lb_test(adapter, mode); @@ -1706,13 +1712,28 @@ fail_diag_alloc: return ret; } +static void qlcnic_extend_lb_idc_cmpltn_wait(struct qlcnic_adapter *adapter, + u32 *max_wait_count) +{ + struct qlcnic_hardware_context *ahw = adapter->ahw; + int temp; + + netdev_info(adapter->netdev, "Recieved loopback IDC time extend event for 0x%x seconds\n", + ahw->extend_lb_time); + temp = ahw->extend_lb_time * 1000; + *max_wait_count += temp / QLC_83XX_LB_MSLEEP_COUNT; + ahw->extend_lb_time = 0; +} + int qlcnic_83xx_set_lb_mode(struct qlcnic_adapter *adapter, u8 mode) { struct qlcnic_hardware_context *ahw = adapter->ahw; struct net_device *netdev = adapter->netdev; + u32 config, max_wait_count; int status = 0, loop = 0; - u32 config; + ahw->extend_lb_time = 0; + max_wait_count = QLC_83XX_LB_WAIT_COUNT; status = qlcnic_83xx_get_port_config(adapter); if (status) return status; @@ -1754,9 +1775,14 @@ int qlcnic_83xx_set_lb_mode(struct qlcnic_adapter *adapter, u8 mode) clear_bit(QLC_83XX_IDC_COMP_AEN, &ahw->idc.status); return -EBUSY; } - if (loop++ > QLC_83XX_LB_WAIT_COUNT) { - netdev_err(netdev, - "Did not receive IDC completion AEN\n"); + + if (ahw->extend_lb_time) + qlcnic_extend_lb_idc_cmpltn_wait(adapter, + &max_wait_count); + + if (loop++ > max_wait_count) { + netdev_err(netdev, "%s: Did not receive loopback IDC completion AEN\n", + __func__); clear_bit(QLC_83XX_IDC_COMP_AEN, &ahw->idc.status); qlcnic_83xx_clear_lb_mode(adapter, mode); return -ETIMEDOUT; @@ -1771,10 +1797,12 @@ int qlcnic_83xx_set_lb_mode(struct qlcnic_adapter *adapter, u8 mode) int qlcnic_83xx_clear_lb_mode(struct qlcnic_adapter *adapter, u8 mode) { struct qlcnic_hardware_context *ahw = adapter->ahw; + u32 config = ahw->port_config, max_wait_count; struct net_device *netdev = adapter->netdev; int status = 0, loop = 0; - u32 config = ahw->port_config; + ahw->extend_lb_time = 0; + max_wait_count = QLC_83XX_LB_WAIT_COUNT; set_bit(QLC_83XX_IDC_COMP_AEN, &ahw->idc.status); if (mode == QLCNIC_ILB_MODE) ahw->port_config &= ~QLC_83XX_CFG_LOOPBACK_HSS; @@ -1802,9 +1830,13 @@ int qlcnic_83xx_clear_lb_mode(struct qlcnic_adapter *adapter, u8 mode) return -EBUSY; } - if (loop++ > QLC_83XX_LB_WAIT_COUNT) { - netdev_err(netdev, - "Did not receive IDC completion AEN\n"); + if (ahw->extend_lb_time) + qlcnic_extend_lb_idc_cmpltn_wait(adapter, + &max_wait_count); + + if (loop++ > max_wait_count) { + netdev_err(netdev, "%s: Did not receive loopback IDC completion AEN\n", + __func__); clear_bit(QLC_83XX_IDC_COMP_AEN, &ahw->idc.status); return -ETIMEDOUT; } @@ -2010,12 +2042,14 @@ void qlcnic_83xx_configure_mac(struct qlcnic_adapter *adapter, u8 *mac, cmd->req.arg[1] = type; } -int qlcnic_83xx_get_mac_address(struct qlcnic_adapter *adapter, u8 *mac) +int qlcnic_83xx_get_mac_address(struct qlcnic_adapter *adapter, u8 *mac, + u8 function) { int err, i; struct qlcnic_cmd_args cmd; u32 mac_low, mac_high; + function = 0; err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_MAC_ADDRESS); if (err) return err; diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h index dd22ef3..0fc5616 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h @@ -84,9 +84,11 @@ /* Firmware image definitions */ #define QLC_83XX_BOOTLOADER_FLASH_ADDR 0x10000 #define QLC_83XX_FW_FILE_NAME "83xx_fw.bin" +#define QLC_84XX_FW_FILE_NAME "84xx_fw.bin" #define QLC_83XX_BOOT_FROM_FLASH 0 #define QLC_83XX_BOOT_FROM_FILE 0x12345678 +#define QLC_FW_FILE_NAME_LEN 20 #define QLC_83XX_MAX_RESET_SEQ_ENTRIES 16 #define QLC_83XX_MBX_POST_BC_OP 0x1 @@ -404,6 +406,7 @@ enum qlcnic_83xx_states { #define QLC_83XX_MAX_MC_COUNT 38 #define QLC_83XX_MAX_UC_COUNT 4096 +#define QLC_83XX_PVID_STRIP_CAPABILITY BIT_22 #define QLC_83XX_GET_FUNC_MODE_FROM_NPAR_INFO(val) (val & 0x80000000) #define QLC_83XX_GET_LRO_CAPABILITY(val) (val & 0x20) #define QLC_83XX_GET_LSO_CAPABILITY(val) (val & 0x40) @@ -520,7 +523,7 @@ enum qlc_83xx_ext_regs { /* 83xx funcitons */ int qlcnic_83xx_get_fw_version(struct qlcnic_adapter *); int qlcnic_83xx_issue_cmd(struct qlcnic_adapter *, struct qlcnic_cmd_args *); -int qlcnic_83xx_setup_intr(struct qlcnic_adapter *, u8); +int qlcnic_83xx_setup_intr(struct qlcnic_adapter *, u8, int); void qlcnic_83xx_get_func_no(struct qlcnic_adapter *); int qlcnic_83xx_cam_lock(struct qlcnic_adapter *); void qlcnic_83xx_cam_unlock(struct qlcnic_adapter *); @@ -561,7 +564,7 @@ int qlcnic_83xx_setup_link_event(struct qlcnic_adapter *, int); void qlcnic_83xx_process_rcv_ring_diag(struct qlcnic_host_sds_ring *); int qlcnic_83xx_config_intrpt(struct qlcnic_adapter *, bool); int qlcnic_83xx_sre_macaddr_change(struct qlcnic_adapter *, u8 *, u16, u8); -int qlcnic_83xx_get_mac_address(struct qlcnic_adapter *, u8 *); +int qlcnic_83xx_get_mac_address(struct qlcnic_adapter *, u8 *, u8); void qlcnic_83xx_configure_mac(struct qlcnic_adapter *, u8 *, u8, struct qlcnic_cmd_args *); int qlcnic_83xx_alloc_mbx_args(struct qlcnic_cmd_args *, diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c index c97e2e0..a969ac2 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c @@ -635,6 +635,8 @@ int qlcnic_83xx_idc_reattach_driver(struct qlcnic_adapter *adapter) if (adapter->portnum == 0) qlcnic_set_drv_version(adapter); + + qlcnic_dcb_get_info(adapter); qlcnic_83xx_idc_attach_driver(adapter); return 0; @@ -1948,12 +1950,36 @@ static void qlcnic_83xx_init_hw(struct qlcnic_adapter *p_dev) dev_err(&p_dev->pdev->dev, "%s: failed\n", __func__); } +static inline void qlcnic_83xx_get_fw_file_name(struct qlcnic_adapter *adapter, + char *file_name) +{ + struct pci_dev *pdev = adapter->pdev; + + memset(file_name, 0, QLC_FW_FILE_NAME_LEN); + + switch (pdev->device) { + case PCI_DEVICE_ID_QLOGIC_QLE834X: + strncpy(file_name, QLC_83XX_FW_FILE_NAME, + QLC_FW_FILE_NAME_LEN); + break; + case PCI_DEVICE_ID_QLOGIC_QLE844X: + strncpy(file_name, QLC_84XX_FW_FILE_NAME, + QLC_FW_FILE_NAME_LEN); + break; + default: + dev_err(&pdev->dev, "%s: Invalid device id\n", + __func__); + } +} + static int qlcnic_83xx_load_fw_image_from_host(struct qlcnic_adapter *adapter) { + char fw_file_name[QLC_FW_FILE_NAME_LEN]; int err = -EIO; - if (request_firmware(&adapter->ahw->fw_info.fw, - QLC_83XX_FW_FILE_NAME, &(adapter->pdev->dev))) { + qlcnic_83xx_get_fw_file_name(adapter, fw_file_name); + if (request_firmware(&adapter->ahw->fw_info.fw, fw_file_name, + &(adapter->pdev->dev))) { dev_err(&adapter->pdev->dev, "No file FW image, loading flash FW image.\n"); QLC_SHARED_REG_WR32(adapter, QLCNIC_FW_IMG_VALID, @@ -2177,7 +2203,7 @@ int qlcnic_83xx_init(struct qlcnic_adapter *adapter, int pci_using_dac) if (err) goto detach_mbx; - err = qlcnic_setup_intr(adapter, 0); + err = qlcnic_setup_intr(adapter, 0, 0); if (err) { dev_err(&adapter->pdev->dev, "Failed to setup interrupt\n"); goto disable_intr; @@ -2204,6 +2230,9 @@ int qlcnic_83xx_init(struct qlcnic_adapter *adapter, int pci_using_dac) if (err) goto disable_mbx_intr; + if (adapter->dcb && qlcnic_dcb_attach(adapter)) + qlcnic_clear_dcb_ops(adapter); + /* Periodically monitor device status */ qlcnic_83xx_idc_poll_dev_state(&adapter->fw_work.work); return 0; diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c index d09389b..bf3b17e 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c @@ -38,6 +38,9 @@ static const struct qlcnic_mailbox_metadata qlcnic_mbx_tbl[] = { {QLCNIC_CMD_GET_TEMP_HDR, 4, 1}, {QLCNIC_CMD_82XX_SET_DRV_VER, 4, 1}, {QLCNIC_CMD_GET_LED_STATUS, 4, 2}, + {QLCNIC_CMD_MQ_TX_CONFIG_INTR, 2, 3}, + {QLCNIC_CMD_DCB_QUERY_CAP, 1, 2}, + {QLCNIC_CMD_DCB_QUERY_PARAM, 4, 1}, }; static inline u32 qlcnic_get_cmd_signature(struct qlcnic_hardware_context *ahw) @@ -171,6 +174,7 @@ int qlcnic_82xx_issue_cmd(struct qlcnic_adapter *adapter, break; } dev_err(&pdev->dev, fmt, cmd->rsp.arg[0]); + qlcnic_dump_mbx(adapter, cmd); } else if (rsp == QLCNIC_CDRP_RSP_OK) cmd->rsp.arg[0] = QLCNIC_RCODE_SUCCESS; @@ -243,40 +247,38 @@ qlcnic_fw_cmd_set_mtu(struct qlcnic_adapter *adapter, int mtu) int qlcnic_82xx_fw_cmd_create_rx_ctx(struct qlcnic_adapter *adapter) { - void *addr; - struct qlcnic_hostrq_rx_ctx *prq; - struct qlcnic_cardrsp_rx_ctx *prsp; - struct qlcnic_hostrq_rds_ring *prq_rds; - struct qlcnic_hostrq_sds_ring *prq_sds; + struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx; + struct qlcnic_hardware_context *ahw = adapter->ahw; + dma_addr_t hostrq_phys_addr, cardrsp_phys_addr; + struct net_device *netdev = adapter->netdev; + u32 temp_intr_crb_mode, temp_rds_crb_mode; struct qlcnic_cardrsp_rds_ring *prsp_rds; struct qlcnic_cardrsp_sds_ring *prsp_sds; + struct qlcnic_hostrq_rds_ring *prq_rds; + struct qlcnic_hostrq_sds_ring *prq_sds; struct qlcnic_host_rds_ring *rds_ring; struct qlcnic_host_sds_ring *sds_ring; - struct qlcnic_cmd_args cmd; - - dma_addr_t hostrq_phys_addr, cardrsp_phys_addr; - u64 phys_addr; - + struct qlcnic_cardrsp_rx_ctx *prsp; + struct qlcnic_hostrq_rx_ctx *prq; u8 i, nrds_rings, nsds_rings; - u16 temp_u16; + struct qlcnic_cmd_args cmd; size_t rq_size, rsp_size; u32 cap, reg, val, reg2; + u64 phys_addr; + u16 temp_u16; + void *addr; int err; - struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx; - nrds_rings = adapter->max_rds_rings; nsds_rings = adapter->max_sds_rings; - rq_size = - SIZEOF_HOSTRQ_RX(struct qlcnic_hostrq_rx_ctx, nrds_rings, - nsds_rings); - rsp_size = - SIZEOF_CARDRSP_RX(struct qlcnic_cardrsp_rx_ctx, nrds_rings, - nsds_rings); + rq_size = SIZEOF_HOSTRQ_RX(struct qlcnic_hostrq_rx_ctx, nrds_rings, + nsds_rings); + rsp_size = SIZEOF_CARDRSP_RX(struct qlcnic_cardrsp_rx_ctx, nrds_rings, + nsds_rings); addr = dma_alloc_coherent(&adapter->pdev->dev, rq_size, - &hostrq_phys_addr, GFP_KERNEL); + &hostrq_phys_addr, GFP_KERNEL); if (addr == NULL) return -ENOMEM; prq = addr; @@ -295,15 +297,20 @@ int qlcnic_82xx_fw_cmd_create_rx_ctx(struct qlcnic_adapter *adapter) | QLCNIC_CAP0_VALIDOFF); cap |= (QLCNIC_CAP0_JUMBO_CONTIGUOUS | QLCNIC_CAP0_LRO_CONTIGUOUS); - temp_u16 = offsetof(struct qlcnic_hostrq_rx_ctx, msix_handler); - prq->valid_field_offset = cpu_to_le16(temp_u16); - prq->txrx_sds_binding = nsds_rings - 1; + if (qlcnic_check_multi_tx(adapter) && + !adapter->ahw->diag_test) { + cap |= QLCNIC_CAP0_TX_MULTI; + } else { + temp_u16 = offsetof(struct qlcnic_hostrq_rx_ctx, msix_handler); + prq->valid_field_offset = cpu_to_le16(temp_u16); + prq->txrx_sds_binding = nsds_rings - 1; + temp_intr_crb_mode = QLCNIC_HOST_INT_CRB_MODE_SHARED; + prq->host_int_crb_mode = cpu_to_le32(temp_intr_crb_mode); + temp_rds_crb_mode = QLCNIC_HOST_RDS_CRB_MODE_UNIQUE; + prq->host_rds_crb_mode = cpu_to_le32(temp_rds_crb_mode); + } prq->capabilities[0] = cpu_to_le32(cap); - prq->host_int_crb_mode = - cpu_to_le32(QLCNIC_HOST_INT_CRB_MODE_SHARED); - prq->host_rds_crb_mode = - cpu_to_le32(QLCNIC_HOST_RDS_CRB_MODE_UNIQUE); prq->num_rds_rings = cpu_to_le16(nrds_rings); prq->num_sds_rings = cpu_to_le16(nsds_rings); @@ -317,10 +324,8 @@ int qlcnic_82xx_fw_cmd_create_rx_ctx(struct qlcnic_adapter *adapter) le32_to_cpu(prq->rds_ring_offset)); for (i = 0; i < nrds_rings; i++) { - rds_ring = &recv_ctx->rds_rings[i]; rds_ring->producer = 0; - prq_rds[i].host_phys_addr = cpu_to_le64(rds_ring->phys_addr); prq_rds[i].ring_size = cpu_to_le32(rds_ring->num_desc); prq_rds[i].ring_kind = cpu_to_le32(i); @@ -331,14 +336,16 @@ int qlcnic_82xx_fw_cmd_create_rx_ctx(struct qlcnic_adapter *adapter) le32_to_cpu(prq->sds_ring_offset)); for (i = 0; i < nsds_rings; i++) { - sds_ring = &recv_ctx->sds_rings[i]; sds_ring->consumer = 0; memset(sds_ring->desc_head, 0, STATUS_DESC_RINGSIZE(sds_ring)); - prq_sds[i].host_phys_addr = cpu_to_le64(sds_ring->phys_addr); prq_sds[i].ring_size = cpu_to_le32(sds_ring->num_desc); - prq_sds[i].msi_index = cpu_to_le16(i); + if (qlcnic_check_multi_tx(adapter) && + !adapter->ahw->diag_test) + prq_sds[i].msi_index = cpu_to_le16(ahw->intr_tbl[i].id); + else + prq_sds[i].msi_index = cpu_to_le16(i); } phys_addr = hostrq_phys_addr; @@ -361,9 +368,8 @@ int qlcnic_82xx_fw_cmd_create_rx_ctx(struct qlcnic_adapter *adapter) for (i = 0; i < le16_to_cpu(prsp->num_rds_rings); i++) { rds_ring = &recv_ctx->rds_rings[i]; - reg = le32_to_cpu(prsp_rds[i].host_producer_crb); - rds_ring->crb_rcv_producer = adapter->ahw->pci_base0 + reg; + rds_ring->crb_rcv_producer = ahw->pci_base0 + reg; } prsp_sds = ((struct qlcnic_cardrsp_sds_ring *) @@ -371,24 +377,30 @@ int qlcnic_82xx_fw_cmd_create_rx_ctx(struct qlcnic_adapter *adapter) for (i = 0; i < le16_to_cpu(prsp->num_sds_rings); i++) { sds_ring = &recv_ctx->sds_rings[i]; - reg = le32_to_cpu(prsp_sds[i].host_consumer_crb); - reg2 = le32_to_cpu(prsp_sds[i].interrupt_crb); + if (qlcnic_check_multi_tx(adapter) && !adapter->ahw->diag_test) + reg2 = ahw->intr_tbl[i].src; + else + reg2 = le32_to_cpu(prsp_sds[i].interrupt_crb); - sds_ring->crb_sts_consumer = adapter->ahw->pci_base0 + reg; - sds_ring->crb_intr_mask = adapter->ahw->pci_base0 + reg2; + sds_ring->crb_intr_mask = ahw->pci_base0 + reg2; + sds_ring->crb_sts_consumer = ahw->pci_base0 + reg; } recv_ctx->state = le32_to_cpu(prsp->host_ctx_state); recv_ctx->context_id = le16_to_cpu(prsp->context_id); recv_ctx->virt_port = prsp->virt_port; + netdev_info(netdev, "Rx Context[%d] Created, state 0x%x\n", + recv_ctx->context_id, recv_ctx->state); qlcnic_free_mbx_args(&cmd); + out_free_rsp: dma_free_coherent(&adapter->pdev->dev, rsp_size, prsp, cardrsp_phys_addr); out_free_rq: dma_free_coherent(&adapter->pdev->dev, rq_size, prq, hostrq_phys_addr); + return err; } @@ -416,16 +428,19 @@ int qlcnic_82xx_fw_cmd_create_tx_ctx(struct qlcnic_adapter *adapter, struct qlcnic_host_tx_ring *tx_ring, int ring) { + struct qlcnic_hardware_context *ahw = adapter->ahw; + struct net_device *netdev = adapter->netdev; struct qlcnic_hostrq_tx_ctx *prq; struct qlcnic_hostrq_cds_ring *prq_cds; struct qlcnic_cardrsp_tx_ctx *prsp; - void *rq_addr, *rsp_addr; - size_t rq_size, rsp_size; - u32 temp; struct qlcnic_cmd_args cmd; - int err; - u64 phys_addr; - dma_addr_t rq_phys_addr, rsp_phys_addr; + u32 temp, intr_mask, temp_int_crb_mode; + dma_addr_t rq_phys_addr, rsp_phys_addr; + int temp_nsds_rings, index, err; + void *rq_addr, *rsp_addr; + size_t rq_size, rsp_size; + u64 phys_addr; + u16 msix_id; /* reset host resources */ tx_ring->producer = 0; @@ -447,18 +462,28 @@ int qlcnic_82xx_fw_cmd_create_tx_ctx(struct qlcnic_adapter *adapter, } prq = rq_addr; - prsp = rsp_addr; prq->host_rsp_dma_addr = cpu_to_le64(rsp_phys_addr); temp = (QLCNIC_CAP0_LEGACY_CONTEXT | QLCNIC_CAP0_LEGACY_MN | - QLCNIC_CAP0_LSO); + QLCNIC_CAP0_LSO); + if (qlcnic_check_multi_tx(adapter) && !adapter->ahw->diag_test) + temp |= QLCNIC_CAP0_TX_MULTI; + prq->capabilities[0] = cpu_to_le32(temp); - prq->host_int_crb_mode = - cpu_to_le32(QLCNIC_HOST_INT_CRB_MODE_SHARED); - prq->msi_index = 0; + if (qlcnic_check_multi_tx(adapter) && + !adapter->ahw->diag_test) { + temp_nsds_rings = adapter->max_sds_rings; + index = temp_nsds_rings + ring; + msix_id = ahw->intr_tbl[index].id; + prq->msi_index = cpu_to_le16(msix_id); + } else { + temp_int_crb_mode = QLCNIC_HOST_INT_CRB_MODE_SHARED; + prq->host_int_crb_mode = cpu_to_le32(temp_int_crb_mode); + prq->msi_index = 0; + } prq->interrupt_ctl = 0; prq->cmd_cons_dma_addr = cpu_to_le64(tx_ring->hw_cons_phys_addr); @@ -480,15 +505,25 @@ int qlcnic_82xx_fw_cmd_create_tx_ctx(struct qlcnic_adapter *adapter, err = qlcnic_issue_cmd(adapter, &cmd); if (err == QLCNIC_RCODE_SUCCESS) { + tx_ring->state = le32_to_cpu(prsp->host_ctx_state); temp = le32_to_cpu(prsp->cds_ring.host_producer_crb); tx_ring->crb_cmd_producer = adapter->ahw->pci_base0 + temp; tx_ring->ctx_id = le16_to_cpu(prsp->context_id); + if (qlcnic_check_multi_tx(adapter) && + !adapter->ahw->diag_test && + (adapter->flags & QLCNIC_MSIX_ENABLED)) { + index = adapter->max_sds_rings + ring; + intr_mask = ahw->intr_tbl[index].src; + tx_ring->crb_intr_mask = ahw->pci_base0 + intr_mask; + } + + netdev_info(netdev, "Tx Context[0x%x] Created, state 0x%x\n", + tx_ring->ctx_id, tx_ring->state); } else { - dev_err(&adapter->pdev->dev, - "Failed to create tx ctx in firmware%d\n", err); + netdev_err(netdev, "Failed to create tx ctx in firmware%d\n", + err); err = -EIO; } - qlcnic_free_mbx_args(&cmd); out_free_rsp: @@ -618,6 +653,13 @@ int qlcnic_fw_create_ctx(struct qlcnic_adapter *dev) } } + if (qlcnic_82xx_check(dev) && (dev->flags & QLCNIC_MSIX_ENABLED) && + qlcnic_check_multi_tx(dev) && !dev->ahw->diag_test) { + err = qlcnic_82xx_mq_intrpt(dev, 1); + if (err) + return err; + } + err = qlcnic_fw_cmd_create_rx_ctx(dev); if (err) goto err_out; @@ -639,13 +681,19 @@ int qlcnic_fw_create_ctx(struct qlcnic_adapter *dev) } set_bit(__QLCNIC_FW_ATTACHED, &dev->state); + return 0; err_out: + if (qlcnic_82xx_check(dev) && (dev->flags & QLCNIC_MSIX_ENABLED) && + qlcnic_check_multi_tx(dev) && !dev->ahw->diag_test) + qlcnic_82xx_config_intrpt(dev, 0); + if (qlcnic_83xx_check(dev) && (dev->flags & QLCNIC_MSIX_ENABLED)) { if (dev->ahw->diag_test != QLCNIC_LOOPBACK_TEST) qlcnic_83xx_config_intrpt(dev, 0); } + return err; } @@ -659,6 +707,12 @@ void qlcnic_fw_destroy_ctx(struct qlcnic_adapter *adapter) qlcnic_fw_cmd_del_tx_ctx(adapter, &adapter->tx_ring[ring]); + if (qlcnic_82xx_check(adapter) && + (adapter->flags & QLCNIC_MSIX_ENABLED) && + qlcnic_check_multi_tx(adapter) && + !adapter->ahw->diag_test) + qlcnic_82xx_config_intrpt(adapter, 0); + if (qlcnic_83xx_check(adapter) && (adapter->flags & QLCNIC_MSIX_ENABLED)) { if (adapter->ahw->diag_test != QLCNIC_LOOPBACK_TEST) @@ -723,8 +777,54 @@ void qlcnic_free_hw_resources(struct qlcnic_adapter *adapter) } } +int qlcnic_82xx_config_intrpt(struct qlcnic_adapter *adapter, u8 op_type) +{ + struct qlcnic_hardware_context *ahw = adapter->ahw; + struct net_device *netdev = adapter->netdev; + struct qlcnic_cmd_args cmd; + u32 type, val; + int i, err = 0; + + for (i = 0; i < ahw->num_msix; i++) { + qlcnic_alloc_mbx_args(&cmd, adapter, + QLCNIC_CMD_MQ_TX_CONFIG_INTR); + type = op_type ? QLCNIC_INTRPT_ADD : QLCNIC_INTRPT_DEL; + val = type | (ahw->intr_tbl[i].type << 4); + if (ahw->intr_tbl[i].type == QLCNIC_INTRPT_MSIX) + val |= (ahw->intr_tbl[i].id << 16); + cmd.req.arg[1] = val; + err = qlcnic_issue_cmd(adapter, &cmd); + if (err) { + netdev_err(netdev, "Failed to %s interrupts %d\n", + op_type == QLCNIC_INTRPT_ADD ? "Add" : + "Delete", err); + qlcnic_free_mbx_args(&cmd); + return err; + } + val = cmd.rsp.arg[1]; + if (LSB(val)) { + netdev_info(netdev, + "failed to configure interrupt for %d\n", + ahw->intr_tbl[i].id); + continue; + } + if (op_type) { + ahw->intr_tbl[i].id = MSW(val); + ahw->intr_tbl[i].enabled = 1; + ahw->intr_tbl[i].src = cmd.rsp.arg[2]; + } else { + ahw->intr_tbl[i].id = i; + ahw->intr_tbl[i].enabled = 0; + ahw->intr_tbl[i].src = 0; + } + qlcnic_free_mbx_args(&cmd); + } + + return err; +} -int qlcnic_82xx_get_mac_address(struct qlcnic_adapter *adapter, u8 *mac) +int qlcnic_82xx_get_mac_address(struct qlcnic_adapter *adapter, u8 *mac, + u8 function) { int err, i; struct qlcnic_cmd_args cmd; @@ -734,7 +834,7 @@ int qlcnic_82xx_get_mac_address(struct qlcnic_adapter *adapter, u8 *mac) if (err) return err; - cmd.req.arg[1] = adapter->ahw->pci_func | BIT_8; + cmd.req.arg[1] = function | BIT_8; err = qlcnic_issue_cmd(adapter, &cmd); if (err == QLCNIC_RCODE_SUCCESS) { diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c new file mode 100644 index 0000000..2e10e79 --- /dev/null +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c @@ -0,0 +1,1179 @@ +/* + * QLogic qlcnic NIC Driver + * Copyright (c) 2009-2013 QLogic Corporation + * + * See LICENSE.qlcnic for copyright and licensing details. + */ + +#include <linux/types.h> +#include "qlcnic.h" + +#define QLC_DCB_NUM_PARAM 3 +#define QLC_DCB_LOCAL_IDX 0 +#define QLC_DCB_OPER_IDX 1 +#define QLC_DCB_PEER_IDX 2 + +#define QLC_DCB_GET_MAP(V) (1 << V) + +#define QLC_DCB_AEN_BIT 0x2 +#define QLC_DCB_FW_VER 0x2 +#define QLC_DCB_MAX_TC 0x8 +#define QLC_DCB_MAX_APP 0x8 +#define QLC_DCB_MAX_PRIO QLC_DCB_MAX_TC +#define QLC_DCB_MAX_PG QLC_DCB_MAX_TC + +#define QLC_DCB_TSA_SUPPORT(V) (V & 0x1) +#define QLC_DCB_ETS_SUPPORT(V) ((V >> 1) & 0x1) +#define QLC_DCB_VERSION_SUPPORT(V) ((V >> 2) & 0xf) +#define QLC_DCB_MAX_NUM_TC(V) ((V >> 20) & 0xf) +#define QLC_DCB_MAX_NUM_ETS_TC(V) ((V >> 24) & 0xf) +#define QLC_DCB_MAX_NUM_PFC_TC(V) ((V >> 28) & 0xf) +#define QLC_DCB_GET_TC_PRIO(X, P) ((X >> (P * 3)) & 0x7) +#define QLC_DCB_GET_PGID_PRIO(X, P) ((X >> (P * 8)) & 0xff) +#define QLC_DCB_GET_BWPER_PG(X, P) ((X >> (P * 8)) & 0xff) +#define QLC_DCB_GET_TSA_PG(X, P) ((X >> (P * 8)) & 0xff) +#define QLC_DCB_GET_PFC_PRIO(X, P) (((X >> 24) >> P) & 0x1) +#define QLC_DCB_GET_PROTO_ID_APP(X) ((X >> 8) & 0xffff) +#define QLC_DCB_GET_SELECTOR_APP(X) (X & 0xff) + +#define QLC_DCB_LOCAL_PARAM_FWID 0x3 +#define QLC_DCB_OPER_PARAM_FWID 0x1 +#define QLC_DCB_PEER_PARAM_FWID 0x2 + +#define QLC_83XX_DCB_GET_NUMAPP(X) ((X >> 2) & 0xf) +#define QLC_83XX_DCB_TSA_VALID(X) (X & 0x1) +#define QLC_83XX_DCB_PFC_VALID(X) ((X >> 1) & 0x1) +#define QLC_83XX_DCB_GET_PRIOMAP_APP(X) (X >> 24) + +#define QLC_82XX_DCB_GET_NUMAPP(X) ((X >> 12) & 0xf) +#define QLC_82XX_DCB_TSA_VALID(X) ((X >> 4) & 0x1) +#define QLC_82XX_DCB_PFC_VALID(X) ((X >> 5) & 0x1) +#define QLC_82XX_DCB_GET_PRIOVAL_APP(X) ((X >> 24) & 0x7) +#define QLC_82XX_DCB_GET_PRIOMAP_APP(X) (1 << X) +#define QLC_82XX_DCB_PRIO_TC_MAP (0x76543210) + +static const struct dcbnl_rtnl_ops qlcnic_dcbnl_ops; + +static void qlcnic_dcb_aen_work(struct work_struct *); +static void qlcnic_dcb_data_cee_param_map(struct qlcnic_adapter *); + +static inline void __qlcnic_init_dcbnl_ops(struct qlcnic_adapter *); +static void __qlcnic_dcb_free(struct qlcnic_adapter *); +static int __qlcnic_dcb_attach(struct qlcnic_adapter *); +static int __qlcnic_dcb_query_hw_capability(struct qlcnic_adapter *, char *); +static void __qlcnic_dcb_get_info(struct qlcnic_adapter *); + +static int qlcnic_82xx_dcb_get_hw_capability(struct qlcnic_adapter *); +static int qlcnic_82xx_dcb_query_cee_param(struct qlcnic_adapter *, char *, u8); +static int qlcnic_82xx_dcb_get_cee_cfg(struct qlcnic_adapter *); +static void qlcnic_82xx_dcb_handle_aen(struct qlcnic_adapter *, void *); + +static int qlcnic_83xx_dcb_get_hw_capability(struct qlcnic_adapter *); +static int qlcnic_83xx_dcb_query_cee_param(struct qlcnic_adapter *, char *, u8); +static int qlcnic_83xx_dcb_get_cee_cfg(struct qlcnic_adapter *); +static int qlcnic_83xx_dcb_register_aen(struct qlcnic_adapter *, bool); +static void qlcnic_83xx_dcb_handle_aen(struct qlcnic_adapter *, void *); + +struct qlcnic_dcb_capability { + bool tsa_capability; + bool ets_capability; + u8 max_num_tc; + u8 max_ets_tc; + u8 max_pfc_tc; + u8 dcb_capability; +}; + +struct qlcnic_dcb_param { + u32 hdr_prio_pfc_map[2]; + u32 prio_pg_map[2]; + u32 pg_bw_map[2]; + u32 pg_tsa_map[2]; + u32 app[QLC_DCB_MAX_APP]; +}; + +struct qlcnic_dcb_mbx_params { + /* 1st local, 2nd operational 3rd remote */ + struct qlcnic_dcb_param type[3]; + u32 prio_tc_map; +}; + +struct qlcnic_82xx_dcb_param_mbx_le { + __le32 hdr_prio_pfc_map[2]; + __le32 prio_pg_map[2]; + __le32 pg_bw_map[2]; + __le32 pg_tsa_map[2]; + __le32 app[QLC_DCB_MAX_APP]; +}; + +enum qlcnic_dcb_selector { + QLC_SELECTOR_DEF = 0x0, + QLC_SELECTOR_ETHER, + QLC_SELECTOR_TCP, + QLC_SELECTOR_UDP, +}; + +enum qlcnic_dcb_prio_type { + QLC_PRIO_NONE = 0, + QLC_PRIO_GROUP, + QLC_PRIO_LINK, +}; + +enum qlcnic_dcb_pfc_type { + QLC_PFC_DISABLED = 0, + QLC_PFC_FULL, + QLC_PFC_TX, + QLC_PFC_RX +}; + +struct qlcnic_dcb_prio_cfg { + bool valid; + enum qlcnic_dcb_pfc_type pfc_type; +}; + +struct qlcnic_dcb_pg_cfg { + bool valid; + u8 total_bw_percent; /* of Link/ port BW */ + u8 prio_count; + u8 tsa_type; +}; + +struct qlcnic_dcb_tc_cfg { + bool valid; + struct qlcnic_dcb_prio_cfg prio_cfg[QLC_DCB_MAX_PRIO]; + enum qlcnic_dcb_prio_type prio_type; /* always prio_link */ + u8 link_percent; /* % of link bandwidth */ + u8 bwg_percent; /* % of BWG's bandwidth */ + u8 up_tc_map; + u8 pgid; +}; + +struct qlcnic_dcb_app { + bool valid; + enum qlcnic_dcb_selector selector; + u16 protocol; + u8 priority; +}; + +struct qlcnic_dcb_cee { + struct qlcnic_dcb_tc_cfg tc_cfg[QLC_DCB_MAX_TC]; + struct qlcnic_dcb_pg_cfg pg_cfg[QLC_DCB_MAX_PG]; + struct qlcnic_dcb_app app[QLC_DCB_MAX_APP]; + bool tc_param_valid; + bool pfc_mode_enable; +}; + +struct qlcnic_dcb_cfg { + /* 0 - local, 1 - operational, 2 - remote */ + struct qlcnic_dcb_cee type[QLC_DCB_NUM_PARAM]; + struct qlcnic_dcb_capability capability; + u32 version; +}; + +static struct qlcnic_dcb_ops qlcnic_83xx_dcb_ops = { + .init_dcbnl_ops = __qlcnic_init_dcbnl_ops, + .free = __qlcnic_dcb_free, + .attach = __qlcnic_dcb_attach, + .query_hw_capability = __qlcnic_dcb_query_hw_capability, + .get_info = __qlcnic_dcb_get_info, + + .get_hw_capability = qlcnic_83xx_dcb_get_hw_capability, + .query_cee_param = qlcnic_83xx_dcb_query_cee_param, + .get_cee_cfg = qlcnic_83xx_dcb_get_cee_cfg, + .register_aen = qlcnic_83xx_dcb_register_aen, + .handle_aen = qlcnic_83xx_dcb_handle_aen, +}; + +static struct qlcnic_dcb_ops qlcnic_82xx_dcb_ops = { + .init_dcbnl_ops = __qlcnic_init_dcbnl_ops, + .free = __qlcnic_dcb_free, + .attach = __qlcnic_dcb_attach, + .query_hw_capability = __qlcnic_dcb_query_hw_capability, + .get_info = __qlcnic_dcb_get_info, + + .get_hw_capability = qlcnic_82xx_dcb_get_hw_capability, + .query_cee_param = qlcnic_82xx_dcb_query_cee_param, + .get_cee_cfg = qlcnic_82xx_dcb_get_cee_cfg, + .handle_aen = qlcnic_82xx_dcb_handle_aen, +}; + +static u8 qlcnic_dcb_get_num_app(struct qlcnic_adapter *adapter, u32 val) +{ + if (qlcnic_82xx_check(adapter)) + return QLC_82XX_DCB_GET_NUMAPP(val); + else + return QLC_83XX_DCB_GET_NUMAPP(val); +} + +static inline u8 qlcnic_dcb_pfc_hdr_valid(struct qlcnic_adapter *adapter, + u32 val) +{ + if (qlcnic_82xx_check(adapter)) + return QLC_82XX_DCB_PFC_VALID(val); + else + return QLC_83XX_DCB_PFC_VALID(val); +} + +static inline u8 qlcnic_dcb_tsa_hdr_valid(struct qlcnic_adapter *adapter, + u32 val) +{ + if (qlcnic_82xx_check(adapter)) + return QLC_82XX_DCB_TSA_VALID(val); + else + return QLC_83XX_DCB_TSA_VALID(val); +} + +static inline u8 qlcnic_dcb_get_prio_map_app(struct qlcnic_adapter *adapter, + u32 val) +{ + if (qlcnic_82xx_check(adapter)) + return QLC_82XX_DCB_GET_PRIOMAP_APP(val); + else + return QLC_83XX_DCB_GET_PRIOMAP_APP(val); +} + +static int qlcnic_dcb_prio_count(u8 up_tc_map) +{ + int j; + + for (j = 0; j < QLC_DCB_MAX_TC; j++) + if (up_tc_map & QLC_DCB_GET_MAP(j)) + break; + + return j; +} + +static inline void __qlcnic_init_dcbnl_ops(struct qlcnic_adapter *adapter) +{ + if (test_bit(__QLCNIC_DCB_STATE, &adapter->state)) + adapter->netdev->dcbnl_ops = &qlcnic_dcbnl_ops; +} + +void qlcnic_set_dcb_ops(struct qlcnic_adapter *adapter) +{ + if (qlcnic_82xx_check(adapter)) + adapter->dcb->ops = &qlcnic_82xx_dcb_ops; + else if (qlcnic_83xx_check(adapter)) + adapter->dcb->ops = &qlcnic_83xx_dcb_ops; +} + +int __qlcnic_register_dcb(struct qlcnic_adapter *adapter) +{ + struct qlcnic_dcb *dcb; + + dcb = kzalloc(sizeof(struct qlcnic_dcb), GFP_ATOMIC); + if (!dcb) + return -ENOMEM; + + adapter->dcb = dcb; + dcb->adapter = adapter; + qlcnic_set_dcb_ops(adapter); + + return 0; +} + +static void __qlcnic_dcb_free(struct qlcnic_adapter *adapter) +{ + struct qlcnic_dcb *dcb = adapter->dcb; + + if (!dcb) + return; + + qlcnic_dcb_register_aen(adapter, 0); + + while (test_bit(__QLCNIC_DCB_IN_AEN, &adapter->state)) + usleep_range(10000, 11000); + + cancel_delayed_work_sync(&dcb->aen_work); + + if (dcb->wq) { + destroy_workqueue(dcb->wq); + dcb->wq = NULL; + } + + kfree(dcb->cfg); + dcb->cfg = NULL; + kfree(dcb->param); + dcb->param = NULL; + kfree(dcb); + adapter->dcb = NULL; +} + +static void __qlcnic_dcb_get_info(struct qlcnic_adapter *adapter) +{ + qlcnic_dcb_get_hw_capability(adapter); + qlcnic_dcb_get_cee_cfg(adapter); + qlcnic_dcb_register_aen(adapter, 1); +} + +static int __qlcnic_dcb_attach(struct qlcnic_adapter *adapter) +{ + struct qlcnic_dcb *dcb = adapter->dcb; + int err = 0; + + INIT_DELAYED_WORK(&dcb->aen_work, qlcnic_dcb_aen_work); + + dcb->wq = create_singlethread_workqueue("qlcnic-dcb"); + if (!dcb->wq) { + dev_err(&adapter->pdev->dev, + "DCB workqueue allocation failed. DCB will be disabled\n"); + return -1; + } + + dcb->cfg = kzalloc(sizeof(struct qlcnic_dcb_cfg), GFP_ATOMIC); + if (!dcb->cfg) { + err = -ENOMEM; + goto out_free_wq; + } + + dcb->param = kzalloc(sizeof(struct qlcnic_dcb_mbx_params), GFP_ATOMIC); + if (!dcb->param) { + err = -ENOMEM; + goto out_free_cfg; + } + + qlcnic_dcb_get_info(adapter); + + return 0; +out_free_cfg: + kfree(dcb->cfg); + dcb->cfg = NULL; + +out_free_wq: + destroy_workqueue(dcb->wq); + dcb->wq = NULL; + + return err; +} + +static int __qlcnic_dcb_query_hw_capability(struct qlcnic_adapter *adapter, + char *buf) +{ + struct qlcnic_cmd_args cmd; + u32 mbx_out; + int err; + + err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_DCB_QUERY_CAP); + if (err) + return err; + + err = qlcnic_issue_cmd(adapter, &cmd); + if (err) { + dev_err(&adapter->pdev->dev, + "Failed to query DCBX capability, err %d\n", err); + } else { + mbx_out = cmd.rsp.arg[1]; + if (buf) + memcpy(buf, &mbx_out, sizeof(u32)); + } + + qlcnic_free_mbx_args(&cmd); + + return err; +} + +static int __qlcnic_dcb_get_capability(struct qlcnic_adapter *adapter, u32 *val) +{ + struct qlcnic_dcb_capability *cap = &adapter->dcb->cfg->capability; + u32 mbx_out; + int err; + + memset(cap, 0, sizeof(struct qlcnic_dcb_capability)); + + err = qlcnic_dcb_query_hw_capability(adapter, (char *)val); + if (err) + return err; + + mbx_out = *val; + if (QLC_DCB_TSA_SUPPORT(mbx_out)) + cap->tsa_capability = true; + + if (QLC_DCB_ETS_SUPPORT(mbx_out)) + cap->ets_capability = true; + + cap->max_num_tc = QLC_DCB_MAX_NUM_TC(mbx_out); + cap->max_ets_tc = QLC_DCB_MAX_NUM_ETS_TC(mbx_out); + cap->max_pfc_tc = QLC_DCB_MAX_NUM_PFC_TC(mbx_out); + + if (cap->max_num_tc > QLC_DCB_MAX_TC || + cap->max_ets_tc > cap->max_num_tc || + cap->max_pfc_tc > cap->max_num_tc) { + dev_err(&adapter->pdev->dev, "Invalid DCB configuration\n"); + return -EINVAL; + } + + return err; +} + +static int qlcnic_82xx_dcb_get_hw_capability(struct qlcnic_adapter *adapter) +{ + struct qlcnic_dcb_cfg *cfg = adapter->dcb->cfg; + struct qlcnic_dcb_capability *cap; + u32 mbx_out; + int err; + + err = __qlcnic_dcb_get_capability(adapter, &mbx_out); + if (err) + return err; + + cap = &cfg->capability; + cap->dcb_capability = DCB_CAP_DCBX_VER_CEE | DCB_CAP_DCBX_LLD_MANAGED; + + if (cap->dcb_capability && cap->tsa_capability && cap->ets_capability) + set_bit(__QLCNIC_DCB_STATE, &adapter->state); + + return err; +} + +static int qlcnic_82xx_dcb_query_cee_param(struct qlcnic_adapter *adapter, + char *buf, u8 type) +{ + u16 size = sizeof(struct qlcnic_82xx_dcb_param_mbx_le); + struct qlcnic_82xx_dcb_param_mbx_le *prsp_le; + struct device *dev = &adapter->pdev->dev; + dma_addr_t cardrsp_phys_addr; + struct qlcnic_dcb_param rsp; + struct qlcnic_cmd_args cmd; + u64 phys_addr; + void *addr; + int err, i; + + switch (type) { + case QLC_DCB_LOCAL_PARAM_FWID: + case QLC_DCB_OPER_PARAM_FWID: + case QLC_DCB_PEER_PARAM_FWID: + break; + default: + dev_err(dev, "Invalid parameter type %d\n", type); + return -EINVAL; + } + + addr = dma_alloc_coherent(&adapter->pdev->dev, size, &cardrsp_phys_addr, + GFP_KERNEL); + if (addr == NULL) + return -ENOMEM; + + prsp_le = addr; + + err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_DCB_QUERY_PARAM); + if (err) + goto out_free_rsp; + + phys_addr = cardrsp_phys_addr; + cmd.req.arg[1] = size | (type << 16); + cmd.req.arg[2] = MSD(phys_addr); + cmd.req.arg[3] = LSD(phys_addr); + + err = qlcnic_issue_cmd(adapter, &cmd); + if (err) { + dev_err(dev, "Failed to query DCBX parameter, err %d\n", err); + goto out; + } + + memset(&rsp, 0, sizeof(struct qlcnic_dcb_param)); + rsp.hdr_prio_pfc_map[0] = le32_to_cpu(prsp_le->hdr_prio_pfc_map[0]); + rsp.hdr_prio_pfc_map[1] = le32_to_cpu(prsp_le->hdr_prio_pfc_map[1]); + rsp.prio_pg_map[0] = le32_to_cpu(prsp_le->prio_pg_map[0]); + rsp.prio_pg_map[1] = le32_to_cpu(prsp_le->prio_pg_map[1]); + rsp.pg_bw_map[0] = le32_to_cpu(prsp_le->pg_bw_map[0]); + rsp.pg_bw_map[1] = le32_to_cpu(prsp_le->pg_bw_map[1]); + rsp.pg_tsa_map[0] = le32_to_cpu(prsp_le->pg_tsa_map[0]); + rsp.pg_tsa_map[1] = le32_to_cpu(prsp_le->pg_tsa_map[1]); + + for (i = 0; i < QLC_DCB_MAX_APP; i++) + rsp.app[i] = le32_to_cpu(prsp_le->app[i]); + + if (buf) + memcpy(buf, &rsp, size); +out: + qlcnic_free_mbx_args(&cmd); + +out_free_rsp: + dma_free_coherent(&adapter->pdev->dev, size, addr, cardrsp_phys_addr); + + return err; +} + +static int qlcnic_82xx_dcb_get_cee_cfg(struct qlcnic_adapter *adapter) +{ + struct qlcnic_dcb_mbx_params *mbx; + int err; + + mbx = adapter->dcb->param; + if (!mbx) + return 0; + + err = qlcnic_dcb_query_cee_param(adapter, (char *)&mbx->type[0], + QLC_DCB_LOCAL_PARAM_FWID); + if (err) + return err; + + err = qlcnic_dcb_query_cee_param(adapter, (char *)&mbx->type[1], + QLC_DCB_OPER_PARAM_FWID); + if (err) + return err; + + err = qlcnic_dcb_query_cee_param(adapter, (char *)&mbx->type[2], + QLC_DCB_PEER_PARAM_FWID); + if (err) + return err; + + mbx->prio_tc_map = QLC_82XX_DCB_PRIO_TC_MAP; + + qlcnic_dcb_data_cee_param_map(adapter); + + return err; +} + +static void qlcnic_dcb_aen_work(struct work_struct *work) +{ + struct qlcnic_adapter *adapter; + struct qlcnic_dcb *dcb; + + dcb = container_of(work, struct qlcnic_dcb, aen_work.work); + adapter = dcb->adapter; + + qlcnic_dcb_get_cee_cfg(adapter); + clear_bit(__QLCNIC_DCB_IN_AEN, &adapter->state); +} + +static void qlcnic_82xx_dcb_handle_aen(struct qlcnic_adapter *adapter, + void *data) +{ + struct qlcnic_dcb *dcb = adapter->dcb; + + if (test_and_set_bit(__QLCNIC_DCB_IN_AEN, &adapter->state)) + return; + + queue_delayed_work(dcb->wq, &dcb->aen_work, 0); +} + +static int qlcnic_83xx_dcb_get_hw_capability(struct qlcnic_adapter *adapter) +{ + struct qlcnic_dcb_capability *cap = &adapter->dcb->cfg->capability; + u32 mbx_out; + int err; + + err = __qlcnic_dcb_get_capability(adapter, &mbx_out); + if (err) + return err; + + if (mbx_out & BIT_2) + cap->dcb_capability = DCB_CAP_DCBX_VER_CEE; + if (mbx_out & BIT_3) + cap->dcb_capability |= DCB_CAP_DCBX_VER_IEEE; + if (cap->dcb_capability) + cap->dcb_capability |= DCB_CAP_DCBX_LLD_MANAGED; + + if (cap->dcb_capability && cap->tsa_capability && cap->ets_capability) + set_bit(__QLCNIC_DCB_STATE, &adapter->state); + + return err; +} + +static int qlcnic_83xx_dcb_query_cee_param(struct qlcnic_adapter *adapter, + char *buf, u8 idx) +{ + struct qlcnic_dcb_mbx_params mbx_out; + int err, i, j, k, max_app, size; + struct qlcnic_dcb_param *each; + struct qlcnic_cmd_args cmd; + u32 val; + char *p; + + size = 0; + memset(&mbx_out, 0, sizeof(struct qlcnic_dcb_mbx_params)); + memset(buf, 0, sizeof(struct qlcnic_dcb_mbx_params)); + + err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_DCB_QUERY_PARAM); + if (err) + return err; + + cmd.req.arg[0] |= QLC_DCB_FW_VER << 29; + err = qlcnic_issue_cmd(adapter, &cmd); + if (err) { + dev_err(&adapter->pdev->dev, + "Failed to query DCBX param, err %d\n", err); + goto out; + } + + mbx_out.prio_tc_map = cmd.rsp.arg[1]; + p = memcpy(buf, &mbx_out, sizeof(u32)); + k = 2; + p += sizeof(u32); + + for (j = 0; j < QLC_DCB_NUM_PARAM; j++) { + each = &mbx_out.type[j]; + + each->hdr_prio_pfc_map[0] = cmd.rsp.arg[k++]; + each->hdr_prio_pfc_map[1] = cmd.rsp.arg[k++]; + each->prio_pg_map[0] = cmd.rsp.arg[k++]; + each->prio_pg_map[1] = cmd.rsp.arg[k++]; + each->pg_bw_map[0] = cmd.rsp.arg[k++]; + each->pg_bw_map[1] = cmd.rsp.arg[k++]; + each->pg_tsa_map[0] = cmd.rsp.arg[k++]; + each->pg_tsa_map[1] = cmd.rsp.arg[k++]; + val = each->hdr_prio_pfc_map[0]; + + max_app = qlcnic_dcb_get_num_app(adapter, val); + for (i = 0; i < max_app; i++) + each->app[i] = cmd.rsp.arg[i + k]; + + size = 16 * sizeof(u32); + memcpy(p, &each->hdr_prio_pfc_map[0], size); + p += size; + if (j == 0) + k = 18; + else + k = 34; + } +out: + qlcnic_free_mbx_args(&cmd); + + return err; +} + +static int qlcnic_83xx_dcb_get_cee_cfg(struct qlcnic_adapter *adapter) +{ + struct qlcnic_dcb *dcb = adapter->dcb; + int err; + + err = qlcnic_dcb_query_cee_param(adapter, (char *)dcb->param, 0); + if (err) + return err; + + qlcnic_dcb_data_cee_param_map(adapter); + + return err; +} + +static int qlcnic_83xx_dcb_register_aen(struct qlcnic_adapter *adapter, + bool flag) +{ + u8 val = (flag ? QLCNIC_CMD_INIT_NIC_FUNC : QLCNIC_CMD_STOP_NIC_FUNC); + struct qlcnic_cmd_args cmd; + int err; + + err = qlcnic_alloc_mbx_args(&cmd, adapter, val); + if (err) + return err; + + cmd.req.arg[1] = QLC_DCB_AEN_BIT; + + err = qlcnic_issue_cmd(adapter, &cmd); + if (err) + dev_err(&adapter->pdev->dev, "Failed to %s DCBX AEN, err %d\n", + (flag ? "register" : "unregister"), err); + + qlcnic_free_mbx_args(&cmd); + + return err; +} + +static void qlcnic_83xx_dcb_handle_aen(struct qlcnic_adapter *adapter, + void *data) +{ + struct qlcnic_dcb *dcb = adapter->dcb; + u32 *val = data; + + if (test_and_set_bit(__QLCNIC_DCB_IN_AEN, &adapter->state)) + return; + + if (*val & BIT_8) + set_bit(__QLCNIC_DCB_STATE, &adapter->state); + else + clear_bit(__QLCNIC_DCB_STATE, &adapter->state); + + queue_delayed_work(dcb->wq, &dcb->aen_work, 0); +} + +static void qlcnic_dcb_fill_cee_tc_params(struct qlcnic_dcb_mbx_params *mbx, + struct qlcnic_dcb_param *each, + struct qlcnic_dcb_cee *type) +{ + struct qlcnic_dcb_tc_cfg *tc_cfg; + u8 i, tc, pgid; + + for (i = 0; i < QLC_DCB_MAX_PRIO; i++) { + tc = QLC_DCB_GET_TC_PRIO(mbx->prio_tc_map, i); + tc_cfg = &type->tc_cfg[tc]; + tc_cfg->valid = true; + tc_cfg->up_tc_map |= QLC_DCB_GET_MAP(i); + + if (QLC_DCB_GET_PFC_PRIO(each->hdr_prio_pfc_map[1], i) && + type->pfc_mode_enable) { + tc_cfg->prio_cfg[i].valid = true; + tc_cfg->prio_cfg[i].pfc_type = QLC_PFC_FULL; + } + + if (i < 4) + pgid = QLC_DCB_GET_PGID_PRIO(each->prio_pg_map[0], i); + else + pgid = QLC_DCB_GET_PGID_PRIO(each->prio_pg_map[1], i); + + tc_cfg->pgid = pgid; + + tc_cfg->prio_type = QLC_PRIO_LINK; + type->pg_cfg[tc_cfg->pgid].prio_count++; + } +} + +static void qlcnic_dcb_fill_cee_pg_params(struct qlcnic_dcb_param *each, + struct qlcnic_dcb_cee *type) +{ + struct qlcnic_dcb_pg_cfg *pg_cfg; + u8 i, tsa, bw_per; + + for (i = 0; i < QLC_DCB_MAX_PG; i++) { + pg_cfg = &type->pg_cfg[i]; + pg_cfg->valid = true; + + if (i < 4) { + bw_per = QLC_DCB_GET_BWPER_PG(each->pg_bw_map[0], i); + tsa = QLC_DCB_GET_TSA_PG(each->pg_tsa_map[0], i); + } else { + bw_per = QLC_DCB_GET_BWPER_PG(each->pg_bw_map[1], i); + tsa = QLC_DCB_GET_TSA_PG(each->pg_tsa_map[1], i); + } + + pg_cfg->total_bw_percent = bw_per; + pg_cfg->tsa_type = tsa; + } +} + +static void +qlcnic_dcb_fill_cee_app_params(struct qlcnic_adapter *adapter, u8 idx, + struct qlcnic_dcb_param *each, + struct qlcnic_dcb_cee *type) +{ + struct qlcnic_dcb_app *app; + u8 i, num_app, map, cnt; + struct dcb_app new_app; + + num_app = qlcnic_dcb_get_num_app(adapter, each->hdr_prio_pfc_map[0]); + for (i = 0; i < num_app; i++) { + app = &type->app[i]; + app->valid = true; + + /* Only for CEE (-1) */ + app->selector = QLC_DCB_GET_SELECTOR_APP(each->app[i]) - 1; + new_app.selector = app->selector; + app->protocol = QLC_DCB_GET_PROTO_ID_APP(each->app[i]); + new_app.protocol = app->protocol; + map = qlcnic_dcb_get_prio_map_app(adapter, each->app[i]); + cnt = qlcnic_dcb_prio_count(map); + + if (cnt >= QLC_DCB_MAX_TC) + cnt = 0; + + app->priority = cnt; + new_app.priority = cnt; + + if (idx == QLC_DCB_OPER_IDX && adapter->netdev->dcbnl_ops) + dcb_setapp(adapter->netdev, &new_app); + } +} + +static void qlcnic_dcb_map_cee_params(struct qlcnic_adapter *adapter, u8 idx) +{ + struct qlcnic_dcb_mbx_params *mbx = adapter->dcb->param; + struct qlcnic_dcb_param *each = &mbx->type[idx]; + struct qlcnic_dcb_cfg *cfg = adapter->dcb->cfg; + struct qlcnic_dcb_cee *type = &cfg->type[idx]; + + type->tc_param_valid = false; + type->pfc_mode_enable = false; + memset(type->tc_cfg, 0, + sizeof(struct qlcnic_dcb_tc_cfg) * QLC_DCB_MAX_TC); + memset(type->pg_cfg, 0, + sizeof(struct qlcnic_dcb_pg_cfg) * QLC_DCB_MAX_TC); + + if (qlcnic_dcb_pfc_hdr_valid(adapter, each->hdr_prio_pfc_map[0]) && + cfg->capability.max_pfc_tc) + type->pfc_mode_enable = true; + + if (qlcnic_dcb_tsa_hdr_valid(adapter, each->hdr_prio_pfc_map[0]) && + cfg->capability.max_ets_tc) + type->tc_param_valid = true; + + qlcnic_dcb_fill_cee_tc_params(mbx, each, type); + qlcnic_dcb_fill_cee_pg_params(each, type); + qlcnic_dcb_fill_cee_app_params(adapter, idx, each, type); +} + +static void qlcnic_dcb_data_cee_param_map(struct qlcnic_adapter *adapter) +{ + int i; + + for (i = 0; i < QLC_DCB_NUM_PARAM; i++) + qlcnic_dcb_map_cee_params(adapter, i); + + dcbnl_cee_notify(adapter->netdev, RTM_GETDCB, DCB_CMD_CEE_GET, 0, 0); +} + +static u8 qlcnic_dcb_get_state(struct net_device *netdev) +{ + struct qlcnic_adapter *adapter = netdev_priv(netdev); + + return test_bit(__QLCNIC_DCB_STATE, &adapter->state); +} + +static void qlcnic_dcb_get_perm_hw_addr(struct net_device *netdev, u8 *addr) +{ + memcpy(addr, netdev->dev_addr, netdev->addr_len); +} + +static void +qlcnic_dcb_get_pg_tc_cfg_tx(struct net_device *netdev, int tc, u8 *prio, + u8 *pgid, u8 *bw_per, u8 *up_tc_map) +{ + struct qlcnic_adapter *adapter = netdev_priv(netdev); + struct qlcnic_dcb_tc_cfg *tc_cfg, *temp; + struct qlcnic_dcb_cee *type; + u8 i, cnt, pg; + + type = &adapter->dcb->cfg->type[QLC_DCB_OPER_IDX]; + *prio = *pgid = *bw_per = *up_tc_map = 0; + + if (!test_bit(__QLCNIC_DCB_STATE, &adapter->state) || + !type->tc_param_valid) + return; + + if (tc < 0 || (tc > QLC_DCB_MAX_TC)) + return; + + tc_cfg = &type->tc_cfg[tc]; + if (!tc_cfg->valid) + return; + + *pgid = tc_cfg->pgid; + *prio = tc_cfg->prio_type; + *up_tc_map = tc_cfg->up_tc_map; + pg = *pgid; + + for (i = 0, cnt = 0; i < QLC_DCB_MAX_TC; i++) { + temp = &type->tc_cfg[i]; + if (temp->valid && (pg == temp->pgid)) + cnt++; + } + + tc_cfg->bwg_percent = (100 / cnt); + *bw_per = tc_cfg->bwg_percent; +} + +static void qlcnic_dcb_get_pg_bwg_cfg_tx(struct net_device *netdev, int pgid, + u8 *bw_pct) +{ + struct qlcnic_adapter *adapter = netdev_priv(netdev); + struct qlcnic_dcb_pg_cfg *pgcfg; + struct qlcnic_dcb_cee *type; + + *bw_pct = 0; + type = &adapter->dcb->cfg->type[QLC_DCB_OPER_IDX]; + + if (!test_bit(__QLCNIC_DCB_STATE, &adapter->state) || + !type->tc_param_valid) + return; + + if (pgid < 0 || pgid > QLC_DCB_MAX_PG) + return; + + pgcfg = &type->pg_cfg[pgid]; + if (!pgcfg->valid) + return; + + *bw_pct = pgcfg->total_bw_percent; +} + +static void qlcnic_dcb_get_pfc_cfg(struct net_device *netdev, int prio, + u8 *setting) +{ + struct qlcnic_adapter *adapter = netdev_priv(netdev); + struct qlcnic_dcb_tc_cfg *tc_cfg; + u8 val = QLC_DCB_GET_MAP(prio); + struct qlcnic_dcb_cee *type; + u8 i; + + *setting = 0; + type = &adapter->dcb->cfg->type[QLC_DCB_OPER_IDX]; + + if (!test_bit(__QLCNIC_DCB_STATE, &adapter->state) || + !type->pfc_mode_enable) + return; + + for (i = 0; i < QLC_DCB_MAX_TC; i++) { + tc_cfg = &type->tc_cfg[i]; + if (!tc_cfg->valid) + continue; + + if ((val & tc_cfg->up_tc_map) && (tc_cfg->prio_cfg[prio].valid)) + *setting = tc_cfg->prio_cfg[prio].pfc_type; + } +} + +static u8 qlcnic_dcb_get_capability(struct net_device *netdev, int capid, + u8 *cap) +{ + struct qlcnic_adapter *adapter = netdev_priv(netdev); + + if (!test_bit(__QLCNIC_DCB_STATE, &adapter->state)) + return 0; + + switch (capid) { + case DCB_CAP_ATTR_PG: + case DCB_CAP_ATTR_UP2TC: + case DCB_CAP_ATTR_PFC: + case DCB_CAP_ATTR_GSP: + *cap = true; + break; + case DCB_CAP_ATTR_PG_TCS: + case DCB_CAP_ATTR_PFC_TCS: + *cap = 0x80; /* 8 priorities for PGs */ + break; + case DCB_CAP_ATTR_DCBX: + *cap = adapter->dcb->cfg->capability.dcb_capability; + break; + default: + *cap = false; + } + + return 0; +} + +static int qlcnic_dcb_get_num_tcs(struct net_device *netdev, int attr, u8 *num) +{ + struct qlcnic_adapter *adapter = netdev_priv(netdev); + struct qlcnic_dcb_cfg *cfg = adapter->dcb->cfg; + + if (!test_bit(__QLCNIC_DCB_STATE, &adapter->state)) + return -EINVAL; + + switch (attr) { + case DCB_NUMTCS_ATTR_PG: + *num = cfg->capability.max_ets_tc; + return 0; + case DCB_NUMTCS_ATTR_PFC: + *num = cfg->capability.max_pfc_tc; + return 0; + default: + return -EINVAL; + } +} + +static u8 qlcnic_dcb_get_app(struct net_device *netdev, u8 idtype, u16 id) +{ + struct qlcnic_adapter *adapter = netdev_priv(netdev); + struct dcb_app app = { + .selector = idtype, + .protocol = id, + }; + + if (!test_bit(__QLCNIC_DCB_STATE, &adapter->state)) + return 0; + + return dcb_getapp(netdev, &app); +} + +static u8 qlcnic_dcb_get_pfc_state(struct net_device *netdev) +{ + struct qlcnic_adapter *adapter = netdev_priv(netdev); + struct qlcnic_dcb *dcb = adapter->dcb; + + if (!test_bit(__QLCNIC_DCB_STATE, &adapter->state)) + return 0; + + return dcb->cfg->type[QLC_DCB_OPER_IDX].pfc_mode_enable; +} + +static u8 qlcnic_dcb_get_dcbx(struct net_device *netdev) +{ + struct qlcnic_adapter *adapter = netdev_priv(netdev); + struct qlcnic_dcb_cfg *cfg = adapter->dcb->cfg; + + if (!test_bit(__QLCNIC_DCB_STATE, &adapter->state)) + return 0; + + return cfg->capability.dcb_capability; +} + +static u8 qlcnic_dcb_get_feat_cfg(struct net_device *netdev, int fid, u8 *flag) +{ + struct qlcnic_adapter *adapter = netdev_priv(netdev); + struct qlcnic_dcb_cee *type; + + if (!test_bit(__QLCNIC_DCB_STATE, &adapter->state)) + return 1; + + type = &adapter->dcb->cfg->type[QLC_DCB_OPER_IDX]; + *flag = 0; + + switch (fid) { + case DCB_FEATCFG_ATTR_PG: + if (type->tc_param_valid) + *flag |= DCB_FEATCFG_ENABLE; + else + *flag |= DCB_FEATCFG_ERROR; + break; + case DCB_FEATCFG_ATTR_PFC: + if (type->pfc_mode_enable) { + if (type->tc_cfg[0].prio_cfg[0].pfc_type) + *flag |= DCB_FEATCFG_ENABLE; + } else { + *flag |= DCB_FEATCFG_ERROR; + } + break; + case DCB_FEATCFG_ATTR_APP: + *flag |= DCB_FEATCFG_ENABLE; + break; + default: + netdev_err(netdev, "Invalid Feature ID %d\n", fid); + return 1; + } + + return 0; +} + +static inline void +qlcnic_dcb_get_pg_tc_cfg_rx(struct net_device *netdev, int prio, u8 *prio_type, + u8 *pgid, u8 *bw_pct, u8 *up_map) +{ + *prio_type = *pgid = *bw_pct = *up_map = 0; +} + +static inline void +qlcnic_dcb_get_pg_bwg_cfg_rx(struct net_device *netdev, int pgid, u8 *bw_pct) +{ + *bw_pct = 0; +} + +static int qlcnic_dcb_peer_app_info(struct net_device *netdev, + struct dcb_peer_app_info *info, + u16 *app_count) +{ + struct qlcnic_adapter *adapter = netdev_priv(netdev); + struct qlcnic_dcb_cee *peer; + int i; + + *app_count = 0; + + if (!test_bit(__QLCNIC_DCB_STATE, &adapter->state)) + return 0; + + peer = &adapter->dcb->cfg->type[QLC_DCB_PEER_IDX]; + + for (i = 0; i < QLC_DCB_MAX_APP; i++) { + if (peer->app[i].valid) + (*app_count)++; + } + + return 0; +} + +static int qlcnic_dcb_peer_app_table(struct net_device *netdev, + struct dcb_app *table) +{ + struct qlcnic_adapter *adapter = netdev_priv(netdev); + struct qlcnic_dcb_cee *peer; + struct qlcnic_dcb_app *app; + int i, j; + + if (!test_bit(__QLCNIC_DCB_STATE, &adapter->state)) + return 0; + + peer = &adapter->dcb->cfg->type[QLC_DCB_PEER_IDX]; + + for (i = 0, j = 0; i < QLC_DCB_MAX_APP; i++) { + app = &peer->app[i]; + if (!app->valid) + continue; + + table[j].selector = app->selector; + table[j].priority = app->priority; + table[j++].protocol = app->protocol; + } + + return 0; +} + +static int qlcnic_dcb_cee_peer_get_pg(struct net_device *netdev, + struct cee_pg *pg) +{ + struct qlcnic_adapter *adapter = netdev_priv(netdev); + struct qlcnic_dcb_cee *peer; + u8 i, j, k, map; + + if (!test_bit(__QLCNIC_DCB_STATE, &adapter->state)) + return 0; + + peer = &adapter->dcb->cfg->type[QLC_DCB_PEER_IDX]; + + for (i = 0, j = 0; i < QLC_DCB_MAX_PG; i++) { + if (!peer->pg_cfg[i].valid) + continue; + + pg->pg_bw[j] = peer->pg_cfg[i].total_bw_percent; + + for (k = 0; k < QLC_DCB_MAX_TC; k++) { + if (peer->tc_cfg[i].valid && + (peer->tc_cfg[i].pgid == i)) { + map = peer->tc_cfg[i].up_tc_map; + pg->prio_pg[j++] = map; + break; + } + } + } + + return 0; +} + +static int qlcnic_dcb_cee_peer_get_pfc(struct net_device *netdev, + struct cee_pfc *pfc) +{ + struct qlcnic_adapter *adapter = netdev_priv(netdev); + struct qlcnic_dcb_cfg *cfg = adapter->dcb->cfg; + struct qlcnic_dcb_tc_cfg *tc; + struct qlcnic_dcb_cee *peer; + u8 i, setting, prio; + + pfc->pfc_en = 0; + + if (!test_bit(__QLCNIC_DCB_STATE, &adapter->state)) + return 0; + + peer = &cfg->type[QLC_DCB_PEER_IDX]; + + for (i = 0; i < QLC_DCB_MAX_TC; i++) { + tc = &peer->tc_cfg[i]; + prio = qlcnic_dcb_prio_count(tc->up_tc_map); + + setting = 0; + qlcnic_dcb_get_pfc_cfg(netdev, prio, &setting); + if (setting) + pfc->pfc_en |= QLC_DCB_GET_MAP(i); + } + + pfc->tcs_supported = cfg->capability.max_pfc_tc; + + return 0; +} + +static const struct dcbnl_rtnl_ops qlcnic_dcbnl_ops = { + .getstate = qlcnic_dcb_get_state, + .getpermhwaddr = qlcnic_dcb_get_perm_hw_addr, + .getpgtccfgtx = qlcnic_dcb_get_pg_tc_cfg_tx, + .getpgbwgcfgtx = qlcnic_dcb_get_pg_bwg_cfg_tx, + .getpfccfg = qlcnic_dcb_get_pfc_cfg, + .getcap = qlcnic_dcb_get_capability, + .getnumtcs = qlcnic_dcb_get_num_tcs, + .getapp = qlcnic_dcb_get_app, + .getpfcstate = qlcnic_dcb_get_pfc_state, + .getdcbx = qlcnic_dcb_get_dcbx, + .getfeatcfg = qlcnic_dcb_get_feat_cfg, + + .getpgtccfgrx = qlcnic_dcb_get_pg_tc_cfg_rx, + .getpgbwgcfgrx = qlcnic_dcb_get_pg_bwg_cfg_rx, + + .peer_getappinfo = qlcnic_dcb_peer_app_info, + .peer_getapptable = qlcnic_dcb_peer_app_table, + .cee_peer_getpg = qlcnic_dcb_cee_peer_get_pg, + .cee_peer_getpfc = qlcnic_dcb_cee_peer_get_pfc, +}; diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h new file mode 100644 index 0000000..b87ce9f --- /dev/null +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h @@ -0,0 +1,41 @@ +/* + * QLogic qlcnic NIC Driver + * Copyright (c) 2009-2013 QLogic Corporation + * + * See LICENSE.qlcnic for copyright and licensing details. + */ + +#ifndef __QLCNIC_DCBX_H +#define __QLCNIC_DCBX_H + +void qlcnic_clear_dcb_ops(struct qlcnic_adapter *); + +#ifdef CONFIG_QLCNIC_DCB +int __qlcnic_register_dcb(struct qlcnic_adapter *); +#else +static inline int __qlcnic_register_dcb(struct qlcnic_adapter *adapter) +{ return 0; } +#endif + +struct qlcnic_dcb_ops { + void (*init_dcbnl_ops) (struct qlcnic_adapter *); + void (*free) (struct qlcnic_adapter *); + int (*attach) (struct qlcnic_adapter *); + int (*query_hw_capability) (struct qlcnic_adapter *, char *); + int (*get_hw_capability) (struct qlcnic_adapter *); + void (*get_info) (struct qlcnic_adapter *); + int (*query_cee_param) (struct qlcnic_adapter *, char *, u8); + int (*get_cee_cfg) (struct qlcnic_adapter *); + int (*register_aen) (struct qlcnic_adapter *, bool); + void (*handle_aen) (struct qlcnic_adapter *, void *); +}; + +struct qlcnic_dcb { + struct qlcnic_dcb_mbx_params *param; + struct qlcnic_adapter *adapter; + struct delayed_work aen_work; + struct workqueue_struct *wq; + struct qlcnic_dcb_ops *ops; + struct qlcnic_dcb_cfg *cfg; +}; +#endif diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c index 79a5855..7b0c90e 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c @@ -125,6 +125,14 @@ static const char qlcnic_83xx_mac_stats_strings[][ETH_GSTRING_LEN] = { }; #define QLCNIC_STATS_LEN ARRAY_SIZE(qlcnic_gstrings_stats) + +static const char qlcnic_tx_ring_stats_strings[][ETH_GSTRING_LEN] = { + "xmit_on", + "xmit_off", + "xmit_called", + "xmit_finished", +}; + static const char qlcnic_83xx_rx_stats_strings[][ETH_GSTRING_LEN] = { "ctx_rx_bytes", "ctx_rx_pkts", @@ -630,15 +638,15 @@ qlcnic_set_ringparam(struct net_device *dev, static void qlcnic_get_channels(struct net_device *dev, struct ethtool_channels *channel) { - int min; struct qlcnic_adapter *adapter = netdev_priv(dev); + int min; min = min_t(int, adapter->ahw->max_rx_ques, num_online_cpus()); channel->max_rx = rounddown_pow_of_two(min); - channel->max_tx = adapter->ahw->max_tx_ques; + channel->max_tx = min_t(int, QLCNIC_MAX_TX_RINGS, num_online_cpus()); channel->rx_count = adapter->max_sds_rings; - channel->tx_count = adapter->ahw->max_tx_ques; + channel->tx_count = adapter->max_drv_tx_rings; } static int qlcnic_set_channels(struct net_device *dev, @@ -646,18 +654,27 @@ static int qlcnic_set_channels(struct net_device *dev, { struct qlcnic_adapter *adapter = netdev_priv(dev); int err; + int txq = 0; - if (channel->other_count || channel->combined_count || - channel->tx_count != channel->max_tx) + if (channel->other_count || channel->combined_count) return -EINVAL; - err = qlcnic_validate_max_rss(adapter, channel->rx_count); - if (err) - return err; + if (channel->rx_count) { + err = qlcnic_validate_max_rss(adapter, channel->rx_count); + if (err) + return err; + } + + if (channel->tx_count) { + err = qlcnic_validate_max_tx_rings(adapter, channel->tx_count); + if (err) + return err; + txq = channel->tx_count; + } - err = qlcnic_set_max_rss(adapter, channel->rx_count, 0); - netdev_info(dev, "allocated 0x%x sds rings\n", - adapter->max_sds_rings); + err = qlcnic_set_max_rss(adapter, channel->rx_count, txq); + netdev_info(dev, "allocated 0x%x sds rings and 0x%x tx rings\n", + adapter->max_sds_rings, adapter->max_drv_tx_rings); return err; } @@ -893,6 +910,7 @@ free_diag_res: clear_diag_irq: adapter->max_sds_rings = max_sds_rings; clear_bit(__QLCNIC_RESETTING, &adapter->state); + return ret; } @@ -966,6 +984,7 @@ int qlcnic_do_lb_test(struct qlcnic_adapter *adapter, u8 mode) int qlcnic_loopback_test(struct net_device *netdev, u8 mode) { struct qlcnic_adapter *adapter = netdev_priv(netdev); + int max_drv_tx_rings = adapter->max_drv_tx_rings; int max_sds_rings = adapter->max_sds_rings; struct qlcnic_host_sds_ring *sds_ring; struct qlcnic_hardware_context *ahw = adapter->ahw; @@ -1025,6 +1044,7 @@ int qlcnic_loopback_test(struct net_device *netdev, u8 mode) clear_it: adapter->max_sds_rings = max_sds_rings; + adapter->max_drv_tx_rings = max_drv_tx_rings; clear_bit(__QLCNIC_RESETTING, &adapter->state); return ret; } @@ -1077,11 +1097,21 @@ qlcnic_get_strings(struct net_device *dev, u32 stringset, u8 *data) QLCNIC_TEST_LEN * ETH_GSTRING_LEN); break; case ETH_SS_STATS: + num_stats = ARRAY_SIZE(qlcnic_tx_ring_stats_strings); + for (i = 0; i < adapter->max_drv_tx_rings; i++) { + for (index = 0; index < num_stats; index++) { + sprintf(data, "tx_ring_%d %s", i, + qlcnic_tx_ring_stats_strings[index]); + data += ETH_GSTRING_LEN; + } + } + for (index = 0; index < QLCNIC_STATS_LEN; index++) { memcpy(data + index * ETH_GSTRING_LEN, qlcnic_gstrings_stats[index].stat_string, ETH_GSTRING_LEN); } + if (qlcnic_83xx_check(adapter)) { num_stats = ARRAY_SIZE(qlcnic_83xx_tx_stats_strings); for (i = 0; i < num_stats; i++, index++) @@ -1173,11 +1203,22 @@ static void qlcnic_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct qlcnic_adapter *adapter = netdev_priv(dev); + struct qlcnic_host_tx_ring *tx_ring; struct qlcnic_esw_statistics port_stats; struct qlcnic_mac_statistics mac_stats; - int index, ret, length, size; + int index, ret, length, size, ring; char *p; + memset(data, 0, adapter->max_drv_tx_rings * 4 * sizeof(u64)); + for (ring = 0, index = 0; ring < adapter->max_drv_tx_rings; ring++) { + if (test_bit(__QLCNIC_DEV_UP, &adapter->state)) { + tx_ring = &adapter->tx_ring[ring]; + *data++ = tx_ring->xmit_on; + *data++ = tx_ring->xmit_off; + *data++ = tx_ring->xmit_called; + *data++ = tx_ring->xmit_finished; + } + } memset(data, 0, stats->n_stats * sizeof(u64)); length = QLCNIC_STATS_LEN; for (index = 0; index < length; index++) { diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c index 4d5f59b..f8adc7b 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c @@ -387,7 +387,7 @@ qlcnic_send_cmd_descs(struct qlcnic_adapter *adapter, if (!test_bit(__QLCNIC_FW_ATTACHED, &adapter->state)) return -EIO; - tx_ring = adapter->tx_ring; + tx_ring = &adapter->tx_ring[0]; __netif_tx_lock_bh(tx_ring->txq); producer = tx_ring->producer; @@ -740,6 +740,22 @@ int qlcnic_82xx_clear_lb_mode(struct qlcnic_adapter *adapter, u8 mode) return 0; } +int qlcnic_82xx_read_phys_port_id(struct qlcnic_adapter *adapter) +{ + u8 mac[ETH_ALEN]; + int ret; + + ret = qlcnic_get_mac_address(adapter, mac, + adapter->ahw->physical_port); + if (ret) + return ret; + + memcpy(adapter->ahw->phys_port_id, mac, ETH_ALEN); + adapter->flags |= QLCNIC_HAS_PHYS_PORT_ID; + + return 0; +} + /* * Send the interrupt coalescing parameter set by ethtool to the card. */ diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.h index 4a71b28..272c356 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.h +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.h @@ -85,8 +85,11 @@ enum qlcnic_regs { #define QLCNIC_CMD_GET_TEMP_HDR 0x30 #define QLCNIC_CMD_BC_EVENT_SETUP 0x31 #define QLCNIC_CMD_CONFIG_VPORT 0x32 +#define QLCNIC_CMD_DCB_QUERY_CAP 0x34 +#define QLCNIC_CMD_DCB_QUERY_PARAM 0x35 #define QLCNIC_CMD_GET_MAC_STATS 0x37 #define QLCNIC_CMD_82XX_SET_DRV_VER 0x38 +#define QLCNIC_CMD_MQ_TX_CONFIG_INTR 0x39 #define QLCNIC_CMD_GET_LED_STATUS 0x3C #define QLCNIC_CMD_CONFIGURE_RSS 0x41 #define QLCNIC_CMD_CONFIG_INTR_COAL 0x43 @@ -122,6 +125,7 @@ enum qlcnic_regs { #define QLCNIC_MBX_COMP_EVENT 0x8100 #define QLCNIC_MBX_REQUEST_EVENT 0x8101 #define QLCNIC_MBX_TIME_EXTEND_EVENT 0x8102 +#define QLCNIC_MBX_DCBX_CONFIG_CHANGE_EVENT 0x8110 #define QLCNIC_MBX_SFP_INSERT_EVENT 0x8130 #define QLCNIC_MBX_SFP_REMOVE_EVENT 0x8131 @@ -149,7 +153,6 @@ struct ethtool_stats; struct pci_device_id; struct qlcnic_host_sds_ring; struct qlcnic_host_tx_ring; -struct qlcnic_host_tx_ring; struct qlcnic_hardware_context; struct qlcnic_adapter; @@ -173,10 +176,12 @@ int qlcnic_82xx_set_lb_mode(struct qlcnic_adapter *, u8); void qlcnic_82xx_write_crb(struct qlcnic_adapter *, char *, loff_t, size_t); void qlcnic_82xx_read_crb(struct qlcnic_adapter *, char *, loff_t, size_t); void qlcnic_82xx_dev_request_reset(struct qlcnic_adapter *, u32); -int qlcnic_82xx_setup_intr(struct qlcnic_adapter *, u8); +int qlcnic_82xx_setup_intr(struct qlcnic_adapter *, u8, int); irqreturn_t qlcnic_82xx_clear_legacy_intr(struct qlcnic_adapter *); int qlcnic_82xx_issue_cmd(struct qlcnic_adapter *adapter, struct qlcnic_cmd_args *); +int qlcnic_82xx_mq_intrpt(struct qlcnic_adapter *, int); +int qlcnic_82xx_config_intrpt(struct qlcnic_adapter *, u8); int qlcnic_82xx_fw_cmd_create_rx_ctx(struct qlcnic_adapter *); int qlcnic_82xx_fw_cmd_create_tx_ctx(struct qlcnic_adapter *, struct qlcnic_host_tx_ring *tx_ring, int); @@ -184,7 +189,7 @@ void qlcnic_82xx_fw_cmd_del_rx_ctx(struct qlcnic_adapter *); void qlcnic_82xx_fw_cmd_del_tx_ctx(struct qlcnic_adapter *, struct qlcnic_host_tx_ring *); int qlcnic_82xx_sre_macaddr_change(struct qlcnic_adapter *, u8 *, u16, u8); -int qlcnic_82xx_get_mac_address(struct qlcnic_adapter *, u8*); +int qlcnic_82xx_get_mac_address(struct qlcnic_adapter *, u8*, u8); int qlcnic_82xx_get_nic_info(struct qlcnic_adapter *, struct qlcnic_info *, u8); int qlcnic_82xx_set_nic_info(struct qlcnic_adapter *, struct qlcnic_info *); int qlcnic_82xx_get_pci_info(struct qlcnic_adapter *, struct qlcnic_pci_info*); diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_init.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_init.c index 974d626..66c26cf 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_init.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_init.c @@ -127,12 +127,12 @@ void qlcnic_reset_rx_buffers_list(struct qlcnic_adapter *adapter) } } -void qlcnic_release_tx_buffers(struct qlcnic_adapter *adapter) +void qlcnic_release_tx_buffers(struct qlcnic_adapter *adapter, + struct qlcnic_host_tx_ring *tx_ring) { struct qlcnic_cmd_buffer *cmd_buf; struct qlcnic_skb_frag *buffrag; int i, j; - struct qlcnic_host_tx_ring *tx_ring = adapter->tx_ring; cmd_buf = tx_ring->cmd_buf_arr; for (i = 0; i < tx_ring->num_desc; i++) { @@ -241,7 +241,13 @@ int qlcnic_alloc_sw_resources(struct qlcnic_adapter *adapter) sds_ring->irq = adapter->msix_entries[ring].vector; sds_ring->adapter = adapter; sds_ring->num_desc = adapter->num_rxd; - + if (qlcnic_82xx_check(adapter)) { + if (qlcnic_check_multi_tx(adapter) && + !adapter->ahw->diag_test) + sds_ring->tx_ring = &adapter->tx_ring[ring]; + else + sds_ring->tx_ring = &adapter->tx_ring[0]; + } for (i = 0; i < NUM_RCV_DESC_RINGS; i++) INIT_LIST_HEAD(&sds_ring->free_list[i]); } diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c index f807f3b..8d06f88 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c @@ -127,6 +127,23 @@ struct sk_buff *qlcnic_process_rxbuf(struct qlcnic_adapter *, struct qlcnic_host_rds_ring *, u16, u16); +inline void qlcnic_enable_tx_intr(struct qlcnic_adapter *adapter, + struct qlcnic_host_tx_ring *tx_ring) +{ + if (qlcnic_check_multi_tx(adapter) && + !adapter->ahw->diag_test) + writel(0x0, tx_ring->crb_intr_mask); +} + + +static inline void qlcnic_disable_tx_int(struct qlcnic_adapter *adapter, + struct qlcnic_host_tx_ring *tx_ring) +{ + if (qlcnic_check_multi_tx(adapter) && + !adapter->ahw->diag_test) + writel(1, tx_ring->crb_intr_mask); +} + inline void qlcnic_83xx_enable_tx_intr(struct qlcnic_adapter *adapter, struct qlcnic_host_tx_ring *tx_ring) { @@ -147,10 +164,7 @@ static inline u8 qlcnic_mac_hash(u64 mac) static inline u32 qlcnic_get_ref_handle(struct qlcnic_adapter *adapter, u16 handle, u8 ring_id) { - unsigned short device = adapter->pdev->device; - - if ((device == PCI_DEVICE_ID_QLOGIC_QLE834X) || - (device == PCI_DEVICE_ID_QLOGIC_VF_QLE834X)) + if (qlcnic_83xx_check(adapter)) return handle | (ring_id << 15); else return handle; @@ -357,14 +371,14 @@ static void qlcnic_send_filter(struct qlcnic_adapter *adapter, } static int qlcnic_tx_pkt(struct qlcnic_adapter *adapter, - struct cmd_desc_type0 *first_desc, struct sk_buff *skb) + struct cmd_desc_type0 *first_desc, struct sk_buff *skb, + struct qlcnic_host_tx_ring *tx_ring) { u8 l4proto, opcode = 0, hdr_len = 0; u16 flags = 0, vlan_tci = 0; int copied, offset, copy_len, size; struct cmd_desc_type0 *hwdesc; struct vlan_ethhdr *vh; - struct qlcnic_host_tx_ring *tx_ring = adapter->tx_ring; u16 protocol = ntohs(skb->protocol); u32 producer = tx_ring->producer; @@ -547,7 +561,7 @@ static inline void qlcnic_clear_cmddesc(u64 *desc) netdev_tx_t qlcnic_xmit_frame(struct sk_buff *skb, struct net_device *netdev) { struct qlcnic_adapter *adapter = netdev_priv(netdev); - struct qlcnic_host_tx_ring *tx_ring = adapter->tx_ring; + struct qlcnic_host_tx_ring *tx_ring; struct qlcnic_cmd_buffer *pbuf; struct qlcnic_skb_frag *buffrag; struct cmd_desc_type0 *hwdesc, *first_desc; @@ -556,10 +570,8 @@ netdev_tx_t qlcnic_xmit_frame(struct sk_buff *skb, struct net_device *netdev) int i, k, frag_count, delta = 0; u32 producer, num_txd; - num_txd = tx_ring->num_desc; - if (!test_bit(__QLCNIC_DEV_UP, &adapter->state)) { - netif_stop_queue(netdev); + netif_tx_stop_all_queues(netdev); return NETDEV_TX_BUSY; } @@ -569,7 +581,14 @@ netdev_tx_t qlcnic_xmit_frame(struct sk_buff *skb, struct net_device *netdev) goto drop_packet; } + if (qlcnic_check_multi_tx(adapter)) + tx_ring = &adapter->tx_ring[skb_get_queue_mapping(skb)]; + else + tx_ring = &adapter->tx_ring[0]; + num_txd = tx_ring->num_desc; + frag_count = skb_shinfo(skb)->nr_frags + 1; + /* 14 frags supported for normal packet and * 32 frags supported for TSO packet */ @@ -584,11 +603,12 @@ netdev_tx_t qlcnic_xmit_frame(struct sk_buff *skb, struct net_device *netdev) } if (unlikely(qlcnic_tx_avail(tx_ring) <= TX_STOP_THRESH)) { - netif_stop_queue(netdev); + netif_tx_stop_queue(tx_ring->txq); if (qlcnic_tx_avail(tx_ring) > TX_STOP_THRESH) { - netif_start_queue(netdev); + netif_tx_start_queue(tx_ring->txq); } else { adapter->stats.xmit_off++; + tx_ring->xmit_off++; return NETDEV_TX_BUSY; } } @@ -643,7 +663,7 @@ netdev_tx_t qlcnic_xmit_frame(struct sk_buff *skb, struct net_device *netdev) tx_ring->producer = get_next_index(producer, num_txd); smp_mb(); - if (unlikely(qlcnic_tx_pkt(adapter, first_desc, skb))) + if (unlikely(qlcnic_tx_pkt(adapter, first_desc, skb, tx_ring))) goto unwind_buff; if (adapter->drv_mac_learn) @@ -651,6 +671,7 @@ netdev_tx_t qlcnic_xmit_frame(struct sk_buff *skb, struct net_device *netdev) adapter->stats.txbytes += skb->len; adapter->stats.xmitcalled++; + tx_ring->xmit_called++; qlcnic_update_cmd_producer(tx_ring); @@ -673,7 +694,7 @@ void qlcnic_advert_link_change(struct qlcnic_adapter *adapter, int linkup) adapter->ahw->linkup = 0; if (netif_running(netdev)) { netif_carrier_off(netdev); - netif_stop_queue(netdev); + netif_tx_stop_all_queues(netdev); } } else if (!adapter->ahw->linkup && linkup) { netdev_info(netdev, "NIC Link is up\n"); @@ -768,9 +789,6 @@ static int qlcnic_process_cmd_ring(struct qlcnic_adapter *adapter, struct net_device *netdev = adapter->netdev; struct qlcnic_skb_frag *frag; - if (!spin_trylock(&adapter->tx_clean_lock)) - return 1; - sw_consumer = tx_ring->sw_consumer; hw_consumer = le32_to_cpu(*(tx_ring->hw_consumer)); @@ -788,6 +806,7 @@ static int qlcnic_process_cmd_ring(struct qlcnic_adapter *adapter, frag->dma = 0ULL; } adapter->stats.xmitfinished++; + tx_ring->xmit_finished++; dev_kfree_skb_any(buffer->skb); buffer->skb = NULL; } @@ -800,10 +819,12 @@ static int qlcnic_process_cmd_ring(struct qlcnic_adapter *adapter, if (count && netif_running(netdev)) { tx_ring->sw_consumer = sw_consumer; smp_mb(); - if (netif_queue_stopped(netdev) && netif_carrier_ok(netdev)) { + if (netif_tx_queue_stopped(tx_ring->txq) && + netif_carrier_ok(netdev)) { if (qlcnic_tx_avail(tx_ring) > TX_STOP_THRESH) { - netif_wake_queue(netdev); + netif_tx_wake_queue(tx_ring->txq); adapter->stats.xmit_on++; + tx_ring->xmit_on++; } } adapter->tx_timeo_cnt = 0; @@ -823,7 +844,6 @@ static int qlcnic_process_cmd_ring(struct qlcnic_adapter *adapter, */ hw_consumer = le32_to_cpu(*(tx_ring->hw_consumer)); done = (sw_consumer == hw_consumer); - spin_unlock(&adapter->tx_clean_lock); return done; } @@ -833,16 +853,40 @@ static int qlcnic_poll(struct napi_struct *napi, int budget) int tx_complete, work_done; struct qlcnic_host_sds_ring *sds_ring; struct qlcnic_adapter *adapter; + struct qlcnic_host_tx_ring *tx_ring; sds_ring = container_of(napi, struct qlcnic_host_sds_ring, napi); adapter = sds_ring->adapter; - tx_complete = qlcnic_process_cmd_ring(adapter, adapter->tx_ring, + tx_ring = sds_ring->tx_ring; + + tx_complete = qlcnic_process_cmd_ring(adapter, tx_ring, budget); work_done = qlcnic_process_rcv_ring(sds_ring, budget); if ((work_done < budget) && tx_complete) { napi_complete(&sds_ring->napi); - if (test_bit(__QLCNIC_DEV_UP, &adapter->state)) + if (test_bit(__QLCNIC_DEV_UP, &adapter->state)) { qlcnic_enable_int(sds_ring); + qlcnic_enable_tx_intr(adapter, tx_ring); + } + } + + return work_done; +} + +static int qlcnic_tx_poll(struct napi_struct *napi, int budget) +{ + struct qlcnic_host_tx_ring *tx_ring; + struct qlcnic_adapter *adapter; + int work_done; + + tx_ring = container_of(napi, struct qlcnic_host_tx_ring, napi); + adapter = tx_ring->adapter; + + work_done = qlcnic_process_cmd_ring(adapter, tx_ring, budget); + if (work_done) { + napi_complete(&tx_ring->napi); + if (test_bit(__QLCNIC_DEV_UP, &adapter->state)) + qlcnic_enable_tx_intr(adapter, tx_ring); } return work_done; @@ -966,6 +1010,9 @@ static void qlcnic_handle_fw_message(int desc_cnt, int index, break; } break; + case QLCNIC_C2H_OPCODE_GET_DCB_AEN: + qlcnic_dcb_handle_aen(adapter, (void *)&msg); + break; default: break; } @@ -1414,6 +1461,7 @@ int qlcnic_82xx_napi_add(struct qlcnic_adapter *adapter, int ring, max_sds_rings; struct qlcnic_host_sds_ring *sds_ring; struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx; + struct qlcnic_host_tx_ring *tx_ring; if (qlcnic_alloc_sds_rings(recv_ctx, adapter->max_sds_rings)) return -ENOMEM; @@ -1422,12 +1470,22 @@ int qlcnic_82xx_napi_add(struct qlcnic_adapter *adapter, for (ring = 0; ring < adapter->max_sds_rings; ring++) { sds_ring = &recv_ctx->sds_rings[ring]; - if (ring == adapter->max_sds_rings - 1) - netif_napi_add(netdev, &sds_ring->napi, qlcnic_poll, - QLCNIC_NETDEV_WEIGHT / max_sds_rings); - else + if (qlcnic_check_multi_tx(adapter) && + !adapter->ahw->diag_test && + (adapter->max_drv_tx_rings > 1)) { netif_napi_add(netdev, &sds_ring->napi, qlcnic_rx_poll, - QLCNIC_NETDEV_WEIGHT*2); + QLCNIC_NETDEV_WEIGHT * 2); + } else { + if (ring == (adapter->max_sds_rings - 1)) + netif_napi_add(netdev, &sds_ring->napi, + qlcnic_poll, + QLCNIC_NETDEV_WEIGHT / + max_sds_rings); + else + netif_napi_add(netdev, &sds_ring->napi, + qlcnic_rx_poll, + QLCNIC_NETDEV_WEIGHT * 2); + } } if (qlcnic_alloc_tx_rings(adapter, netdev)) { @@ -1435,6 +1493,14 @@ int qlcnic_82xx_napi_add(struct qlcnic_adapter *adapter, return -ENOMEM; } + if (qlcnic_check_multi_tx(adapter) && !adapter->ahw->diag_test) { + for (ring = 0; ring < adapter->max_drv_tx_rings; ring++) { + tx_ring = &adapter->tx_ring[ring]; + netif_napi_add(netdev, &tx_ring->napi, qlcnic_tx_poll, + QLCNIC_NETDEV_WEIGHT); + } + } + return 0; } @@ -1443,6 +1509,7 @@ void qlcnic_82xx_napi_del(struct qlcnic_adapter *adapter) int ring; struct qlcnic_host_sds_ring *sds_ring; struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx; + struct qlcnic_host_tx_ring *tx_ring; for (ring = 0; ring < adapter->max_sds_rings; ring++) { sds_ring = &recv_ctx->sds_rings[ring]; @@ -1450,6 +1517,14 @@ void qlcnic_82xx_napi_del(struct qlcnic_adapter *adapter) } qlcnic_free_sds_rings(adapter->recv_ctx); + + if (qlcnic_check_multi_tx(adapter) && !adapter->ahw->diag_test) { + for (ring = 0; ring < adapter->max_drv_tx_rings; ring++) { + tx_ring = &adapter->tx_ring[ring]; + netif_napi_del(&tx_ring->napi); + } + } + qlcnic_free_tx_rings(adapter); } @@ -1457,6 +1532,7 @@ void qlcnic_82xx_napi_enable(struct qlcnic_adapter *adapter) { int ring; struct qlcnic_host_sds_ring *sds_ring; + struct qlcnic_host_tx_ring *tx_ring; struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx; if (adapter->is_up != QLCNIC_ADAPTER_UP_MAGIC) @@ -1467,12 +1543,24 @@ void qlcnic_82xx_napi_enable(struct qlcnic_adapter *adapter) napi_enable(&sds_ring->napi); qlcnic_enable_int(sds_ring); } + + if (qlcnic_check_multi_tx(adapter) && + (adapter->flags & QLCNIC_MSIX_ENABLED) && + !adapter->ahw->diag_test && + (adapter->max_drv_tx_rings > 1)) { + for (ring = 0; ring < adapter->max_drv_tx_rings; ring++) { + tx_ring = &adapter->tx_ring[ring]; + napi_enable(&tx_ring->napi); + qlcnic_enable_tx_intr(adapter, tx_ring); + } + } } void qlcnic_82xx_napi_disable(struct qlcnic_adapter *adapter) { int ring; struct qlcnic_host_sds_ring *sds_ring; + struct qlcnic_host_tx_ring *tx_ring; struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx; if (adapter->is_up != QLCNIC_ADAPTER_UP_MAGIC) @@ -1484,6 +1572,17 @@ void qlcnic_82xx_napi_disable(struct qlcnic_adapter *adapter) napi_synchronize(&sds_ring->napi); napi_disable(&sds_ring->napi); } + + if ((adapter->flags & QLCNIC_MSIX_ENABLED) && + !adapter->ahw->diag_test && + qlcnic_check_multi_tx(adapter)) { + for (ring = 0; ring < adapter->max_drv_tx_rings; ring++) { + tx_ring = &adapter->tx_ring[ring]; + qlcnic_disable_tx_int(adapter, tx_ring); + napi_synchronize(&tx_ring->napi); + napi_disable(&tx_ring->napi); + } + } } #define QLC_83XX_NORMAL_LB_PKT (1ULL << 36) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index a780b73..7dde3ba 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -100,6 +100,8 @@ static DEFINE_PCI_DEVICE_TABLE(qlcnic_pci_tbl) = { ENTRY(PCI_DEVICE_ID_QLOGIC_QLE824X), ENTRY(PCI_DEVICE_ID_QLOGIC_QLE834X), ENTRY(PCI_DEVICE_ID_QLOGIC_VF_QLE834X), + ENTRY(PCI_DEVICE_ID_QLOGIC_QLE844X), + ENTRY(PCI_DEVICE_ID_QLOGIC_VF_QLE844X), {0,} }; @@ -146,6 +148,11 @@ static const u32 qlcnic_reg_tbl[] = { static const struct qlcnic_board_info qlcnic_boards[] = { { PCI_VENDOR_ID_QLOGIC, + PCI_DEVICE_ID_QLOGIC_QLE844X, + 0x0, + 0x0, + "8400 series 10GbE Converged Network Adapter (TCP/IP Networking)" }, + { PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_QLE834X, PCI_VENDOR_ID_QLOGIC, 0x24e, @@ -254,7 +261,6 @@ static const struct qlcnic_board_info qlcnic_boards[] = { }; #define NUM_SUPPORTED_BOARDS ARRAY_SIZE(qlcnic_boards) -#define QLC_MAX_SDS_RINGS 8 static const struct qlcnic_legacy_intr_set legacy_intr[] = QLCNIC_LEGACY_INTR_CONFIG; @@ -278,12 +284,15 @@ void qlcnic_free_sds_rings(struct qlcnic_recv_context *recv_ctx) int qlcnic_read_mac_addr(struct qlcnic_adapter *adapter) { - u8 mac_addr[ETH_ALEN]; struct net_device *netdev = adapter->netdev; struct pci_dev *pdev = adapter->pdev; + u8 mac_addr[ETH_ALEN]; + int ret; - if (qlcnic_get_mac_address(adapter, mac_addr) != 0) - return -EIO; + ret = qlcnic_get_mac_address(adapter, mac_addr, + adapter->ahw->pci_func); + if (ret) + return ret; memcpy(netdev->dev_addr, mac_addr, ETH_ALEN); memcpy(adapter->mac_addr, netdev->dev_addr, netdev->addr_len); @@ -425,6 +434,21 @@ static void qlcnic_82xx_cancel_idc_work(struct qlcnic_adapter *adapter) cancel_delayed_work_sync(&adapter->fw_work); } +static int qlcnic_get_phys_port_id(struct net_device *netdev, + struct netdev_phys_port_id *ppid) +{ + struct qlcnic_adapter *adapter = netdev_priv(netdev); + struct qlcnic_hardware_context *ahw = adapter->ahw; + + if (!(adapter->flags & QLCNIC_HAS_PHYS_PORT_ID)) + return -EOPNOTSUPP; + + ppid->id_len = sizeof(ahw->phys_port_id); + memcpy(ppid->id, ahw->phys_port_id, ppid->id_len); + + return 0; +} + static const struct net_device_ops qlcnic_netdev_ops = { .ndo_open = qlcnic_open, .ndo_stop = qlcnic_close, @@ -442,6 +466,7 @@ static const struct net_device_ops qlcnic_netdev_ops = { .ndo_fdb_add = qlcnic_fdb_add, .ndo_fdb_del = qlcnic_fdb_del, .ndo_fdb_dump = qlcnic_fdb_dump, + .ndo_get_phys_port_id = qlcnic_get_phys_port_id, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = qlcnic_poll_controller, #endif @@ -514,13 +539,33 @@ static struct qlcnic_hardware_ops qlcnic_hw_ops = { .get_board_info = qlcnic_82xx_get_board_info, .set_mac_filter_count = qlcnic_82xx_set_mac_filter_count, .free_mac_list = qlcnic_82xx_free_mac_list, + .read_phys_port_id = qlcnic_82xx_read_phys_port_id, }; +static void qlcnic_get_multiq_capability(struct qlcnic_adapter *adapter) +{ + struct qlcnic_hardware_context *ahw = adapter->ahw; + int num_tx_q; + + if (ahw->msix_supported && + (ahw->extra_capability[0] & QLCNIC_FW_CAPABILITY_2_MULTI_TX)) { + num_tx_q = min_t(int, QLCNIC_DEF_NUM_TX_RINGS, + num_online_cpus()); + if (num_tx_q > 1) { + test_and_set_bit(__QLCNIC_MULTI_TX_UNIQUE, + &adapter->state); + adapter->max_drv_tx_rings = num_tx_q; + } + } else { + adapter->max_drv_tx_rings = 1; + } +} + int qlcnic_enable_msix(struct qlcnic_adapter *adapter, u32 num_msix) { struct pci_dev *pdev = adapter->pdev; + int max_tx_rings, max_sds_rings, tx_vector; int err = -1, i; - int max_tx_rings, tx_vector; if (adapter->flags & QLCNIC_TX_INTR_SHARED) { max_tx_rings = 0; @@ -554,7 +599,15 @@ int qlcnic_enable_msix(struct qlcnic_adapter *adapter, u32 num_msix) adapter->max_sds_rings = num_msix - max_tx_rings - 1; } else { - adapter->max_sds_rings = num_msix; + adapter->ahw->num_msix = num_msix; + if (qlcnic_check_multi_tx(adapter) && + !adapter->ahw->diag_test && + (adapter->max_drv_tx_rings > 1)) + max_sds_rings = num_msix - max_tx_rings; + else + max_sds_rings = num_msix; + + adapter->max_sds_rings = max_sds_rings; } dev_info(&pdev->dev, "using msi-x interrupts\n"); return err; @@ -570,6 +623,8 @@ int qlcnic_enable_msix(struct qlcnic_adapter *adapter, u32 num_msix) num_msix += (max_tx_rings + 1); } else { num_msix = rounddown_pow_of_two(err); + if (qlcnic_check_multi_tx(adapter)) + num_msix += max_tx_rings; } if (num_msix) { @@ -605,6 +660,7 @@ static int qlcnic_enable_msi_legacy(struct qlcnic_adapter *adapter) adapter->msix_entries[0].vector = pdev->irq; return err; } + if (qlcnic_use_msi || qlcnic_use_msi_x) return -EOPNOTSUPP; @@ -621,28 +677,69 @@ static int qlcnic_enable_msi_legacy(struct qlcnic_adapter *adapter) return err; } -int qlcnic_82xx_setup_intr(struct qlcnic_adapter *adapter, u8 num_intr) +int qlcnic_82xx_setup_intr(struct qlcnic_adapter *adapter, u8 num_intr, int txq) { + struct qlcnic_hardware_context *ahw = adapter->ahw; int num_msix, err = 0; if (!num_intr) num_intr = QLCNIC_DEF_NUM_STS_DESC_RINGS; - if (adapter->ahw->msix_supported) + if (ahw->msix_supported) { num_msix = rounddown_pow_of_two(min_t(int, num_online_cpus(), num_intr)); - else + if (qlcnic_check_multi_tx(adapter)) { + if (txq) + adapter->max_drv_tx_rings = txq; + num_msix += adapter->max_drv_tx_rings; + } + } else { num_msix = 1; + } err = qlcnic_enable_msix(adapter, num_msix); - if (err == -ENOMEM || !err) + if (err == -ENOMEM) return err; - err = qlcnic_enable_msi_legacy(adapter); - if (!err) + if (!(adapter->flags & QLCNIC_MSIX_ENABLED)) { + qlcnic_disable_multi_tx(adapter); + + err = qlcnic_enable_msi_legacy(adapter); + if (!err) + return err; + } + + return 0; +} + +int qlcnic_82xx_mq_intrpt(struct qlcnic_adapter *adapter, int op_type) +{ + struct qlcnic_hardware_context *ahw = adapter->ahw; + int err, i; + + if (qlcnic_check_multi_tx(adapter) && + !ahw->diag_test && + (adapter->flags & QLCNIC_MSIX_ENABLED)) { + ahw->intr_tbl = vzalloc(ahw->num_msix * + sizeof(struct qlcnic_intrpt_config)); + if (!ahw->intr_tbl) + return -ENOMEM; + + for (i = 0; i < ahw->num_msix; i++) { + ahw->intr_tbl[i].type = QLCNIC_INTRPT_MSIX; + ahw->intr_tbl[i].id = i; + ahw->intr_tbl[i].src = 0; + } + + err = qlcnic_82xx_config_intrpt(adapter, 1); + if (err) + dev_err(&adapter->pdev->dev, + "Failed to configure Interrupt for %d vector\n", + ahw->num_msix); return err; + } - return -EIO; + return 0; } void qlcnic_teardown_intr(struct qlcnic_adapter *adapter) @@ -829,7 +926,9 @@ static void qlcnic_get_bar_length(u32 dev_id, ulong *bar) *bar = QLCNIC_82XX_BAR0_LENGTH; break; case PCI_DEVICE_ID_QLOGIC_QLE834X: + case PCI_DEVICE_ID_QLOGIC_QLE844X: case PCI_DEVICE_ID_QLOGIC_VF_QLE834X: + case PCI_DEVICE_ID_QLOGIC_VF_QLE844X: *bar = QLCNIC_83XX_BAR0_LENGTH; break; default: @@ -1413,6 +1512,7 @@ qlcnic_request_irq(struct qlcnic_adapter *adapter) for (ring = 0; ring < num_sds_rings; ring++) { sds_ring = &recv_ctx->sds_rings[ring]; if (qlcnic_82xx_check(adapter) && + !qlcnic_check_multi_tx(adapter) && (ring == (num_sds_rings - 1))) { if (!(adapter->flags & QLCNIC_MSIX_ENABLED)) @@ -1436,9 +1536,11 @@ qlcnic_request_irq(struct qlcnic_adapter *adapter) return err; } } - if (qlcnic_83xx_check(adapter) && - (adapter->flags & QLCNIC_MSIX_ENABLED) && - !(adapter->flags & QLCNIC_TX_INTR_SHARED)) { + if ((qlcnic_82xx_check(adapter) && + qlcnic_check_multi_tx(adapter)) || + (qlcnic_83xx_check(adapter) && + (adapter->flags & QLCNIC_MSIX_ENABLED) && + !(adapter->flags & QLCNIC_TX_INTR_SHARED))) { handler = qlcnic_msix_tx_intr; for (ring = 0; ring < adapter->max_drv_tx_rings; ring++) { @@ -1473,8 +1575,10 @@ qlcnic_free_irq(struct qlcnic_adapter *adapter) free_irq(sds_ring->irq, sds_ring); } } - if (qlcnic_83xx_check(adapter) && - !(adapter->flags & QLCNIC_TX_INTR_SHARED)) { + if ((qlcnic_83xx_check(adapter) && + !(adapter->flags & QLCNIC_TX_INTR_SHARED)) || + (qlcnic_82xx_check(adapter) && + qlcnic_check_multi_tx(adapter))) { for (ring = 0; ring < adapter->max_drv_tx_rings; ring++) { tx_ring = &adapter->tx_ring[ring]; @@ -1510,8 +1614,10 @@ int __qlcnic_up(struct qlcnic_adapter *adapter, struct net_device *netdev) if (test_bit(__QLCNIC_DEV_UP, &adapter->state)) return 0; + if (qlcnic_set_eswitch_port_config(adapter)) return -EIO; + qlcnic_get_lro_mss_capability(adapter); if (qlcnic_fw_create_ctx(adapter)) @@ -1558,6 +1664,8 @@ int qlcnic_up(struct qlcnic_adapter *adapter, struct net_device *netdev) void __qlcnic_down(struct qlcnic_adapter *adapter, struct net_device *netdev) { + int ring; + if (adapter->is_up != QLCNIC_ADAPTER_UP_MAGIC) return; @@ -1567,7 +1675,6 @@ void __qlcnic_down(struct qlcnic_adapter *adapter, struct net_device *netdev) if (qlcnic_sriov_vf_check(adapter)) qlcnic_sriov_cleanup_async_list(&adapter->ahw->sriov->bc); smp_mb(); - spin_lock(&adapter->tx_clean_lock); netif_carrier_off(netdev); adapter->ahw->linkup = 0; netif_tx_disable(netdev); @@ -1585,8 +1692,9 @@ void __qlcnic_down(struct qlcnic_adapter *adapter, struct net_device *netdev) adapter->flags &= ~QLCNIC_FW_LRO_MSS_CAP; qlcnic_reset_rx_buffers_list(adapter); - qlcnic_release_tx_buffers(adapter); - spin_unlock(&adapter->tx_clean_lock); + + for (ring = 0; ring < adapter->max_drv_tx_rings; ring++) + qlcnic_release_tx_buffers(adapter, &adapter->tx_ring[ring]); } /* Usage: During suspend and firmware recovery module */ @@ -1666,6 +1774,7 @@ void qlcnic_diag_free_res(struct net_device *netdev, int max_sds_rings) { struct qlcnic_adapter *adapter = netdev_priv(netdev); struct qlcnic_host_sds_ring *sds_ring; + int max_tx_rings = adapter->max_drv_tx_rings; int ring; clear_bit(__QLCNIC_DEV_UP, &adapter->state); @@ -1682,6 +1791,7 @@ void qlcnic_diag_free_res(struct net_device *netdev, int max_sds_rings) adapter->ahw->diag_test = 0; adapter->max_sds_rings = max_sds_rings; + adapter->max_drv_tx_rings = max_tx_rings; if (qlcnic_attach(adapter)) goto out; @@ -1750,6 +1860,7 @@ int qlcnic_diag_alloc_res(struct net_device *netdev, int test) adapter->max_sds_rings = 1; adapter->ahw->diag_test = test; adapter->ahw->linkup = 0; + adapter->max_drv_tx_rings = 1; ret = qlcnic_attach(adapter); if (ret) { @@ -1907,12 +2018,18 @@ qlcnic_setup_netdev(struct qlcnic_adapter *adapter, struct net_device *netdev, netdev->priv_flags |= IFF_UNICAST_FLT; netdev->irq = adapter->msix_entries[0].vector; + err = qlcnic_set_real_num_queues(adapter, netdev); + if (err) + return err; + err = register_netdev(netdev); if (err) { dev_err(&pdev->dev, "failed to register net device\n"); return err; } + qlcnic_dcb_init_dcbnl_ops(adapter); + return 0; } @@ -1975,7 +2092,8 @@ int qlcnic_alloc_tx_rings(struct qlcnic_adapter *adapter, tx_ring->cmd_buf_arr = cmd_buf_arr; } - if (qlcnic_83xx_check(adapter)) { + if (qlcnic_83xx_check(adapter) || + (qlcnic_82xx_check(adapter) && qlcnic_check_multi_tx(adapter))) { for (ring = 0; ring < adapter->max_drv_tx_rings; ring++) { tx_ring = &adapter->tx_ring[ring]; tx_ring->adapter = adapter; @@ -1986,6 +2104,7 @@ int qlcnic_alloc_tx_rings(struct qlcnic_adapter *adapter, } } } + return 0; } @@ -2004,6 +2123,17 @@ void qlcnic_set_drv_version(struct qlcnic_adapter *adapter) qlcnic_fw_cmd_set_drv_version(adapter, fw_cmd); } +static int qlcnic_register_dcb(struct qlcnic_adapter *adapter) +{ + return __qlcnic_register_dcb(adapter); +} + +void qlcnic_clear_dcb_ops(struct qlcnic_adapter *adapter) +{ + kfree(adapter->dcb); + adapter->dcb = NULL; +} + static int qlcnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { @@ -2048,9 +2178,11 @@ qlcnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) ahw->reg_tbl = (u32 *) qlcnic_reg_tbl; break; case PCI_DEVICE_ID_QLOGIC_QLE834X: + case PCI_DEVICE_ID_QLOGIC_QLE844X: qlcnic_83xx_register_map(ahw); break; case PCI_DEVICE_ID_QLOGIC_VF_QLE834X: + case PCI_DEVICE_ID_QLOGIC_VF_QLE844X: qlcnic_sriov_vf_register_map(ahw); break; default: @@ -2061,7 +2193,8 @@ qlcnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto err_out_free_hw_res; - netdev = alloc_etherdev(sizeof(struct qlcnic_adapter)); + netdev = alloc_etherdev_mq(sizeof(struct qlcnic_adapter), + QLCNIC_MAX_TX_RINGS); if (!netdev) { err = -ENOMEM; goto err_out_iounmap; @@ -2091,14 +2224,14 @@ qlcnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) adapter->fdb_mac_learn = true; else if (qlcnic_mac_learn == DRV_MAC_LEARN) adapter->drv_mac_learn = true; - adapter->max_drv_tx_rings = 1; rwlock_init(&adapter->ahw->crb_lock); mutex_init(&adapter->ahw->mem_lock); - spin_lock_init(&adapter->tx_clean_lock); INIT_LIST_HEAD(&adapter->mac_list); + qlcnic_register_dcb(adapter); + if (qlcnic_82xx_check(adapter)) { qlcnic_check_vf(adapter, ent); adapter->portnum = adapter->ahw->pci_func; @@ -2108,12 +2241,31 @@ qlcnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_out_free_hw; } + qlcnic_get_multiq_capability(adapter); + + if ((adapter->ahw->act_pci_func > 2) && + qlcnic_check_multi_tx(adapter)) { + adapter->max_drv_tx_rings = QLCNIC_DEF_NUM_TX_RINGS; + dev_info(&adapter->pdev->dev, + "vNIC mode enabled, Set max TX rings = %d\n", + adapter->max_drv_tx_rings); + } + + if (!qlcnic_check_multi_tx(adapter)) { + clear_bit(__QLCNIC_MULTI_TX_UNIQUE, &adapter->state); + adapter->max_drv_tx_rings = 1; + } err = qlcnic_setup_idc_param(adapter); if (err) goto err_out_free_hw; adapter->flags |= QLCNIC_NEED_FLR; + + if (adapter->dcb && qlcnic_dcb_attach(adapter)) + qlcnic_clear_dcb_ops(adapter); + } else if (qlcnic_83xx_check(adapter)) { + adapter->max_drv_tx_rings = 1; qlcnic_83xx_check_vf(adapter, ent); adapter->portnum = adapter->ahw->pci_func; err = qlcnic_83xx_init(adapter, pci_using_dac); @@ -2132,6 +2284,8 @@ qlcnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (qlcnic_read_mac_addr(adapter)) dev_warn(&pdev->dev, "failed to read mac addr\n"); + qlcnic_read_phys_port_id(adapter); + if (adapter->portnum == 0) { qlcnic_get_board_name(adapter, board_name); @@ -2146,7 +2300,7 @@ qlcnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) "Device does not support MSI interrupts\n"); if (qlcnic_82xx_check(adapter)) { - err = qlcnic_setup_intr(adapter, 0); + err = qlcnic_setup_intr(adapter, 0, 0); if (err) { dev_err(&pdev->dev, "Failed to setup interrupt\n"); goto err_out_disable_msi; @@ -2234,6 +2388,8 @@ static void qlcnic_remove(struct pci_dev *pdev) qlcnic_cancel_idc_work(adapter); ahw = adapter->ahw; + qlcnic_dcb_free(adapter); + unregister_netdev(netdev); qlcnic_sriov_cleanup(adapter); @@ -2276,6 +2432,7 @@ static void qlcnic_remove(struct pci_dev *pdev) destroy_workqueue(adapter->qlcnic_wq); adapter->qlcnic_wq = NULL; } + qlcnic_free_adapter_resources(adapter); kfree(ahw); free_netdev(netdev); @@ -2334,7 +2491,7 @@ static int qlcnic_open(struct net_device *netdev) if (err) goto err_out; - netif_start_queue(netdev); + netif_tx_start_all_queues(netdev); return 0; @@ -2466,6 +2623,8 @@ int qlcnic_check_temp(struct qlcnic_adapter *adapter) static void qlcnic_tx_timeout(struct net_device *netdev) { struct qlcnic_adapter *adapter = netdev_priv(netdev); + struct qlcnic_host_tx_ring *tx_ring; + int ring; if (test_bit(__QLCNIC_RESETTING, &adapter->state)) return; @@ -2479,6 +2638,25 @@ static void qlcnic_tx_timeout(struct net_device *netdev) QLCNIC_FORCE_FW_DUMP_KEY); } else { netdev_info(netdev, "Tx timeout, reset adapter context.\n"); + if (qlcnic_82xx_check(adapter)) { + for (ring = 0; ring < adapter->max_drv_tx_rings; + ring++) { + tx_ring = &adapter->tx_ring[ring]; + dev_info(&netdev->dev, "ring=%d\n", ring); + dev_info(&netdev->dev, "crb_intr_mask=%d\n", + readl(tx_ring->crb_intr_mask)); + dev_info(&netdev->dev, "producer=%d\n", + readl(tx_ring->crb_cmd_producer)); + dev_info(&netdev->dev, "sw_consumer = %d\n", + tx_ring->sw_consumer); + dev_info(&netdev->dev, "hw_consumer = %d\n", + le32_to_cpu(*(tx_ring->hw_consumer))); + dev_info(&netdev->dev, "xmit-on=%llu\n", + tx_ring->xmit_on); + dev_info(&netdev->dev, "xmit-off=%llu\n", + tx_ring->xmit_off); + } + } adapter->ahw->reset_context = 1; } } @@ -3072,6 +3250,8 @@ qlcnic_attach_work(struct work_struct *work) return; } attach: + qlcnic_dcb_get_info(adapter); + if (netif_running(netdev)) { if (qlcnic_up(adapter, netdev)) goto done; @@ -3243,7 +3423,7 @@ static int qlcnic_attach_func(struct pci_dev *pdev) qlcnic_clr_drv_state(adapter); kfree(adapter->msix_entries); adapter->msix_entries = NULL; - err = qlcnic_setup_intr(adapter, 0); + err = qlcnic_setup_intr(adapter, 0, 0); if (err) { kfree(adapter->msix_entries); @@ -3368,16 +3548,65 @@ qlcnicvf_start_firmware(struct qlcnic_adapter *adapter) return err; } +int qlcnic_validate_max_tx_rings(struct qlcnic_adapter *adapter, u32 txq) +{ + struct net_device *netdev = adapter->netdev; + u8 max_hw = QLCNIC_MAX_TX_RINGS; + u32 max_allowed; + + if (!qlcnic_82xx_check(adapter)) { + netdev_err(netdev, "No Multi TX-Q support\n"); + return -EINVAL; + } + + if (!qlcnic_use_msi_x && !qlcnic_use_msi) { + netdev_err(netdev, "No Multi TX-Q support in INT-x mode\n"); + return -EINVAL; + } + + if (!qlcnic_check_multi_tx(adapter)) { + netdev_err(netdev, "No Multi TX-Q support\n"); + return -EINVAL; + } + + if (txq > QLCNIC_MAX_TX_RINGS) { + netdev_err(netdev, "Invalid ring count\n"); + return -EINVAL; + } + + max_allowed = rounddown_pow_of_two(min_t(int, max_hw, + num_online_cpus())); + if ((txq > max_allowed) || !is_power_of_2(txq)) { + if (!is_power_of_2(txq)) + netdev_err(netdev, + "TX queue should be a power of 2\n"); + if (txq > num_online_cpus()) + netdev_err(netdev, + "Tx queue should not be higher than [%u], number of online CPUs in the system\n", + num_online_cpus()); + netdev_err(netdev, "Unable to configure %u Tx rings\n", txq); + return -EINVAL; + } + + return 0; +} + int qlcnic_validate_max_rss(struct qlcnic_adapter *adapter, - __u32 val) + __u32 val) { struct net_device *netdev = adapter->netdev; u8 max_hw = adapter->ahw->max_rx_ques; u32 max_allowed; - if (val > QLC_MAX_SDS_RINGS) { + if (qlcnic_82xx_check(adapter) && !qlcnic_use_msi_x && + !qlcnic_use_msi) { + netdev_err(netdev, "No RSS support in INT-x mode\n"); + return -EINVAL; + } + + if (val > QLCNIC_MAX_SDS_RINGS) { netdev_err(netdev, "RSS value should not be higher than %u\n", - QLC_MAX_SDS_RINGS); + QLCNIC_MAX_SDS_RINGS); return -EINVAL; } @@ -3407,27 +3636,48 @@ int qlcnic_validate_max_rss(struct qlcnic_adapter *adapter, return 0; } -int qlcnic_set_max_rss(struct qlcnic_adapter *adapter, u8 data, size_t len) +int qlcnic_set_max_rss(struct qlcnic_adapter *adapter, u8 data, int txq) { int err; struct net_device *netdev = adapter->netdev; + int num_msix; if (test_bit(__QLCNIC_RESETTING, &adapter->state)) return -EBUSY; + if (qlcnic_82xx_check(adapter) && !qlcnic_use_msi_x && + !qlcnic_use_msi) { + netdev_err(netdev, "No RSS support in INT-x mode\n"); + return -EINVAL; + } + netif_device_detach(netdev); if (netif_running(netdev)) __qlcnic_down(adapter, netdev); qlcnic_detach(adapter); + if (qlcnic_82xx_check(adapter)) { + if (txq != 0) + adapter->max_drv_tx_rings = txq; + + if (qlcnic_check_multi_tx(adapter) && + (txq > adapter->max_drv_tx_rings)) + num_msix = adapter->max_drv_tx_rings; + else + num_msix = data; + } + if (qlcnic_83xx_check(adapter)) { qlcnic_83xx_free_mbx_intr(adapter); qlcnic_83xx_enable_mbx_poll(adapter); } + netif_set_real_num_tx_queues(netdev, adapter->max_drv_tx_rings); + qlcnic_teardown_intr(adapter); - err = qlcnic_setup_intr(adapter, data); + + err = qlcnic_setup_intr(adapter, data, txq); if (err) { kfree(adapter->msix_entries); netdev_err(netdev, "failed to setup interrupt\n"); @@ -3455,8 +3705,7 @@ int qlcnic_set_max_rss(struct qlcnic_adapter *adapter, u8 data, size_t len) goto done; qlcnic_restore_indev_addr(netdev, NETDEV_UP); } - err = len; - done: +done: netif_device_attach(netdev); clear_bit(__QLCNIC_RESETTING, &adapter->state); return err; diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c index dc24979..26f9aa6 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c @@ -398,10 +398,14 @@ int qlcnic_sriov_get_vf_vport_info(struct qlcnic_adapter *adapter, } static int qlcnic_sriov_set_pvid_mode(struct qlcnic_adapter *adapter, - struct qlcnic_cmd_args *cmd) + struct qlcnic_cmd_args *cmd, u32 cap) { - adapter->rx_pvid = (cmd->rsp.arg[1] >> 16) & 0xffff; - adapter->flags &= ~QLCNIC_TAGGING_ENABLED; + if (cap & QLC_83XX_PVID_STRIP_CAPABILITY) { + adapter->rx_pvid = 0; + } else { + adapter->rx_pvid = (cmd->rsp.arg[1] >> 16) & 0xffff; + adapter->flags &= ~QLCNIC_TAGGING_ENABLED; + } return 0; } @@ -432,12 +436,14 @@ static int qlcnic_sriov_set_guest_vlan_mode(struct qlcnic_adapter *adapter, return 0; } -static int qlcnic_sriov_get_vf_acl(struct qlcnic_adapter *adapter) +static int qlcnic_sriov_get_vf_acl(struct qlcnic_adapter *adapter, + struct qlcnic_info *info) { struct qlcnic_sriov *sriov = adapter->ahw->sriov; struct qlcnic_cmd_args cmd; - int ret; + int ret, cap; + cap = info->capabilities; ret = qlcnic_sriov_alloc_bc_mbx_args(&cmd, QLCNIC_BC_CMD_GET_ACL); if (ret) return ret; @@ -453,7 +459,7 @@ static int qlcnic_sriov_get_vf_acl(struct qlcnic_adapter *adapter) ret = qlcnic_sriov_set_guest_vlan_mode(adapter, &cmd); break; case QLC_PVID_MODE: - ret = qlcnic_sriov_set_pvid_mode(adapter, &cmd); + ret = qlcnic_sriov_set_pvid_mode(adapter, &cmd, cap); break; } } @@ -476,7 +482,7 @@ static int qlcnic_sriov_vf_init_driver(struct qlcnic_adapter *adapter) if (err) return -EIO; - err = qlcnic_sriov_get_vf_acl(adapter); + err = qlcnic_sriov_get_vf_acl(adapter, &nic_info); if (err) return err; @@ -506,7 +512,7 @@ static int qlcnic_sriov_setup_vf(struct qlcnic_adapter *adapter, dev_warn(&adapter->pdev->dev, "Device does not support MSI interrupts\n"); - err = qlcnic_setup_intr(adapter, 1); + err = qlcnic_setup_intr(adapter, 1, 0); if (err) { dev_err(&adapter->pdev->dev, "Failed to setup interrupt\n"); goto err_out_disable_msi; @@ -532,6 +538,9 @@ static int qlcnic_sriov_setup_vf(struct qlcnic_adapter *adapter, if (err) goto err_out_send_channel_term; + if (adapter->dcb && qlcnic_dcb_attach(adapter)) + qlcnic_clear_dcb_ops(adapter); + err = qlcnic_setup_netdev(adapter, adapter->netdev, pci_using_dac); if (err) goto err_out_send_channel_term; @@ -539,6 +548,7 @@ static int qlcnic_sriov_setup_vf(struct qlcnic_adapter *adapter, pci_set_drvdata(adapter->pdev, adapter); dev_info(&adapter->pdev->dev, "%s: XGbE port initialized\n", adapter->netdev->name); + qlcnic_schedule_work(adapter, qlcnic_sriov_vf_poll_dev_state, adapter->ahw->idc.delay); return 0; @@ -1571,6 +1581,8 @@ static int qlcnic_sriov_vf_reinit_driver(struct qlcnic_adapter *adapter) if (err) goto err_out_term_channel; + qlcnic_dcb_get_info(adapter); + return 0; err_out_term_channel: diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c index eb49cd6..2d6faf0 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c @@ -1284,6 +1284,10 @@ static const int qlcnic_pf_passthru_supp_cmds[] = { QLCNIC_CMD_GET_STATISTICS, QLCNIC_CMD_GET_PORT_CONFIG, QLCNIC_CMD_GET_LINK_STATUS, + QLCNIC_CMD_DCB_QUERY_CAP, + QLCNIC_CMD_DCB_QUERY_PARAM, + QLCNIC_CMD_INIT_NIC_FUNC, + QLCNIC_CMD_STOP_NIC_FUNC, }; static const struct qlcnic_sriov_cmd_handler qlcnic_pf_bc_cmd_hdlr[] = { diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index b5eb419..6f87f2c 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -1897,12 +1897,13 @@ static void rtl8169_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *p) { struct rtl8169_private *tp = netdev_priv(dev); - - if (regs->len > R8169_REGS_SIZE) - regs->len = R8169_REGS_SIZE; + u32 __iomem *data = tp->mmio_addr; + u32 *dw = p; + int i; rtl_lock_work(tp); - memcpy_fromio(p, tp->mmio_addr, regs->len); + for (i = 0; i < R8169_REGS_SIZE; i += 4) + memcpy_fromio(dw++, data++, 4); rtl_unlock_work(tp); } @@ -7088,7 +7089,7 @@ rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) RTL_W8(Cfg9346, Cfg9346_Unlock); RTL_W8(Config1, RTL_R8(Config1) | PMEnable); - RTL_W8(Config5, RTL_R8(Config5) & PMEStatus); + RTL_W8(Config5, RTL_R8(Config5) & (BWF | MWF | UWF | LanWake | PMEStatus)); if ((RTL_R8(Config3) & (LinkUp | MagicPacket)) != 0) tp->features |= RTL_FEATURE_WOL; if ((RTL_R8(Config5) & (UWF | BWF | MWF)) != 0) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 9e2afe8..c3570764 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -378,6 +378,8 @@ static struct sh_eth_cpu_data r8a777x_data = { .set_duplex = sh_eth_set_duplex, .set_rate = sh_eth_set_rate_r8a777x, + .register_type = SH_ETH_REG_FAST_RCAR, + .ecsr_value = ECSR_PSRTO | ECSR_LCHNG | ECSR_ICD, .ecsipr_value = ECSIPR_PSRTOIP | ECSIPR_LCHNGIP | ECSIPR_ICDIP, .eesipr_value = 0x01ff009f, @@ -398,6 +400,8 @@ static struct sh_eth_cpu_data r8a7790_data = { .set_duplex = sh_eth_set_duplex, .set_rate = sh_eth_set_rate_r8a777x, + .register_type = SH_ETH_REG_FAST_RCAR, + .ecsr_value = ECSR_PSRTO | ECSR_LCHNG | ECSR_ICD, .ecsipr_value = ECSIPR_PSRTOIP | ECSIPR_LCHNGIP | ECSIPR_ICDIP, .eesipr_value = 0x01ff009f, @@ -435,6 +439,8 @@ static struct sh_eth_cpu_data sh7724_data = { .set_duplex = sh_eth_set_duplex, .set_rate = sh_eth_set_rate_sh7724, + .register_type = SH_ETH_REG_FAST_SH4, + .ecsr_value = ECSR_PSRTO | ECSR_LCHNG | ECSR_ICD, .ecsipr_value = ECSIPR_PSRTOIP | ECSIPR_LCHNGIP | ECSIPR_ICDIP, .eesipr_value = 0x01ff009f, @@ -473,6 +479,8 @@ static struct sh_eth_cpu_data sh7757_data = { .set_duplex = sh_eth_set_duplex, .set_rate = sh_eth_set_rate_sh7757, + .register_type = SH_ETH_REG_FAST_SH4, + .eesipr_value = DMAC_M_RFRMER | DMAC_M_ECI | 0x003fffff, .rmcr_value = 0x00000001, @@ -541,6 +549,8 @@ static struct sh_eth_cpu_data sh7757_data_giga = { .set_duplex = sh_eth_set_duplex, .set_rate = sh_eth_set_rate_giga, + .register_type = SH_ETH_REG_GIGABIT, + .ecsr_value = ECSR_ICD | ECSR_MPD, .ecsipr_value = ECSIPR_LCHNGIP | ECSIPR_ICDIP | ECSIPR_MPDIP, .eesipr_value = DMAC_M_RFRMER | DMAC_M_ECI | 0x003fffff, @@ -599,6 +609,8 @@ static struct sh_eth_cpu_data sh7734_data = { .set_duplex = sh_eth_set_duplex, .set_rate = sh_eth_set_rate_gether, + .register_type = SH_ETH_REG_GIGABIT, + .ecsr_value = ECSR_ICD | ECSR_MPD, .ecsipr_value = ECSIPR_LCHNGIP | ECSIPR_ICDIP | ECSIPR_MPDIP, .eesipr_value = DMAC_M_RFRMER | DMAC_M_ECI | 0x003fffff, @@ -626,6 +638,8 @@ static struct sh_eth_cpu_data sh7763_data = { .set_duplex = sh_eth_set_duplex, .set_rate = sh_eth_set_rate_gether, + .register_type = SH_ETH_REG_GIGABIT, + .ecsr_value = ECSR_ICD | ECSR_MPD, .ecsipr_value = ECSIPR_LCHNGIP | ECSIPR_ICDIP | ECSIPR_MPDIP, .eesipr_value = DMAC_M_RFRMER | DMAC_M_ECI | 0x003fffff, @@ -663,6 +677,8 @@ static struct sh_eth_cpu_data r8a7740_data = { .set_duplex = sh_eth_set_duplex, .set_rate = sh_eth_set_rate_gether, + .register_type = SH_ETH_REG_GIGABIT, + .ecsr_value = ECSR_ICD | ECSR_MPD, .ecsipr_value = ECSIPR_LCHNGIP | ECSIPR_ICDIP | ECSIPR_MPDIP, .eesipr_value = DMAC_M_RFRMER | DMAC_M_ECI | 0x003fffff, @@ -685,6 +701,8 @@ static struct sh_eth_cpu_data r8a7740_data = { }; static struct sh_eth_cpu_data sh7619_data = { + .register_type = SH_ETH_REG_FAST_SH3_SH2, + .eesipr_value = DMAC_M_RFRMER | DMAC_M_ECI | 0x003fffff, .apr = 1, @@ -694,6 +712,8 @@ static struct sh_eth_cpu_data sh7619_data = { }; static struct sh_eth_cpu_data sh771x_data = { + .register_type = SH_ETH_REG_FAST_SH3_SH2, + .eesipr_value = DMAC_M_RFRMER | DMAC_M_ECI | 0x003fffff, .tsu = 1, }; @@ -2643,10 +2663,10 @@ static int sh_eth_drv_probe(struct platform_device *pdev) mdp->edmac_endian = pd->edmac_endian; mdp->no_ether_link = pd->no_ether_link; mdp->ether_link_active_low = pd->ether_link_active_low; - mdp->reg_offset = sh_eth_get_register_offset(pd->register_type); /* set cpu data */ mdp->cd = (struct sh_eth_cpu_data *)id->driver_data; + mdp->reg_offset = sh_eth_get_register_offset(mdp->cd->register_type); sh_eth_set_default_cpu_data(mdp->cd); /* set function */ diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h index da93f5c..a0db02c 100644 --- a/drivers/net/ethernet/renesas/sh_eth.h +++ b/drivers/net/ethernet/renesas/sh_eth.h @@ -157,6 +157,13 @@ enum { SH_ETH_MAX_REGISTER_OFFSET, }; +enum { + SH_ETH_REG_GIGABIT, + SH_ETH_REG_FAST_RCAR, + SH_ETH_REG_FAST_SH4, + SH_ETH_REG_FAST_SH3_SH2 +}; + /* Driver's parameters */ #if defined(CONFIG_CPU_SH4) || defined(CONFIG_ARCH_SHMOBILE) #define SH4_SKB_RX_ALIGN 32 @@ -454,6 +461,7 @@ struct sh_eth_cpu_data { void (*set_rate)(struct net_device *ndev); /* mandatory initialize value */ + int register_type; unsigned long eesipr_value; /* optional initialize value */ diff --git a/drivers/net/ethernet/sis/sis190.c b/drivers/net/ethernet/sis/sis190.c index 02df089..ee18e6f 100644 --- a/drivers/net/ethernet/sis/sis190.c +++ b/drivers/net/ethernet/sis/sis190.c @@ -1770,9 +1770,6 @@ static void sis190_get_regs(struct net_device *dev, struct ethtool_regs *regs, struct sis190_private *tp = netdev_priv(dev); unsigned long flags; - if (regs->len > SIS190_REGS_SIZE) - regs->len = SIS190_REGS_SIZE; - spin_lock_irqsave(&tp->lock, flags); memcpy_fromio(p, tp->mmio_addr, regs->len); spin_unlock_irqrestore(&tp->lock, flags); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index c922fde..f16a9bd 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -70,7 +70,6 @@ struct stmmac_priv { struct net_device *dev; struct device *device; struct mac_device_info *hw; - int no_csum_insertion; spinlock_t lock; struct phy_device *phydev ____cacheline_aligned_in_smp; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 0a9bb9d..be40691 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1224,8 +1224,7 @@ static void free_dma_desc_resources(struct stmmac_priv *priv) */ static void stmmac_dma_operation_mode(struct stmmac_priv *priv) { - if (likely(priv->plat->force_sf_dma_mode || - ((priv->plat->tx_coe) && (!priv->no_csum_insertion)))) { + if (priv->plat->force_sf_dma_mode || priv->plat->tx_coe) { /* * In case of GMAC, SF mode can be enabled * to perform the TX COE in HW. This depends on: diff --git a/drivers/net/ethernet/sun/sunbmac.c b/drivers/net/ethernet/sun/sunbmac.c index 0d43fa9..7217ee5 100644 --- a/drivers/net/ethernet/sun/sunbmac.c +++ b/drivers/net/ethernet/sun/sunbmac.c @@ -1239,7 +1239,7 @@ static int bigmac_sbus_probe(struct platform_device *op) static int bigmac_sbus_remove(struct platform_device *op) { - struct bigmac *bp = dev_get_drvdata(&op->dev); + struct bigmac *bp = platform_get_drvdata(op); struct device *parent = op->dev.parent; struct net_device *net_dev = bp->dev; struct platform_device *qec_op; @@ -1259,8 +1259,6 @@ static int bigmac_sbus_remove(struct platform_device *op) free_netdev(net_dev); - dev_set_drvdata(&op->dev, NULL); - return 0; } diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c index 171f5b0..c67e683 100644 --- a/drivers/net/ethernet/sun/sunhme.c +++ b/drivers/net/ethernet/sun/sunhme.c @@ -3231,7 +3231,7 @@ static int hme_sbus_probe(struct platform_device *op) static int hme_sbus_remove(struct platform_device *op) { - struct happy_meal *hp = dev_get_drvdata(&op->dev); + struct happy_meal *hp = platform_get_drvdata(op); struct net_device *net_dev = hp->dev; unregister_netdev(net_dev); @@ -3250,8 +3250,6 @@ static int hme_sbus_remove(struct platform_device *op) free_netdev(net_dev); - dev_set_drvdata(&op->dev, NULL); - return 0; } diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 0fcf212..79974e3 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -34,9 +34,9 @@ #include <linux/of_device.h> #include <linux/if_vlan.h> -#include <linux/platform_data/cpsw.h> #include <linux/pinctrl/consumer.h> +#include "cpsw.h" #include "cpsw_ale.h" #include "cpts.h" #include "davinci_cpdma.h" @@ -1640,6 +1640,29 @@ static int cpsw_set_settings(struct net_device *ndev, struct ethtool_cmd *ecmd) return -EOPNOTSUPP; } +static void cpsw_get_wol(struct net_device *ndev, struct ethtool_wolinfo *wol) +{ + struct cpsw_priv *priv = netdev_priv(ndev); + int slave_no = cpsw_slave_index(priv); + + wol->supported = 0; + wol->wolopts = 0; + + if (priv->slaves[slave_no].phy) + phy_ethtool_get_wol(priv->slaves[slave_no].phy, wol); +} + +static int cpsw_set_wol(struct net_device *ndev, struct ethtool_wolinfo *wol) +{ + struct cpsw_priv *priv = netdev_priv(ndev); + int slave_no = cpsw_slave_index(priv); + + if (priv->slaves[slave_no].phy) + return phy_ethtool_set_wol(priv->slaves[slave_no].phy, wol); + else + return -EOPNOTSUPP; +} + static const struct ethtool_ops cpsw_ethtool_ops = { .get_drvinfo = cpsw_get_drvinfo, .get_msglevel = cpsw_get_msglevel, @@ -1653,6 +1676,8 @@ static const struct ethtool_ops cpsw_ethtool_ops = { .get_sset_count = cpsw_get_sset_count, .get_strings = cpsw_get_strings, .get_ethtool_stats = cpsw_get_ethtool_stats, + .get_wol = cpsw_get_wol, + .set_wol = cpsw_set_wol, }; static void cpsw_slave_init(struct cpsw_slave *slave, struct cpsw_priv *priv, diff --git a/include/linux/platform_data/cpsw.h b/drivers/net/ethernet/ti/cpsw.h index bb3cd58..eb3e101 100644 --- a/include/linux/platform_data/cpsw.h +++ b/drivers/net/ethernet/ti/cpsw.h @@ -1,11 +1,10 @@ -/* - * Texas Instruments Ethernet Switch Driver +/* Texas Instruments Ethernet Switch Driver * - * Copyright (C) 2012 Texas Instruments + * Copyright (C) 2013 Texas Instruments * * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation version 2. + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. * * This program is distributed "as is" WITHOUT ANY WARRANTY of any * kind, whether express or implied; without even the implied warranty @@ -22,14 +21,13 @@ struct cpsw_slave_data { int phy_if; u8 mac_addr[ETH_ALEN]; u16 dual_emac_res_vlan; /* Reserved VLAN for DualEMAC */ - }; struct cpsw_platform_data { + struct cpsw_slave_data *slave_data; u32 ss_reg_ofs; /* Subsystem control register offset */ u32 channels; /* number of cpdma channels (symmetric) */ u32 slaves; /* number of slave cpgmac ports */ - struct cpsw_slave_data *slave_data; u32 active_slave; /* time stamping, ethtool and SIOCGMIIPHY slave */ u32 cpts_clock_mult; /* convert input clock ticks to nanoseconds */ u32 cpts_clock_shift; /* convert input clock ticks to nanoseconds */ diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c index 031ebc8..90a7946 100644 --- a/drivers/net/ethernet/ti/davinci_cpdma.c +++ b/drivers/net/ethernet/ti/davinci_cpdma.c @@ -591,6 +591,7 @@ int cpdma_chan_get_stats(struct cpdma_chan *chan, spin_unlock_irqrestore(&chan->lock, flags); return 0; } +EXPORT_SYMBOL_GPL(cpdma_chan_get_stats); int cpdma_chan_dump(struct cpdma_chan *chan) { diff --git a/drivers/net/ethernet/ti/davinci_mdio.c b/drivers/net/ethernet/ti/davinci_mdio.c index 16ddfc3..7f85143 100644 --- a/drivers/net/ethernet/ti/davinci_mdio.c +++ b/drivers/net/ethernet/ti/davinci_mdio.c @@ -421,8 +421,7 @@ bail_out: static int davinci_mdio_remove(struct platform_device *pdev) { - struct device *dev = &pdev->dev; - struct davinci_mdio_data *data = dev_get_drvdata(dev); + struct davinci_mdio_data *data = platform_get_drvdata(pdev); if (data->bus) { mdiobus_unregister(data->bus); @@ -434,8 +433,6 @@ static int davinci_mdio_remove(struct platform_device *pdev) pm_runtime_put_sync(&pdev->dev); pm_runtime_disable(&pdev->dev); - dev_set_drvdata(dev, NULL); - kfree(data); return 0; diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index fd4dbda..4c619ea 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -1230,8 +1230,7 @@ error: */ static int xemaclite_of_remove(struct platform_device *of_dev) { - struct device *dev = &of_dev->dev; - struct net_device *ndev = dev_get_drvdata(dev); + struct net_device *ndev = platform_get_drvdata(of_dev); struct net_local *lp = netdev_priv(ndev); @@ -1250,7 +1249,6 @@ static int xemaclite_of_remove(struct platform_device *of_dev) lp->phy_node = NULL; xemaclite_remove_ndev(ndev, of_dev); - dev_set_drvdata(dev, NULL); return 0; } diff --git a/drivers/net/irda/pxaficp_ir.c b/drivers/net/irda/pxaficp_ir.c index 964b116..3eeaaf8 100644 --- a/drivers/net/irda/pxaficp_ir.c +++ b/drivers/net/irda/pxaficp_ir.c @@ -915,7 +915,7 @@ static int pxa_irda_probe(struct platform_device *pdev) err = register_netdev(dev); if (err == 0) - dev_set_drvdata(&pdev->dev, dev); + platform_set_drvdata(pdev, dev); if (err) { if (si->pdata->shutdown) diff --git a/drivers/net/irda/via-ircc.c b/drivers/net/irda/via-ircc.c index 51f2bc3..2dcc60f 100644 --- a/drivers/net/irda/via-ircc.c +++ b/drivers/net/irda/via-ircc.c @@ -210,8 +210,7 @@ static int via_init_one(struct pci_dev *pcidev, const struct pci_device_id *id) pci_write_config_byte(pcidev,0x42,(bTmp | 0xf0)); pci_write_config_byte(pcidev,0x5a,0xc0); WriteLPCReg(0x28, 0x70 ); - if (via_ircc_open(pcidev, &info, 0x3076) == 0) - rc=0; + rc = via_ircc_open(pcidev, &info, 0x3076); } else rc = -ENODEV; //IR not turn on } else { //Not VT1211 @@ -249,8 +248,7 @@ static int via_init_one(struct pci_dev *pcidev, const struct pci_device_id *id) info.irq=FirIRQ; info.dma=FirDRQ1; info.dma2=FirDRQ0; - if (via_ircc_open(pcidev, &info, 0x3096) == 0) - rc=0; + rc = via_ircc_open(pcidev, &info, 0x3096); } else rc = -ENODEV; //IR not turn on !!!!! }//Not VT1211 diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 1c6e111..9dccb1e 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -68,6 +68,8 @@ static const struct proto_ops macvtap_socket_ops; #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \ NETIF_F_TSO6 | NETIF_F_UFO) #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO) +#define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG) + /* * RCU usage: * The macvtap_queue and the macvlan_dev are loosely coupled, the @@ -278,7 +280,8 @@ static int macvtap_forward(struct net_device *dev, struct sk_buff *skb) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvtap_queue *q = macvtap_get_queue(dev, skb); - netdev_features_t features; + netdev_features_t features = TAP_FEATURES; + if (!q) goto drop; @@ -287,9 +290,11 @@ static int macvtap_forward(struct net_device *dev, struct sk_buff *skb) skb->dev = dev; /* Apply the forward feature mask so that we perform segmentation - * according to users wishes. + * according to users wishes. This only works if VNET_HDR is + * enabled. */ - features = netif_skb_features(skb) & vlan->tap_features; + if (q->flags & IFF_VNET_HDR) + features |= vlan->tap_features; if (netif_needs_gso(skb, features)) { struct sk_buff *segs = __skb_gso_segment(skb, features, false); @@ -961,8 +966,7 @@ static int set_offload(struct macvtap_queue *q, unsigned long arg) /* tap_features are the same as features on tun/tap and * reflect user expectations. */ - vlan->tap_features = vlan->dev->features & - (feature_mask | ~TUN_OFFLOADS); + vlan->tap_features = feature_mask; vlan->set_features = features; netdev_update_features(vlan->dev); @@ -1058,10 +1062,6 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd, TUN_F_TSO_ECN | TUN_F_UFO)) return -EINVAL; - /* TODO: only accept frames with the features that - got enabled for forwarded frames */ - if (!(q->flags & IFF_VNET_HDR)) - return -EINVAL; rtnl_lock(); ret = set_offload(q, arg); rtnl_unlock(); diff --git a/drivers/net/phy/mdio-octeon.c b/drivers/net/phy/mdio-octeon.c index b51fa1f..7f18f80 100644 --- a/drivers/net/phy/mdio-octeon.c +++ b/drivers/net/phy/mdio-octeon.c @@ -222,7 +222,7 @@ static int octeon_mdiobus_probe(struct platform_device *pdev) bus->mii_bus->read = octeon_mdiobus_read; bus->mii_bus->write = octeon_mdiobus_write; - dev_set_drvdata(&pdev->dev, bus); + platform_set_drvdata(pdev, bus); err = of_mdiobus_register(bus->mii_bus, pdev->dev.of_node); if (err) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 9ca4945..c31aad0 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -25,6 +25,7 @@ #include <linux/module.h> #include <linux/phy.h> #include <linux/micrel_phy.h> +#include <linux/of.h> /* Operation Mode Strap Override */ #define MII_KSZPHY_OMSO 0x16 @@ -53,6 +54,20 @@ #define KS8737_CTRL_INT_ACTIVE_HIGH (1 << 14) #define KSZ8051_RMII_50MHZ_CLK (1 << 7) +/* Write/read to/from extended registers */ +#define MII_KSZPHY_EXTREG 0x0b +#define KSZPHY_EXTREG_WRITE 0x8000 + +#define MII_KSZPHY_EXTREG_WRITE 0x0c +#define MII_KSZPHY_EXTREG_READ 0x0d + +/* Extended registers */ +#define MII_KSZPHY_CLK_CONTROL_PAD_SKEW 0x104 +#define MII_KSZPHY_RX_DATA_PAD_SKEW 0x105 +#define MII_KSZPHY_TX_DATA_PAD_SKEW 0x106 + +#define PS_TO_REG 200 + static int ksz_config_flags(struct phy_device *phydev) { int regval; @@ -65,6 +80,20 @@ static int ksz_config_flags(struct phy_device *phydev) return 0; } +static int kszphy_extended_write(struct phy_device *phydev, + u32 regnum, u16 val) +{ + phy_write(phydev, MII_KSZPHY_EXTREG, KSZPHY_EXTREG_WRITE | regnum); + return phy_write(phydev, MII_KSZPHY_EXTREG_WRITE, val); +} + +static int kszphy_extended_read(struct phy_device *phydev, + u32 regnum) +{ + phy_write(phydev, MII_KSZPHY_EXTREG, regnum); + return phy_read(phydev, MII_KSZPHY_EXTREG_READ); +} + static int kszphy_ack_interrupt(struct phy_device *phydev) { /* bit[7..0] int status, which is a read and clear register. */ @@ -141,6 +170,78 @@ static int ks8051_config_init(struct phy_device *phydev) return rc < 0 ? rc : 0; } +static int ksz9021_load_values_from_of(struct phy_device *phydev, + struct device_node *of_node, u16 reg, + char *field1, char *field2, + char *field3, char *field4) +{ + int val1 = -1; + int val2 = -2; + int val3 = -3; + int val4 = -4; + int newval; + int matches = 0; + + if (!of_property_read_u32(of_node, field1, &val1)) + matches++; + + if (!of_property_read_u32(of_node, field2, &val2)) + matches++; + + if (!of_property_read_u32(of_node, field3, &val3)) + matches++; + + if (!of_property_read_u32(of_node, field4, &val4)) + matches++; + + if (!matches) + return 0; + + if (matches < 4) + newval = kszphy_extended_read(phydev, reg); + else + newval = 0; + + if (val1 != -1) + newval = ((newval & 0xfff0) | ((val1 / PS_TO_REG) & 0xf) << 0); + + if (val2 != -1) + newval = ((newval & 0xff0f) | ((val2 / PS_TO_REG) & 0xf) << 4); + + if (val3 != -1) + newval = ((newval & 0xf0ff) | ((val3 / PS_TO_REG) & 0xf) << 8); + + if (val4 != -1) + newval = ((newval & 0x0fff) | ((val4 / PS_TO_REG) & 0xf) << 12); + + return kszphy_extended_write(phydev, reg, newval); +} + +static int ksz9021_config_init(struct phy_device *phydev) +{ + struct device *dev = &phydev->dev; + struct device_node *of_node = dev->of_node; + + if (!of_node && dev->parent->of_node) + of_node = dev->parent->of_node; + + if (of_node) { + ksz9021_load_values_from_of(phydev, of_node, + MII_KSZPHY_CLK_CONTROL_PAD_SKEW, + "txen-skew-ps", "txc-skew-ps", + "rxdv-skew-ps", "rxc-skew-ps"); + ksz9021_load_values_from_of(phydev, of_node, + MII_KSZPHY_RX_DATA_PAD_SKEW, + "rxd0-skew-ps", "rxd1-skew-ps", + "rxd2-skew-ps", "rxd3-skew-ps"); + ksz9021_load_values_from_of(phydev, of_node, + MII_KSZPHY_TX_DATA_PAD_SKEW, + "txd0-skew-ps", "txd1-skew-ps", + "txd2-skew-ps", "txd3-skew-ps"); + } + return 0; +} + #define KSZ8873MLL_GLOBAL_CONTROL_4 0x06 #define KSZ8873MLL_GLOBAL_CONTROL_4_DUPLEX (1 << 6) #define KSZ8873MLL_GLOBAL_CONTROL_4_SPEED (1 << 4) @@ -281,7 +382,7 @@ static struct phy_driver ksphy_driver[] = { .name = "Micrel KSZ9021 Gigabit PHY", .features = (PHY_GBIT_FEATURES | SUPPORTED_Pause), .flags = PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT, - .config_init = kszphy_config_init, + .config_init = ksz9021_config_init, .config_aneg = genphy_config_aneg, .read_status = genphy_read_status, .ack_interrupt = kszphy_ack_interrupt, diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c index 8e7af83..138de83 100644 --- a/drivers/net/phy/realtek.c +++ b/drivers/net/phy/realtek.c @@ -23,7 +23,7 @@ #define RTL821x_INER_INIT 0x6400 #define RTL821x_INSR 0x13 -#define RTL8211E_INER_LINK_STAT 0x10 +#define RTL8211E_INER_LINK_STATUS 0x400 MODULE_DESCRIPTION("Realtek PHY driver"); MODULE_AUTHOR("Johnson Leung"); @@ -57,7 +57,7 @@ static int rtl8211e_config_intr(struct phy_device *phydev) if (phydev->interrupts == PHY_INTERRUPT_ENABLED) err = phy_write(phydev, RTL821x_INER, - RTL8211E_INER_LINK_STAT); + RTL8211E_INER_LINK_STATUS); else err = phy_write(phydev, RTL821x_INER, 0); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 7ed13cc..60a1e93 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -138,7 +138,10 @@ struct tun_file { struct fasync_struct *fasync; /* only used for fasnyc */ unsigned int flags; - u16 queue_index; + union { + u16 queue_index; + unsigned int ifindex; + }; struct list_head next; struct tun_struct *detached; }; @@ -498,7 +501,7 @@ static void tun_detach_all(struct net_device *dev) module_put(THIS_MODULE); } -static int tun_attach(struct tun_struct *tun, struct file *file) +static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter) { struct tun_file *tfile = file->private_data; int err; @@ -523,7 +526,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file) err = 0; /* Re-attach the filter to presist device */ - if (tun->filter_attached == true) { + if (!skip_filter && (tun->filter_attached == true)) { err = sk_attach_filter(&tun->fprog, tfile->socket.sk); if (!err) goto out; @@ -1554,7 +1557,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (err < 0) return err; - err = tun_attach(tun, file); + err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER); if (err < 0) return err; @@ -1601,6 +1604,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) dev_net_set(dev, net); dev->rtnl_link_ops = &tun_link_ops; + dev->ifindex = tfile->ifindex; tun = netdev_priv(dev); tun->dev = dev; @@ -1627,7 +1631,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) dev->vlan_features = dev->features; INIT_LIST_HEAD(&tun->disabled); - err = tun_attach(tun, file); + err = tun_attach(tun, file, false); if (err < 0) goto err_free_dev; @@ -1791,7 +1795,7 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) ret = security_tun_dev_attach_queue(tun->security); if (ret < 0) goto unlock; - ret = tun_attach(tun, file); + ret = tun_attach(tun, file, false); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { tun = rtnl_dereference(tfile->tun); if (!tun || !(tun->flags & TUN_TAP_MQ) || tfile->detached) @@ -1817,6 +1821,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, kgid_t group; int sndbuf; int vnet_hdr_sz; + unsigned int ifindex; int ret; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 0x89) { @@ -1851,6 +1856,19 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = -EFAULT; goto unlock; } + if (cmd == TUNSETIFINDEX) { + ret = -EPERM; + if (tun) + goto unlock; + + ret = -EFAULT; + if (copy_from_user(&ifindex, argp, sizeof(ifindex))) + goto unlock; + + ret = 0; + tfile->ifindex = ifindex; + goto unlock; + } ret = -EBADFD; if (!tun) @@ -1863,6 +1881,11 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, case TUNGETIFF: tun_get_iff(current->nsproxy->net_ns, tun, &ifr); + if (tfile->detached) + ifr.ifr_flags |= IFF_DETACH_QUEUE; + if (!tfile->socket.sk->sk_filter) + ifr.ifr_flags |= IFF_NOFILTER; + if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; @@ -2019,6 +2042,16 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, tun_detach_filter(tun, tun->numqueues); break; + case TUNGETFILTER: + ret = -EINVAL; + if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) + break; + ret = -EFAULT; + if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog))) + break; + ret = 0; + break; + default: ret = -EINVAL; break; @@ -2099,6 +2132,7 @@ static int tun_chr_open(struct inode *inode, struct file * file) rcu_assign_pointer(tfile->tun, NULL); tfile->net = get_net(current->nsproxy->net_ns); tfile->flags = 0; + tfile->ifindex = 0; rcu_assign_pointer(tfile->socket.wq, &tfile->wq); init_waitqueue_head(&tfile->wq.wait); diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c index cba1d46..86292e6 100644 --- a/drivers/net/usb/hso.c +++ b/drivers/net/usb/hso.c @@ -2816,13 +2816,16 @@ exit: static int hso_get_config_data(struct usb_interface *interface) { struct usb_device *usbdev = interface_to_usbdev(interface); - u8 config_data[17]; + u8 *config_data = kmalloc(17, GFP_KERNEL); u32 if_num = interface->altsetting->desc.bInterfaceNumber; s32 result; + if (!config_data) + return -ENOMEM; if (usb_control_msg(usbdev, usb_rcvctrlpipe(usbdev, 0), 0x86, 0xC0, 0, 0, config_data, 17, USB_CTRL_SET_TIMEOUT) != 0x11) { + kfree(config_data); return -EIO; } @@ -2873,6 +2876,7 @@ static int hso_get_config_data(struct usb_interface *interface) if (config_data[16] & 0x1) result |= HSO_INFO_CRC_BUG; + kfree(config_data); return result; } @@ -2886,6 +2890,11 @@ static int hso_probe(struct usb_interface *interface, struct hso_shared_int *shared_int; struct hso_device *tmp_dev = NULL; + if (interface->cur_altsetting->desc.bInterfaceClass != 0xFF) { + dev_err(&interface->dev, "Not our interface\n"); + return -ENODEV; + } + if_num = interface->altsetting->desc.bInterfaceNumber; /* Get the interface/port specification from either driver_info or from @@ -2895,10 +2904,6 @@ static int hso_probe(struct usb_interface *interface, else port_spec = hso_get_config_data(interface); - if (interface->cur_altsetting->desc.bInterfaceClass != 0xFF) { - dev_err(&interface->dev, "Not our interface\n"); - return -ENODEV; - } /* Check if we need to switch to alt interfaces prior to port * configuration */ if (interface->num_altsetting > 1) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 55a62ca..7e2788c 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -313,10 +313,10 @@ vmxnet3_unmap_tx_buf(struct vmxnet3_tx_buf_info *tbi, struct pci_dev *pdev) { if (tbi->map_type == VMXNET3_MAP_SINGLE) - pci_unmap_single(pdev, tbi->dma_addr, tbi->len, + dma_unmap_single(&pdev->dev, tbi->dma_addr, tbi->len, PCI_DMA_TODEVICE); else if (tbi->map_type == VMXNET3_MAP_PAGE) - pci_unmap_page(pdev, tbi->dma_addr, tbi->len, + dma_unmap_page(&pdev->dev, tbi->dma_addr, tbi->len, PCI_DMA_TODEVICE); else BUG_ON(tbi->map_type != VMXNET3_MAP_NONE); @@ -429,25 +429,29 @@ vmxnet3_tq_destroy(struct vmxnet3_tx_queue *tq, struct vmxnet3_adapter *adapter) { if (tq->tx_ring.base) { - pci_free_consistent(adapter->pdev, tq->tx_ring.size * - sizeof(struct Vmxnet3_TxDesc), - tq->tx_ring.base, tq->tx_ring.basePA); + dma_free_coherent(&adapter->pdev->dev, tq->tx_ring.size * + sizeof(struct Vmxnet3_TxDesc), + tq->tx_ring.base, tq->tx_ring.basePA); tq->tx_ring.base = NULL; } if (tq->data_ring.base) { - pci_free_consistent(adapter->pdev, tq->data_ring.size * - sizeof(struct Vmxnet3_TxDataDesc), - tq->data_ring.base, tq->data_ring.basePA); + dma_free_coherent(&adapter->pdev->dev, tq->data_ring.size * + sizeof(struct Vmxnet3_TxDataDesc), + tq->data_ring.base, tq->data_ring.basePA); tq->data_ring.base = NULL; } if (tq->comp_ring.base) { - pci_free_consistent(adapter->pdev, tq->comp_ring.size * - sizeof(struct Vmxnet3_TxCompDesc), - tq->comp_ring.base, tq->comp_ring.basePA); + dma_free_coherent(&adapter->pdev->dev, tq->comp_ring.size * + sizeof(struct Vmxnet3_TxCompDesc), + tq->comp_ring.base, tq->comp_ring.basePA); tq->comp_ring.base = NULL; } - kfree(tq->buf_info); - tq->buf_info = NULL; + if (tq->buf_info) { + dma_free_coherent(&adapter->pdev->dev, + tq->tx_ring.size * sizeof(tq->buf_info[0]), + tq->buf_info, tq->buf_info_pa); + tq->buf_info = NULL; + } } @@ -496,37 +500,38 @@ static int vmxnet3_tq_create(struct vmxnet3_tx_queue *tq, struct vmxnet3_adapter *adapter) { + size_t sz; + BUG_ON(tq->tx_ring.base || tq->data_ring.base || tq->comp_ring.base || tq->buf_info); - tq->tx_ring.base = pci_alloc_consistent(adapter->pdev, tq->tx_ring.size - * sizeof(struct Vmxnet3_TxDesc), - &tq->tx_ring.basePA); + tq->tx_ring.base = dma_alloc_coherent(&adapter->pdev->dev, + tq->tx_ring.size * sizeof(struct Vmxnet3_TxDesc), + &tq->tx_ring.basePA, GFP_KERNEL); if (!tq->tx_ring.base) { netdev_err(adapter->netdev, "failed to allocate tx ring\n"); goto err; } - tq->data_ring.base = pci_alloc_consistent(adapter->pdev, - tq->data_ring.size * - sizeof(struct Vmxnet3_TxDataDesc), - &tq->data_ring.basePA); + tq->data_ring.base = dma_alloc_coherent(&adapter->pdev->dev, + tq->data_ring.size * sizeof(struct Vmxnet3_TxDataDesc), + &tq->data_ring.basePA, GFP_KERNEL); if (!tq->data_ring.base) { netdev_err(adapter->netdev, "failed to allocate data ring\n"); goto err; } - tq->comp_ring.base = pci_alloc_consistent(adapter->pdev, - tq->comp_ring.size * - sizeof(struct Vmxnet3_TxCompDesc), - &tq->comp_ring.basePA); + tq->comp_ring.base = dma_alloc_coherent(&adapter->pdev->dev, + tq->comp_ring.size * sizeof(struct Vmxnet3_TxCompDesc), + &tq->comp_ring.basePA, GFP_KERNEL); if (!tq->comp_ring.base) { netdev_err(adapter->netdev, "failed to allocate tx comp ring\n"); goto err; } - tq->buf_info = kcalloc(tq->tx_ring.size, sizeof(tq->buf_info[0]), - GFP_KERNEL); + sz = tq->tx_ring.size * sizeof(tq->buf_info[0]); + tq->buf_info = dma_zalloc_coherent(&adapter->pdev->dev, sz, + &tq->buf_info_pa, GFP_KERNEL); if (!tq->buf_info) goto err; @@ -578,7 +583,8 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32 ring_idx, break; } - rbi->dma_addr = pci_map_single(adapter->pdev, + rbi->dma_addr = dma_map_single( + &adapter->pdev->dev, rbi->skb->data, rbi->len, PCI_DMA_FROMDEVICE); } else { @@ -595,7 +601,8 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32 ring_idx, rq->stats.rx_buf_alloc_failure++; break; } - rbi->dma_addr = pci_map_page(adapter->pdev, + rbi->dma_addr = dma_map_page( + &adapter->pdev->dev, rbi->page, 0, PAGE_SIZE, PCI_DMA_FROMDEVICE); } else { @@ -705,7 +712,7 @@ vmxnet3_map_pkt(struct sk_buff *skb, struct vmxnet3_tx_ctx *ctx, tbi = tq->buf_info + tq->tx_ring.next2fill; tbi->map_type = VMXNET3_MAP_SINGLE; - tbi->dma_addr = pci_map_single(adapter->pdev, + tbi->dma_addr = dma_map_single(&adapter->pdev->dev, skb->data + buf_offset, buf_size, PCI_DMA_TODEVICE); @@ -1221,7 +1228,8 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq, goto rcd_done; } - pci_unmap_single(adapter->pdev, rbi->dma_addr, rbi->len, + dma_unmap_single(&adapter->pdev->dev, rbi->dma_addr, + rbi->len, PCI_DMA_FROMDEVICE); #ifdef VMXNET3_RSS @@ -1233,7 +1241,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq, /* Immediate refill */ rbi->skb = new_skb; - rbi->dma_addr = pci_map_single(adapter->pdev, + rbi->dma_addr = dma_map_single(&adapter->pdev->dev, rbi->skb->data, rbi->len, PCI_DMA_FROMDEVICE); rxd->addr = cpu_to_le64(rbi->dma_addr); @@ -1267,7 +1275,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq, } if (rcd->len) { - pci_unmap_page(adapter->pdev, + dma_unmap_page(&adapter->pdev->dev, rbi->dma_addr, rbi->len, PCI_DMA_FROMDEVICE); @@ -1276,7 +1284,8 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq, /* Immediate refill */ rbi->page = new_page; - rbi->dma_addr = pci_map_page(adapter->pdev, rbi->page, + rbi->dma_addr = dma_map_page(&adapter->pdev->dev, + rbi->page, 0, PAGE_SIZE, PCI_DMA_FROMDEVICE); rxd->addr = cpu_to_le64(rbi->dma_addr); @@ -1352,13 +1361,13 @@ vmxnet3_rq_cleanup(struct vmxnet3_rx_queue *rq, if (rxd->btype == VMXNET3_RXD_BTYPE_HEAD && rq->buf_info[ring_idx][i].skb) { - pci_unmap_single(adapter->pdev, rxd->addr, + dma_unmap_single(&adapter->pdev->dev, rxd->addr, rxd->len, PCI_DMA_FROMDEVICE); dev_kfree_skb(rq->buf_info[ring_idx][i].skb); rq->buf_info[ring_idx][i].skb = NULL; } else if (rxd->btype == VMXNET3_RXD_BTYPE_BODY && rq->buf_info[ring_idx][i].page) { - pci_unmap_page(adapter->pdev, rxd->addr, + dma_unmap_page(&adapter->pdev->dev, rxd->addr, rxd->len, PCI_DMA_FROMDEVICE); put_page(rq->buf_info[ring_idx][i].page); rq->buf_info[ring_idx][i].page = NULL; @@ -1400,25 +1409,31 @@ static void vmxnet3_rq_destroy(struct vmxnet3_rx_queue *rq, } - kfree(rq->buf_info[0]); - for (i = 0; i < 2; i++) { if (rq->rx_ring[i].base) { - pci_free_consistent(adapter->pdev, rq->rx_ring[i].size - * sizeof(struct Vmxnet3_RxDesc), - rq->rx_ring[i].base, - rq->rx_ring[i].basePA); + dma_free_coherent(&adapter->pdev->dev, + rq->rx_ring[i].size + * sizeof(struct Vmxnet3_RxDesc), + rq->rx_ring[i].base, + rq->rx_ring[i].basePA); rq->rx_ring[i].base = NULL; } rq->buf_info[i] = NULL; } if (rq->comp_ring.base) { - pci_free_consistent(adapter->pdev, rq->comp_ring.size * - sizeof(struct Vmxnet3_RxCompDesc), - rq->comp_ring.base, rq->comp_ring.basePA); + dma_free_coherent(&adapter->pdev->dev, rq->comp_ring.size + * sizeof(struct Vmxnet3_RxCompDesc), + rq->comp_ring.base, rq->comp_ring.basePA); rq->comp_ring.base = NULL; } + + if (rq->buf_info[0]) { + size_t sz = sizeof(struct vmxnet3_rx_buf_info) * + (rq->rx_ring[0].size + rq->rx_ring[1].size); + dma_free_coherent(&adapter->pdev->dev, sz, rq->buf_info[0], + rq->buf_info_pa); + } } @@ -1503,8 +1518,10 @@ vmxnet3_rq_create(struct vmxnet3_rx_queue *rq, struct vmxnet3_adapter *adapter) for (i = 0; i < 2; i++) { sz = rq->rx_ring[i].size * sizeof(struct Vmxnet3_RxDesc); - rq->rx_ring[i].base = pci_alloc_consistent(adapter->pdev, sz, - &rq->rx_ring[i].basePA); + rq->rx_ring[i].base = dma_alloc_coherent( + &adapter->pdev->dev, sz, + &rq->rx_ring[i].basePA, + GFP_KERNEL); if (!rq->rx_ring[i].base) { netdev_err(adapter->netdev, "failed to allocate rx ring %d\n", i); @@ -1513,8 +1530,9 @@ vmxnet3_rq_create(struct vmxnet3_rx_queue *rq, struct vmxnet3_adapter *adapter) } sz = rq->comp_ring.size * sizeof(struct Vmxnet3_RxCompDesc); - rq->comp_ring.base = pci_alloc_consistent(adapter->pdev, sz, - &rq->comp_ring.basePA); + rq->comp_ring.base = dma_alloc_coherent(&adapter->pdev->dev, sz, + &rq->comp_ring.basePA, + GFP_KERNEL); if (!rq->comp_ring.base) { netdev_err(adapter->netdev, "failed to allocate rx comp ring\n"); goto err; @@ -1522,7 +1540,8 @@ vmxnet3_rq_create(struct vmxnet3_rx_queue *rq, struct vmxnet3_adapter *adapter) sz = sizeof(struct vmxnet3_rx_buf_info) * (rq->rx_ring[0].size + rq->rx_ring[1].size); - bi = kzalloc(sz, GFP_KERNEL); + bi = dma_zalloc_coherent(&adapter->pdev->dev, sz, &rq->buf_info_pa, + GFP_KERNEL); if (!bi) goto err; @@ -2005,6 +2024,7 @@ vmxnet3_set_mc(struct net_device *netdev) struct Vmxnet3_RxFilterConf *rxConf = &adapter->shared->devRead.rxFilterConf; u8 *new_table = NULL; + dma_addr_t new_table_pa = 0; u32 new_mode = VMXNET3_RXM_UCAST; if (netdev->flags & IFF_PROMISC) { @@ -2028,8 +2048,12 @@ vmxnet3_set_mc(struct net_device *netdev) new_mode |= VMXNET3_RXM_MCAST; rxConf->mfTableLen = cpu_to_le16( netdev_mc_count(netdev) * ETH_ALEN); - rxConf->mfTablePA = cpu_to_le64(virt_to_phys( - new_table)); + new_table_pa = dma_map_single( + &adapter->pdev->dev, + new_table, + rxConf->mfTableLen, + PCI_DMA_TODEVICE); + rxConf->mfTablePA = cpu_to_le64(new_table_pa); } else { netdev_info(netdev, "failed to copy mcast list" ", setting ALL_MULTI\n"); @@ -2056,7 +2080,11 @@ vmxnet3_set_mc(struct net_device *netdev) VMXNET3_CMD_UPDATE_MAC_FILTERS); spin_unlock_irqrestore(&adapter->cmd_lock, flags); - kfree(new_table); + if (new_table) { + dma_unmap_single(&adapter->pdev->dev, new_table_pa, + rxConf->mfTableLen, PCI_DMA_TODEVICE); + kfree(new_table); + } } void @@ -2096,7 +2124,7 @@ vmxnet3_setup_driver_shared(struct vmxnet3_adapter *adapter) devRead->misc.driverInfo.vmxnet3RevSpt = cpu_to_le32(1); devRead->misc.driverInfo.uptVerSpt = cpu_to_le32(1); - devRead->misc.ddPA = cpu_to_le64(virt_to_phys(adapter)); + devRead->misc.ddPA = cpu_to_le64(adapter->adapter_pa); devRead->misc.ddLen = cpu_to_le32(sizeof(struct vmxnet3_adapter)); /* set up feature flags */ @@ -2125,7 +2153,7 @@ vmxnet3_setup_driver_shared(struct vmxnet3_adapter *adapter) tqc->txRingBasePA = cpu_to_le64(tq->tx_ring.basePA); tqc->dataRingBasePA = cpu_to_le64(tq->data_ring.basePA); tqc->compRingBasePA = cpu_to_le64(tq->comp_ring.basePA); - tqc->ddPA = cpu_to_le64(virt_to_phys(tq->buf_info)); + tqc->ddPA = cpu_to_le64(tq->buf_info_pa); tqc->txRingSize = cpu_to_le32(tq->tx_ring.size); tqc->dataRingSize = cpu_to_le32(tq->data_ring.size); tqc->compRingSize = cpu_to_le32(tq->comp_ring.size); @@ -2143,8 +2171,7 @@ vmxnet3_setup_driver_shared(struct vmxnet3_adapter *adapter) rqc->rxRingBasePA[0] = cpu_to_le64(rq->rx_ring[0].basePA); rqc->rxRingBasePA[1] = cpu_to_le64(rq->rx_ring[1].basePA); rqc->compRingBasePA = cpu_to_le64(rq->comp_ring.basePA); - rqc->ddPA = cpu_to_le64(virt_to_phys( - rq->buf_info)); + rqc->ddPA = cpu_to_le64(rq->buf_info_pa); rqc->rxRingSize[0] = cpu_to_le32(rq->rx_ring[0].size); rqc->rxRingSize[1] = cpu_to_le32(rq->rx_ring[1].size); rqc->compRingSize = cpu_to_le32(rq->comp_ring.size); @@ -2184,8 +2211,9 @@ vmxnet3_setup_driver_shared(struct vmxnet3_adapter *adapter) i, adapter->num_rx_queues); devRead->rssConfDesc.confVer = 1; - devRead->rssConfDesc.confLen = sizeof(*rssConf); - devRead->rssConfDesc.confPA = virt_to_phys(rssConf); + devRead->rssConfDesc.confLen = cpu_to_le32(sizeof(*rssConf)); + devRead->rssConfDesc.confPA = + cpu_to_le64(adapter->rss_conf_pa); } #endif /* VMXNET3_RSS */ @@ -2948,9 +2976,13 @@ vmxnet3_probe_device(struct pci_dev *pdev, adapter->pdev = pdev; spin_lock_init(&adapter->cmd_lock); - adapter->shared = pci_alloc_consistent(adapter->pdev, - sizeof(struct Vmxnet3_DriverShared), - &adapter->shared_pa); + adapter->adapter_pa = dma_map_single(&adapter->pdev->dev, adapter, + sizeof(struct vmxnet3_adapter), + PCI_DMA_TODEVICE); + adapter->shared = dma_alloc_coherent( + &adapter->pdev->dev, + sizeof(struct Vmxnet3_DriverShared), + &adapter->shared_pa, GFP_KERNEL); if (!adapter->shared) { dev_err(&pdev->dev, "Failed to allocate memory\n"); err = -ENOMEM; @@ -2963,8 +2995,9 @@ vmxnet3_probe_device(struct pci_dev *pdev, size = sizeof(struct Vmxnet3_TxQueueDesc) * adapter->num_tx_queues; size += sizeof(struct Vmxnet3_RxQueueDesc) * adapter->num_rx_queues; - adapter->tqd_start = pci_alloc_consistent(adapter->pdev, size, - &adapter->queue_desc_pa); + adapter->tqd_start = dma_alloc_coherent(&adapter->pdev->dev, size, + &adapter->queue_desc_pa, + GFP_KERNEL); if (!adapter->tqd_start) { dev_err(&pdev->dev, "Failed to allocate memory\n"); @@ -2974,7 +3007,10 @@ vmxnet3_probe_device(struct pci_dev *pdev, adapter->rqd_start = (struct Vmxnet3_RxQueueDesc *)(adapter->tqd_start + adapter->num_tx_queues); - adapter->pm_conf = kmalloc(sizeof(struct Vmxnet3_PMConf), GFP_KERNEL); + adapter->pm_conf = dma_alloc_coherent(&adapter->pdev->dev, + sizeof(struct Vmxnet3_PMConf), + &adapter->pm_conf_pa, + GFP_KERNEL); if (adapter->pm_conf == NULL) { err = -ENOMEM; goto err_alloc_pm; @@ -2982,7 +3018,10 @@ vmxnet3_probe_device(struct pci_dev *pdev, #ifdef VMXNET3_RSS - adapter->rss_conf = kmalloc(sizeof(struct UPT1_RSSConf), GFP_KERNEL); + adapter->rss_conf = dma_alloc_coherent(&adapter->pdev->dev, + sizeof(struct UPT1_RSSConf), + &adapter->rss_conf_pa, + GFP_KERNEL); if (adapter->rss_conf == NULL) { err = -ENOMEM; goto err_alloc_rss; @@ -3077,17 +3116,22 @@ err_ver: vmxnet3_free_pci_resources(adapter); err_alloc_pci: #ifdef VMXNET3_RSS - kfree(adapter->rss_conf); + dma_free_coherent(&adapter->pdev->dev, sizeof(struct UPT1_RSSConf), + adapter->rss_conf, adapter->rss_conf_pa); err_alloc_rss: #endif - kfree(adapter->pm_conf); + dma_free_coherent(&adapter->pdev->dev, sizeof(struct Vmxnet3_PMConf), + adapter->pm_conf, adapter->pm_conf_pa); err_alloc_pm: - pci_free_consistent(adapter->pdev, size, adapter->tqd_start, - adapter->queue_desc_pa); + dma_free_coherent(&adapter->pdev->dev, size, adapter->tqd_start, + adapter->queue_desc_pa); err_alloc_queue_desc: - pci_free_consistent(adapter->pdev, sizeof(struct Vmxnet3_DriverShared), - adapter->shared, adapter->shared_pa); + dma_free_coherent(&adapter->pdev->dev, + sizeof(struct Vmxnet3_DriverShared), + adapter->shared, adapter->shared_pa); err_alloc_shared: + dma_unmap_single(&adapter->pdev->dev, adapter->adapter_pa, + sizeof(struct vmxnet3_adapter), PCI_DMA_TODEVICE); pci_set_drvdata(pdev, NULL); free_netdev(netdev); return err; @@ -3118,16 +3162,21 @@ vmxnet3_remove_device(struct pci_dev *pdev) vmxnet3_free_intr_resources(adapter); vmxnet3_free_pci_resources(adapter); #ifdef VMXNET3_RSS - kfree(adapter->rss_conf); + dma_free_coherent(&adapter->pdev->dev, sizeof(struct UPT1_RSSConf), + adapter->rss_conf, adapter->rss_conf_pa); #endif - kfree(adapter->pm_conf); + dma_free_coherent(&adapter->pdev->dev, sizeof(struct Vmxnet3_PMConf), + adapter->pm_conf, adapter->pm_conf_pa); size = sizeof(struct Vmxnet3_TxQueueDesc) * adapter->num_tx_queues; size += sizeof(struct Vmxnet3_RxQueueDesc) * num_rx_queues; - pci_free_consistent(adapter->pdev, size, adapter->tqd_start, - adapter->queue_desc_pa); - pci_free_consistent(adapter->pdev, sizeof(struct Vmxnet3_DriverShared), - adapter->shared, adapter->shared_pa); + dma_free_coherent(&adapter->pdev->dev, size, adapter->tqd_start, + adapter->queue_desc_pa); + dma_free_coherent(&adapter->pdev->dev, + sizeof(struct Vmxnet3_DriverShared), + adapter->shared, adapter->shared_pa); + dma_unmap_single(&adapter->pdev->dev, adapter->adapter_pa, + sizeof(struct vmxnet3_adapter), PCI_DMA_TODEVICE); free_netdev(netdev); } @@ -3227,8 +3276,8 @@ skip_arp: adapter->shared->devRead.pmConfDesc.confVer = cpu_to_le32(1); adapter->shared->devRead.pmConfDesc.confLen = cpu_to_le32(sizeof( *pmConf)); - adapter->shared->devRead.pmConfDesc.confPA = cpu_to_le64(virt_to_phys( - pmConf)); + adapter->shared->devRead.pmConfDesc.confPA = + cpu_to_le64(adapter->pm_conf_pa); spin_lock_irqsave(&adapter->cmd_lock, flags); VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, @@ -3265,8 +3314,8 @@ vmxnet3_resume(struct device *device) adapter->shared->devRead.pmConfDesc.confVer = cpu_to_le32(1); adapter->shared->devRead.pmConfDesc.confLen = cpu_to_le32(sizeof( *pmConf)); - adapter->shared->devRead.pmConfDesc.confPA = cpu_to_le64(virt_to_phys( - pmConf)); + adapter->shared->devRead.pmConfDesc.confPA = + cpu_to_le64(adapter->pm_conf_pa); netif_device_attach(netdev); pci_set_power_state(pdev, PCI_D0); diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h index 3541814..a03f358 100644 --- a/drivers/net/vmxnet3/vmxnet3_int.h +++ b/drivers/net/vmxnet3/vmxnet3_int.h @@ -70,10 +70,10 @@ /* * Version numbers */ -#define VMXNET3_DRIVER_VERSION_STRING "1.1.30.0-k" +#define VMXNET3_DRIVER_VERSION_STRING "1.2.0.0-k" /* a 32-bit int, each byte encode a verion number in VMXNET3_DRIVER_VERSION */ -#define VMXNET3_DRIVER_VERSION_NUM 0x01011E00 +#define VMXNET3_DRIVER_VERSION_NUM 0x01020000 #if defined(CONFIG_PCI_MSI) /* RSS only makes sense if MSI-X is supported. */ @@ -229,6 +229,7 @@ struct vmxnet3_tx_queue { spinlock_t tx_lock; struct vmxnet3_cmd_ring tx_ring; struct vmxnet3_tx_buf_info *buf_info; + dma_addr_t buf_info_pa; struct vmxnet3_tx_data_ring data_ring; struct vmxnet3_comp_ring comp_ring; struct Vmxnet3_TxQueueCtrl *shared; @@ -277,6 +278,7 @@ struct vmxnet3_rx_queue { u32 qid; /* rqID in RCD for buffer from 1st ring */ u32 qid2; /* rqID in RCD for buffer from 2nd ring */ struct vmxnet3_rx_buf_info *buf_info[2]; + dma_addr_t buf_info_pa; struct Vmxnet3_RxQueueCtrl *shared; struct vmxnet3_rq_driver_stats stats; } __attribute__((__aligned__(SMP_CACHE_BYTES))); @@ -353,6 +355,10 @@ struct vmxnet3_adapter { unsigned long state; /* VMXNET3_STATE_BIT_xxx */ int share_intr; + + dma_addr_t adapter_pa; + dma_addr_t pm_conf_pa; + dma_addr_t rss_conf_pa; }; #define VMXNET3_WRITE_BAR0_REG(adapter, reg, val) \ diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index b9401b5..3b21aca 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -542,12 +542,6 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, return 0; } -static void vxlan_fdb_free_rdst(struct rcu_head *head) -{ - struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu); - kfree(rd); -} - static void vxlan_fdb_free(struct rcu_head *head) { struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu); @@ -687,7 +681,7 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], */ if (rd && !list_is_singular(&f->remotes)) { list_del_rcu(&rd->list); - call_rcu(&rd->rcu, vxlan_fdb_free_rdst); + kfree_rcu(rd, rcu); goto out; } diff --git a/drivers/net/wireless/hostap/hostap_ioctl.c b/drivers/net/wireless/hostap/hostap_ioctl.c index ac07473..e509030 100644 --- a/drivers/net/wireless/hostap/hostap_ioctl.c +++ b/drivers/net/wireless/hostap/hostap_ioctl.c @@ -523,9 +523,9 @@ static int prism2_ioctl_giwaplist(struct net_device *dev, data->length = prism2_ap_get_sta_qual(local, addr, qual, IW_MAX_AP, 1); - memcpy(extra, &addr, sizeof(struct sockaddr) * data->length); + memcpy(extra, addr, sizeof(struct sockaddr) * data->length); data->flags = 1; /* has quality information */ - memcpy(extra + sizeof(struct sockaddr) * data->length, &qual, + memcpy(extra + sizeof(struct sockaddr) * data->length, qual, sizeof(struct iw_quality) * data->length); kfree(addr); diff --git a/drivers/net/wireless/iwlwifi/dvm/mac80211.c b/drivers/net/wireless/iwlwifi/dvm/mac80211.c index f0a2c95..cae4d31 100644 --- a/drivers/net/wireless/iwlwifi/dvm/mac80211.c +++ b/drivers/net/wireless/iwlwifi/dvm/mac80211.c @@ -1024,7 +1024,10 @@ void iwl_chswitch_done(struct iwl_priv *priv, bool is_success) if (test_bit(STATUS_EXIT_PENDING, &priv->status)) return; - if (test_and_clear_bit(STATUS_CHANNEL_SWITCH_PENDING, &priv->status)) + if (!test_and_clear_bit(STATUS_CHANNEL_SWITCH_PENDING, &priv->status)) + return; + + if (ctx->vif) ieee80211_chswitch_done(ctx->vif, is_success); } diff --git a/drivers/net/wireless/iwlwifi/iwl-prph.h b/drivers/net/wireless/iwlwifi/iwl-prph.h index a70c7b9..ff8cc75 100644 --- a/drivers/net/wireless/iwlwifi/iwl-prph.h +++ b/drivers/net/wireless/iwlwifi/iwl-prph.h @@ -97,8 +97,6 @@ #define APMG_PCIDEV_STT_VAL_L1_ACT_DIS (0x00000800) -#define APMG_RTC_INT_STT_RFKILL (0x10000000) - /* Device system time */ #define DEVICE_SYSTEM_TIME_REG 0xA0206C diff --git a/drivers/net/wireless/iwlwifi/mvm/time-event.c b/drivers/net/wireless/iwlwifi/mvm/time-event.c index ad9bbca..7fd6fbf 100644 --- a/drivers/net/wireless/iwlwifi/mvm/time-event.c +++ b/drivers/net/wireless/iwlwifi/mvm/time-event.c @@ -138,6 +138,20 @@ static void iwl_mvm_roc_finished(struct iwl_mvm *mvm) schedule_work(&mvm->roc_done_wk); } +static bool iwl_mvm_te_check_disconnect(struct iwl_mvm *mvm, + struct ieee80211_vif *vif, + const char *errmsg) +{ + if (vif->type != NL80211_IFTYPE_STATION) + return false; + if (vif->bss_conf.assoc && vif->bss_conf.dtim_period) + return false; + if (errmsg) + IWL_ERR(mvm, "%s\n", errmsg); + ieee80211_connection_loss(vif); + return true; +} + /* * Handles a FW notification for an event that is known to the driver. * @@ -163,8 +177,13 @@ static void iwl_mvm_te_handle_notif(struct iwl_mvm *mvm, * P2P Device discoveribility, while there are other higher priority * events in the system). */ - WARN_ONCE(!le32_to_cpu(notif->status), - "Failed to schedule time event\n"); + if (WARN_ONCE(!le32_to_cpu(notif->status), + "Failed to schedule time event\n")) { + if (iwl_mvm_te_check_disconnect(mvm, te_data->vif, NULL)) { + iwl_mvm_te_clear_data(mvm, te_data); + return; + } + } if (le32_to_cpu(notif->action) & TE_NOTIF_HOST_EVENT_END) { IWL_DEBUG_TE(mvm, @@ -180,14 +199,8 @@ static void iwl_mvm_te_handle_notif(struct iwl_mvm *mvm, * By now, we should have finished association * and know the dtim period. */ - if (te_data->vif->type == NL80211_IFTYPE_STATION && - (!te_data->vif->bss_conf.assoc || - !te_data->vif->bss_conf.dtim_period)) { - IWL_ERR(mvm, - "No assocation and the time event is over already...\n"); - ieee80211_connection_loss(te_data->vif); - } - + iwl_mvm_te_check_disconnect(mvm, te_data->vif, + "No assocation and the time event is over already..."); iwl_mvm_te_clear_data(mvm, te_data); } else if (le32_to_cpu(notif->action) & TE_NOTIF_HOST_EVENT_START) { te_data->running = true; diff --git a/drivers/net/wireless/iwlwifi/pcie/rx.c b/drivers/net/wireless/iwlwifi/pcie/rx.c index 68837d4..5fdb4ee 100644 --- a/drivers/net/wireless/iwlwifi/pcie/rx.c +++ b/drivers/net/wireless/iwlwifi/pcie/rx.c @@ -888,14 +888,6 @@ irqreturn_t iwl_pcie_irq_handler(int irq, void *dev_id) iwl_op_mode_hw_rf_kill(trans->op_mode, hw_rfkill); if (hw_rfkill) { - /* - * Clear the interrupt in APMG if the NIC is going down. - * Note that when the NIC exits RFkill (else branch), we - * can't access prph and the NIC will be reset in - * start_hw anyway. - */ - iwl_write_prph(trans, APMG_RTC_INT_STT_REG, - APMG_RTC_INT_STT_RFKILL); set_bit(STATUS_RFKILL, &trans_pcie->status); if (test_and_clear_bit(STATUS_HCMD_ACTIVE, &trans_pcie->status)) diff --git a/drivers/net/wireless/iwlwifi/pcie/trans.c b/drivers/net/wireless/iwlwifi/pcie/trans.c index e52d1ce..eca4429 100644 --- a/drivers/net/wireless/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/iwlwifi/pcie/trans.c @@ -1416,6 +1416,11 @@ struct iwl_trans *iwl_trans_pcie_alloc(struct pci_dev *pdev, goto out_no_pci; } + /* W/A - seems to solve weird behavior. We need to remove this if we + * don't want to stay in L1 all the time. This wastes a lot of power */ + pci_disable_link_state(pdev, PCIE_LINK_STATE_L0S | PCIE_LINK_STATE_L1 | + PCIE_LINK_STATE_CLKPM); + pci_set_master(pdev); err = pci_set_dma_mask(pdev, DMA_BIT_MASK(36)); diff --git a/drivers/net/wireless/zd1201.c b/drivers/net/wireless/zd1201.c index 4941f20..b8ba1f9 100644 --- a/drivers/net/wireless/zd1201.c +++ b/drivers/net/wireless/zd1201.c @@ -98,10 +98,12 @@ static int zd1201_fw_upload(struct usb_device *dev, int apfw) goto exit; err = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), 0x4, - USB_DIR_IN | 0x40, 0,0, &ret, sizeof(ret), ZD1201_FW_TIMEOUT); + USB_DIR_IN | 0x40, 0, 0, buf, sizeof(ret), ZD1201_FW_TIMEOUT); if (err < 0) goto exit; + memcpy(&ret, buf, sizeof(ret)); + if (ret & 0x80) { err = -EIO; goto exit; diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 8a4d77e..a197743 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -45,31 +45,109 @@ #include <xen/grant_table.h> #include <xen/xenbus.h> -struct xen_netbk; +typedef unsigned int pending_ring_idx_t; +#define INVALID_PENDING_RING_IDX (~0U) + +/* For the head field in pending_tx_info: it is used to indicate + * whether this tx info is the head of one or more coalesced requests. + * + * When head != INVALID_PENDING_RING_IDX, it means the start of a new + * tx requests queue and the end of previous queue. + * + * An example sequence of head fields (I = INVALID_PENDING_RING_IDX): + * + * ...|0 I I I|5 I|9 I I I|... + * -->|<-INUSE---------------- + * + * After consuming the first slot(s) we have: + * + * ...|V V V V|5 I|9 I I I|... + * -----FREE->|<-INUSE-------- + * + * where V stands for "valid pending ring index". Any number other + * than INVALID_PENDING_RING_IDX is OK. These entries are considered + * free and can contain any number other than + * INVALID_PENDING_RING_IDX. In practice we use 0. + * + * The in use non-INVALID_PENDING_RING_IDX (say 0, 5 and 9 in the + * above example) number is the index into pending_tx_info and + * mmap_pages arrays. + */ +struct pending_tx_info { + struct xen_netif_tx_request req; /* coalesced tx request */ + pending_ring_idx_t head; /* head != INVALID_PENDING_RING_IDX + * if it is head of one or more tx + * reqs + */ +}; + +#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE) +#define XEN_NETIF_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE) + +struct xenvif_rx_meta { + int id; + int size; + int gso_size; +}; + +/* Discriminate from any valid pending_idx value. */ +#define INVALID_PENDING_IDX 0xFFFF + +#define MAX_BUFFER_OFFSET PAGE_SIZE + +#define MAX_PENDING_REQS 256 struct xenvif { /* Unique identifier for this interface. */ domid_t domid; unsigned int handle; - /* Reference to netback processing backend. */ - struct xen_netbk *netbk; + /* Use NAPI for guest TX */ + struct napi_struct napi; + /* When feature-split-event-channels = 0, tx_irq = rx_irq. */ + unsigned int tx_irq; + /* Only used when feature-split-event-channels = 1 */ + char tx_irq_name[IFNAMSIZ+4]; /* DEVNAME-tx */ + struct xen_netif_tx_back_ring tx; + struct sk_buff_head tx_queue; + struct page *mmap_pages[MAX_PENDING_REQS]; + pending_ring_idx_t pending_prod; + pending_ring_idx_t pending_cons; + u16 pending_ring[MAX_PENDING_REQS]; + struct pending_tx_info pending_tx_info[MAX_PENDING_REQS]; + + /* Coalescing tx requests before copying makes number of grant + * copy ops greater or equal to number of slots required. In + * worst case a tx request consumes 2 gnttab_copy. + */ + struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS]; - u8 fe_dev_addr[6]; + /* Use kthread for guest RX */ + struct task_struct *task; + wait_queue_head_t wq; /* When feature-split-event-channels = 0, tx_irq = rx_irq. */ - unsigned int tx_irq; unsigned int rx_irq; /* Only used when feature-split-event-channels = 1 */ - char tx_irq_name[IFNAMSIZ+4]; /* DEVNAME-tx */ char rx_irq_name[IFNAMSIZ+4]; /* DEVNAME-rx */ + struct xen_netif_rx_back_ring rx; + struct sk_buff_head rx_queue; - /* List of frontends to notify after a batch of frames sent. */ - struct list_head notify_list; + /* Allow xenvif_start_xmit() to peek ahead in the rx request + * ring. This is a prediction of what rx_req_cons will be + * once all queued skbs are put on the ring. + */ + RING_IDX rx_req_cons_peek; + + /* Given MAX_BUFFER_OFFSET of 4096 the worst case is that each + * head/fragment page uses 2 copy operations because it + * straddles two buffers in the frontend. + */ + struct gnttab_copy grant_copy_op[2*XEN_NETIF_RX_RING_SIZE]; + struct xenvif_rx_meta meta[2*XEN_NETIF_RX_RING_SIZE]; - /* The shared rings and indexes. */ - struct xen_netif_tx_back_ring tx; - struct xen_netif_rx_back_ring rx; + + u8 fe_dev_addr[6]; /* Frontend feature information. */ u8 can_sg:1; @@ -80,13 +158,6 @@ struct xenvif { /* Internal feature information. */ u8 can_queue:1; /* can queue packets for receiver? */ - /* - * Allow xenvif_start_xmit() to peek ahead in the rx request - * ring. This is a prediction of what rx_req_cons will be - * once all queued skbs are put on the ring. - */ - RING_IDX rx_req_cons_peek; - /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */ unsigned long credit_bytes; unsigned long credit_usec; @@ -97,11 +168,7 @@ struct xenvif { unsigned long rx_gso_checksum_fixup; /* Miscellaneous private stuff. */ - struct list_head schedule_list; - atomic_t refcnt; struct net_device *dev; - - wait_queue_head_t waiting_to_free; }; static inline struct xenbus_device *xenvif_to_xenbus_device(struct xenvif *vif) @@ -109,9 +176,6 @@ static inline struct xenbus_device *xenvif_to_xenbus_device(struct xenvif *vif) return to_xenbus_device(vif->dev->dev.parent); } -#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE) -#define XEN_NETIF_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE) - struct xenvif *xenvif_alloc(struct device *parent, domid_t domid, unsigned int handle); @@ -121,39 +185,26 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref, unsigned int rx_evtchn); void xenvif_disconnect(struct xenvif *vif); -void xenvif_get(struct xenvif *vif); -void xenvif_put(struct xenvif *vif); - int xenvif_xenbus_init(void); void xenvif_xenbus_fini(void); int xenvif_schedulable(struct xenvif *vif); -int xen_netbk_rx_ring_full(struct xenvif *vif); +int xenvif_rx_ring_full(struct xenvif *vif); -int xen_netbk_must_stop_queue(struct xenvif *vif); +int xenvif_must_stop_queue(struct xenvif *vif); /* (Un)Map communication rings. */ -void xen_netbk_unmap_frontend_rings(struct xenvif *vif); -int xen_netbk_map_frontend_rings(struct xenvif *vif, - grant_ref_t tx_ring_ref, - grant_ref_t rx_ring_ref); - -/* (De)Register a xenvif with the netback backend. */ -void xen_netbk_add_xenvif(struct xenvif *vif); -void xen_netbk_remove_xenvif(struct xenvif *vif); - -/* (De)Schedule backend processing for a xenvif */ -void xen_netbk_schedule_xenvif(struct xenvif *vif); -void xen_netbk_deschedule_xenvif(struct xenvif *vif); +void xenvif_unmap_frontend_rings(struct xenvif *vif); +int xenvif_map_frontend_rings(struct xenvif *vif, + grant_ref_t tx_ring_ref, + grant_ref_t rx_ring_ref); /* Check for SKBs from frontend and schedule backend processing */ -void xen_netbk_check_rx_xenvif(struct xenvif *vif); -/* Receive an SKB from the frontend */ -void xenvif_receive_skb(struct xenvif *vif, struct sk_buff *skb); +void xenvif_check_rx_xenvif(struct xenvif *vif); /* Queue an SKB for transmission to the frontend */ -void xen_netbk_queue_tx_skb(struct xenvif *vif, struct sk_buff *skb); +void xenvif_queue_tx_skb(struct xenvif *vif, struct sk_buff *skb); /* Notify xenvif that ring now has space to send an skb to the frontend */ void xenvif_notify_tx_completion(struct xenvif *vif); @@ -161,7 +212,12 @@ void xenvif_notify_tx_completion(struct xenvif *vif); void xenvif_carrier_off(struct xenvif *vif); /* Returns number of ring slots required to send an skb to the frontend */ -unsigned int xen_netbk_count_skb_slots(struct xenvif *vif, struct sk_buff *skb); +unsigned int xenvif_count_skb_slots(struct xenvif *vif, struct sk_buff *skb); + +int xenvif_tx_action(struct xenvif *vif, int budget); +void xenvif_rx_action(struct xenvif *vif); + +int xenvif_kthread(void *data); extern bool separate_tx_rx_irq; diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 087d2db..625c6f4 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -30,6 +30,7 @@ #include "common.h" +#include <linux/kthread.h> #include <linux/ethtool.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> @@ -38,17 +39,7 @@ #include <asm/xen/hypercall.h> #define XENVIF_QUEUE_LENGTH 32 - -void xenvif_get(struct xenvif *vif) -{ - atomic_inc(&vif->refcnt); -} - -void xenvif_put(struct xenvif *vif) -{ - if (atomic_dec_and_test(&vif->refcnt)) - wake_up(&vif->waiting_to_free); -} +#define XENVIF_NAPI_WEIGHT 64 int xenvif_schedulable(struct xenvif *vif) { @@ -57,28 +48,62 @@ int xenvif_schedulable(struct xenvif *vif) static int xenvif_rx_schedulable(struct xenvif *vif) { - return xenvif_schedulable(vif) && !xen_netbk_rx_ring_full(vif); + return xenvif_schedulable(vif) && !xenvif_rx_ring_full(vif); } static irqreturn_t xenvif_tx_interrupt(int irq, void *dev_id) { struct xenvif *vif = dev_id; - if (vif->netbk == NULL) - return IRQ_HANDLED; - - xen_netbk_schedule_xenvif(vif); + if (RING_HAS_UNCONSUMED_REQUESTS(&vif->tx)) + napi_schedule(&vif->napi); return IRQ_HANDLED; } +static int xenvif_poll(struct napi_struct *napi, int budget) +{ + struct xenvif *vif = container_of(napi, struct xenvif, napi); + int work_done; + + work_done = xenvif_tx_action(vif, budget); + + if (work_done < budget) { + int more_to_do = 0; + unsigned long flags; + + /* It is necessary to disable IRQ before calling + * RING_HAS_UNCONSUMED_REQUESTS. Otherwise we might + * lose event from the frontend. + * + * Consider: + * RING_HAS_UNCONSUMED_REQUESTS + * <frontend generates event to trigger napi_schedule> + * __napi_complete + * + * This handler is still in scheduled state so the + * event has no effect at all. After __napi_complete + * this handler is descheduled and cannot get + * scheduled again. We lose event in this case and the ring + * will be completely stalled. + */ + + local_irq_save(flags); + + RING_FINAL_CHECK_FOR_REQUESTS(&vif->tx, more_to_do); + if (!more_to_do) + __napi_complete(napi); + + local_irq_restore(flags); + } + + return work_done; +} + static irqreturn_t xenvif_rx_interrupt(int irq, void *dev_id) { struct xenvif *vif = dev_id; - if (vif->netbk == NULL) - return IRQ_HANDLED; - if (xenvif_rx_schedulable(vif)) netif_wake_queue(vif->dev); @@ -99,7 +124,8 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev) BUG_ON(skb->dev != dev); - if (vif->netbk == NULL) + /* Drop the packet if vif is not ready */ + if (vif->task == NULL) goto drop; /* Drop the packet if the target domain has no receive buffers. */ @@ -107,13 +133,12 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev) goto drop; /* Reserve ring slots for the worst-case number of fragments. */ - vif->rx_req_cons_peek += xen_netbk_count_skb_slots(vif, skb); - xenvif_get(vif); + vif->rx_req_cons_peek += xenvif_count_skb_slots(vif, skb); - if (vif->can_queue && xen_netbk_must_stop_queue(vif)) + if (vif->can_queue && xenvif_must_stop_queue(vif)) netif_stop_queue(dev); - xen_netbk_queue_tx_skb(vif, skb); + xenvif_queue_tx_skb(vif, skb); return NETDEV_TX_OK; @@ -123,11 +148,6 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } -void xenvif_receive_skb(struct xenvif *vif, struct sk_buff *skb) -{ - netif_rx_ni(skb); -} - void xenvif_notify_tx_completion(struct xenvif *vif) { if (netif_queue_stopped(vif->dev) && xenvif_rx_schedulable(vif)) @@ -142,21 +162,20 @@ static struct net_device_stats *xenvif_get_stats(struct net_device *dev) static void xenvif_up(struct xenvif *vif) { - xen_netbk_add_xenvif(vif); + napi_enable(&vif->napi); enable_irq(vif->tx_irq); if (vif->tx_irq != vif->rx_irq) enable_irq(vif->rx_irq); - xen_netbk_check_rx_xenvif(vif); + xenvif_check_rx_xenvif(vif); } static void xenvif_down(struct xenvif *vif) { + napi_disable(&vif->napi); disable_irq(vif->tx_irq); if (vif->tx_irq != vif->rx_irq) disable_irq(vif->rx_irq); del_timer_sync(&vif->credit_timeout); - xen_netbk_deschedule_xenvif(vif); - xen_netbk_remove_xenvif(vif); } static int xenvif_open(struct net_device *dev) @@ -272,11 +291,12 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid, struct net_device *dev; struct xenvif *vif; char name[IFNAMSIZ] = {}; + int i; snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle); dev = alloc_netdev(sizeof(struct xenvif), name, ether_setup); if (dev == NULL) { - pr_warn("Could not allocate netdev\n"); + pr_warn("Could not allocate netdev for %s\n", name); return ERR_PTR(-ENOMEM); } @@ -285,14 +305,9 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid, vif = netdev_priv(dev); vif->domid = domid; vif->handle = handle; - vif->netbk = NULL; vif->can_sg = 1; vif->csum = 1; - atomic_set(&vif->refcnt, 1); - init_waitqueue_head(&vif->waiting_to_free); vif->dev = dev; - INIT_LIST_HEAD(&vif->schedule_list); - INIT_LIST_HEAD(&vif->notify_list); vif->credit_bytes = vif->remaining_credit = ~0UL; vif->credit_usec = 0UL; @@ -307,6 +322,16 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid, dev->tx_queue_len = XENVIF_QUEUE_LENGTH; + skb_queue_head_init(&vif->rx_queue); + skb_queue_head_init(&vif->tx_queue); + + vif->pending_cons = 0; + vif->pending_prod = MAX_PENDING_REQS; + for (i = 0; i < MAX_PENDING_REQS; i++) + vif->pending_ring[i] = i; + for (i = 0; i < MAX_PENDING_REQS; i++) + vif->mmap_pages[i] = NULL; + /* * Initialise a dummy MAC address. We choose the numerically * largest non-broadcast address to prevent the address getting @@ -316,6 +341,8 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid, memset(dev->dev_addr, 0xFF, ETH_ALEN); dev->dev_addr[0] &= ~0x01; + netif_napi_add(dev, &vif->napi, xenvif_poll, XENVIF_NAPI_WEIGHT); + netif_carrier_off(dev); err = register_netdev(dev); @@ -341,7 +368,7 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref, __module_get(THIS_MODULE); - err = xen_netbk_map_frontend_rings(vif, tx_ring_ref, rx_ring_ref); + err = xenvif_map_frontend_rings(vif, tx_ring_ref, rx_ring_ref); if (err < 0) goto err; @@ -377,7 +404,14 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref, disable_irq(vif->rx_irq); } - xenvif_get(vif); + init_waitqueue_head(&vif->wq); + vif->task = kthread_create(xenvif_kthread, + (void *)vif, vif->dev->name); + if (IS_ERR(vif->task)) { + pr_warn("Could not allocate kthread for %s\n", vif->dev->name); + err = PTR_ERR(vif->task); + goto err_rx_unbind; + } rtnl_lock(); if (!vif->can_sg && vif->dev->mtu > ETH_DATA_LEN) @@ -388,12 +422,18 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref, xenvif_up(vif); rtnl_unlock(); + wake_up_process(vif->task); + return 0; + +err_rx_unbind: + unbind_from_irqhandler(vif->rx_irq, vif); + vif->rx_irq = 0; err_tx_unbind: unbind_from_irqhandler(vif->tx_irq, vif); vif->tx_irq = 0; err_unmap: - xen_netbk_unmap_frontend_rings(vif); + xenvif_unmap_frontend_rings(vif); err: module_put(THIS_MODULE); return err; @@ -408,7 +448,6 @@ void xenvif_carrier_off(struct xenvif *vif) if (netif_running(dev)) xenvif_down(vif); rtnl_unlock(); - xenvif_put(vif); } void xenvif_disconnect(struct xenvif *vif) @@ -422,9 +461,6 @@ void xenvif_disconnect(struct xenvif *vif) if (netif_carrier_ok(vif->dev)) xenvif_carrier_off(vif); - atomic_dec(&vif->refcnt); - wait_event(vif->waiting_to_free, atomic_read(&vif->refcnt) == 0); - if (vif->tx_irq) { if (vif->tx_irq == vif->rx_irq) unbind_from_irqhandler(vif->tx_irq, vif); @@ -438,9 +474,14 @@ void xenvif_disconnect(struct xenvif *vif) need_module_put = 1; } + if (vif->task) + kthread_stop(vif->task); + + netif_napi_del(&vif->napi); + unregister_netdev(vif->dev); - xen_netbk_unmap_frontend_rings(vif); + xenvif_unmap_frontend_rings(vif); free_netdev(vif->dev); diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 64828de..956130c 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -70,131 +70,26 @@ module_param(fatal_skb_slots, uint, 0444); */ #define XEN_NETBK_LEGACY_SLOTS_MAX XEN_NETIF_NR_SLOTS_MIN -typedef unsigned int pending_ring_idx_t; -#define INVALID_PENDING_RING_IDX (~0U) - -struct pending_tx_info { - struct xen_netif_tx_request req; /* coalesced tx request */ - struct xenvif *vif; - pending_ring_idx_t head; /* head != INVALID_PENDING_RING_IDX - * if it is head of one or more tx - * reqs - */ -}; - -struct netbk_rx_meta { - int id; - int size; - int gso_size; -}; - -#define MAX_PENDING_REQS 256 - -/* Discriminate from any valid pending_idx value. */ -#define INVALID_PENDING_IDX 0xFFFF - -#define MAX_BUFFER_OFFSET PAGE_SIZE - -/* extra field used in struct page */ -union page_ext { - struct { -#if BITS_PER_LONG < 64 -#define IDX_WIDTH 8 -#define GROUP_WIDTH (BITS_PER_LONG - IDX_WIDTH) - unsigned int group:GROUP_WIDTH; - unsigned int idx:IDX_WIDTH; -#else - unsigned int group, idx; -#endif - } e; - void *mapping; -}; - -struct xen_netbk { - wait_queue_head_t wq; - struct task_struct *task; - - struct sk_buff_head rx_queue; - struct sk_buff_head tx_queue; - - struct timer_list net_timer; - - struct page *mmap_pages[MAX_PENDING_REQS]; - - pending_ring_idx_t pending_prod; - pending_ring_idx_t pending_cons; - struct list_head net_schedule_list; - - /* Protect the net_schedule_list in netif. */ - spinlock_t net_schedule_list_lock; - - atomic_t netfront_count; - - struct pending_tx_info pending_tx_info[MAX_PENDING_REQS]; - /* Coalescing tx requests before copying makes number of grant - * copy ops greater or equal to number of slots required. In - * worst case a tx request consumes 2 gnttab_copy. - */ - struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS]; - - u16 pending_ring[MAX_PENDING_REQS]; - - /* - * Given MAX_BUFFER_OFFSET of 4096 the worst case is that each - * head/fragment page uses 2 copy operations because it - * straddles two buffers in the frontend. - */ - struct gnttab_copy grant_copy_op[2*XEN_NETIF_RX_RING_SIZE]; - struct netbk_rx_meta meta[2*XEN_NETIF_RX_RING_SIZE]; -}; - -static struct xen_netbk *xen_netbk; -static int xen_netbk_group_nr; - /* * If head != INVALID_PENDING_RING_IDX, it means this tx request is head of * one or more merged tx requests, otherwise it is the continuation of * previous tx request. */ -static inline int pending_tx_is_head(struct xen_netbk *netbk, RING_IDX idx) -{ - return netbk->pending_tx_info[idx].head != INVALID_PENDING_RING_IDX; -} - -void xen_netbk_add_xenvif(struct xenvif *vif) +static inline int pending_tx_is_head(struct xenvif *vif, RING_IDX idx) { - int i; - int min_netfront_count; - int min_group = 0; - struct xen_netbk *netbk; - - min_netfront_count = atomic_read(&xen_netbk[0].netfront_count); - for (i = 0; i < xen_netbk_group_nr; i++) { - int netfront_count = atomic_read(&xen_netbk[i].netfront_count); - if (netfront_count < min_netfront_count) { - min_group = i; - min_netfront_count = netfront_count; - } - } - - netbk = &xen_netbk[min_group]; - - vif->netbk = netbk; - atomic_inc(&netbk->netfront_count); + return vif->pending_tx_info[idx].head != INVALID_PENDING_RING_IDX; } -void xen_netbk_remove_xenvif(struct xenvif *vif) -{ - struct xen_netbk *netbk = vif->netbk; - vif->netbk = NULL; - atomic_dec(&netbk->netfront_count); -} +static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx, + u8 status); -static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx, - u8 status); static void make_tx_response(struct xenvif *vif, struct xen_netif_tx_request *txp, s8 st); + +static inline int tx_work_todo(struct xenvif *vif); +static inline int rx_work_todo(struct xenvif *vif); + static struct xen_netif_rx_response *make_rx_response(struct xenvif *vif, u16 id, s8 st, @@ -202,55 +97,16 @@ static struct xen_netif_rx_response *make_rx_response(struct xenvif *vif, u16 size, u16 flags); -static inline unsigned long idx_to_pfn(struct xen_netbk *netbk, +static inline unsigned long idx_to_pfn(struct xenvif *vif, u16 idx) { - return page_to_pfn(netbk->mmap_pages[idx]); + return page_to_pfn(vif->mmap_pages[idx]); } -static inline unsigned long idx_to_kaddr(struct xen_netbk *netbk, +static inline unsigned long idx_to_kaddr(struct xenvif *vif, u16 idx) { - return (unsigned long)pfn_to_kaddr(idx_to_pfn(netbk, idx)); -} - -/* extra field used in struct page */ -static inline void set_page_ext(struct page *pg, struct xen_netbk *netbk, - unsigned int idx) -{ - unsigned int group = netbk - xen_netbk; - union page_ext ext = { .e = { .group = group + 1, .idx = idx } }; - - BUILD_BUG_ON(sizeof(ext) > sizeof(ext.mapping)); - pg->mapping = ext.mapping; -} - -static int get_page_ext(struct page *pg, - unsigned int *pgroup, unsigned int *pidx) -{ - union page_ext ext = { .mapping = pg->mapping }; - struct xen_netbk *netbk; - unsigned int group, idx; - - group = ext.e.group - 1; - - if (group < 0 || group >= xen_netbk_group_nr) - return 0; - - netbk = &xen_netbk[group]; - - idx = ext.e.idx; - - if ((idx < 0) || (idx >= MAX_PENDING_REQS)) - return 0; - - if (netbk->mmap_pages[idx] != pg) - return 0; - - *pgroup = group; - *pidx = idx; - - return 1; + return (unsigned long)pfn_to_kaddr(idx_to_pfn(vif, idx)); } /* @@ -278,15 +134,10 @@ static inline pending_ring_idx_t pending_index(unsigned i) return i & (MAX_PENDING_REQS-1); } -static inline pending_ring_idx_t nr_pending_reqs(struct xen_netbk *netbk) +static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif) { return MAX_PENDING_REQS - - netbk->pending_prod + netbk->pending_cons; -} - -static void xen_netbk_kick_thread(struct xen_netbk *netbk) -{ - wake_up(&netbk->wq); + vif->pending_prod + vif->pending_cons; } static int max_required_rx_slots(struct xenvif *vif) @@ -300,7 +151,7 @@ static int max_required_rx_slots(struct xenvif *vif) return max; } -int xen_netbk_rx_ring_full(struct xenvif *vif) +int xenvif_rx_ring_full(struct xenvif *vif) { RING_IDX peek = vif->rx_req_cons_peek; RING_IDX needed = max_required_rx_slots(vif); @@ -309,16 +160,16 @@ int xen_netbk_rx_ring_full(struct xenvif *vif) ((vif->rx.rsp_prod_pvt + XEN_NETIF_RX_RING_SIZE - peek) < needed); } -int xen_netbk_must_stop_queue(struct xenvif *vif) +int xenvif_must_stop_queue(struct xenvif *vif) { - if (!xen_netbk_rx_ring_full(vif)) + if (!xenvif_rx_ring_full(vif)) return 0; vif->rx.sring->req_event = vif->rx_req_cons_peek + max_required_rx_slots(vif); mb(); /* request notification /then/ check the queue */ - return xen_netbk_rx_ring_full(vif); + return xenvif_rx_ring_full(vif); } /* @@ -364,9 +215,9 @@ static bool start_new_rx_buffer(int offset, unsigned long size, int head) /* * Figure out how many ring slots we're going to need to send @skb to * the guest. This function is essentially a dry run of - * netbk_gop_frag_copy. + * xenvif_gop_frag_copy. */ -unsigned int xen_netbk_count_skb_slots(struct xenvif *vif, struct sk_buff *skb) +unsigned int xenvif_count_skb_slots(struct xenvif *vif, struct sk_buff *skb) { unsigned int count; int i, copy_off; @@ -418,15 +269,15 @@ struct netrx_pending_operations { unsigned copy_prod, copy_cons; unsigned meta_prod, meta_cons; struct gnttab_copy *copy; - struct netbk_rx_meta *meta; + struct xenvif_rx_meta *meta; int copy_off; grant_ref_t copy_gref; }; -static struct netbk_rx_meta *get_next_rx_buffer(struct xenvif *vif, - struct netrx_pending_operations *npo) +static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif *vif, + struct netrx_pending_operations *npo) { - struct netbk_rx_meta *meta; + struct xenvif_rx_meta *meta; struct xen_netif_rx_request *req; req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++); @@ -446,19 +297,13 @@ static struct netbk_rx_meta *get_next_rx_buffer(struct xenvif *vif, * Set up the grant operations for this fragment. If it's a flipping * interface, we also set up the unmap request from here. */ -static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, - struct netrx_pending_operations *npo, - struct page *page, unsigned long size, - unsigned long offset, int *head) +static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, + struct netrx_pending_operations *npo, + struct page *page, unsigned long size, + unsigned long offset, int *head) { struct gnttab_copy *copy_gop; - struct netbk_rx_meta *meta; - /* - * These variables are used iff get_page_ext returns true, - * in which case they are guaranteed to be initialized. - */ - unsigned int uninitialized_var(group), uninitialized_var(idx); - int foreign = get_page_ext(page, &group, &idx); + struct xenvif_rx_meta *meta; unsigned long bytes; /* Data must not cross a page boundary. */ @@ -494,26 +339,15 @@ static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, copy_gop = npo->copy + npo->copy_prod++; copy_gop->flags = GNTCOPY_dest_gref; - if (foreign) { - struct xen_netbk *netbk = &xen_netbk[group]; - struct pending_tx_info *src_pend; - - src_pend = &netbk->pending_tx_info[idx]; + copy_gop->len = bytes; - copy_gop->source.domid = src_pend->vif->domid; - copy_gop->source.u.ref = src_pend->req.gref; - copy_gop->flags |= GNTCOPY_source_gref; - } else { - void *vaddr = page_address(page); - copy_gop->source.domid = DOMID_SELF; - copy_gop->source.u.gmfn = virt_to_mfn(vaddr); - } + copy_gop->source.domid = DOMID_SELF; + copy_gop->source.u.gmfn = virt_to_mfn(page_address(page)); copy_gop->source.offset = offset; - copy_gop->dest.domid = vif->domid; + copy_gop->dest.domid = vif->domid; copy_gop->dest.offset = npo->copy_off; copy_gop->dest.u.ref = npo->copy_gref; - copy_gop->len = bytes; npo->copy_off += bytes; meta->size += bytes; @@ -549,14 +383,14 @@ static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, * zero GSO descriptors (for non-GSO packets) or one descriptor (for * frontend-side LRO). */ -static int netbk_gop_skb(struct sk_buff *skb, - struct netrx_pending_operations *npo) +static int xenvif_gop_skb(struct sk_buff *skb, + struct netrx_pending_operations *npo) { struct xenvif *vif = netdev_priv(skb->dev); int nr_frags = skb_shinfo(skb)->nr_frags; int i; struct xen_netif_rx_request *req; - struct netbk_rx_meta *meta; + struct xenvif_rx_meta *meta; unsigned char *data; int head = 1; int old_meta_prod; @@ -593,30 +427,30 @@ static int netbk_gop_skb(struct sk_buff *skb, if (data + len > skb_tail_pointer(skb)) len = skb_tail_pointer(skb) - data; - netbk_gop_frag_copy(vif, skb, npo, - virt_to_page(data), len, offset, &head); + xenvif_gop_frag_copy(vif, skb, npo, + virt_to_page(data), len, offset, &head); data += len; } for (i = 0; i < nr_frags; i++) { - netbk_gop_frag_copy(vif, skb, npo, - skb_frag_page(&skb_shinfo(skb)->frags[i]), - skb_frag_size(&skb_shinfo(skb)->frags[i]), - skb_shinfo(skb)->frags[i].page_offset, - &head); + xenvif_gop_frag_copy(vif, skb, npo, + skb_frag_page(&skb_shinfo(skb)->frags[i]), + skb_frag_size(&skb_shinfo(skb)->frags[i]), + skb_shinfo(skb)->frags[i].page_offset, + &head); } return npo->meta_prod - old_meta_prod; } /* - * This is a twin to netbk_gop_skb. Assume that netbk_gop_skb was + * This is a twin to xenvif_gop_skb. Assume that xenvif_gop_skb was * used to set up the operations on the top of * netrx_pending_operations, which have since been done. Check that * they didn't give any errors and advance over them. */ -static int netbk_check_gop(struct xenvif *vif, int nr_meta_slots, - struct netrx_pending_operations *npo) +static int xenvif_check_gop(struct xenvif *vif, int nr_meta_slots, + struct netrx_pending_operations *npo) { struct gnttab_copy *copy_op; int status = XEN_NETIF_RSP_OKAY; @@ -635,9 +469,9 @@ static int netbk_check_gop(struct xenvif *vif, int nr_meta_slots, return status; } -static void netbk_add_frag_responses(struct xenvif *vif, int status, - struct netbk_rx_meta *meta, - int nr_meta_slots) +static void xenvif_add_frag_responses(struct xenvif *vif, int status, + struct xenvif_rx_meta *meta, + int nr_meta_slots) { int i; unsigned long offset; @@ -665,9 +499,13 @@ struct skb_cb_overlay { int meta_slots_used; }; -static void xen_netbk_rx_action(struct xen_netbk *netbk) +static void xenvif_kick_thread(struct xenvif *vif) +{ + wake_up(&vif->wq); +} + +void xenvif_rx_action(struct xenvif *vif) { - struct xenvif *vif = NULL, *tmp; s8 status; u16 flags; struct xen_netif_rx_response *resp; @@ -679,22 +517,23 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk) int count; unsigned long offset; struct skb_cb_overlay *sco; + int need_to_notify = 0; struct netrx_pending_operations npo = { - .copy = netbk->grant_copy_op, - .meta = netbk->meta, + .copy = vif->grant_copy_op, + .meta = vif->meta, }; skb_queue_head_init(&rxq); count = 0; - while ((skb = skb_dequeue(&netbk->rx_queue)) != NULL) { + while ((skb = skb_dequeue(&vif->rx_queue)) != NULL) { vif = netdev_priv(skb->dev); nr_frags = skb_shinfo(skb)->nr_frags; sco = (struct skb_cb_overlay *)skb->cb; - sco->meta_slots_used = netbk_gop_skb(skb, &npo); + sco->meta_slots_used = xenvif_gop_skb(skb, &npo); count += nr_frags + 1; @@ -706,27 +545,27 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk) break; } - BUG_ON(npo.meta_prod > ARRAY_SIZE(netbk->meta)); + BUG_ON(npo.meta_prod > ARRAY_SIZE(vif->meta)); if (!npo.copy_prod) return; - BUG_ON(npo.copy_prod > ARRAY_SIZE(netbk->grant_copy_op)); - gnttab_batch_copy(netbk->grant_copy_op, npo.copy_prod); + BUG_ON(npo.copy_prod > ARRAY_SIZE(vif->grant_copy_op)); + gnttab_batch_copy(vif->grant_copy_op, npo.copy_prod); while ((skb = __skb_dequeue(&rxq)) != NULL) { sco = (struct skb_cb_overlay *)skb->cb; vif = netdev_priv(skb->dev); - if (netbk->meta[npo.meta_cons].gso_size && vif->gso_prefix) { + if (vif->meta[npo.meta_cons].gso_size && vif->gso_prefix) { resp = RING_GET_RESPONSE(&vif->rx, - vif->rx.rsp_prod_pvt++); + vif->rx.rsp_prod_pvt++); resp->flags = XEN_NETRXF_gso_prefix | XEN_NETRXF_more_data; - resp->offset = netbk->meta[npo.meta_cons].gso_size; - resp->id = netbk->meta[npo.meta_cons].id; + resp->offset = vif->meta[npo.meta_cons].gso_size; + resp->id = vif->meta[npo.meta_cons].id; resp->status = sco->meta_slots_used; npo.meta_cons++; @@ -737,7 +576,7 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk) vif->dev->stats.tx_bytes += skb->len; vif->dev->stats.tx_packets++; - status = netbk_check_gop(vif, sco->meta_slots_used, &npo); + status = xenvif_check_gop(vif, sco->meta_slots_used, &npo); if (sco->meta_slots_used == 1) flags = 0; @@ -751,12 +590,12 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk) flags |= XEN_NETRXF_data_validated; offset = 0; - resp = make_rx_response(vif, netbk->meta[npo.meta_cons].id, + resp = make_rx_response(vif, vif->meta[npo.meta_cons].id, status, offset, - netbk->meta[npo.meta_cons].size, + vif->meta[npo.meta_cons].size, flags); - if (netbk->meta[npo.meta_cons].gso_size && !vif->gso_prefix) { + if (vif->meta[npo.meta_cons].gso_size && !vif->gso_prefix) { struct xen_netif_extra_info *gso = (struct xen_netif_extra_info *) RING_GET_RESPONSE(&vif->rx, @@ -764,7 +603,7 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk) resp->flags |= XEN_NETRXF_extra_info; - gso->u.gso.size = netbk->meta[npo.meta_cons].gso_size; + gso->u.gso.size = vif->meta[npo.meta_cons].gso_size; gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; gso->u.gso.pad = 0; gso->u.gso.features = 0; @@ -773,123 +612,44 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk) gso->flags = 0; } - netbk_add_frag_responses(vif, status, - netbk->meta + npo.meta_cons + 1, - sco->meta_slots_used); + xenvif_add_frag_responses(vif, status, + vif->meta + npo.meta_cons + 1, + sco->meta_slots_used); RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&vif->rx, ret); + if (ret) + need_to_notify = 1; + xenvif_notify_tx_completion(vif); - if (ret && list_empty(&vif->notify_list)) - list_add_tail(&vif->notify_list, ¬ify); - else - xenvif_put(vif); npo.meta_cons += sco->meta_slots_used; dev_kfree_skb(skb); } - list_for_each_entry_safe(vif, tmp, ¬ify, notify_list) { + if (need_to_notify) notify_remote_via_irq(vif->rx_irq); - list_del_init(&vif->notify_list); - xenvif_put(vif); - } /* More work to do? */ - if (!skb_queue_empty(&netbk->rx_queue) && - !timer_pending(&netbk->net_timer)) - xen_netbk_kick_thread(netbk); -} - -void xen_netbk_queue_tx_skb(struct xenvif *vif, struct sk_buff *skb) -{ - struct xen_netbk *netbk = vif->netbk; - - skb_queue_tail(&netbk->rx_queue, skb); - - xen_netbk_kick_thread(netbk); -} - -static void xen_netbk_alarm(unsigned long data) -{ - struct xen_netbk *netbk = (struct xen_netbk *)data; - xen_netbk_kick_thread(netbk); -} - -static int __on_net_schedule_list(struct xenvif *vif) -{ - return !list_empty(&vif->schedule_list); -} - -/* Must be called with net_schedule_list_lock held */ -static void remove_from_net_schedule_list(struct xenvif *vif) -{ - if (likely(__on_net_schedule_list(vif))) { - list_del_init(&vif->schedule_list); - xenvif_put(vif); - } -} - -static struct xenvif *poll_net_schedule_list(struct xen_netbk *netbk) -{ - struct xenvif *vif = NULL; - - spin_lock_irq(&netbk->net_schedule_list_lock); - if (list_empty(&netbk->net_schedule_list)) - goto out; - - vif = list_first_entry(&netbk->net_schedule_list, - struct xenvif, schedule_list); - if (!vif) - goto out; - - xenvif_get(vif); - - remove_from_net_schedule_list(vif); -out: - spin_unlock_irq(&netbk->net_schedule_list_lock); - return vif; + if (!skb_queue_empty(&vif->rx_queue)) + xenvif_kick_thread(vif); } -void xen_netbk_schedule_xenvif(struct xenvif *vif) +void xenvif_queue_tx_skb(struct xenvif *vif, struct sk_buff *skb) { - unsigned long flags; - struct xen_netbk *netbk = vif->netbk; - - if (__on_net_schedule_list(vif)) - goto kick; - - spin_lock_irqsave(&netbk->net_schedule_list_lock, flags); - if (!__on_net_schedule_list(vif) && - likely(xenvif_schedulable(vif))) { - list_add_tail(&vif->schedule_list, &netbk->net_schedule_list); - xenvif_get(vif); - } - spin_unlock_irqrestore(&netbk->net_schedule_list_lock, flags); - -kick: - smp_mb(); - if ((nr_pending_reqs(netbk) < (MAX_PENDING_REQS/2)) && - !list_empty(&netbk->net_schedule_list)) - xen_netbk_kick_thread(netbk); -} + skb_queue_tail(&vif->rx_queue, skb); -void xen_netbk_deschedule_xenvif(struct xenvif *vif) -{ - struct xen_netbk *netbk = vif->netbk; - spin_lock_irq(&netbk->net_schedule_list_lock); - remove_from_net_schedule_list(vif); - spin_unlock_irq(&netbk->net_schedule_list_lock); + xenvif_kick_thread(vif); } -void xen_netbk_check_rx_xenvif(struct xenvif *vif) +void xenvif_check_rx_xenvif(struct xenvif *vif) { int more_to_do; RING_FINAL_CHECK_FOR_REQUESTS(&vif->tx, more_to_do); if (more_to_do) - xen_netbk_schedule_xenvif(vif); + napi_schedule(&vif->napi); } static void tx_add_credit(struct xenvif *vif) @@ -916,11 +676,11 @@ static void tx_credit_callback(unsigned long data) { struct xenvif *vif = (struct xenvif *)data; tx_add_credit(vif); - xen_netbk_check_rx_xenvif(vif); + xenvif_check_rx_xenvif(vif); } -static void netbk_tx_err(struct xenvif *vif, - struct xen_netif_tx_request *txp, RING_IDX end) +static void xenvif_tx_err(struct xenvif *vif, + struct xen_netif_tx_request *txp, RING_IDX end) { RING_IDX cons = vif->tx.req_cons; @@ -931,21 +691,18 @@ static void netbk_tx_err(struct xenvif *vif, txp = RING_GET_REQUEST(&vif->tx, cons++); } while (1); vif->tx.req_cons = cons; - xen_netbk_check_rx_xenvif(vif); - xenvif_put(vif); } -static void netbk_fatal_tx_err(struct xenvif *vif) +static void xenvif_fatal_tx_err(struct xenvif *vif) { netdev_err(vif->dev, "fatal error; disabling device\n"); xenvif_carrier_off(vif); - xenvif_put(vif); } -static int netbk_count_requests(struct xenvif *vif, - struct xen_netif_tx_request *first, - struct xen_netif_tx_request *txp, - int work_to_do) +static int xenvif_count_requests(struct xenvif *vif, + struct xen_netif_tx_request *first, + struct xen_netif_tx_request *txp, + int work_to_do) { RING_IDX cons = vif->tx.req_cons; int slots = 0; @@ -962,7 +719,7 @@ static int netbk_count_requests(struct xenvif *vif, netdev_err(vif->dev, "Asked for %d slots but exceeds this limit\n", work_to_do); - netbk_fatal_tx_err(vif); + xenvif_fatal_tx_err(vif); return -ENODATA; } @@ -973,7 +730,7 @@ static int netbk_count_requests(struct xenvif *vif, netdev_err(vif->dev, "Malicious frontend using %d slots, threshold %u\n", slots, fatal_skb_slots); - netbk_fatal_tx_err(vif); + xenvif_fatal_tx_err(vif); return -E2BIG; } @@ -1021,7 +778,7 @@ static int netbk_count_requests(struct xenvif *vif, if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) { netdev_err(vif->dev, "Cross page boundary, txp->offset: %x, size: %u\n", txp->offset, txp->size); - netbk_fatal_tx_err(vif); + xenvif_fatal_tx_err(vif); return -EINVAL; } @@ -1033,30 +790,30 @@ static int netbk_count_requests(struct xenvif *vif, } while (more_data); if (drop_err) { - netbk_tx_err(vif, first, cons + slots); + xenvif_tx_err(vif, first, cons + slots); return drop_err; } return slots; } -static struct page *xen_netbk_alloc_page(struct xen_netbk *netbk, - u16 pending_idx) +static struct page *xenvif_alloc_page(struct xenvif *vif, + u16 pending_idx) { struct page *page; - page = alloc_page(GFP_KERNEL|__GFP_COLD); + + page = alloc_page(GFP_ATOMIC|__GFP_COLD); if (!page) return NULL; - set_page_ext(page, netbk, pending_idx); - netbk->mmap_pages[pending_idx] = page; + vif->mmap_pages[pending_idx] = page; + return page; } -static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk, - struct xenvif *vif, - struct sk_buff *skb, - struct xen_netif_tx_request *txp, - struct gnttab_copy *gop) +static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif, + struct sk_buff *skb, + struct xen_netif_tx_request *txp, + struct gnttab_copy *gop) { struct skb_shared_info *shinfo = skb_shinfo(skb); skb_frag_t *frags = shinfo->frags; @@ -1079,14 +836,14 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk, /* Coalesce tx requests, at this point the packet passed in * should be <= 64K. Any packets larger than 64K have been - * handled in netbk_count_requests(). + * handled in xenvif_count_requests(). */ for (shinfo->nr_frags = slot = start; slot < nr_slots; shinfo->nr_frags++) { struct pending_tx_info *pending_tx_info = - netbk->pending_tx_info; + vif->pending_tx_info; - page = alloc_page(GFP_KERNEL|__GFP_COLD); + page = alloc_page(GFP_ATOMIC|__GFP_COLD); if (!page) goto err; @@ -1121,21 +878,18 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk, gop->len = txp->size; dst_offset += gop->len; - index = pending_index(netbk->pending_cons++); + index = pending_index(vif->pending_cons++); - pending_idx = netbk->pending_ring[index]; + pending_idx = vif->pending_ring[index]; memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp)); - xenvif_get(vif); - - pending_tx_info[pending_idx].vif = vif; /* Poison these fields, corresponding * fields for head tx req will be set * to correct values after the loop. */ - netbk->mmap_pages[pending_idx] = (void *)(~0UL); + vif->mmap_pages[pending_idx] = (void *)(~0UL); pending_tx_info[pending_idx].head = INVALID_PENDING_RING_IDX; @@ -1155,8 +909,7 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk, first->req.offset = 0; first->req.size = dst_offset; first->head = start_idx; - set_page_ext(page, netbk, head_idx); - netbk->mmap_pages[head_idx] = page; + vif->mmap_pages[head_idx] = page; frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx); } @@ -1166,20 +919,20 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk, err: /* Unwind, freeing all pages and sending error responses. */ while (shinfo->nr_frags-- > start) { - xen_netbk_idx_release(netbk, + xenvif_idx_release(vif, frag_get_pending_idx(&frags[shinfo->nr_frags]), XEN_NETIF_RSP_ERROR); } /* The head too, if necessary. */ if (start) - xen_netbk_idx_release(netbk, pending_idx, XEN_NETIF_RSP_ERROR); + xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR); return NULL; } -static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, - struct sk_buff *skb, - struct gnttab_copy **gopp) +static int xenvif_tx_check_gop(struct xenvif *vif, + struct sk_buff *skb, + struct gnttab_copy **gopp) { struct gnttab_copy *gop = *gopp; u16 pending_idx = *((u16 *)skb->data); @@ -1192,7 +945,7 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, /* Check status of header. */ err = gop->status; if (unlikely(err)) - xen_netbk_idx_release(netbk, pending_idx, XEN_NETIF_RSP_ERROR); + xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR); /* Skip first skb fragment if it is on same page as header fragment. */ start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); @@ -1202,7 +955,7 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, pending_ring_idx_t head; pending_idx = frag_get_pending_idx(&shinfo->frags[i]); - tx_info = &netbk->pending_tx_info[pending_idx]; + tx_info = &vif->pending_tx_info[pending_idx]; head = tx_info->head; /* Check error status: if okay then remember grant handle. */ @@ -1210,18 +963,19 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, newerr = (++gop)->status; if (newerr) break; - peek = netbk->pending_ring[pending_index(++head)]; - } while (!pending_tx_is_head(netbk, peek)); + peek = vif->pending_ring[pending_index(++head)]; + } while (!pending_tx_is_head(vif, peek)); if (likely(!newerr)) { /* Had a previous error? Invalidate this fragment. */ if (unlikely(err)) - xen_netbk_idx_release(netbk, pending_idx, XEN_NETIF_RSP_OKAY); + xenvif_idx_release(vif, pending_idx, + XEN_NETIF_RSP_OKAY); continue; } /* Error on this fragment: respond to client with an error. */ - xen_netbk_idx_release(netbk, pending_idx, XEN_NETIF_RSP_ERROR); + xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR); /* Not the first error? Preceding frags already invalidated. */ if (err) @@ -1229,10 +983,11 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, /* First error: invalidate header and preceding fragments. */ pending_idx = *((u16 *)skb->data); - xen_netbk_idx_release(netbk, pending_idx, XEN_NETIF_RSP_OKAY); + xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY); for (j = start; j < i; j++) { pending_idx = frag_get_pending_idx(&shinfo->frags[j]); - xen_netbk_idx_release(netbk, pending_idx, XEN_NETIF_RSP_OKAY); + xenvif_idx_release(vif, pending_idx, + XEN_NETIF_RSP_OKAY); } /* Remember the error: invalidate all subsequent fragments. */ @@ -1243,7 +998,7 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, return err; } -static void xen_netbk_fill_frags(struct xen_netbk *netbk, struct sk_buff *skb) +static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); int nr_frags = shinfo->nr_frags; @@ -1257,20 +1012,20 @@ static void xen_netbk_fill_frags(struct xen_netbk *netbk, struct sk_buff *skb) pending_idx = frag_get_pending_idx(frag); - txp = &netbk->pending_tx_info[pending_idx].req; - page = virt_to_page(idx_to_kaddr(netbk, pending_idx)); + txp = &vif->pending_tx_info[pending_idx].req; + page = virt_to_page(idx_to_kaddr(vif, pending_idx)); __skb_fill_page_desc(skb, i, page, txp->offset, txp->size); skb->len += txp->size; skb->data_len += txp->size; skb->truesize += txp->size; - /* Take an extra reference to offset xen_netbk_idx_release */ - get_page(netbk->mmap_pages[pending_idx]); - xen_netbk_idx_release(netbk, pending_idx, XEN_NETIF_RSP_OKAY); + /* Take an extra reference to offset xenvif_idx_release */ + get_page(vif->mmap_pages[pending_idx]); + xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY); } } -static int xen_netbk_get_extras(struct xenvif *vif, +static int xenvif_get_extras(struct xenvif *vif, struct xen_netif_extra_info *extras, int work_to_do) { @@ -1280,7 +1035,7 @@ static int xen_netbk_get_extras(struct xenvif *vif, do { if (unlikely(work_to_do-- <= 0)) { netdev_err(vif->dev, "Missing extra info\n"); - netbk_fatal_tx_err(vif); + xenvif_fatal_tx_err(vif); return -EBADR; } @@ -1291,7 +1046,7 @@ static int xen_netbk_get_extras(struct xenvif *vif, vif->tx.req_cons = ++cons; netdev_err(vif->dev, "Invalid extra type: %d\n", extra.type); - netbk_fatal_tx_err(vif); + xenvif_fatal_tx_err(vif); return -EINVAL; } @@ -1302,20 +1057,20 @@ static int xen_netbk_get_extras(struct xenvif *vif, return work_to_do; } -static int netbk_set_skb_gso(struct xenvif *vif, - struct sk_buff *skb, - struct xen_netif_extra_info *gso) +static int xenvif_set_skb_gso(struct xenvif *vif, + struct sk_buff *skb, + struct xen_netif_extra_info *gso) { if (!gso->u.gso.size) { netdev_err(vif->dev, "GSO size must not be zero.\n"); - netbk_fatal_tx_err(vif); + xenvif_fatal_tx_err(vif); return -EINVAL; } /* Currently only TCPv4 S.O. is supported. */ if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) { netdev_err(vif->dev, "Bad GSO type %d.\n", gso->u.gso.type); - netbk_fatal_tx_err(vif); + xenvif_fatal_tx_err(vif); return -EINVAL; } @@ -1426,16 +1181,14 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size) return false; } -static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) +static unsigned xenvif_tx_build_gops(struct xenvif *vif) { - struct gnttab_copy *gop = netbk->tx_copy_ops, *request_gop; + struct gnttab_copy *gop = vif->tx_copy_ops, *request_gop; struct sk_buff *skb; int ret; - while ((nr_pending_reqs(netbk) + XEN_NETBK_LEGACY_SLOTS_MAX - < MAX_PENDING_REQS) && - !list_empty(&netbk->net_schedule_list)) { - struct xenvif *vif; + while ((nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX + < MAX_PENDING_REQS)) { struct xen_netif_tx_request txreq; struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX]; struct page *page; @@ -1446,16 +1199,6 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) unsigned int data_len; pending_ring_idx_t index; - /* Get a netif from the list with work to do. */ - vif = poll_net_schedule_list(netbk); - /* This can sometimes happen because the test of - * list_empty(net_schedule_list) at the top of the - * loop is unlocked. Just go back and have another - * look. - */ - if (!vif) - continue; - if (vif->tx.sring->req_prod - vif->tx.req_cons > XEN_NETIF_TX_RING_SIZE) { netdev_err(vif->dev, @@ -1463,15 +1206,13 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) "req_prod %d, req_cons %d, size %ld\n", vif->tx.sring->req_prod, vif->tx.req_cons, XEN_NETIF_TX_RING_SIZE); - netbk_fatal_tx_err(vif); + xenvif_fatal_tx_err(vif); continue; } RING_FINAL_CHECK_FOR_REQUESTS(&vif->tx, work_to_do); - if (!work_to_do) { - xenvif_put(vif); - continue; - } + if (!work_to_do) + break; idx = vif->tx.req_cons; rmb(); /* Ensure that we see the request before we copy it. */ @@ -1479,10 +1220,8 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) /* Credit-based scheduling. */ if (txreq.size > vif->remaining_credit && - tx_credit_exceeded(vif, txreq.size)) { - xenvif_put(vif); - continue; - } + tx_credit_exceeded(vif, txreq.size)) + break; vif->remaining_credit -= txreq.size; @@ -1491,24 +1230,24 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) memset(extras, 0, sizeof(extras)); if (txreq.flags & XEN_NETTXF_extra_info) { - work_to_do = xen_netbk_get_extras(vif, extras, - work_to_do); + work_to_do = xenvif_get_extras(vif, extras, + work_to_do); idx = vif->tx.req_cons; if (unlikely(work_to_do < 0)) - continue; + break; } - ret = netbk_count_requests(vif, &txreq, txfrags, work_to_do); + ret = xenvif_count_requests(vif, &txreq, txfrags, work_to_do); if (unlikely(ret < 0)) - continue; + break; idx += ret; if (unlikely(txreq.size < ETH_HLEN)) { netdev_dbg(vif->dev, "Bad packet size: %d\n", txreq.size); - netbk_tx_err(vif, &txreq, idx); - continue; + xenvif_tx_err(vif, &txreq, idx); + break; } /* No crossing a page as the payload mustn't fragment. */ @@ -1517,12 +1256,12 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) "txreq.offset: %x, size: %u, end: %lu\n", txreq.offset, txreq.size, (txreq.offset&~PAGE_MASK) + txreq.size); - netbk_fatal_tx_err(vif); - continue; + xenvif_fatal_tx_err(vif); + break; } - index = pending_index(netbk->pending_cons); - pending_idx = netbk->pending_ring[index]; + index = pending_index(vif->pending_cons); + pending_idx = vif->pending_ring[index]; data_len = (txreq.size > PKT_PROT_LEN && ret < XEN_NETBK_LEGACY_SLOTS_MAX) ? @@ -1533,7 +1272,7 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) if (unlikely(skb == NULL)) { netdev_dbg(vif->dev, "Can't allocate a skb in start_xmit.\n"); - netbk_tx_err(vif, &txreq, idx); + xenvif_tx_err(vif, &txreq, idx); break; } @@ -1544,19 +1283,19 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) struct xen_netif_extra_info *gso; gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1]; - if (netbk_set_skb_gso(vif, skb, gso)) { - /* Failure in netbk_set_skb_gso is fatal. */ + if (xenvif_set_skb_gso(vif, skb, gso)) { + /* Failure in xenvif_set_skb_gso is fatal. */ kfree_skb(skb); - continue; + break; } } /* XXX could copy straight to head */ - page = xen_netbk_alloc_page(netbk, pending_idx); + page = xenvif_alloc_page(vif, pending_idx); if (!page) { kfree_skb(skb); - netbk_tx_err(vif, &txreq, idx); - continue; + xenvif_tx_err(vif, &txreq, idx); + break; } gop->source.u.ref = txreq.gref; @@ -1572,10 +1311,9 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) gop++; - memcpy(&netbk->pending_tx_info[pending_idx].req, + memcpy(&vif->pending_tx_info[pending_idx].req, &txreq, sizeof(txreq)); - netbk->pending_tx_info[pending_idx].vif = vif; - netbk->pending_tx_info[pending_idx].head = index; + vif->pending_tx_info[pending_idx].head = index; *((u16 *)skb->data) = pending_idx; __skb_put(skb, data_len); @@ -1590,46 +1328,45 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) INVALID_PENDING_IDX); } - netbk->pending_cons++; + vif->pending_cons++; - request_gop = xen_netbk_get_requests(netbk, vif, - skb, txfrags, gop); + request_gop = xenvif_get_requests(vif, skb, txfrags, gop); if (request_gop == NULL) { kfree_skb(skb); - netbk_tx_err(vif, &txreq, idx); - continue; + xenvif_tx_err(vif, &txreq, idx); + break; } gop = request_gop; - __skb_queue_tail(&netbk->tx_queue, skb); + __skb_queue_tail(&vif->tx_queue, skb); vif->tx.req_cons = idx; - xen_netbk_check_rx_xenvif(vif); - if ((gop-netbk->tx_copy_ops) >= ARRAY_SIZE(netbk->tx_copy_ops)) + if ((gop-vif->tx_copy_ops) >= ARRAY_SIZE(vif->tx_copy_ops)) break; } - return gop - netbk->tx_copy_ops; + return gop - vif->tx_copy_ops; } -static void xen_netbk_tx_submit(struct xen_netbk *netbk) + +static int xenvif_tx_submit(struct xenvif *vif, int budget) { - struct gnttab_copy *gop = netbk->tx_copy_ops; + struct gnttab_copy *gop = vif->tx_copy_ops; struct sk_buff *skb; + int work_done = 0; - while ((skb = __skb_dequeue(&netbk->tx_queue)) != NULL) { + while (work_done < budget && + (skb = __skb_dequeue(&vif->tx_queue)) != NULL) { struct xen_netif_tx_request *txp; - struct xenvif *vif; u16 pending_idx; unsigned data_len; pending_idx = *((u16 *)skb->data); - vif = netbk->pending_tx_info[pending_idx].vif; - txp = &netbk->pending_tx_info[pending_idx].req; + txp = &vif->pending_tx_info[pending_idx].req; /* Check the remap error code. */ - if (unlikely(xen_netbk_tx_check_gop(netbk, skb, &gop))) { + if (unlikely(xenvif_tx_check_gop(vif, skb, &gop))) { netdev_dbg(vif->dev, "netback grant failed.\n"); skb_shinfo(skb)->nr_frags = 0; kfree_skb(skb); @@ -1638,7 +1375,7 @@ static void xen_netbk_tx_submit(struct xen_netbk *netbk) data_len = skb->len; memcpy(skb->data, - (void *)(idx_to_kaddr(netbk, pending_idx)|txp->offset), + (void *)(idx_to_kaddr(vif, pending_idx)|txp->offset), data_len); if (data_len < txp->size) { /* Append the packet payload as a fragment. */ @@ -1646,7 +1383,8 @@ static void xen_netbk_tx_submit(struct xen_netbk *netbk) txp->size -= data_len; } else { /* Schedule a response immediately. */ - xen_netbk_idx_release(netbk, pending_idx, XEN_NETIF_RSP_OKAY); + xenvif_idx_release(vif, pending_idx, + XEN_NETIF_RSP_OKAY); } if (txp->flags & XEN_NETTXF_csum_blank) @@ -1654,7 +1392,7 @@ static void xen_netbk_tx_submit(struct xen_netbk *netbk) else if (txp->flags & XEN_NETTXF_data_validated) skb->ip_summed = CHECKSUM_UNNECESSARY; - xen_netbk_fill_frags(netbk, skb); + xenvif_fill_frags(vif, skb); /* * If the initial fragment was < PKT_PROT_LEN then @@ -1682,53 +1420,61 @@ static void xen_netbk_tx_submit(struct xen_netbk *netbk) vif->dev->stats.rx_bytes += skb->len; vif->dev->stats.rx_packets++; - xenvif_receive_skb(vif, skb); + work_done++; + + netif_receive_skb(skb); } + + return work_done; } /* Called after netfront has transmitted */ -static void xen_netbk_tx_action(struct xen_netbk *netbk) +int xenvif_tx_action(struct xenvif *vif, int budget) { unsigned nr_gops; + int work_done; + + if (unlikely(!tx_work_todo(vif))) + return 0; - nr_gops = xen_netbk_tx_build_gops(netbk); + nr_gops = xenvif_tx_build_gops(vif); if (nr_gops == 0) - return; + return 0; - gnttab_batch_copy(netbk->tx_copy_ops, nr_gops); + gnttab_batch_copy(vif->tx_copy_ops, nr_gops); - xen_netbk_tx_submit(netbk); + work_done = xenvif_tx_submit(vif, nr_gops); + + return work_done; } -static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx, - u8 status) +static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx, + u8 status) { - struct xenvif *vif; struct pending_tx_info *pending_tx_info; pending_ring_idx_t head; u16 peek; /* peek into next tx request */ - BUG_ON(netbk->mmap_pages[pending_idx] == (void *)(~0UL)); + BUG_ON(vif->mmap_pages[pending_idx] == (void *)(~0UL)); /* Already complete? */ - if (netbk->mmap_pages[pending_idx] == NULL) + if (vif->mmap_pages[pending_idx] == NULL) return; - pending_tx_info = &netbk->pending_tx_info[pending_idx]; + pending_tx_info = &vif->pending_tx_info[pending_idx]; - vif = pending_tx_info->vif; head = pending_tx_info->head; - BUG_ON(!pending_tx_is_head(netbk, head)); - BUG_ON(netbk->pending_ring[pending_index(head)] != pending_idx); + BUG_ON(!pending_tx_is_head(vif, head)); + BUG_ON(vif->pending_ring[pending_index(head)] != pending_idx); do { pending_ring_idx_t index; pending_ring_idx_t idx = pending_index(head); - u16 info_idx = netbk->pending_ring[idx]; + u16 info_idx = vif->pending_ring[idx]; - pending_tx_info = &netbk->pending_tx_info[info_idx]; + pending_tx_info = &vif->pending_tx_info[info_idx]; make_tx_response(vif, &pending_tx_info->req, status); /* Setting any number other than @@ -1737,18 +1483,15 @@ static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx, */ pending_tx_info->head = 0; - index = pending_index(netbk->pending_prod++); - netbk->pending_ring[index] = netbk->pending_ring[info_idx]; - - xenvif_put(vif); + index = pending_index(vif->pending_prod++); + vif->pending_ring[index] = vif->pending_ring[info_idx]; - peek = netbk->pending_ring[pending_index(++head)]; + peek = vif->pending_ring[pending_index(++head)]; - } while (!pending_tx_is_head(netbk, peek)); + } while (!pending_tx_is_head(vif, peek)); - netbk->mmap_pages[pending_idx]->mapping = 0; - put_page(netbk->mmap_pages[pending_idx]); - netbk->mmap_pages[pending_idx] = NULL; + put_page(vif->mmap_pages[pending_idx]); + vif->mmap_pages[pending_idx] = NULL; } @@ -1796,46 +1539,23 @@ static struct xen_netif_rx_response *make_rx_response(struct xenvif *vif, return resp; } -static inline int rx_work_todo(struct xen_netbk *netbk) +static inline int rx_work_todo(struct xenvif *vif) { - return !skb_queue_empty(&netbk->rx_queue); + return !skb_queue_empty(&vif->rx_queue); } -static inline int tx_work_todo(struct xen_netbk *netbk) +static inline int tx_work_todo(struct xenvif *vif) { - if ((nr_pending_reqs(netbk) + XEN_NETBK_LEGACY_SLOTS_MAX - < MAX_PENDING_REQS) && - !list_empty(&netbk->net_schedule_list)) + if (likely(RING_HAS_UNCONSUMED_REQUESTS(&vif->tx)) && + (nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX + < MAX_PENDING_REQS)) return 1; return 0; } -static int xen_netbk_kthread(void *data) -{ - struct xen_netbk *netbk = data; - while (!kthread_should_stop()) { - wait_event_interruptible(netbk->wq, - rx_work_todo(netbk) || - tx_work_todo(netbk) || - kthread_should_stop()); - cond_resched(); - - if (kthread_should_stop()) - break; - - if (rx_work_todo(netbk)) - xen_netbk_rx_action(netbk); - - if (tx_work_todo(netbk)) - xen_netbk_tx_action(netbk); - } - - return 0; -} - -void xen_netbk_unmap_frontend_rings(struct xenvif *vif) +void xenvif_unmap_frontend_rings(struct xenvif *vif) { if (vif->tx.sring) xenbus_unmap_ring_vfree(xenvif_to_xenbus_device(vif), @@ -1845,9 +1565,9 @@ void xen_netbk_unmap_frontend_rings(struct xenvif *vif) vif->rx.sring); } -int xen_netbk_map_frontend_rings(struct xenvif *vif, - grant_ref_t tx_ring_ref, - grant_ref_t rx_ring_ref) +int xenvif_map_frontend_rings(struct xenvif *vif, + grant_ref_t tx_ring_ref, + grant_ref_t rx_ring_ref) { void *addr; struct xen_netif_tx_sring *txs; @@ -1876,15 +1596,33 @@ int xen_netbk_map_frontend_rings(struct xenvif *vif, return 0; err: - xen_netbk_unmap_frontend_rings(vif); + xenvif_unmap_frontend_rings(vif); return err; } +int xenvif_kthread(void *data) +{ + struct xenvif *vif = data; + + while (!kthread_should_stop()) { + wait_event_interruptible(vif->wq, + rx_work_todo(vif) || + kthread_should_stop()); + if (kthread_should_stop()) + break; + + if (rx_work_todo(vif)) + xenvif_rx_action(vif); + + cond_resched(); + } + + return 0; +} + static int __init netback_init(void) { - int i; int rc = 0; - int group; if (!xen_domain()) return -ENODEV; @@ -1895,48 +1633,6 @@ static int __init netback_init(void) fatal_skb_slots = XEN_NETBK_LEGACY_SLOTS_MAX; } - xen_netbk_group_nr = num_online_cpus(); - xen_netbk = vzalloc(sizeof(struct xen_netbk) * xen_netbk_group_nr); - if (!xen_netbk) - return -ENOMEM; - - for (group = 0; group < xen_netbk_group_nr; group++) { - struct xen_netbk *netbk = &xen_netbk[group]; - skb_queue_head_init(&netbk->rx_queue); - skb_queue_head_init(&netbk->tx_queue); - - init_timer(&netbk->net_timer); - netbk->net_timer.data = (unsigned long)netbk; - netbk->net_timer.function = xen_netbk_alarm; - - netbk->pending_cons = 0; - netbk->pending_prod = MAX_PENDING_REQS; - for (i = 0; i < MAX_PENDING_REQS; i++) - netbk->pending_ring[i] = i; - - init_waitqueue_head(&netbk->wq); - netbk->task = kthread_create(xen_netbk_kthread, - (void *)netbk, - "netback/%u", group); - - if (IS_ERR(netbk->task)) { - pr_alert("kthread_create() fails at netback\n"); - del_timer(&netbk->net_timer); - rc = PTR_ERR(netbk->task); - goto failed_init; - } - - kthread_bind(netbk->task, group); - - INIT_LIST_HEAD(&netbk->net_schedule_list); - - spin_lock_init(&netbk->net_schedule_list_lock); - - atomic_set(&netbk->netfront_count, 0); - - wake_up_process(netbk->task); - } - rc = xenvif_xenbus_init(); if (rc) goto failed_init; @@ -1944,35 +1640,14 @@ static int __init netback_init(void) return 0; failed_init: - while (--group >= 0) { - struct xen_netbk *netbk = &xen_netbk[group]; - del_timer(&netbk->net_timer); - kthread_stop(netbk->task); - } - vfree(xen_netbk); return rc; - } module_init(netback_init); static void __exit netback_fini(void) { - int i, j; - xenvif_xenbus_fini(); - - for (i = 0; i < xen_netbk_group_nr; i++) { - struct xen_netbk *netbk = &xen_netbk[i]; - del_timer_sync(&netbk->net_timer); - kthread_stop(netbk->task); - for (j = 0; j < MAX_PENDING_REQS; j++) { - if (netbk->mmap_pages[j]) - __free_page(netbk->mmap_pages[j]); - } - } - - vfree(xen_netbk); } module_exit(netback_fini); diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 6bb7cf2..b10ba00 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -392,6 +392,8 @@ static void __unflatten_device_tree(struct boot_param_header *blob, mem = (unsigned long) dt_alloc(size + 4, __alignof__(struct device_node)); + memset((void *)mem, 0, size); + ((__be32 *)mem)[size / 4] = cpu_to_be32(0xdeadbeef); pr_debug(" unflattening %lx...\n", mem); diff --git a/drivers/pinctrl/pinctrl-sunxi.c b/drivers/pinctrl/pinctrl-sunxi.c index c47fd1e..94716c7 100644 --- a/drivers/pinctrl/pinctrl-sunxi.c +++ b/drivers/pinctrl/pinctrl-sunxi.c @@ -278,6 +278,7 @@ static int sunxi_pconf_group_set(struct pinctrl_dev *pctldev, { struct sunxi_pinctrl *pctl = pinctrl_dev_get_drvdata(pctldev); struct sunxi_pinctrl_group *g = &pctl->groups[group]; + unsigned long flags; u32 val, mask; u16 strength; u8 dlevel; @@ -295,22 +296,35 @@ static int sunxi_pconf_group_set(struct pinctrl_dev *pctldev, * 3: 40mA */ dlevel = strength / 10 - 1; + + spin_lock_irqsave(&pctl->lock, flags); + val = readl(pctl->membase + sunxi_dlevel_reg(g->pin)); mask = DLEVEL_PINS_MASK << sunxi_dlevel_offset(g->pin); writel((val & ~mask) | dlevel << sunxi_dlevel_offset(g->pin), pctl->membase + sunxi_dlevel_reg(g->pin)); + + spin_unlock_irqrestore(&pctl->lock, flags); break; case PIN_CONFIG_BIAS_PULL_UP: + spin_lock_irqsave(&pctl->lock, flags); + val = readl(pctl->membase + sunxi_pull_reg(g->pin)); mask = PULL_PINS_MASK << sunxi_pull_offset(g->pin); writel((val & ~mask) | 1 << sunxi_pull_offset(g->pin), pctl->membase + sunxi_pull_reg(g->pin)); + + spin_unlock_irqrestore(&pctl->lock, flags); break; case PIN_CONFIG_BIAS_PULL_DOWN: + spin_lock_irqsave(&pctl->lock, flags); + val = readl(pctl->membase + sunxi_pull_reg(g->pin)); mask = PULL_PINS_MASK << sunxi_pull_offset(g->pin); writel((val & ~mask) | 2 << sunxi_pull_offset(g->pin), pctl->membase + sunxi_pull_reg(g->pin)); + + spin_unlock_irqrestore(&pctl->lock, flags); break; default: break; @@ -360,11 +374,17 @@ static void sunxi_pmx_set(struct pinctrl_dev *pctldev, u8 config) { struct sunxi_pinctrl *pctl = pinctrl_dev_get_drvdata(pctldev); + unsigned long flags; + u32 val, mask; + + spin_lock_irqsave(&pctl->lock, flags); - u32 val = readl(pctl->membase + sunxi_mux_reg(pin)); - u32 mask = MUX_PINS_MASK << sunxi_mux_offset(pin); + val = readl(pctl->membase + sunxi_mux_reg(pin)); + mask = MUX_PINS_MASK << sunxi_mux_offset(pin); writel((val & ~mask) | config << sunxi_mux_offset(pin), pctl->membase + sunxi_mux_reg(pin)); + + spin_unlock_irqrestore(&pctl->lock, flags); } static int sunxi_pmx_enable(struct pinctrl_dev *pctldev, @@ -464,8 +484,21 @@ static void sunxi_pinctrl_gpio_set(struct gpio_chip *chip, struct sunxi_pinctrl *pctl = dev_get_drvdata(chip->dev); u32 reg = sunxi_data_reg(offset); u8 index = sunxi_data_offset(offset); + unsigned long flags; + u32 regval; + + spin_lock_irqsave(&pctl->lock, flags); + + regval = readl(pctl->membase + reg); - writel((value & DATA_PINS_MASK) << index, pctl->membase + reg); + if (value) + regval |= BIT(index); + else + regval &= ~(BIT(index)); + + writel(regval, pctl->membase + reg); + + spin_unlock_irqrestore(&pctl->lock, flags); } static int sunxi_pinctrl_gpio_of_xlate(struct gpio_chip *gc, @@ -526,6 +559,8 @@ static int sunxi_pinctrl_irq_set_type(struct irq_data *d, struct sunxi_pinctrl *pctl = irq_data_get_irq_chip_data(d); u32 reg = sunxi_irq_cfg_reg(d->hwirq); u8 index = sunxi_irq_cfg_offset(d->hwirq); + unsigned long flags; + u32 regval; u8 mode; switch (type) { @@ -548,7 +583,13 @@ static int sunxi_pinctrl_irq_set_type(struct irq_data *d, return -EINVAL; } - writel((mode & IRQ_CFG_IRQ_MASK) << index, pctl->membase + reg); + spin_lock_irqsave(&pctl->lock, flags); + + regval = readl(pctl->membase + reg); + regval &= ~IRQ_CFG_IRQ_MASK; + writel(regval | (mode << index), pctl->membase + reg); + + spin_unlock_irqrestore(&pctl->lock, flags); return 0; } @@ -560,14 +601,19 @@ static void sunxi_pinctrl_irq_mask_ack(struct irq_data *d) u8 ctrl_idx = sunxi_irq_ctrl_offset(d->hwirq); u32 status_reg = sunxi_irq_status_reg(d->hwirq); u8 status_idx = sunxi_irq_status_offset(d->hwirq); + unsigned long flags; u32 val; + spin_lock_irqsave(&pctl->lock, flags); + /* Mask the IRQ */ val = readl(pctl->membase + ctrl_reg); writel(val & ~(1 << ctrl_idx), pctl->membase + ctrl_reg); /* Clear the IRQ */ writel(1 << status_idx, pctl->membase + status_reg); + + spin_unlock_irqrestore(&pctl->lock, flags); } static void sunxi_pinctrl_irq_mask(struct irq_data *d) @@ -575,11 +621,16 @@ static void sunxi_pinctrl_irq_mask(struct irq_data *d) struct sunxi_pinctrl *pctl = irq_data_get_irq_chip_data(d); u32 reg = sunxi_irq_ctrl_reg(d->hwirq); u8 idx = sunxi_irq_ctrl_offset(d->hwirq); + unsigned long flags; u32 val; + spin_lock_irqsave(&pctl->lock, flags); + /* Mask the IRQ */ val = readl(pctl->membase + reg); writel(val & ~(1 << idx), pctl->membase + reg); + + spin_unlock_irqrestore(&pctl->lock, flags); } static void sunxi_pinctrl_irq_unmask(struct irq_data *d) @@ -588,6 +639,7 @@ static void sunxi_pinctrl_irq_unmask(struct irq_data *d) struct sunxi_desc_function *func; u32 reg = sunxi_irq_ctrl_reg(d->hwirq); u8 idx = sunxi_irq_ctrl_offset(d->hwirq); + unsigned long flags; u32 val; func = sunxi_pinctrl_desc_find_function_by_pin(pctl, @@ -597,9 +649,13 @@ static void sunxi_pinctrl_irq_unmask(struct irq_data *d) /* Change muxing to INT mode */ sunxi_pmx_set(pctl->pctl_dev, pctl->irq_array[d->hwirq], func->muxval); + spin_lock_irqsave(&pctl->lock, flags); + /* Unmask the IRQ */ val = readl(pctl->membase + reg); writel(val | (1 << idx), pctl->membase + reg); + + spin_unlock_irqrestore(&pctl->lock, flags); } static struct irq_chip sunxi_pinctrl_irq_chip = { @@ -752,6 +808,8 @@ static int sunxi_pinctrl_probe(struct platform_device *pdev) return -ENOMEM; platform_set_drvdata(pdev, pctl); + spin_lock_init(&pctl->lock); + pctl->membase = of_iomap(node, 0); if (!pctl->membase) return -ENOMEM; diff --git a/drivers/pinctrl/pinctrl-sunxi.h b/drivers/pinctrl/pinctrl-sunxi.h index d68047d..01c494f 100644 --- a/drivers/pinctrl/pinctrl-sunxi.h +++ b/drivers/pinctrl/pinctrl-sunxi.h @@ -14,6 +14,7 @@ #define __PINCTRL_SUNXI_H #include <linux/kernel.h> +#include <linux/spinlock.h> #define PA_BASE 0 #define PB_BASE 32 @@ -407,6 +408,7 @@ struct sunxi_pinctrl { unsigned ngroups; int irq; int irq_array[SUNXI_IRQ_NUMBER]; + spinlock_t lock; struct pinctrl_dev *pctl_dev; }; diff --git a/drivers/platform/olpc/olpc-ec.c b/drivers/platform/olpc/olpc-ec.c index 0f9f859..f911952 100644 --- a/drivers/platform/olpc/olpc-ec.c +++ b/drivers/platform/olpc/olpc-ec.c @@ -330,7 +330,7 @@ static int __init olpc_ec_init_module(void) return platform_driver_register(&olpc_ec_plat_driver); } -module_init(olpc_ec_init_module); +arch_initcall(olpc_ec_init_module); MODULE_AUTHOR("Andres Salomon <dilinger@queued.net>"); MODULE_LICENSE("GPL"); diff --git a/drivers/platform/x86/hp-wmi.c b/drivers/platform/x86/hp-wmi.c index 97bb05e..d6970f4 100644 --- a/drivers/platform/x86/hp-wmi.c +++ b/drivers/platform/x86/hp-wmi.c @@ -53,7 +53,6 @@ MODULE_ALIAS("wmi:5FB7F034-2C63-45e9-BE91-3D44E2C707E4"); #define HPWMI_ALS_QUERY 0x3 #define HPWMI_HARDWARE_QUERY 0x4 #define HPWMI_WIRELESS_QUERY 0x5 -#define HPWMI_BIOS_QUERY 0x9 #define HPWMI_HOTKEY_QUERY 0xc #define HPWMI_WIRELESS2_QUERY 0x1b #define HPWMI_POSTCODEERROR_QUERY 0x2a @@ -293,19 +292,6 @@ static int hp_wmi_tablet_state(void) return (state & 0x4) ? 1 : 0; } -static int hp_wmi_enable_hotkeys(void) -{ - int ret; - int query = 0x6e; - - ret = hp_wmi_perform_query(HPWMI_BIOS_QUERY, 1, &query, sizeof(query), - 0); - - if (ret) - return -EINVAL; - return 0; -} - static int hp_wmi_set_block(void *data, bool blocked) { enum hp_wmi_radio r = (enum hp_wmi_radio) data; @@ -1009,8 +995,6 @@ static int __init hp_wmi_init(void) err = hp_wmi_input_setup(); if (err) return err; - - hp_wmi_enable_hotkeys(); } if (bios_capable) { diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index 2ac045f..3a1b6bf 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -2440,7 +2440,10 @@ static ssize_t sony_nc_gfx_switch_status_show(struct device *dev, if (pos < 0) return pos; - return snprintf(buffer, PAGE_SIZE, "%s\n", pos ? "speed" : "stamina"); + return snprintf(buffer, PAGE_SIZE, "%s\n", + pos == SPEED ? "speed" : + pos == STAMINA ? "stamina" : + pos == AUTO ? "auto" : "unknown"); } static int sony_nc_gfx_switch_setup(struct platform_device *pd, @@ -4320,7 +4323,8 @@ static int sony_pic_add(struct acpi_device *device) goto err_free_resources; } - if (sonypi_compat_init()) + result = sonypi_compat_init(); + if (result) goto err_remove_input; /* request io port */ diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index e58cf00..448efe0 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -13,7 +13,7 @@ #include <linux/eventfd.h> #include <linux/vhost.h> -#include <linux/socket.h> /* memcpy_fromiovec */ +#include <linux/uio.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/miscdevice.h> diff --git a/drivers/xen/events.c b/drivers/xen/events.c index a58ac43..5e8be46 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -348,7 +348,7 @@ static void init_evtchn_cpu_bindings(void) for_each_possible_cpu(i) memset(per_cpu(cpu_evtchn_mask, i), - (i == 0) ? ~0 : 0, sizeof(*per_cpu(cpu_evtchn_mask, i))); + (i == 0) ? ~0 : 0, NR_EVENT_CHANNELS/8); } static inline void clear_evtchn(int port) @@ -1493,8 +1493,10 @@ void rebind_evtchn_irq(int evtchn, int irq) /* Rebind an evtchn so that it gets delivered to a specific cpu */ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) { + struct shared_info *s = HYPERVISOR_shared_info; struct evtchn_bind_vcpu bind_vcpu; int evtchn = evtchn_from_irq(irq); + int masked; if (!VALID_EVTCHN(evtchn)) return -1; @@ -1511,6 +1513,12 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) bind_vcpu.vcpu = tcpu; /* + * Mask the event while changing the VCPU binding to prevent + * it being delivered on an unexpected VCPU. + */ + masked = sync_test_and_set_bit(evtchn, BM(s->evtchn_mask)); + + /* * If this fails, it usually just indicates that we're dealing with a * virq or IPI channel, which don't actually need to be rebound. Ignore * it, but don't do the xenlinux-level rebind in that case. @@ -1518,6 +1526,9 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) bind_evtchn_to_cpu(evtchn, tcpu); + if (!masked) + unmask_evtchn(evtchn); + return 0; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b577e45..0ab26fb 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2086,6 +2086,7 @@ extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 72a3600..17ac112 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -255,10 +255,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, set_buffer_prio(bh); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); - if (err) { - /* Errors can only happen if there is a bug */ - handle->h_err = err; - __ext4_journal_stop(where, line, handle); + /* Errors can only happen if there is a bug */ + if (WARN_ON_ONCE(err)) { + ext4_journal_abort_handle(where, line, __func__, bh, + handle, err); } } else { if (inode) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6f4cc56..319c9d2 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -219,7 +219,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp) { struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct ext4_inode_info *ei = EXT4_I(inode); struct vfsmount *mnt = filp->f_path.mnt; struct path path; char buf[64], *cp; @@ -259,22 +258,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp) * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present */ - if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) { - struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL); - - spin_lock(&inode->i_lock); - if (!ei->jinode) { - if (!jinode) { - spin_unlock(&inode->i_lock); - return -ENOMEM; - } - ei->jinode = jinode; - jbd2_journal_init_jbd_inode(ei->jinode, inode); - jinode = NULL; - } - spin_unlock(&inode->i_lock); - if (unlikely(jinode != NULL)) - jbd2_free_inode(jinode); + if (filp->f_mode & FMODE_WRITE) { + int ret = ext4_inode_attach_jinode(inode); + if (ret < 0) + return ret; } return dquot_file_open(inode, filp); } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index dd32a2e..c2ca04e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3533,6 +3533,18 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) offset; } + if (offset & (sb->s_blocksize - 1) || + (offset + length) & (sb->s_blocksize - 1)) { + /* + * Attach jinode to inode for jbd2 if we do any zeroing of + * partial block + */ + ret = ext4_inode_attach_jinode(inode); + if (ret < 0) + goto out_mutex; + + } + first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; @@ -3601,6 +3613,31 @@ out_mutex: return ret; } +int ext4_inode_attach_jinode(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct jbd2_inode *jinode; + + if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) + return 0; + + jinode = jbd2_alloc_inode(GFP_KERNEL); + spin_lock(&inode->i_lock); + if (!ei->jinode) { + if (!jinode) { + spin_unlock(&inode->i_lock); + return -ENOMEM; + } + ei->jinode = jinode; + jbd2_journal_init_jbd_inode(ei->jinode, inode); + jinode = NULL; + } + spin_unlock(&inode->i_lock); + if (unlikely(jinode != NULL)) + jbd2_free_inode(jinode); + return 0; +} + /* * ext4_truncate() * @@ -3661,6 +3698,12 @@ void ext4_truncate(struct inode *inode) return; } + /* If we zero-out tail of the page, we have to create jinode for jbd2 */ + if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { + if (ext4_inode_attach_jinode(inode) < 0) + return; + } + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9435384..544a809 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1838,14 +1838,14 @@ int __init gfs2_glock_init(void) glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 0); - if (IS_ERR(glock_workqueue)) - return PTR_ERR(glock_workqueue); + if (!glock_workqueue) + return -ENOMEM; gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); - if (IS_ERR(gfs2_delete_workqueue)) { + if (!gfs2_delete_workqueue) { destroy_workqueue(glock_workqueue); - return PTR_ERR(gfs2_delete_workqueue); + return -ENOMEM; } register_shrinker(&glock_shrinker); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 5f2e522..e2e0a90 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -47,7 +47,8 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) * None of the buffers should be dirty, locked, or pinned. */ -static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) +static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync, + unsigned int nr_revokes) { struct gfs2_sbd *sdp = gl->gl_sbd; struct list_head *head = &gl->gl_ail_list; @@ -57,7 +58,9 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) gfs2_log_lock(sdp); spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_safe(bd, tmp, head, bd_ail_gl_list) { + list_for_each_entry_safe_reverse(bd, tmp, head, bd_ail_gl_list) { + if (nr_revokes == 0) + break; bh = bd->bd_bh; if (bh->b_state & b_state) { if (fsync) @@ -65,6 +68,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) gfs2_ail_error(gl, bh); } gfs2_trans_add_revoke(sdp, bd); + nr_revokes--; } GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count)); spin_unlock(&sdp->sd_ail_lock); @@ -91,7 +95,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) WARN_ON_ONCE(current->journal_info); current->journal_info = &tr; - __gfs2_ail_flush(gl, 0); + __gfs2_ail_flush(gl, 0, tr.tr_revokes); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); @@ -101,15 +105,19 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) { struct gfs2_sbd *sdp = gl->gl_sbd; unsigned int revokes = atomic_read(&gl->gl_ail_count); + unsigned int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64); int ret; if (!revokes) return; - ret = gfs2_trans_begin(sdp, 0, revokes); + while (revokes > max_revokes) + max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); + + ret = gfs2_trans_begin(sdp, 0, max_revokes); if (ret) return; - __gfs2_ail_flush(gl, fsync); + __gfs2_ail_flush(gl, fsync, max_revokes); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index bbb2715..64915ee 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -594,7 +594,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, } gfs2_glock_dq_uninit(ghs); if (IS_ERR(d)) - return PTR_RET(d); + return PTR_ERR(d); return error; } else if (error != -ENOENT) { goto fail_gunlock; @@ -1750,6 +1750,10 @@ static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, struct gfs2_holder gh; int ret; + /* For selinux during lookup */ + if (gfs2_glock_is_locked_by_me(ip->i_gl)) + return generic_getxattr(dentry, name, data, size); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); ret = gfs2_glock_nq(&gh); if (ret == 0) { diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index e04d0e0..7b0f504 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -155,7 +155,7 @@ static int __init init_gfs2_fs(void) goto fail_wq; gfs2_control_wq = alloc_workqueue("gfs2_control", - WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0); + WQ_UNBOUND | WQ_FREEZABLE, 0); if (!gfs2_control_wq) goto fail_recovery; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index dc9a913..2d8be51 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -345,8 +345,7 @@ static void nilfs_end_bio_write(struct bio *bio, int err) if (err == -EOPNOTSUPP) { set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - bio_put(bio); - /* to be detected by submit_seg_bio() */ + /* to be detected by nilfs_segbuf_submit_bio() */ } if (!uptodate) @@ -377,12 +376,12 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, bio->bi_private = segbuf; bio_get(bio); submit_bio(mode, bio); + segbuf->sb_nbio++; if (bio_flagged(bio, BIO_EOPNOTSUPP)) { bio_put(bio); err = -EOPNOTSUPP; goto failed; } - segbuf->sb_nbio++; bio_put(bio); wi->bio = NULL; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 94441a4..737e156 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -271,7 +271,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, de = next; } while (de); spin_unlock(&proc_subdir_lock); - return 0; + return 1; } int proc_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/root.c b/fs/proc/root.c index 229e366..e0a790d 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -205,7 +205,9 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr static int proc_root_readdir(struct file *file, struct dir_context *ctx) { if (ctx->pos < FIRST_PROCESS_ENTRY) { - proc_readdir(file, ctx); + int error = proc_readdir(file, ctx); + if (unlikely(error <= 0)) + return error; ctx->pos = FIRST_PROCESS_ENTRY; } diff --git a/include/linux/can/platform/mcp251x.h b/include/linux/can/platform/mcp251x.h index 089fe43..dc029db 100644 --- a/include/linux/can/platform/mcp251x.h +++ b/include/linux/can/platform/mcp251x.h @@ -9,26 +9,13 @@ #include <linux/spi/spi.h> -/** +/* * struct mcp251x_platform_data - MCP251X SPI CAN controller platform data * @oscillator_frequency: - oscillator frequency in Hz - * @irq_flags: - IRQF configuration flags - * @board_specific_setup: - called before probing the chip (power,reset) - * @transceiver_enable: - called to power on/off the transceiver - * @power_enable: - called to power on/off the mcp *and* the - * transceiver - * - * Please note that you should define power_enable or transceiver_enable or - * none of them. Defining both of them is no use. - * */ struct mcp251x_platform_data { unsigned long oscillator_frequency; - unsigned long irq_flags; - int (*board_specific_setup)(struct spi_device *spi); - int (*transceiver_enable)(int enable); - int (*power_enable) (int enable); }; #endif /* __CAN_PLATFORM_MCP251X_H__ */ diff --git a/include/linux/fs_enet_pd.h b/include/linux/fs_enet_pd.h index 343d82a..efb0596 100644 --- a/include/linux/fs_enet_pd.h +++ b/include/linux/fs_enet_pd.h @@ -16,6 +16,7 @@ #ifndef FS_ENET_PD_H #define FS_ENET_PD_H +#include <linux/clk.h> #include <linux/string.h> #include <linux/of_mdio.h> #include <linux/if_ether.h> @@ -143,6 +144,8 @@ struct fs_platform_info { int use_rmii; /* use RMII mode */ int has_phy; /* if the network is phy container as well...*/ + + struct clk *clk_per; /* 'per' clock for register access */ }; struct fs_mii_fec_platform_info { u32 irq[32]; diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index c796ce2..79640e01 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -5,47 +5,13 @@ #include <linux/bitmap.h> #include <linux/if.h> +#include <linux/ip.h> #include <linux/netdevice.h> #include <linux/rcupdate.h> #include <linux/timer.h> #include <linux/sysctl.h> #include <linux/rtnetlink.h> -enum -{ - IPV4_DEVCONF_FORWARDING=1, - IPV4_DEVCONF_MC_FORWARDING, - IPV4_DEVCONF_PROXY_ARP, - IPV4_DEVCONF_ACCEPT_REDIRECTS, - IPV4_DEVCONF_SECURE_REDIRECTS, - IPV4_DEVCONF_SEND_REDIRECTS, - IPV4_DEVCONF_SHARED_MEDIA, - IPV4_DEVCONF_RP_FILTER, - IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE, - IPV4_DEVCONF_BOOTP_RELAY, - IPV4_DEVCONF_LOG_MARTIANS, - IPV4_DEVCONF_TAG, - IPV4_DEVCONF_ARPFILTER, - IPV4_DEVCONF_MEDIUM_ID, - IPV4_DEVCONF_FORCE_IGMP_VERSION, - IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL, - IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL, - IPV4_DEVCONF_NOXFRM, - IPV4_DEVCONF_NOPOLICY, - IPV4_DEVCONF_ARP_ANNOUNCE, - IPV4_DEVCONF_ARP_IGNORE, - IPV4_DEVCONF_PROMOTE_SECONDARIES, - IPV4_DEVCONF_ARP_ACCEPT, - IPV4_DEVCONF_ARP_NOTIFY, - IPV4_DEVCONF_ACCEPT_LOCAL, - IPV4_DEVCONF_SRC_VMARK, - IPV4_DEVCONF_PROXY_ARP_PVLAN, - IPV4_DEVCONF_ROUTE_LOCALNET, - __IPV4_DEVCONF_MAX -}; - -#define IPV4_DEVCONF_MAX (__IPV4_DEVCONF_MAX - 1) - struct ipv4_devconf { void *sysctl; int data[IPV4_DEVCONF_MAX]; diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 77a4784..9ac5047 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -103,6 +103,7 @@ struct inet6_skb_parm { #define IP6SKB_FORWARDED 2 #define IP6SKB_REROUTED 4 #define IP6SKB_ROUTERALERT 8 +#define IP6SKB_FRAGMENTED 16 }; #define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb)) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index fb425aa..faf4b7c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -332,6 +332,7 @@ struct mm_struct { unsigned long pgoff, unsigned long flags); #endif unsigned long mmap_base; /* base of mmap area */ + unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ unsigned long task_size; /* size of task vm space */ unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index de70f7b..708fe72ab9 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -314,25 +314,24 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) #endif /*CONFIG_NETFILTER*/ #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) -extern void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu; -extern void nf_ct_attach(struct sk_buff *, struct sk_buff *); +extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu; +extern void nf_ct_attach(struct sk_buff *, const struct sk_buff *); extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu; struct nf_conn; +enum ip_conntrack_info; struct nlattr; struct nfq_ct_hook { size_t (*build_size)(const struct nf_conn *ct); int (*build)(struct sk_buff *skb, struct nf_conn *ct); int (*parse)(const struct nlattr *attr, struct nf_conn *ct); -}; -extern struct nfq_ct_hook __rcu *nfq_ct_hook; - -struct nfq_ct_nat_hook { + int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct, + u32 portid, u32 report); void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct, - u32 ctinfo, int off); + enum ip_conntrack_info ctinfo, s32 off); }; -extern struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook; +extern struct nfq_ct_hook __rcu *nfq_ct_hook; #else static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index e9995eb..078066d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -314,7 +314,6 @@ struct nsproxy; struct user_namespace; #ifdef CONFIG_MMU -extern unsigned long mmap_legacy_base(void); extern void arch_pick_mmap_layout(struct mm_struct *mm); extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, diff --git a/include/linux/sh_eth.h b/include/linux/sh_eth.h index 6205eeb..90b5e30 100644 --- a/include/linux/sh_eth.h +++ b/include/linux/sh_eth.h @@ -5,17 +5,10 @@ #include <linux/if_ether.h> enum {EDMAC_LITTLE_ENDIAN, EDMAC_BIG_ENDIAN}; -enum { - SH_ETH_REG_GIGABIT, - SH_ETH_REG_FAST_RCAR, - SH_ETH_REG_FAST_SH4, - SH_ETH_REG_FAST_SH3_SH2 -}; struct sh_eth_plat_data { int phy; int edmac_endian; - int register_type; phy_interface_t phy_interface; void (*set_mdio_gate)(void *addr); diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 260f83f..f667248 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -135,6 +135,8 @@ extern void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, extern void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu); extern void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark); +extern void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, + u32 mark); extern void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk); struct netlink_callback; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 5fe5649..7bdff04 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -41,6 +41,7 @@ #define NEXTHDR_ICMP 58 /* ICMP for IPv6. */ #define NEXTHDR_NONE 59 /* No next header */ #define NEXTHDR_DEST 60 /* Destination options header. */ +#define NEXTHDR_SCTP 132 /* SCTP message. */ #define NEXTHDR_MOBILITY 135 /* Mobility header. */ #define NEXTHDR_MAX 255 diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 644d9c2..0c1288a 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -181,8 +181,7 @@ __nf_conntrack_find(struct net *net, u16 zone, const struct nf_conntrack_tuple *tuple); extern int nf_conntrack_hash_check_insert(struct nf_conn *ct); -extern void nf_ct_delete_from_lists(struct nf_conn *ct); -extern void nf_ct_dying_timeout(struct nf_conn *ct); +bool nf_ct_delete(struct nf_conn *ct, u32 pid, int report); extern void nf_conntrack_flush_report(struct net *net, u32 portid, int report); @@ -235,7 +234,7 @@ static inline bool nf_ct_kill(struct nf_conn *ct) } /* These are for NAT. Icky. */ -extern s16 (*nf_ct_nat_offset)(const struct nf_conn *ct, +extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq); @@ -249,7 +248,9 @@ extern void nf_ct_untracked_status_or(unsigned long bits); /* Iterate over all conntracks: if iter returns true, it's deleted. */ extern void -nf_ct_iterate_cleanup(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data); +nf_ct_iterate_cleanup(struct net *net, + int (*iter)(struct nf_conn *i, void *data), + void *data, u32 portid, int report); extern void nf_conntrack_free(struct nf_conn *ct); extern struct nf_conn * nf_conntrack_alloc(struct net *net, u16 zone, diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 977bc8a..ff95434 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -10,6 +10,7 @@ enum nf_ct_ext_id { #if defined(CONFIG_NF_NAT) || defined(CONFIG_NF_NAT_MODULE) NF_CT_EXT_NAT, #endif + NF_CT_EXT_SEQADJ, NF_CT_EXT_ACCT, #ifdef CONFIG_NF_CONNTRACK_EVENTS NF_CT_EXT_ECACHE, @@ -26,17 +27,22 @@ enum nf_ct_ext_id { #ifdef CONFIG_NF_CONNTRACK_LABELS NF_CT_EXT_LABELS, #endif +#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) + NF_CT_EXT_SYNPROXY, +#endif NF_CT_EXT_NUM, }; #define NF_CT_EXT_HELPER_TYPE struct nf_conn_help #define NF_CT_EXT_NAT_TYPE struct nf_conn_nat +#define NF_CT_EXT_SEQADJ_TYPE struct nf_conn_seqadj #define NF_CT_EXT_ACCT_TYPE struct nf_conn_counter #define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache #define NF_CT_EXT_ZONE_TYPE struct nf_conntrack_zone #define NF_CT_EXT_TSTAMP_TYPE struct nf_conn_tstamp #define NF_CT_EXT_TIMEOUT_TYPE struct nf_conn_timeout #define NF_CT_EXT_LABELS_TYPE struct nf_conn_labels +#define NF_CT_EXT_SYNPROXY_TYPE struct nf_conn_synproxy /* Extensions: optional stuff which isn't permanently in struct. */ struct nf_ct_ext { diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 914d8d9..b411d7b 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -148,17 +148,10 @@ extern int nf_ct_port_nlattr_tuple_size(void); extern const struct nla_policy nf_ct_port_nla_policy[]; #ifdef CONFIG_SYSCTL -#ifdef DEBUG_INVALID_PACKETS #define LOG_INVALID(net, proto) \ ((net)->ct.sysctl_log_invalid == (proto) || \ (net)->ct.sysctl_log_invalid == IPPROTO_RAW) #else -#define LOG_INVALID(net, proto) \ - (((net)->ct.sysctl_log_invalid == (proto) || \ - (net)->ct.sysctl_log_invalid == IPPROTO_RAW) \ - && net_ratelimit()) -#endif -#else static inline int LOG_INVALID(struct net *net, int proto) { return 0; } #endif /* CONFIG_SYSCTL */ diff --git a/include/net/netfilter/nf_conntrack_seqadj.h b/include/net/netfilter/nf_conntrack_seqadj.h new file mode 100644 index 0000000..f6177a5 --- /dev/null +++ b/include/net/netfilter/nf_conntrack_seqadj.h @@ -0,0 +1,51 @@ +#ifndef _NF_CONNTRACK_SEQADJ_H +#define _NF_CONNTRACK_SEQADJ_H + +#include <net/netfilter/nf_conntrack_extend.h> + +/** + * struct nf_ct_seqadj - sequence number adjustment information + * + * @correction_pos: position of the last TCP sequence number modification + * @offset_before: sequence number offset before last modification + * @offset_after: sequence number offset after last modification + */ +struct nf_ct_seqadj { + u32 correction_pos; + s32 offset_before; + s32 offset_after; +}; + +struct nf_conn_seqadj { + struct nf_ct_seqadj seq[IP_CT_DIR_MAX]; +}; + +static inline struct nf_conn_seqadj *nfct_seqadj(const struct nf_conn *ct) +{ + return nf_ct_ext_find(ct, NF_CT_EXT_SEQADJ); +} + +static inline struct nf_conn_seqadj *nfct_seqadj_ext_add(struct nf_conn *ct) +{ + return nf_ct_ext_add(ct, NF_CT_EXT_SEQADJ, GFP_ATOMIC); +} + +extern int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + s32 off); +extern int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + __be32 seq, s32 off); +extern void nf_ct_tcp_seqadj_set(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + s32 off); + +extern int nf_ct_seq_adjust(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff); +extern s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir, + u32 seq); + +extern int nf_conntrack_seqadj_init(void); +extern void nf_conntrack_seqadj_fini(void); + +#endif /* _NF_CONNTRACK_SEQADJ_H */ diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h new file mode 100644 index 0000000..806f54a --- /dev/null +++ b/include/net/netfilter/nf_conntrack_synproxy.h @@ -0,0 +1,77 @@ +#ifndef _NF_CONNTRACK_SYNPROXY_H +#define _NF_CONNTRACK_SYNPROXY_H + +#include <net/netns/generic.h> + +struct nf_conn_synproxy { + u32 isn; + u32 its; + u32 tsoff; +}; + +static inline struct nf_conn_synproxy *nfct_synproxy(const struct nf_conn *ct) +{ +#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) + return nf_ct_ext_find(ct, NF_CT_EXT_SYNPROXY); +#else + return NULL; +#endif +} + +static inline struct nf_conn_synproxy *nfct_synproxy_ext_add(struct nf_conn *ct) +{ +#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) + return nf_ct_ext_add(ct, NF_CT_EXT_SYNPROXY, GFP_ATOMIC); +#else + return NULL; +#endif +} + +struct synproxy_stats { + unsigned int syn_received; + unsigned int cookie_invalid; + unsigned int cookie_valid; + unsigned int cookie_retrans; + unsigned int conn_reopened; +}; + +struct synproxy_net { + struct nf_conn *tmpl; + struct synproxy_stats __percpu *stats; +}; + +extern int synproxy_net_id; +static inline struct synproxy_net *synproxy_pernet(struct net *net) +{ + return net_generic(net, synproxy_net_id); +} + +struct synproxy_options { + u8 options; + u8 wscale; + u16 mss; + u32 tsval; + u32 tsecr; +}; + +struct tcphdr; +struct xt_synproxy_info; +extern void synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, + const struct tcphdr *th, + struct synproxy_options *opts); +extern unsigned int synproxy_options_size(const struct synproxy_options *opts); +extern void synproxy_build_options(struct tcphdr *th, + const struct synproxy_options *opts); + +extern void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, + struct synproxy_options *opts); +extern void synproxy_check_timestamp_cookie(struct synproxy_options *opts); + +extern unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, + unsigned int protoff, + struct tcphdr *th, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct nf_conn_synproxy *synproxy); + +#endif /* _NF_CONNTRACK_SYNPROXY_H */ diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index ad14a79..59a1924 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -13,15 +13,6 @@ enum nf_nat_manip_type { #define HOOK2MANIP(hooknum) ((hooknum) != NF_INET_POST_ROUTING && \ (hooknum) != NF_INET_LOCAL_IN) -/* NAT sequence number modifications */ -struct nf_nat_seq { - /* position of the last TCP sequence number modification (if any) */ - u_int32_t correction_pos; - - /* sequence number offset before and after last modification */ - int16_t offset_before, offset_after; -}; - #include <linux/list.h> #include <linux/netfilter/nf_conntrack_pptp.h> #include <net/netfilter/nf_conntrack_extend.h> @@ -39,7 +30,6 @@ struct nf_conn; /* The structure embedded in the conntrack structure. */ struct nf_conn_nat { struct hlist_node bysource; - struct nf_nat_seq seq[IP_CT_DIR_MAX]; struct nf_conn *ct; union nf_conntrack_nat_help help; #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ diff --git a/include/net/netfilter/nf_nat_helper.h b/include/net/netfilter/nf_nat_helper.h index b4d6bfc..404324d 100644 --- a/include/net/netfilter/nf_nat_helper.h +++ b/include/net/netfilter/nf_nat_helper.h @@ -39,28 +39,9 @@ extern int nf_nat_mangle_udp_packet(struct sk_buff *skb, const char *rep_buffer, unsigned int rep_len); -extern void nf_nat_set_seq_adjust(struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - __be32 seq, s16 off); -extern int nf_nat_seq_adjust(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff); -extern int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff); - /* Setup NAT on this expected conntrack so it follows master, but goes * to port ct->master->saved_proto. */ extern void nf_nat_follow_master(struct nf_conn *ct, struct nf_conntrack_expect *this); -extern s16 nf_nat_get_offset(const struct nf_conn *ct, - enum ip_conntrack_dir dir, - u32 seq); - -extern void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, - u32 dir, int off); - #endif diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h deleted file mode 100644 index 36d9379..0000000 --- a/include/net/netfilter/nf_tproxy_core.h +++ /dev/null @@ -1,210 +0,0 @@ -#ifndef _NF_TPROXY_CORE_H -#define _NF_TPROXY_CORE_H - -#include <linux/types.h> -#include <linux/in.h> -#include <linux/skbuff.h> -#include <net/sock.h> -#include <net/inet_hashtables.h> -#include <net/inet6_hashtables.h> -#include <net/tcp.h> - -#define NFT_LOOKUP_ANY 0 -#define NFT_LOOKUP_LISTENER 1 -#define NFT_LOOKUP_ESTABLISHED 2 - -/* look up and get a reference to a matching socket */ - - -/* This function is used by the 'TPROXY' target and the 'socket' - * match. The following lookups are supported: - * - * Explicit TProxy target rule - * =========================== - * - * This is used when the user wants to intercept a connection matching - * an explicit iptables rule. In this case the sockets are assumed - * matching in preference order: - * - * - match: if there's a fully established connection matching the - * _packet_ tuple, it is returned, assuming the redirection - * already took place and we process a packet belonging to an - * established connection - * - * - match: if there's a listening socket matching the redirection - * (e.g. on-port & on-ip of the connection), it is returned, - * regardless if it was bound to 0.0.0.0 or an explicit - * address. The reasoning is that if there's an explicit rule, it - * does not really matter if the listener is bound to an interface - * or to 0. The user already stated that he wants redirection - * (since he added the rule). - * - * "socket" match based redirection (no specific rule) - * =================================================== - * - * There are connections with dynamic endpoints (e.g. FTP data - * connection) that the user is unable to add explicit rules - * for. These are taken care of by a generic "socket" rule. It is - * assumed that the proxy application is trusted to open such - * connections without explicit iptables rule (except of course the - * generic 'socket' rule). In this case the following sockets are - * matched in preference order: - * - * - match: if there's a fully established connection matching the - * _packet_ tuple - * - * - match: if there's a non-zero bound listener (possibly with a - * non-local address) We don't accept zero-bound listeners, since - * then local services could intercept traffic going through the - * box. - * - * Please note that there's an overlap between what a TPROXY target - * and a socket match will match. Normally if you have both rules the - * "socket" match will be the first one, effectively all packets - * belonging to established connections going through that one. - */ -static inline struct sock * -nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, - const __be32 saddr, const __be32 daddr, - const __be16 sport, const __be16 dport, - const struct net_device *in, int lookup_type) -{ - struct sock *sk; - - /* look up socket */ - switch (protocol) { - case IPPROTO_TCP: - switch (lookup_type) { - case NFT_LOOKUP_ANY: - sk = __inet_lookup(net, &tcp_hashinfo, - saddr, sport, daddr, dport, - in->ifindex); - break; - case NFT_LOOKUP_LISTENER: - sk = inet_lookup_listener(net, &tcp_hashinfo, - saddr, sport, - daddr, dport, - in->ifindex); - - /* NOTE: we return listeners even if bound to - * 0.0.0.0, those are filtered out in - * xt_socket, since xt_TPROXY needs 0 bound - * listeners too */ - - break; - case NFT_LOOKUP_ESTABLISHED: - sk = inet_lookup_established(net, &tcp_hashinfo, - saddr, sport, daddr, dport, - in->ifindex); - break; - default: - WARN_ON(1); - sk = NULL; - break; - } - break; - case IPPROTO_UDP: - sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, - in->ifindex); - if (sk && lookup_type != NFT_LOOKUP_ANY) { - int connected = (sk->sk_state == TCP_ESTABLISHED); - int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0); - - /* NOTE: we return listeners even if bound to - * 0.0.0.0, those are filtered out in - * xt_socket, since xt_TPROXY needs 0 bound - * listeners too */ - if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || - (lookup_type == NFT_LOOKUP_LISTENER && connected)) { - sock_put(sk); - sk = NULL; - } - } - break; - default: - WARN_ON(1); - sk = NULL; - } - - pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n", - protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk); - - return sk; -} - -#if IS_ENABLED(CONFIG_IPV6) -static inline struct sock * -nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, - const struct in6_addr *saddr, const struct in6_addr *daddr, - const __be16 sport, const __be16 dport, - const struct net_device *in, int lookup_type) -{ - struct sock *sk; - - /* look up socket */ - switch (protocol) { - case IPPROTO_TCP: - switch (lookup_type) { - case NFT_LOOKUP_ANY: - sk = inet6_lookup(net, &tcp_hashinfo, - saddr, sport, daddr, dport, - in->ifindex); - break; - case NFT_LOOKUP_LISTENER: - sk = inet6_lookup_listener(net, &tcp_hashinfo, - saddr, sport, - daddr, ntohs(dport), - in->ifindex); - - /* NOTE: we return listeners even if bound to - * 0.0.0.0, those are filtered out in - * xt_socket, since xt_TPROXY needs 0 bound - * listeners too */ - - break; - case NFT_LOOKUP_ESTABLISHED: - sk = __inet6_lookup_established(net, &tcp_hashinfo, - saddr, sport, daddr, ntohs(dport), - in->ifindex); - break; - default: - WARN_ON(1); - sk = NULL; - break; - } - break; - case IPPROTO_UDP: - sk = udp6_lib_lookup(net, saddr, sport, daddr, dport, - in->ifindex); - if (sk && lookup_type != NFT_LOOKUP_ANY) { - int connected = (sk->sk_state == TCP_ESTABLISHED); - int wildcard = ipv6_addr_any(&inet6_sk(sk)->rcv_saddr); - - /* NOTE: we return listeners even if bound to - * 0.0.0.0, those are filtered out in - * xt_socket, since xt_TPROXY needs 0 bound - * listeners too */ - if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || - (lookup_type == NFT_LOOKUP_LISTENER && connected)) { - sock_put(sk); - sk = NULL; - } - } - break; - default: - WARN_ON(1); - sk = NULL; - } - - pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n", - protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk); - - return sk; -} -#endif - -/* assign a socket to the skb -- consumes sk */ -void -nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk); - -#endif diff --git a/include/net/netfilter/nfnetlink_queue.h b/include/net/netfilter/nfnetlink_queue.h index 86267a5..aff88ba 100644 --- a/include/net/netfilter/nfnetlink_queue.h +++ b/include/net/netfilter/nfnetlink_queue.h @@ -15,6 +15,8 @@ int nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo); void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, int diff); +int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr, + u32 portid, u32 report); #else inline struct nf_conn * nfqnl_ct_get(struct sk_buff *entskb, size_t *size, enum ip_conntrack_info *ctinfo) @@ -39,5 +41,11 @@ inline void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, int diff) { } + +inline int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr, + u32 portid, u32 report) +{ + return 0; +} #endif /* NF_CONNTRACK */ #endif diff --git a/include/net/tcp.h b/include/net/tcp.h index 09cb5c1..dd5e16f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -476,9 +476,13 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); /* From syncookies.c */ extern __u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; +extern int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, + u32 cookie); extern struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt); #ifdef CONFIG_SYN_COOKIES +extern u32 __cookie_v4_init_sequence(const struct iphdr *iph, + const struct tcphdr *th, u16 *mssp); extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mss); #else @@ -495,8 +499,12 @@ extern bool cookie_check_timestamp(struct tcp_options_received *opt, struct net *net, bool *ecn_ok); /* From net/ipv6/syncookies.c */ +extern int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, + u32 cookie); extern struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); #ifdef CONFIG_SYN_COOKIES +extern u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, + const struct tcphdr *th, u16 *mssp); extern __u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mss); #else diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 94ce082..89d3d8a 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1548,7 +1548,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8, int dir, u32 int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info); u32 xfrm_get_acqseq(void); extern int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi); -struct xfrm_state *xfrm_find_acq(struct net *net, struct xfrm_mark *mark, +struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 1870ee2..e9502dd 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -56,6 +56,8 @@ #define TUNGETVNETHDRSZ _IOR('T', 215, int) #define TUNSETVNETHDRSZ _IOW('T', 216, int) #define TUNSETQUEUE _IOW('T', 217, int) +#define TUNSETIFINDEX _IOW('T', 218, unsigned int) +#define TUNGETFILTER _IOR('T', 219, struct sock_fprog) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 @@ -70,6 +72,7 @@ #define IFF_DETACH_QUEUE 0x0400 /* read-only flag */ #define IFF_PERSIST 0x0800 +#define IFF_NOFILTER 0x1000 /* Socket options */ #define TUN_TX_TIMESTAMP 1 diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index 6cf06bf..4119594 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -133,4 +133,40 @@ struct ip_beet_phdr { __u8 reserved; }; +/* index values for the variables in ipv4_devconf */ +enum +{ + IPV4_DEVCONF_FORWARDING=1, + IPV4_DEVCONF_MC_FORWARDING, + IPV4_DEVCONF_PROXY_ARP, + IPV4_DEVCONF_ACCEPT_REDIRECTS, + IPV4_DEVCONF_SECURE_REDIRECTS, + IPV4_DEVCONF_SEND_REDIRECTS, + IPV4_DEVCONF_SHARED_MEDIA, + IPV4_DEVCONF_RP_FILTER, + IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE, + IPV4_DEVCONF_BOOTP_RELAY, + IPV4_DEVCONF_LOG_MARTIANS, + IPV4_DEVCONF_TAG, + IPV4_DEVCONF_ARPFILTER, + IPV4_DEVCONF_MEDIUM_ID, + IPV4_DEVCONF_NOXFRM, + IPV4_DEVCONF_NOPOLICY, + IPV4_DEVCONF_FORCE_IGMP_VERSION, + IPV4_DEVCONF_ARP_ANNOUNCE, + IPV4_DEVCONF_ARP_IGNORE, + IPV4_DEVCONF_PROMOTE_SECONDARIES, + IPV4_DEVCONF_ARP_ACCEPT, + IPV4_DEVCONF_ARP_NOTIFY, + IPV4_DEVCONF_ACCEPT_LOCAL, + IPV4_DEVCONF_SRC_VMARK, + IPV4_DEVCONF_PROXY_ARP_PVLAN, + IPV4_DEVCONF_ROUTE_LOCALNET, + IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL, + IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL, + __IPV4_DEVCONF_MAX +}; + +#define IPV4_DEVCONF_MAX (__IPV4_DEVCONF_MAX - 1) + #endif /* _UAPI_LINUX_IP_H */ diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild index 4111577..1749154 100644 --- a/include/uapi/linux/netfilter/Kbuild +++ b/include/uapi/linux/netfilter/Kbuild @@ -22,6 +22,7 @@ header-y += xt_CONNMARK.h header-y += xt_CONNSECMARK.h header-y += xt_CT.h header-y += xt_DSCP.h +header-y += xt_HMARK.h header-y += xt_IDLETIMER.h header-y += xt_LED.h header-y += xt_LOG.h @@ -68,6 +69,7 @@ header-y += xt_quota.h header-y += xt_rateest.h header-y += xt_realm.h header-y += xt_recent.h +header-y += xt_rpfilter.h header-y += xt_sctp.h header-y += xt_set.h header-y += xt_socket.h diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index d69483f..8dd8038 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -99,7 +99,8 @@ enum ip_conntrack_events { IPCT_PROTOINFO, /* protocol information has changed */ IPCT_HELPER, /* new helper has been set */ IPCT_MARK, /* new mark has been set */ - IPCT_NATSEQADJ, /* NAT is doing sequence adjustment */ + IPCT_SEQADJ, /* sequence adjustment has changed */ + IPCT_NATSEQADJ = IPCT_SEQADJ, IPCT_SECMARK, /* new security mark has been set */ IPCT_LABEL, /* new connlabel has been set */ }; diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 08fabc6..acad6c5 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -42,8 +42,10 @@ enum ctattr_type { CTA_ID, CTA_NAT_DST, CTA_TUPLE_MASTER, - CTA_NAT_SEQ_ADJ_ORIG, - CTA_NAT_SEQ_ADJ_REPLY, + CTA_SEQ_ADJ_ORIG, + CTA_NAT_SEQ_ADJ_ORIG = CTA_SEQ_ADJ_ORIG, + CTA_SEQ_ADJ_REPLY, + CTA_NAT_SEQ_ADJ_REPLY = CTA_SEQ_ADJ_REPLY, CTA_SECMARK, /* obsolete */ CTA_ZONE, CTA_SECCTX, @@ -165,6 +167,15 @@ enum ctattr_protonat { }; #define CTA_PROTONAT_MAX (__CTA_PROTONAT_MAX - 1) +enum ctattr_seqadj { + CTA_SEQADJ_UNSPEC, + CTA_SEQADJ_CORRECTION_POS, + CTA_SEQADJ_OFFSET_BEFORE, + CTA_SEQADJ_OFFSET_AFTER, + __CTA_SEQADJ_MAX +}; +#define CTA_SEQADJ_MAX (__CTA_SEQADJ_MAX - 1) + enum ctattr_natseq { CTA_NAT_SEQ_UNSPEC, CTA_NAT_SEQ_CORRECTION_POS, diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h index 3a9b921..0132bad 100644 --- a/include/uapi/linux/netfilter/nfnetlink_queue.h +++ b/include/uapi/linux/netfilter/nfnetlink_queue.h @@ -46,6 +46,7 @@ enum nfqnl_attr_type { NFQA_CT_INFO, /* enum ip_conntrack_info */ NFQA_CAP_LEN, /* __u32 length of captured packet */ NFQA_SKB_INFO, /* __u32 skb meta information */ + NFQA_EXP, /* nf_conntrack_netlink.h */ __NFQA_MAX }; diff --git a/include/linux/netfilter/xt_HMARK.h b/include/uapi/linux/netfilter/xt_HMARK.h index 826fc58..826fc58 100644 --- a/include/linux/netfilter/xt_HMARK.h +++ b/include/uapi/linux/netfilter/xt_HMARK.h diff --git a/include/uapi/linux/netfilter/xt_SYNPROXY.h b/include/uapi/linux/netfilter/xt_SYNPROXY.h new file mode 100644 index 0000000..2d59fba --- /dev/null +++ b/include/uapi/linux/netfilter/xt_SYNPROXY.h @@ -0,0 +1,16 @@ +#ifndef _XT_SYNPROXY_H +#define _XT_SYNPROXY_H + +#define XT_SYNPROXY_OPT_MSS 0x01 +#define XT_SYNPROXY_OPT_WSCALE 0x02 +#define XT_SYNPROXY_OPT_SACK_PERM 0x04 +#define XT_SYNPROXY_OPT_TIMESTAMP 0x08 +#define XT_SYNPROXY_OPT_ECN 0x10 + +struct xt_synproxy_info { + __u8 options; + __u8 wscale; + __u16 mss; +}; + +#endif /* _XT_SYNPROXY_H */ diff --git a/include/linux/netfilter/xt_rpfilter.h b/include/uapi/linux/netfilter/xt_rpfilter.h index 8358d4f..8358d4f 100644 --- a/include/linux/netfilter/xt_rpfilter.h +++ b/include/uapi/linux/netfilter/xt_rpfilter.h diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 52490b0..a74d375 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -1,6 +1,6 @@ /* - * Copyright (c) 2007-2011 Nicira Networks. + * Copyright (c) 2007-2013 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -259,6 +259,7 @@ enum ovs_key_attr { OVS_KEY_ATTR_ND, /* struct ovs_key_nd */ OVS_KEY_ATTR_SKB_MARK, /* u32 skb mark */ OVS_KEY_ATTR_TUNNEL, /* Nested set of ovs_tunnel attributes */ + OVS_KEY_ATTR_SCTP, /* struct ovs_key_sctp */ #ifdef __KERNEL__ OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */ @@ -333,6 +334,11 @@ struct ovs_key_udp { __be16 udp_dst; }; +struct ovs_key_sctp { + __be16 sctp_src; + __be16 sctp_dst; +}; + struct ovs_key_icmp { __u8 icmp_type; __u8 icmp_code; @@ -379,6 +385,12 @@ struct ovs_key_nd { * @OVS_FLOW_ATTR_CLEAR: If present in a %OVS_FLOW_CMD_SET request, clears the * last-used time, accumulated TCP flags, and statistics for this flow. * Otherwise ignored in requests. Never present in notifications. + * @OVS_FLOW_ATTR_MASK: Nested %OVS_KEY_ATTR_* attributes specifying the + * mask bits for wildcarded flow match. Mask bit value '1' specifies exact + * match with corresponding flow key bit, while mask bit value '0' specifies + * a wildcarded match. Omitting attribute is treated as wildcarding all + * corresponding fields. Optional for all requests. If not present, + * all flow key bits are exact match bits. * * These attributes follow the &struct ovs_header within the Generic Netlink * payload for %OVS_FLOW_* commands. @@ -391,6 +403,7 @@ enum ovs_flow_attr { OVS_FLOW_ATTR_TCP_FLAGS, /* 8-bit OR'd TCP flags. */ OVS_FLOW_ATTR_USED, /* u64 msecs last used in monotonic time. */ OVS_FLOW_ATTR_CLEAR, /* Flag to clear stats, tcp_flags, used. */ + OVS_FLOW_ATTR_MASK, /* Sequence of OVS_KEY_ATTR_* attributes. */ __OVS_FLOW_ATTR_MAX }; diff --git a/init/Kconfig b/init/Kconfig index 247084b..fed81b5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -955,7 +955,7 @@ config MEMCG_SWAP_ENABLED Memory Resource Controller Swap Extension comes with its price in a bigger memory consumption. General purpose distribution kernels which want to enable the feature but keep it disabled by default - and let the user enable it by swapaccount boot command line + and let the user enable it by swapaccount=1 boot command line parameter should have this option unselected. For those who want to have the feature enabled by default should select this option (if, for some reason, they need to disable it diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e565778..010a008 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1608,11 +1608,13 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) { struct cpuset *cs = cgroup_cs(cgrp); cpuset_filetype_t type = cft->private; - int retval = -ENODEV; + int retval = 0; mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) + if (!is_cpuset_online(cs)) { + retval = -ENODEV; goto out_unlock; + } switch (type) { case FILE_CPU_EXCLUSIVE: diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index a326f27..0b479a6 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -121,7 +121,7 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) BUG_ON(bits > 32); WARN_ON(!irqs_disabled()); read_sched_clock = read; - sched_clock_mask = (1 << bits) - 1; + sched_clock_mask = (1ULL << bits) - 1; cd.rate = rate; /* calculate the mult/shift to convert counter ticks to ns. */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e77edc9..e8a1516 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -182,7 +182,8 @@ static bool can_stop_full_tick(void) * Don't allow the user to think they can get * full NO_HZ with this machine. */ - WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock"); + WARN_ONCE(have_nohz_full_mask, + "NO_HZ FULL will not work with unstable sched clock"); return false; } #endif @@ -343,8 +344,6 @@ static int tick_nohz_init_all(void) void __init tick_nohz_init(void) { - int cpu; - if (!have_nohz_full_mask) { if (tick_nohz_init_all() < 0) return; diff --git a/kernel/wait.c b/kernel/wait.c index dec68bd..d550920 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -363,8 +363,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); /** * wake_up_atomic_t - Wake up a waiter on a atomic_t - * @word: The word being waited on, a kernel virtual address - * @bit: The bit of the word being waited on + * @p: The atomic_t being waited on, a kernel virtual address * * Wake up anyone waiting for the atomic_t to go to zero. * diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c index fd94058..28321d8 100644 --- a/lib/lz4/lz4_compress.c +++ b/lib/lz4/lz4_compress.c @@ -437,7 +437,7 @@ int lz4_compress(const unsigned char *src, size_t src_len, exit: return ret; } -EXPORT_SYMBOL_GPL(lz4_compress); +EXPORT_SYMBOL(lz4_compress); -MODULE_LICENSE("GPL"); +MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("LZ4 compressor"); diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index d3414ea..411be80 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -299,7 +299,7 @@ exit_0: return ret; } #ifndef STATIC -EXPORT_SYMBOL_GPL(lz4_decompress); +EXPORT_SYMBOL(lz4_decompress); #endif int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, @@ -319,8 +319,8 @@ exit_0: return ret; } #ifndef STATIC -EXPORT_SYMBOL_GPL(lz4_decompress_unknownoutputsize); +EXPORT_SYMBOL(lz4_decompress_unknownoutputsize); -MODULE_LICENSE("GPL"); +MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("LZ4 Decompressor"); #endif diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c index eb1a74f..f344f76 100644 --- a/lib/lz4/lz4hc_compress.c +++ b/lib/lz4/lz4hc_compress.c @@ -533,7 +533,7 @@ int lz4hc_compress(const unsigned char *src, size_t src_len, exit: return ret; } -EXPORT_SYMBOL_GPL(lz4hc_compress); +EXPORT_SYMBOL(lz4hc_compress); -MODULE_LICENSE("GPL"); +MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("LZ4HC compressor"); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c5792a5..0878ff7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6969,7 +6969,6 @@ struct cgroup_subsys mem_cgroup_subsys = { #ifdef CONFIG_MEMCG_SWAP static int __init enable_swap_account(char *s) { - /* consider enabled if no parameter or 1 is given */ if (!strcmp(s, "1")) really_do_swap_account = 1; else if (!strcmp(s, "0")) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 62da527..0a8a80c 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -28,6 +28,22 @@ #include "bat_algo.h" #include "network-coding.h" + +/** + * batadv_dup_status - duplicate status + * @BATADV_NO_DUP: the packet is a duplicate + * @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for the + * neighbor) + * @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor + * @BATADV_PROTECTED: originator is currently protected (after reboot) + */ +enum batadv_dup_status { + BATADV_NO_DUP = 0, + BATADV_ORIG_DUP, + BATADV_NEIGH_DUP, + BATADV_PROTECTED, +}; + /** * batadv_ring_buffer_set - update the ring buffer with the given value * @lq_recv: pointer to the ring buffer @@ -71,21 +87,6 @@ static uint8_t batadv_ring_buffer_avg(const uint8_t lq_recv[]) return (uint8_t)(sum / count); } -/* - * batadv_dup_status - duplicate status - * @BATADV_NO_DUP: the packet is a duplicate - * @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for the - * neighbor) - * @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor - * @BATADV_PROTECTED: originator is currently protected (after reboot) - */ -enum batadv_dup_status { - BATADV_NO_DUP = 0, - BATADV_ORIG_DUP, - BATADV_NEIGH_DUP, - BATADV_PROTECTED, -}; - static struct batadv_neigh_node * batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, const uint8_t *neigh_addr, @@ -478,6 +479,7 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, kfree(forw_packet_aggr); goto out; } + forw_packet_aggr->skb->priority = TC_PRIO_CONTROL; skb_reserve(forw_packet_aggr->skb, ETH_HLEN); skb_buff = skb_put(forw_packet_aggr->skb, packet_len); diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 7614af3..1ce4b87 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -190,6 +190,33 @@ next: return curr_gw; } +/** + * batadv_gw_check_client_stop - check if client mode has been switched off + * @bat_priv: the bat priv with all the soft interface information + * + * This function assumes the caller has checked that the gw state *is actually + * changing*. This function is not supposed to be called when there is no state + * change. + */ +void batadv_gw_check_client_stop(struct batadv_priv *bat_priv) +{ + struct batadv_gw_node *curr_gw; + + if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_CLIENT) + return; + + curr_gw = batadv_gw_get_selected_gw_node(bat_priv); + if (!curr_gw) + return; + + /* if batman-adv is switching the gw client mode off and a gateway was + * already selected, send a DEL uevent + */ + batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL); + + batadv_gw_node_free_ref(curr_gw); +} + void batadv_gw_election(struct batadv_priv *bat_priv) { struct batadv_gw_node *curr_gw = NULL, *next_gw = NULL; diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 1037d75..ceef4eb 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -20,6 +20,7 @@ #ifndef _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ #define _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ +void batadv_gw_check_client_stop(struct batadv_priv *bat_priv); void batadv_gw_deselect(struct batadv_priv *bat_priv); void batadv_gw_election(struct batadv_priv *bat_priv); struct batadv_orig_node * diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index b27508b..5a99bb4 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -183,6 +183,7 @@ static ssize_t batadv_socket_write(struct file *file, const char __user *buff, goto out; } + skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, ETH_HLEN); icmp_packet = (struct batadv_icmp_packet_rr *)skb_put(skb, packet_len); diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 08125f3..c72d1bc 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -19,6 +19,10 @@ #include <linux/crc32c.h> #include <linux/highmem.h> +#include <linux/if_vlan.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/dsfield.h> #include "main.h" #include "sysfs.h" #include "debugfs.h" @@ -249,6 +253,60 @@ out: return primary_if; } +/** + * batadv_skb_set_priority - sets skb priority according to packet content + * @skb: the packet to be sent + * @offset: offset to the packet content + * + * This function sets a value between 256 and 263 (802.1d priority), which + * can be interpreted by the cfg80211 or other drivers. + */ +void batadv_skb_set_priority(struct sk_buff *skb, int offset) +{ + struct iphdr ip_hdr_tmp, *ip_hdr; + struct ipv6hdr ip6_hdr_tmp, *ip6_hdr; + struct ethhdr ethhdr_tmp, *ethhdr; + struct vlan_ethhdr *vhdr, vhdr_tmp; + u32 prio; + + /* already set, do nothing */ + if (skb->priority >= 256 && skb->priority <= 263) + return; + + ethhdr = skb_header_pointer(skb, offset, sizeof(*ethhdr), ðhdr_tmp); + if (!ethhdr) + return; + + switch (ethhdr->h_proto) { + case htons(ETH_P_8021Q): + vhdr = skb_header_pointer(skb, offset + sizeof(*vhdr), + sizeof(*vhdr), &vhdr_tmp); + if (!vhdr) + return; + prio = ntohs(vhdr->h_vlan_TCI) & VLAN_PRIO_MASK; + prio = prio >> VLAN_PRIO_SHIFT; + break; + case htons(ETH_P_IP): + ip_hdr = skb_header_pointer(skb, offset + sizeof(*ethhdr), + sizeof(*ip_hdr), &ip_hdr_tmp); + if (!ip_hdr) + return; + prio = (ipv4_get_dsfield(ip_hdr) & 0xfc) >> 5; + break; + case htons(ETH_P_IPV6): + ip6_hdr = skb_header_pointer(skb, offset + sizeof(*ethhdr), + sizeof(*ip6_hdr), &ip6_hdr_tmp); + if (!ip6_hdr) + return; + prio = (ipv6_get_dsfield(ip6_hdr) & 0xfc) >> 5; + break; + default: + return; + } + + skb->priority = prio + 256; +} + static int batadv_recv_unhandled_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 5e9aebb..5d00f23 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -26,7 +26,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2013.3.0" +#define BATADV_SOURCE_VERSION "2013.4.0" #endif /* B.A.T.M.A.N. parameters */ @@ -184,6 +184,7 @@ void batadv_mesh_free(struct net_device *soft_iface); int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr); struct batadv_hard_iface * batadv_seq_print_text_primary_if_get(struct seq_file *seq); +void batadv_skb_set_priority(struct sk_buff *skb, int offset); int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev); diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 2f0bd3f..0439395 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -775,7 +775,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, struct batadv_neigh_node *neigh_node = NULL; struct batadv_unicast_packet *unicast_packet; struct ethhdr *ethhdr = eth_hdr(skb); - int res, ret = NET_RX_DROP; + int res, hdr_len, ret = NET_RX_DROP; struct sk_buff *new_skb; unicast_packet = (struct batadv_unicast_packet *)skb->data; @@ -835,6 +835,22 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, /* decrement ttl */ unicast_packet->header.ttl--; + switch (unicast_packet->header.packet_type) { + case BATADV_UNICAST_4ADDR: + hdr_len = sizeof(struct batadv_unicast_4addr_packet); + break; + case BATADV_UNICAST: + hdr_len = sizeof(struct batadv_unicast_packet); + break; + default: + /* other packet types not supported - yet */ + hdr_len = -1; + break; + } + + if (hdr_len > 0) + batadv_skb_set_priority(skb, hdr_len); + res = batadv_send_skb_to_orig(skb, orig_node, recv_if); /* translate transmit result into receive result */ @@ -1193,6 +1209,8 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, if (batadv_bla_check_bcast_duplist(bat_priv, skb)) goto out; + batadv_skb_set_priority(skb, sizeof(struct batadv_bcast_packet)); + /* rebroadcast packet */ batadv_add_bcast_packet_to_list(bat_priv, skb, 1); diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index e9ff8d8..0266edd 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -67,7 +67,6 @@ int batadv_send_skb_packet(struct sk_buff *skb, ethhdr->h_proto = __constant_htons(ETH_P_BATMAN); skb_set_network_header(skb, ETH_HLEN); - skb->priority = TC_PRIO_CONTROL; skb->protocol = __constant_htons(ETH_P_BATMAN); skb->dev = hard_iface->net_dev; diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 0f04e1c..4493913 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -229,6 +229,8 @@ static int batadv_interface_tx(struct sk_buff *skb, */ } + batadv_skb_set_priority(skb, 0); + /* ethernet packet should be broadcasted */ if (do_bcast) { primary_if = batadv_primary_if_get_selected(bat_priv); diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 929e304..4114b96 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -385,6 +385,10 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj, curr_gw_mode_str, buff); batadv_gw_deselect(bat_priv); + /* always call batadv_gw_check_client_stop() before changing the gateway + * state + */ + batadv_gw_check_client_stop(bat_priv); atomic_set(&bat_priv->gw_mode, (unsigned int)gw_mode_tmp); return count; } diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 429aeef..34510f3 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1626,6 +1626,7 @@ batadv_tt_response_fill_table(uint16_t tt_len, uint8_t ttvn, if (!skb) goto out; + skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, ETH_HLEN); tt_response = (struct batadv_tt_query_packet *)skb_put(skb, len); tt_response->ttvn = ttvn; @@ -1691,6 +1692,7 @@ static int batadv_send_tt_request(struct batadv_priv *bat_priv, if (!skb) goto out; + skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, ETH_HLEN); tt_req_len = sizeof(*tt_request); @@ -1788,6 +1790,7 @@ batadv_send_other_tt_response(struct batadv_priv *bat_priv, if (!skb) goto unlock; + skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, ETH_HLEN); packet_pos = skb_put(skb, len); tt_response = (struct batadv_tt_query_packet *)packet_pos; @@ -1906,6 +1909,7 @@ batadv_send_my_tt_response(struct batadv_priv *bat_priv, if (!skb) goto unlock; + skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, ETH_HLEN); packet_pos = skb_put(skb, len); tt_response = (struct batadv_tt_query_packet *)packet_pos; @@ -2240,6 +2244,7 @@ static void batadv_send_roam_adv(struct batadv_priv *bat_priv, uint8_t *client, if (!skb) goto out; + skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, ETH_HLEN); roam_adv_packet = (struct batadv_roam_adv_packet *)skb_put(skb, len); diff --git a/net/batman-adv/unicast.c b/net/batman-adv/unicast.c index 688a041..48b31d3 100644 --- a/net/batman-adv/unicast.c +++ b/net/batman-adv/unicast.c @@ -242,6 +242,8 @@ int batadv_frag_send_skb(struct sk_buff *skb, struct batadv_priv *bat_priv, frag_skb = dev_alloc_skb(data_len - (data_len / 2) + ucf_hdr_len); if (!frag_skb) goto dropped; + + skb->priority = TC_PRIO_CONTROL; skb_reserve(frag_skb, ucf_hdr_len); unicast_packet = (struct batadv_unicast_packet *)skb->data; @@ -432,12 +434,16 @@ find_router: switch (packet_type) { case BATADV_UNICAST: - batadv_unicast_prepare_skb(skb, orig_node); + if (!batadv_unicast_prepare_skb(skb, orig_node)) + goto out; + header_len = sizeof(struct batadv_unicast_packet); break; case BATADV_UNICAST_4ADDR: - batadv_unicast_4addr_prepare_skb(bat_priv, skb, orig_node, - packet_subtype); + if (!batadv_unicast_4addr_prepare_skb(bat_priv, skb, orig_node, + packet_subtype)) + goto out; + header_len = sizeof(struct batadv_unicast_4addr_packet); break; default: diff --git a/net/batman-adv/vis.c b/net/batman-adv/vis.c index 4983340..d8ea31a 100644 --- a/net/batman-adv/vis.c +++ b/net/batman-adv/vis.c @@ -397,6 +397,7 @@ batadv_add_packet(struct batadv_priv *bat_priv, kfree(info); return NULL; } + info->skb_packet->priority = TC_PRIO_CONTROL; skb_reserve(info->skb_packet, ETH_HLEN); packet = (struct batadv_vis_packet *)skb_put(info->skb_packet, len); @@ -861,6 +862,7 @@ int batadv_vis_init(struct batadv_priv *bat_priv) if (!bat_priv->vis.my_info->skb_packet) goto free_info; + bat_priv->vis.my_info->skb_packet->priority = TC_PRIO_CONTROL; skb_reserve(bat_priv->vis.my_info->skb_packet, ETH_HLEN); tmp_skb = bat_priv->vis.my_info->skb_packet; packet = (struct batadv_vis_packet *)skb_put(tmp_skb, sizeof(*packet)); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 60aca91..ffd5874 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -161,7 +161,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) if (!pv) return; - for_each_set_bit_from(vid, pv->vlan_bitmap, BR_VLAN_BITMAP_LEN) { + for_each_set_bit_from(vid, pv->vlan_bitmap, VLAN_N_VID) { f = __br_fdb_get(br, br->dev->dev_addr, vid); if (f && f->is_local && !f->dst) fdb_delete(br, f); @@ -730,7 +730,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], /* VID was specified, so use it. */ err = __br_fdb_add(ndm, p, addr, nlh_flags, vid); } else { - if (!pv || bitmap_empty(pv->vlan_bitmap, BR_VLAN_BITMAP_LEN)) { + if (!pv || bitmap_empty(pv->vlan_bitmap, VLAN_N_VID)) { err = __br_fdb_add(ndm, p, addr, nlh_flags, 0); goto out; } @@ -739,7 +739,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], * specify a VLAN. To be nice, add/update entry for every * vlan on this port. */ - for_each_set_bit(vid, pv->vlan_bitmap, BR_VLAN_BITMAP_LEN) { + for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { err = __br_fdb_add(ndm, p, addr, nlh_flags, vid); if (err) goto out; @@ -817,7 +817,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], err = __br_fdb_delete(p, addr, vid); } else { - if (!pv || bitmap_empty(pv->vlan_bitmap, BR_VLAN_BITMAP_LEN)) { + if (!pv || bitmap_empty(pv->vlan_bitmap, VLAN_N_VID)) { err = __br_fdb_delete(p, addr, 0); goto out; } @@ -827,7 +827,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], * vlan on this port. */ err = -ENOENT; - for_each_set_bit(vid, pv->vlan_bitmap, BR_VLAN_BITMAP_LEN) { + for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { err &= __br_fdb_delete(p, addr, vid); } } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 1fc30ab..b9259ef 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -132,7 +132,7 @@ static int br_fill_ifinfo(struct sk_buff *skb, else pv = br_get_vlan_info(br); - if (!pv || bitmap_empty(pv->vlan_bitmap, BR_VLAN_BITMAP_LEN)) + if (!pv || bitmap_empty(pv->vlan_bitmap, VLAN_N_VID)) goto done; af = nla_nest_start(skb, IFLA_AF_SPEC); @@ -140,7 +140,7 @@ static int br_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; pvid = br_get_pvid(pv); - for_each_set_bit(vid, pv->vlan_bitmap, BR_VLAN_BITMAP_LEN) { + for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { vinfo.vid = vid; vinfo.flags = 0; if (vid == pvid) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index bd58b45..9a9ffe7 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -108,7 +108,7 @@ static int __vlan_del(struct net_port_vlans *v, u16 vid) clear_bit(vid, v->vlan_bitmap); v->num_vlans--; - if (bitmap_empty(v->vlan_bitmap, BR_VLAN_BITMAP_LEN)) { + if (bitmap_empty(v->vlan_bitmap, VLAN_N_VID)) { if (v->port_idx) rcu_assign_pointer(v->parent.port->vlan_info, NULL); else @@ -122,7 +122,7 @@ static void __vlan_flush(struct net_port_vlans *v) { smp_wmb(); v->pvid = 0; - bitmap_zero(v->vlan_bitmap, BR_VLAN_BITMAP_LEN); + bitmap_zero(v->vlan_bitmap, VLAN_N_VID); if (v->port_idx) rcu_assign_pointer(v->parent.port->vlan_info, NULL); else diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c index 3b9d5f2..c85e71e 100644 --- a/net/ieee802154/6lowpan.c +++ b/net/ieee802154/6lowpan.c @@ -67,39 +67,6 @@ static const u8 lowpan_ttl_values[] = {0, 1, 64, 255}; static LIST_HEAD(lowpan_devices); -/* - * Uncompression of linklocal: - * 0 -> 16 bytes from packet - * 1 -> 2 bytes from prefix - bunch of zeroes and 8 from packet - * 2 -> 2 bytes from prefix - zeroes + 2 from packet - * 3 -> 2 bytes from prefix - infer 8 bytes from lladdr - * - * NOTE: => the uncompress function does change 0xf to 0x10 - * NOTE: 0x00 => no-autoconfig => unspecified - */ -static const u8 lowpan_unc_llconf[] = {0x0f, 0x28, 0x22, 0x20}; - -/* - * Uncompression of ctx-based: - * 0 -> 0 bits from packet [unspecified / reserved] - * 1 -> 8 bytes from prefix - bunch of zeroes and 8 from packet - * 2 -> 8 bytes from prefix - zeroes + 2 from packet - * 3 -> 8 bytes from prefix - infer 8 bytes from lladdr - */ -static const u8 lowpan_unc_ctxconf[] = {0x00, 0x88, 0x82, 0x80}; - -/* - * Uncompression of ctx-base - * 0 -> 0 bits from packet - * 1 -> 2 bytes from prefix - bunch of zeroes 5 from packet - * 2 -> 2 bytes from prefix - zeroes + 3 from packet - * 3 -> 2 bytes from prefix - infer 1 bytes from lladdr - */ -static const u8 lowpan_unc_mxconf[] = {0x0f, 0x25, 0x23, 0x21}; - -/* Link local prefix */ -static const u8 lowpan_llprefix[] = {0xfe, 0x80}; - /* private device info */ struct lowpan_dev_info { struct net_device *real_dev; /* real WPAN device ptr */ @@ -191,55 +158,177 @@ lowpan_compress_addr_64(u8 **hc06_ptr, u8 shift, const struct in6_addr *ipaddr, return rol8(val, shift); } -static void -lowpan_uip_ds6_set_addr_iid(struct in6_addr *ipaddr, unsigned char *lladdr) +/* + * Uncompress address function for source and + * destination address(non-multicast). + * + * address_mode is sam value or dam value. + */ +static int +lowpan_uncompress_addr(struct sk_buff *skb, + struct in6_addr *ipaddr, + const u8 address_mode, + const struct ieee802154_addr *lladdr) { - memcpy(&ipaddr->s6_addr[8], lladdr, IEEE802154_ADDR_LEN); - /* second bit-flip (Universe/Local) is done according RFC2464 */ - ipaddr->s6_addr[8] ^= 0x02; + bool fail; + + switch (address_mode) { + case LOWPAN_IPHC_ADDR_00: + /* for global link addresses */ + fail = lowpan_fetch_skb(skb, ipaddr->s6_addr, 16); + break; + case LOWPAN_IPHC_ADDR_01: + /* fe:80::XXXX:XXXX:XXXX:XXXX */ + ipaddr->s6_addr[0] = 0xFE; + ipaddr->s6_addr[1] = 0x80; + fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[8], 8); + break; + case LOWPAN_IPHC_ADDR_02: + /* fe:80::ff:fe00:XXXX */ + ipaddr->s6_addr[0] = 0xFE; + ipaddr->s6_addr[1] = 0x80; + ipaddr->s6_addr[11] = 0xFF; + ipaddr->s6_addr[12] = 0xFE; + fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[14], 2); + break; + case LOWPAN_IPHC_ADDR_03: + fail = false; + switch (lladdr->addr_type) { + case IEEE802154_ADDR_LONG: + /* fe:80::XXXX:XXXX:XXXX:XXXX + * \_________________/ + * hwaddr + */ + ipaddr->s6_addr[0] = 0xFE; + ipaddr->s6_addr[1] = 0x80; + memcpy(&ipaddr->s6_addr[8], lladdr->hwaddr, + IEEE802154_ADDR_LEN); + /* second bit-flip (Universe/Local) + * is done according RFC2464 + */ + ipaddr->s6_addr[8] ^= 0x02; + break; + case IEEE802154_ADDR_SHORT: + /* fe:80::ff:fe00:XXXX + * \__/ + * short_addr + * + * Universe/Local bit is zero. + */ + ipaddr->s6_addr[0] = 0xFE; + ipaddr->s6_addr[1] = 0x80; + ipaddr->s6_addr[11] = 0xFF; + ipaddr->s6_addr[12] = 0xFE; + ipaddr->s6_addr16[7] = htons(lladdr->short_addr); + break; + default: + pr_debug("Invalid addr_type set\n"); + return -EINVAL; + } + break; + default: + pr_debug("Invalid address mode value: 0x%x\n", address_mode); + return -EINVAL; + } + + if (fail) { + pr_debug("Failed to fetch skb data\n"); + return -EIO; + } + + lowpan_raw_dump_inline(NULL, "Reconstructed ipv6 addr is:\n", + ipaddr->s6_addr, 16); + + return 0; } -/* - * Uncompress addresses based on a prefix and a postfix with zeroes in - * between. If the postfix is zero in length it will use the link address - * to configure the IP address (autoconf style). - * pref_post_count takes a byte where the first nibble specify prefix count - * and the second postfix count (NOTE: 15/0xf => 16 bytes copy). +/* Uncompress address function for source context + * based address(non-multicast). */ static int -lowpan_uncompress_addr(struct sk_buff *skb, struct in6_addr *ipaddr, - u8 const *prefix, u8 pref_post_count, unsigned char *lladdr) +lowpan_uncompress_context_based_src_addr(struct sk_buff *skb, + struct in6_addr *ipaddr, + const u8 sam) { - u8 prefcount = pref_post_count >> 4; - u8 postcount = pref_post_count & 0x0f; - - /* full nibble 15 => 16 */ - prefcount = (prefcount == 15 ? 16 : prefcount); - postcount = (postcount == 15 ? 16 : postcount); - - if (lladdr) - lowpan_raw_dump_inline(__func__, "linklocal address", - lladdr, IEEE802154_ADDR_LEN); - if (prefcount > 0) - memcpy(ipaddr, prefix, prefcount); - - if (prefcount + postcount < 16) - memset(&ipaddr->s6_addr[prefcount], 0, - 16 - (prefcount + postcount)); - - if (postcount > 0) { - memcpy(&ipaddr->s6_addr[16 - postcount], skb->data, postcount); - skb_pull(skb, postcount); - } else if (prefcount > 0) { - if (lladdr == NULL) - return -EINVAL; + switch (sam) { + case LOWPAN_IPHC_ADDR_00: + /* unspec address :: + * Do nothing, address is already :: + */ + break; + case LOWPAN_IPHC_ADDR_01: + /* TODO */ + case LOWPAN_IPHC_ADDR_02: + /* TODO */ + case LOWPAN_IPHC_ADDR_03: + /* TODO */ + netdev_warn(skb->dev, "SAM value 0x%x not supported\n", sam); + return -EINVAL; + default: + pr_debug("Invalid sam value: 0x%x\n", sam); + return -EINVAL; + } + + lowpan_raw_dump_inline(NULL, + "Reconstructed context based ipv6 src addr is:\n", + ipaddr->s6_addr, 16); + + return 0; +} - /* no IID based configuration if no prefix and no data */ - lowpan_uip_ds6_set_addr_iid(ipaddr, lladdr); +/* Uncompress function for multicast destination address, + * when M bit is set. + */ +static int +lowpan_uncompress_multicast_daddr(struct sk_buff *skb, + struct in6_addr *ipaddr, + const u8 dam) +{ + bool fail; + + switch (dam) { + case LOWPAN_IPHC_DAM_00: + /* 00: 128 bits. The full address + * is carried in-line. + */ + fail = lowpan_fetch_skb(skb, ipaddr->s6_addr, 16); + break; + case LOWPAN_IPHC_DAM_01: + /* 01: 48 bits. The address takes + * the form ffXX::00XX:XXXX:XXXX. + */ + ipaddr->s6_addr[0] = 0xFF; + fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 1); + fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[11], 5); + break; + case LOWPAN_IPHC_DAM_10: + /* 10: 32 bits. The address takes + * the form ffXX::00XX:XXXX. + */ + ipaddr->s6_addr[0] = 0xFF; + fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 1); + fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[13], 3); + break; + case LOWPAN_IPHC_DAM_11: + /* 11: 8 bits. The address takes + * the form ff02::00XX. + */ + ipaddr->s6_addr[0] = 0xFF; + ipaddr->s6_addr[1] = 0x02; + fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[15], 1); + break; + default: + pr_debug("DAM value has a wrong value: 0x%x\n", dam); + return -EINVAL; + } + + if (fail) { + pr_debug("Failed to fetch skb data\n"); + return -EIO; } - pr_debug("uncompressing %d + %d => ", prefcount, postcount); - lowpan_raw_dump_inline(NULL, NULL, ipaddr->s6_addr, 16); + lowpan_raw_dump_inline(NULL, "Reconstructed ipv6 multicast addr is:\n", + ipaddr->s6_addr, 16); return 0; } @@ -702,6 +791,12 @@ lowpan_alloc_new_frame(struct sk_buff *skb, u16 len, u16 tag) skb_reserve(frame->skb, sizeof(struct ipv6hdr)); skb_put(frame->skb, frame->length); + /* copy the first control block to keep a + * trace of the link-layer addresses in case + * of a link-local compressed address + */ + memcpy(frame->skb->cb, skb->cb, sizeof(skb->cb)); + init_timer(&frame->timer); /* time out is the same as for ipv6 - 60 sec */ frame->timer.expires = jiffies + LOWPAN_FRAG_TIMEOUT; @@ -723,9 +818,9 @@ frame_err: static int lowpan_process_data(struct sk_buff *skb) { - struct ipv6hdr hdr; + struct ipv6hdr hdr = {}; u8 tmp, iphc0, iphc1, num_context = 0; - u8 *_saddr, *_daddr; + const struct ieee802154_addr *_saddr, *_daddr; int err; lowpan_raw_dump_table(__func__, "raw skb data dump", skb->data, @@ -828,8 +923,8 @@ lowpan_process_data(struct sk_buff *skb) if (lowpan_fetch_skb_u8(skb, &iphc1)) goto drop; - _saddr = mac_cb(skb)->sa.hwaddr; - _daddr = mac_cb(skb)->da.hwaddr; + _saddr = &mac_cb(skb)->sa; + _daddr = &mac_cb(skb)->da; pr_debug("iphc0 = %02x, iphc1 = %02x\n", iphc0, iphc1); @@ -868,8 +963,6 @@ lowpan_process_data(struct sk_buff *skb) hdr.priority = ((tmp >> 2) & 0x0f); hdr.flow_lbl[0] = ((tmp << 6) & 0xC0) | ((tmp >> 2) & 0x30); - hdr.flow_lbl[1] = 0; - hdr.flow_lbl[2] = 0; break; /* * Flow Label carried in-line @@ -885,10 +978,6 @@ lowpan_process_data(struct sk_buff *skb) break; /* Traffic Class and Flow Label are elided */ case 3: /* 11b */ - hdr.priority = 0; - hdr.flow_lbl[0] = 0; - hdr.flow_lbl[1] = 0; - hdr.flow_lbl[2] = 0; break; default: break; @@ -915,10 +1004,18 @@ lowpan_process_data(struct sk_buff *skb) /* Extract SAM to the tmp variable */ tmp = ((iphc1 & LOWPAN_IPHC_SAM) >> LOWPAN_IPHC_SAM_BIT) & 0x03; - /* Source address uncompression */ - pr_debug("source address stateless compression\n"); - err = lowpan_uncompress_addr(skb, &hdr.saddr, lowpan_llprefix, - lowpan_unc_llconf[tmp], skb->data); + if (iphc1 & LOWPAN_IPHC_SAC) { + /* Source address context based uncompression */ + pr_debug("SAC bit is set. Handle context based source address.\n"); + err = lowpan_uncompress_context_based_src_addr( + skb, &hdr.saddr, tmp); + } else { + /* Source address uncompression */ + pr_debug("source address stateless compression\n"); + err = lowpan_uncompress_addr(skb, &hdr.saddr, tmp, _saddr); + } + + /* Check on error of previous branch */ if (err) goto drop; @@ -931,23 +1028,14 @@ lowpan_process_data(struct sk_buff *skb) pr_debug("dest: context-based mcast compression\n"); /* TODO: implement this */ } else { - u8 prefix[] = {0xff, 0x02}; - - pr_debug("dest: non context-based mcast compression\n"); - if (0 < tmp && tmp < 3) { - if (lowpan_fetch_skb_u8(skb, &prefix[1])) - goto drop; - } - - err = lowpan_uncompress_addr(skb, &hdr.daddr, prefix, - lowpan_unc_mxconf[tmp], NULL); + err = lowpan_uncompress_multicast_daddr( + skb, &hdr.daddr, tmp); if (err) goto drop; } } else { pr_debug("dest: stateless compression\n"); - err = lowpan_uncompress_addr(skb, &hdr.daddr, lowpan_llprefix, - lowpan_unc_llconf[tmp], skb->data); + err = lowpan_uncompress_addr(skb, &hdr.daddr, tmp, _daddr); if (err) goto drop; } diff --git a/net/ieee802154/6lowpan.h b/net/ieee802154/6lowpan.h index 4b8f917..2869c05 100644 --- a/net/ieee802154/6lowpan.h +++ b/net/ieee802154/6lowpan.h @@ -193,10 +193,12 @@ /* Values of fields within the IPHC encoding second byte */ #define LOWPAN_IPHC_CID 0x80 +#define LOWPAN_IPHC_ADDR_00 0x00 +#define LOWPAN_IPHC_ADDR_01 0x01 +#define LOWPAN_IPHC_ADDR_02 0x02 +#define LOWPAN_IPHC_ADDR_03 0x03 + #define LOWPAN_IPHC_SAC 0x40 -#define LOWPAN_IPHC_SAM_00 0x00 -#define LOWPAN_IPHC_SAM_01 0x10 -#define LOWPAN_IPHC_SAM_10 0x20 #define LOWPAN_IPHC_SAM 0x30 #define LOWPAN_IPHC_SAM_BIT 4 @@ -230,4 +232,16 @@ dest = 16 bit inline */ #define LOWPAN_NHC_UDP_CS_P_11 0xF3 /* source & dest = 0xF0B + 4bit inline */ +static inline bool lowpan_fetch_skb(struct sk_buff *skb, + void *data, const unsigned int len) +{ + if (unlikely(!pskb_may_pull(skb, len))) + return true; + + skb_copy_from_linear_data(skb, data, len); + skb_pull(skb, len); + + return false; +} + #endif /* __6LOWPAN_H__ */ diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index a4d9126..830de3f 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -857,13 +857,11 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + if (!IS_ERR(itn->fb_tunnel_dev)) + itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; rtnl_unlock(); - if (IS_ERR(itn->fb_tunnel_dev)) - return PTR_ERR(itn->fb_tunnel_dev); - - return 0; + return PTR_RET(itn->fb_tunnel_dev); } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 4e90280..1657e39b 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -110,6 +110,19 @@ config IP_NF_TARGET_REJECT To compile it as a module, choose M here. If unsure, say N. +config IP_NF_TARGET_SYNPROXY + tristate "SYNPROXY target support" + depends on NF_CONNTRACK && NETFILTER_ADVANCED + select NETFILTER_SYNPROXY + select SYN_COOKIES + help + The SYNPROXY target allows you to intercept TCP connections and + establish them using syncookies before they are passed on to the + server. This allows to avoid conntrack and server resource usage + during SYN-flood attacks. + + To compile it as a module, choose M here. If unsure, say N. + config IP_NF_TARGET_ULOG tristate "ULOG target support (obsolete)" default m if NETFILTER_ADVANCED=n diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 007b128..3622b24 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o +obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o # generic ARP tables diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 30e4de9..00352ce 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -118,7 +118,7 @@ static int masq_device_event(struct notifier_block *this, NF_CT_ASSERT(dev->ifindex != 0); nf_ct_iterate_cleanup(net, device_cmp, - (void *)(long)dev->ifindex); + (void *)(long)dev->ifindex, 0, 0); } return NOTIFY_DONE; diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 04b18c1..b969131 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -119,7 +119,26 @@ static void send_reset(struct sk_buff *oldskb, int hook) nf_ct_attach(nskb, oldskb); - ip_local_out(nskb); +#ifdef CONFIG_BRIDGE_NETFILTER + /* If we use ip_local_out for bridged traffic, the MAC source on + * the RST will be ours, instead of the destination's. This confuses + * some routers/firewalls, and they drop the packet. So we need to + * build the eth header using the original destination's MAC as the + * source, and send the RST packet directly. + */ + if (oldskb->nf_bridge) { + struct ethhdr *oeth = eth_hdr(oldskb); + nskb->dev = oldskb->nf_bridge->physindev; + niph->tot_len = htons(nskb->len); + ip_send_check(niph); + if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), + oeth->h_source, oeth->h_dest, nskb->len) < 0) + goto free_nskb; + dev_queue_xmit(nskb); + } else +#endif + ip_local_out(nskb); + return; free_nskb: diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c new file mode 100644 index 0000000..94371db --- /dev/null +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -0,0 +1,472 @@ +/* + * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/tcp.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_SYNPROXY.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_synproxy.h> + +static struct iphdr * +synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr) +{ + struct iphdr *iph; + + skb_reset_network_header(skb); + iph = (struct iphdr *)skb_put(skb, sizeof(*iph)); + iph->version = 4; + iph->ihl = sizeof(*iph) / 4; + iph->tos = 0; + iph->id = 0; + iph->frag_off = htons(IP_DF); + iph->ttl = sysctl_ip_default_ttl; + iph->protocol = IPPROTO_TCP; + iph->check = 0; + iph->saddr = saddr; + iph->daddr = daddr; + + return iph; +} + +static void +synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, + struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, + struct iphdr *niph, struct tcphdr *nth, + unsigned int tcp_hdr_size) +{ + nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)nth - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + skb_dst_set_noref(nskb, skb_dst(skb)); + nskb->protocol = htons(ETH_P_IP); + if (ip_route_me_harder(nskb, RTN_UNSPEC)) + goto free_nskb; + + if (nfct) { + nskb->nfct = nfct; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nfct); + } + + ip_local_out(nskb); + return; + +free_nskb: + kfree_skb(nskb); +} + +static void +synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + u16 mss = opts->mss; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (opts->options & XT_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE; + nth->doff = tcp_hdr_size / 4; + nth->window = 0; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_syn(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts, u32 recv_seq) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(recv_seq - 1); + /* ack_seq is used to relay our ISN to the synproxy hook to initialize + * sequence number translation once a connection tracking entry exists. + */ + nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); + tcp_flag_word(nth) = TCP_FLAG_SYN; + if (opts->options & XT_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; + nth->doff = tcp_hdr_size / 4; + nth->window = th->window; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_ack(const struct synproxy_net *snet, + const struct ip_ct_tcp *state, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(ntohl(th->ack_seq)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_client_ack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(ntohl(th->seq) + 1); + nth->ack_seq = th->ack_seq; + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = ntohs(htons(th->window) >> opts->wscale); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static bool +synproxy_recv_client_ack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq) +{ + int mss; + + mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); + if (mss == 0) { + this_cpu_inc(snet->stats->cookie_invalid); + return false; + } + + this_cpu_inc(snet->stats->cookie_valid); + opts->mss = mss; + + if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy_check_timestamp_cookie(opts); + + synproxy_send_server_syn(snet, skb, th, opts, recv_seq); + return true; +} + +static unsigned int +synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_synproxy_info *info = par->targinfo; + struct synproxy_net *snet = synproxy_pernet(dev_net(par->in)); + struct synproxy_options opts = {}; + struct tcphdr *th, _th; + + if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP)) + return NF_DROP; + + th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); + if (th == NULL) + return NF_DROP; + + synproxy_parse_options(skb, par->thoff, th, &opts); + + if (th->syn && !th->ack) { + /* Initial SYN from client */ + this_cpu_inc(snet->stats->syn_received); + + if (th->ece && th->cwr) + opts.options |= XT_SYNPROXY_OPT_ECN; + + opts.options &= info->options; + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy_init_timestamp_cookie(info, &opts); + else + opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | + XT_SYNPROXY_OPT_SACK_PERM | + XT_SYNPROXY_OPT_ECN); + + synproxy_send_client_synack(skb, th, &opts); + } else if (th->ack && !(th->fin || th->rst)) + /* ACK from client */ + synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq)); + + return NF_DROP; +} + +static unsigned int ipv4_synproxy_hook(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out)); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct nf_conn_synproxy *synproxy; + struct synproxy_options opts = {}; + const struct ip_ct_tcp *state; + struct tcphdr *th, _th; + unsigned int thoff; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return NF_ACCEPT; + + synproxy = nfct_synproxy(ct); + if (synproxy == NULL) + return NF_ACCEPT; + + if (nf_is_loopback_packet(skb)) + return NF_ACCEPT; + + thoff = ip_hdrlen(skb); + th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); + if (th == NULL) + return NF_DROP; + + state = &ct->proto.tcp; + switch (state->state) { + case TCP_CONNTRACK_CLOSE: + if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - + ntohl(th->seq) + 1); + break; + } + + if (!th->syn || th->ack || + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + break; + + /* Reopened connection - reset the sequence number and timestamp + * adjustments, they will get initialized once the connection is + * reestablished. + */ + nf_ct_seqadj_init(ct, ctinfo, 0); + synproxy->tsoff = 0; + this_cpu_inc(snet->stats->conn_reopened); + + /* fall through */ + case TCP_CONNTRACK_SYN_SENT: + synproxy_parse_options(skb, thoff, th, &opts); + + if (!th->syn && th->ack && + CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, + * therefore we need to add 1 to make the SYN sequence + * number match the one of first SYN. + */ + if (synproxy_recv_client_ack(snet, skb, th, &opts, + ntohl(th->seq) + 1)) + this_cpu_inc(snet->stats->cookie_retrans); + + return NF_DROP; + } + + synproxy->isn = ntohl(th->ack_seq); + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy->its = opts.tsecr; + break; + case TCP_CONNTRACK_SYN_RECV: + if (!th->syn || !th->ack) + break; + + synproxy_parse_options(skb, thoff, th, &opts); + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy->tsoff = opts.tsval - synproxy->its; + + opts.options &= ~(XT_SYNPROXY_OPT_MSS | + XT_SYNPROXY_OPT_WSCALE | + XT_SYNPROXY_OPT_SACK_PERM); + + swap(opts.tsval, opts.tsecr); + synproxy_send_server_ack(snet, state, skb, th, &opts); + + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + + swap(opts.tsval, opts.tsecr); + synproxy_send_client_ack(snet, skb, th, &opts); + + consume_skb(skb); + return NF_STOLEN; + default: + break; + } + + synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); + return NF_ACCEPT; +} + +static int synproxy_tg4_check(const struct xt_tgchk_param *par) +{ + const struct ipt_entry *e = par->entryinfo; + + if (e->ip.proto != IPPROTO_TCP || + e->ip.invflags & XT_INV_PROTO) + return -EINVAL; + + return nf_ct_l3proto_try_module_get(par->family); +} + +static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_target synproxy_tg4_reg __read_mostly = { + .name = "SYNPROXY", + .family = NFPROTO_IPV4, + .target = synproxy_tg4, + .targetsize = sizeof(struct xt_synproxy_info), + .checkentry = synproxy_tg4_check, + .destroy = synproxy_tg4_destroy, + .me = THIS_MODULE, +}; + +static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = { + { + .hook = ipv4_synproxy_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv4_synproxy_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + +static int __init synproxy_tg4_init(void) +{ + int err; + + err = nf_register_hooks(ipv4_synproxy_ops, + ARRAY_SIZE(ipv4_synproxy_ops)); + if (err < 0) + goto err1; + + err = xt_register_target(&synproxy_tg4_reg); + if (err < 0) + goto err2; + + return 0; + +err2: + nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); +err1: + return err; +} + +static void __exit synproxy_tg4_exit(void) +{ + xt_unregister_target(&synproxy_tg4_reg); + nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); +} + +module_init(synproxy_tg4_init); +module_exit(synproxy_tg4_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 0a2e0e3..86f5b34 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -25,6 +25,7 @@ #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/nf_nat_helper.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> @@ -136,11 +137,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum, /* adjust seqs for loopback traffic only in outgoing direction */ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb)) { - typeof(nf_nat_seq_adjust_hook) seq_adjust; - - seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); - if (!seq_adjust || - !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); return NF_DROP; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e805481..727f436 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -112,7 +112,8 @@ #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) -#define IP_MAX_MTU 0xFFF0 +/* IPv4 datagram length is stored into 16bit field (tot_len) */ +#define IP_MAX_MTU 0xFFFF #define RT_GC_TIMEOUT (300*HZ) @@ -1227,10 +1228,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = 576; } - if (mtu > IP_MAX_MTU) - mtu = IP_MAX_MTU; - - return mtu; + return min_t(unsigned int, mtu, IP_MAX_MTU); } static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b05c96e..14a15c4 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -160,26 +160,33 @@ static __u16 const msstab[] = { * Generate a syncookie. mssp points to the mss, which is returned * rounded down to the value encoded in the cookie. */ -__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) +u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, + u16 *mssp) { - const struct iphdr *iph = ip_hdr(skb); - const struct tcphdr *th = tcp_hdr(skb); int mssind; const __u16 mss = *mssp; - tcp_synq_overflow(sk); - for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) if (mss >= msstab[mssind]) break; *mssp = msstab[mssind]; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); - return secure_tcp_syn_cookie(iph->saddr, iph->daddr, th->source, th->dest, ntohl(th->seq), jiffies / (HZ * 60), mssind); } +EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); + +__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) +{ + const struct iphdr *iph = ip_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + + tcp_synq_overflow(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); + + return __cookie_v4_init_sequence(iph, th, mssp); +} /* * This (misnamed) value is the age of syncookie which is permitted. @@ -192,10 +199,9 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) * Check if a ack sequence number is a valid syncookie. * Return the decoded mss if it is, or 0 if not. */ -static inline int cookie_check(struct sk_buff *skb, __u32 cookie) +int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, + u32 cookie) { - const struct iphdr *iph = ip_hdr(skb); - const struct tcphdr *th = tcp_hdr(skb); __u32 seq = ntohl(th->seq) - 1; __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, th->source, th->dest, seq, @@ -204,6 +210,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie) return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } +EXPORT_SYMBOL_GPL(__cookie_v4_check); static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, @@ -284,7 +291,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, goto out; if (tcp_synq_no_recent_overflow(sk) || - (mss = cookie_check(skb, cookie)) == 0) { + (mss = __cookie_v4_check(ip_hdr(skb), th, cookie)) == 0) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ab64eea..4e42c03 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1117,6 +1117,13 @@ new_segment: goto wait_for_memory; /* + * All packets are restored as if they have + * already been sent. + */ + if (tp->repair) + TCP_SKB_CB(skb)->when = tcp_time_stamp; + + /* * Check whether we can use HW checksum. */ if (sk->sk_route_caps & NETIF_F_ALL_CSUM) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e965cc7..ec492ea 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2485,8 +2485,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); - if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open) - tcp_moderate_cwnd(tp); } else { tcp_cwnd_reduction(sk, prior_unsacked, 0); } @@ -3128,11 +3126,24 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) inet_csk(sk)->icsk_ca_state != TCP_CA_Open; } +/* Decide wheather to run the increase function of congestion control. */ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { - const struct tcp_sock *tp = tcp_sk(sk); - return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && - !tcp_in_cwnd_reduction(sk); + if (tcp_in_cwnd_reduction(sk)) + return false; + + /* If reordering is high then always grow cwnd whenever data is + * delivered regardless of its ordering. Otherwise stay conservative + * and only grow cwnd on in-order delivery in Open state, and retain + * cwnd in Disordered state (RFC5681). A stretched ACK with + * new SACK or ECE mark may first advance cwnd here and later reduce + * cwnd in tcp_fastretrans_alert() based on more states. + */ + if (tcp_sk(sk)->reordering > sysctl_tcp_reordering) + return flag & FLAG_FORWARD_PROGRESS; + + return inet_csk(sk)->icsk_ca_state == TCP_CA_Open && + flag & FLAG_DATA_ACKED; } /* Check that window update is acceptable. @@ -3352,18 +3363,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt); acked -= tp->packets_out; + /* Advance cwnd if state allows */ + if (tcp_may_raise_cwnd(sk, flag)) + tcp_cong_avoid(sk, ack, prior_in_flight); + if (tcp_ack_is_dubious(sk, flag)) { - /* Advance CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, prior_in_flight); is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); - } else { - if (flag & FLAG_DATA_ACKED) - tcp_cong_avoid(sk, ack, prior_in_flight); } - if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 05a3d45..09d45d7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -821,8 +821,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, */ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req, - u16 queue_mapping, - bool nocache) + u16 queue_mapping) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -852,7 +851,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) { - int res = tcp_v4_send_synack(sk, NULL, req, 0, false); + int res = tcp_v4_send_synack(sk, NULL, req, 0); if (!res) TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index d4943f6..622a437 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -46,6 +46,10 @@ static unsigned int bufsize __read_mostly = 4096; MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); module_param(bufsize, uint, 0); +static unsigned int fwmark __read_mostly = 0; +MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)"); +module_param(fwmark, uint, 0); + static int full __read_mostly; MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); module_param(full, int, 0); @@ -54,12 +58,16 @@ static const char procname[] = "tcpprobe"; struct tcp_log { ktime_t tstamp; - __be32 saddr, daddr; - __be16 sport, dport; + union { + struct sockaddr raw; + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } src, dst; u16 length; u32 snd_nxt; u32 snd_una; u32 snd_wnd; + u32 rcv_wnd; u32 snd_cwnd; u32 ssthresh; u32 srtt; @@ -86,19 +94,45 @@ static inline int tcp_probe_avail(void) return bufsize - tcp_probe_used() - 1; } +#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \ + do { \ + si4.sin_family = AF_INET; \ + si4.sin_port = inet->inet_##mem##port; \ + si4.sin_addr.s_addr = inet->inet_##mem##addr; \ + } while (0) \ + +#if IS_ENABLED(CONFIG_IPV6) +#define tcp_probe_copy_fl_to_si6(inet, si6, mem) \ + do { \ + struct ipv6_pinfo *pi6 = inet->pinet6; \ + si6.sin6_family = AF_INET6; \ + si6.sin6_port = inet->inet_##mem##port; \ + si6.sin6_addr = pi6->mem##addr; \ + si6.sin6_flowinfo = 0; /* No need here. */ \ + si6.sin6_scope_id = 0; /* No need here. */ \ + } while (0) +#else +#define tcp_probe_copy_fl_to_si6(fl, si6, mem) \ + do { \ + memset(&si6, 0, sizeof(si6)); \ + } while (0) +#endif + /* * Hook inserted to be called before each receive packet. * Note: arguments must match tcp_rcv_established()! */ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, - struct tcphdr *th, unsigned int len) + const struct tcphdr *th, unsigned int len) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_sock *inet = inet_sk(sk); - /* Only update if port matches */ - if ((port == 0 || ntohs(inet->inet_dport) == port || - ntohs(inet->inet_sport) == port) && + /* Only update if port or skb mark matches */ + if (((port == 0 && fwmark == 0) || + ntohs(inet->inet_dport) == port || + ntohs(inet->inet_sport) == port || + (fwmark > 0 && skb->mark == fwmark)) && (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { spin_lock(&tcp_probe.lock); @@ -107,15 +141,25 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, struct tcp_log *p = tcp_probe.log + tcp_probe.head; p->tstamp = ktime_get(); - p->saddr = inet->inet_saddr; - p->sport = inet->inet_sport; - p->daddr = inet->inet_daddr; - p->dport = inet->inet_dport; + switch (sk->sk_family) { + case AF_INET: + tcp_probe_copy_fl_to_si4(inet, p->src.v4, s); + tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d); + break; + case AF_INET6: + tcp_probe_copy_fl_to_si6(inet, p->src.v6, s); + tcp_probe_copy_fl_to_si6(inet, p->dst.v6, d); + break; + default: + BUG(); + } + p->length = skb->len; p->snd_nxt = tp->snd_nxt; p->snd_una = tp->snd_una; p->snd_cwnd = tp->snd_cwnd; p->snd_wnd = tp->snd_wnd; + p->rcv_wnd = tp->rcv_wnd; p->ssthresh = tcp_current_ssthresh(sk); p->srtt = tp->srtt >> 3; @@ -157,13 +201,11 @@ static int tcpprobe_sprint(char *tbuf, int n) = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); return scnprintf(tbuf, n, - "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n", + "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n", (unsigned long) tv.tv_sec, (unsigned long) tv.tv_nsec, - &p->saddr, ntohs(p->sport), - &p->daddr, ntohs(p->dport), - p->length, p->snd_nxt, p->snd_una, - p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt); + &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, + p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd); } static ssize_t tcpprobe_read(struct file *file, char __user *buf, @@ -223,6 +265,13 @@ static __init int tcpprobe_init(void) { int ret = -ENOMEM; + /* Warning: if the function signature of tcp_rcv_established, + * has been changed, you also have to change the signature of + * jtcp_rcv_established, otherwise you end up right here! + */ + BUILD_BUG_ON(__same_type(tcp_rcv_established, + jtcp_rcv_established) == 0); + init_waitqueue_head(&tcp_probe.wait); spin_lock_init(&tcp_probe.lock); @@ -241,7 +290,8 @@ static __init int tcpprobe_init(void) if (ret) goto err1; - pr_info("probe registered (port=%d) bufsize=%u\n", port, bufsize); + pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n", + port, fwmark, bufsize); return 0; err1: remove_proc_entry(procname, init_net.proc_net); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ad12f7c..2d6d179 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -99,9 +99,9 @@ #define ACONF_DEBUG 2 #if ACONF_DEBUG >= 3 -#define ADBG(x) printk x +#define ADBG(fmt, ...) printk(fmt, ##__VA_ARGS__) #else -#define ADBG(x) +#define ADBG(fmt, ...) do { if (0) printk(fmt, ##__VA_ARGS__); } while (0) #endif #define INFINITY_LIFE_TIME 0xFFFFFFFF @@ -374,9 +374,9 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) dev_hold(dev); if (snmp6_alloc_dev(ndev) < 0) { - ADBG((KERN_WARNING + ADBG(KERN_WARNING "%s: cannot allocate memory for statistics; dev=%s.\n", - __func__, dev->name)); + __func__, dev->name); neigh_parms_release(&nd_tbl, ndev->nd_parms); dev_put(dev); kfree(ndev); @@ -384,9 +384,9 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) } if (snmp6_register_dev(ndev) < 0) { - ADBG((KERN_WARNING + ADBG(KERN_WARNING "%s: cannot create /proc/net/dev_snmp6/%s\n", - __func__, dev->name)); + __func__, dev->name); neigh_parms_release(&nd_tbl, ndev->nd_parms); ndev->dead = 1; in6_dev_finish_destroy(ndev); @@ -849,7 +849,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, /* Ignore adding duplicate addresses on an interface */ if (ipv6_chk_same_addr(dev_net(idev->dev), addr, idev->dev)) { - ADBG(("ipv6_add_addr: already assigned\n")); + ADBG("ipv6_add_addr: already assigned\n"); err = -EEXIST; goto out; } @@ -857,7 +857,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); if (ifa == NULL) { - ADBG(("ipv6_add_addr: malloc failed\n")); + ADBG("ipv6_add_addr: malloc failed\n"); err = -ENOBUFS; goto out; } @@ -1131,12 +1131,10 @@ retry: if (ifp->flags & IFA_F_OPTIMISTIC) addr_flags |= IFA_F_OPTIMISTIC; - ift = !max_addresses || - ipv6_count_addresses(idev) < max_addresses ? - ipv6_add_addr(idev, &addr, NULL, tmp_plen, - ipv6_addr_scope(&addr), addr_flags, - tmp_valid_lft, tmp_prefered_lft) : NULL; - if (IS_ERR_OR_NULL(ift)) { + ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen, + ipv6_addr_scope(&addr), addr_flags, + tmp_valid_lft, tmp_prefered_lft); + if (IS_ERR(ift)) { in6_ifa_put(ifp); in6_dev_put(idev); pr_info("%s: retry temporary address regeneration\n", __func__); @@ -1814,6 +1812,16 @@ static int addrconf_ifid_gre(u8 *eui, struct net_device *dev) return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr); } +static int addrconf_ifid_ip6tnl(u8 *eui, struct net_device *dev) +{ + memcpy(eui, dev->perm_addr, 3); + memcpy(eui + 5, dev->perm_addr + 3, 3); + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[0] ^= 2; + return 0; +} + static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) { switch (dev->type) { @@ -1832,6 +1840,8 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) return addrconf_ifid_eui64(eui, dev); case ARPHRD_IEEE1394: return addrconf_ifid_ieee1394(eui, dev); + case ARPHRD_TUNNEL6: + return addrconf_ifid_ip6tnl(eui, dev); } return -1; } @@ -2057,7 +2067,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) pinfo = (struct prefix_info *) opt; if (len < sizeof(struct prefix_info)) { - ADBG(("addrconf: prefix option too short\n")); + ADBG("addrconf: prefix option too short\n"); return; } @@ -2709,7 +2719,8 @@ static void addrconf_dev_config(struct net_device *dev) (dev->type != ARPHRD_ARCNET) && (dev->type != ARPHRD_INFINIBAND) && (dev->type != ARPHRD_IEEE802154) && - (dev->type != ARPHRD_IEEE1394)) { + (dev->type != ARPHRD_IEEE1394) && + (dev->type != ARPHRD_TUNNEL6)) { /* Alas, we support only Ethernet autoconfiguration. */ return; } @@ -2795,44 +2806,6 @@ ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev) return -1; } -static void ip6_tnl_add_linklocal(struct inet6_dev *idev) -{ - struct net_device *link_dev; - struct net *net = dev_net(idev->dev); - - /* first try to inherit the link-local address from the link device */ - if (idev->dev->iflink && - (link_dev = __dev_get_by_index(net, idev->dev->iflink))) { - if (!ipv6_inherit_linklocal(idev, link_dev)) - return; - } - /* then try to inherit it from any device */ - for_each_netdev(net, link_dev) { - if (!ipv6_inherit_linklocal(idev, link_dev)) - return; - } - pr_debug("init ip6-ip6: add_linklocal failed\n"); -} - -/* - * Autoconfigure tunnel with a link-local address so routing protocols, - * DHCPv6, MLD etc. can be run over the virtual link - */ - -static void addrconf_ip6_tnl_config(struct net_device *dev) -{ - struct inet6_dev *idev; - - ASSERT_RTNL(); - - idev = addrconf_add_dev(dev); - if (IS_ERR(idev)) { - pr_debug("init ip6-ip6: add_dev failed\n"); - return; - } - ip6_tnl_add_linklocal(idev); -} - static int addrconf_notify(struct notifier_block *this, unsigned long event, void *ptr) { @@ -2900,9 +2873,6 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, addrconf_gre_config(dev); break; #endif - case ARPHRD_TUNNEL6: - addrconf_ip6_tnl_config(dev); - break; case ARPHRD_LOOPBACK: init_loopback(dev); break; @@ -3637,8 +3607,8 @@ restart: if (time_before(next_sched, jiffies + ADDRCONF_TIMER_FUZZ_MAX)) next_sched = jiffies + ADDRCONF_TIMER_FUZZ_MAX; - ADBG((KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n", - now, next, next_sec, next_sched)); + ADBG(KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n", + now, next, next_sec, next_sched); addr_chk_timer.expires = next_sched; add_timer(&addr_chk_timer); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index cc3bb20..d6e00a3 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -41,6 +41,7 @@ #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> +#include <linux/etherdevice.h> #include <asm/uaccess.h> #include <linux/atomic.h> @@ -1471,6 +1472,9 @@ static void ip6_tnl_dev_setup(struct net_device *dev) dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + /* This perm addr will be used as interface identifier by IPv6 */ + dev->addr_assign_type = NET_ADDR_RANDOM; + eth_random_addr(dev->perm_addr); } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 6c76df9..98ead2b 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -107,9 +107,12 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev); - #define MLD_QRV_DEFAULT 2 +/* RFC3810, 8.1 Query Version Distinctions */ +#define MLD_V1_QUERY_LEN 24 +#define MLD_V2_QUERY_LEN_MIN 28 + #define MLD_V1_SEEN(idev) (dev_net((idev)->dev)->ipv6.devconf_all->force_mld_version == 1 || \ (idev)->cnf.force_mld_version == 1 || \ ((idev)->mc_v1_seen && \ @@ -996,24 +999,24 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, static void mld_gq_start_timer(struct inet6_dev *idev) { - int tv = net_random() % idev->mc_maxdelay; + unsigned long tv = net_random() % idev->mc_maxdelay; idev->mc_gq_running = 1; if (!mod_timer(&idev->mc_gq_timer, jiffies+tv+2)) in6_dev_hold(idev); } -static void mld_ifc_start_timer(struct inet6_dev *idev, int delay) +static void mld_ifc_start_timer(struct inet6_dev *idev, unsigned long delay) { - int tv = net_random() % delay; + unsigned long tv = net_random() % delay; if (!mod_timer(&idev->mc_ifc_timer, jiffies+tv+2)) in6_dev_hold(idev); } -static void mld_dad_start_timer(struct inet6_dev *idev, int delay) +static void mld_dad_start_timer(struct inet6_dev *idev, unsigned long delay) { - int tv = net_random() % delay; + unsigned long tv = net_random() % delay; if (!mod_timer(&idev->mc_dad_timer, jiffies+tv+2)) in6_dev_hold(idev); @@ -1146,13 +1149,11 @@ int igmp6_event_query(struct sk_buff *skb) !(group_type&IPV6_ADDR_MULTICAST)) return -EINVAL; - if (len == 24) { + if (len == MLD_V1_QUERY_LEN) { int switchback; /* MLDv1 router present */ - /* Translate milliseconds to jiffies */ - max_delay = (ntohs(mld->mld_maxdelay)*HZ)/1000; - + max_delay = msecs_to_jiffies(ntohs(mld->mld_maxdelay)); switchback = (idev->mc_qrv + 1) * max_delay; idev->mc_v1_seen = jiffies + switchback; @@ -1162,17 +1163,18 @@ int igmp6_event_query(struct sk_buff *skb) __in6_dev_put(idev); /* clear deleted report items */ mld_clear_delrec(idev); - } else if (len >= 28) { + } else if (len >= MLD_V2_QUERY_LEN_MIN) { int srcs_offset = sizeof(struct mld2_query) - sizeof(struct icmp6hdr); if (!pskb_may_pull(skb, srcs_offset)) return -EINVAL; mlh2 = (struct mld2_query *)skb_transport_header(skb); - max_delay = (MLDV2_MRC(ntohs(mlh2->mld2q_mrc))*HZ)/1000; - if (!max_delay) - max_delay = 1; + + max_delay = max(msecs_to_jiffies(MLDV2_MRC(ntohs(mlh2->mld2q_mrc))), 1UL); + idev->mc_maxdelay = max_delay; + if (mlh2->mld2q_qrv) idev->mc_qrv = mlh2->mld2q_qrv; if (group_type == IPV6_ADDR_ANY) { /* general query */ diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 79aa965..04d31c2 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1369,8 +1369,10 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) return; - if (!ndopts.nd_opts_rh) + if (!ndopts.nd_opts_rh) { + ip6_redirect_no_header(skb, dev_net(skb->dev), 0, 0); return; + } hdr = (u8 *)ndopts.nd_opts_rh; hdr += 8; diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 4433ab40..a7f842b 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -153,6 +153,19 @@ config IP6_NF_TARGET_REJECT To compile it as a module, choose M here. If unsure, say N. +config IP6_NF_TARGET_SYNPROXY + tristate "SYNPROXY target support" + depends on NF_CONNTRACK && NETFILTER_ADVANCED + select NETFILTER_SYNPROXY + select SYN_COOKIES + help + The SYNPROXY target allows you to intercept TCP connections and + establish them using syncookies before they are passed on to the + server. This allows to avoid conntrack and server resource usage + during SYN-flood attacks. + + To compile it as a module, choose M here. If unsure, say N. + config IP6_NF_MANGLE tristate "Packet mangling" default m if NETFILTER_ADVANCED=n diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 2d11fcc..2b53738 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_NF_NAT_IPV6) += ip6table_nat.o nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o # l3 independent conntrack -obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_defrag_ipv6.o +obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o @@ -37,3 +37,4 @@ obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o obj-$(CONFIG_IP6_NF_TARGET_NPT) += ip6t_NPT.o obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o +obj-$(CONFIG_IP6_NF_TARGET_SYNPROXY) += ip6t_SYNPROXY.o diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index 47bff61..3e4e92d 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -76,7 +76,7 @@ static int masq_device_event(struct notifier_block *this, if (event == NETDEV_DOWN) nf_ct_iterate_cleanup(net, device_cmp, - (void *)(long)dev->ifindex); + (void *)(long)dev->ifindex, 0, 0); return NOTIFY_DONE; } diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 70f9abc..56eef30 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -169,7 +169,25 @@ static void send_reset(struct net *net, struct sk_buff *oldskb) nf_ct_attach(nskb, oldskb); - ip6_local_out(nskb); +#ifdef CONFIG_BRIDGE_NETFILTER + /* If we use ip6_local_out for bridged traffic, the MAC source on + * the RST will be ours, instead of the destination's. This confuses + * some routers/firewalls, and they drop the packet. So we need to + * build the eth header using the original destination's MAC as the + * source, and send the RST packet directly. + */ + if (oldskb->nf_bridge) { + struct ethhdr *oeth = eth_hdr(oldskb); + nskb->dev = oldskb->nf_bridge->physindev; + nskb->protocol = htons(ETH_P_IPV6); + ip6h->payload_len = htons(sizeof(struct tcphdr)); + if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), + oeth->h_source, oeth->h_dest, nskb->len) < 0) + return; + dev_queue_xmit(nskb); + } else +#endif + ip6_local_out(nskb); } static inline void diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c new file mode 100644 index 0000000..4270a9b --- /dev/null +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/ip6_checksum.h> +#include <net/ip6_route.h> +#include <net/tcp.h> + +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_SYNPROXY.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_synproxy.h> + +static struct ipv6hdr * +synproxy_build_ip(struct sk_buff *skb, const struct in6_addr *saddr, + const struct in6_addr *daddr) +{ + struct ipv6hdr *iph; + + skb_reset_network_header(skb); + iph = (struct ipv6hdr *)skb_put(skb, sizeof(*iph)); + ip6_flow_hdr(iph, 0, 0); + iph->hop_limit = 64; //XXX + iph->nexthdr = IPPROTO_TCP; + iph->saddr = *saddr; + iph->daddr = *daddr; + + return iph; +} + +static void +synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, + struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, + struct ipv6hdr *niph, struct tcphdr *nth, + unsigned int tcp_hdr_size) +{ + struct net *net = nf_ct_net((struct nf_conn *)nfct); + struct dst_entry *dst; + struct flowi6 fl6; + + nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)nth - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.saddr = niph->saddr; + fl6.daddr = niph->daddr; + fl6.fl6_sport = nth->source; + fl6.fl6_dport = nth->dest; + security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6)); + dst = ip6_route_output(net, NULL, &fl6); + if (dst == NULL || dst->error) { + dst_release(dst); + goto free_nskb; + } + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) + goto free_nskb; + + skb_dst_set(nskb, dst); + + if (nfct) { + nskb->nfct = nfct; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nfct); + } + + ip6_local_out(nskb); + return; + +free_nskb: + kfree_skb(nskb); +} + +static void +synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + u16 mss = opts->mss; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, &iph->daddr, &iph->saddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(__cookie_v6_init_sequence(iph, th, &mss)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (opts->options & XT_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE; + nth->doff = tcp_hdr_size / 4; + nth->window = 0; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_syn(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts, u32 recv_seq) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, &iph->saddr, &iph->daddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(recv_seq - 1); + /* ack_seq is used to relay our ISN to the synproxy hook to initialize + * sequence number translation once a connection tracking entry exists. + */ + nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); + tcp_flag_word(nth) = TCP_FLAG_SYN; + if (opts->options & XT_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; + nth->doff = tcp_hdr_size / 4; + nth->window = th->window; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_ack(const struct synproxy_net *snet, + const struct ip_ct_tcp *state, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, &iph->daddr, &iph->saddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(ntohl(th->ack_seq)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_client_ack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, &iph->saddr, &iph->daddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(ntohl(th->seq) + 1); + nth->ack_seq = th->ack_seq; + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = ntohs(htons(th->window) >> opts->wscale); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static bool +synproxy_recv_client_ack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq) +{ + int mss; + + mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); + if (mss == 0) { + this_cpu_inc(snet->stats->cookie_invalid); + return false; + } + + this_cpu_inc(snet->stats->cookie_valid); + opts->mss = mss; + + if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy_check_timestamp_cookie(opts); + + synproxy_send_server_syn(snet, skb, th, opts, recv_seq); + return true; +} + +static unsigned int +synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_synproxy_info *info = par->targinfo; + struct synproxy_net *snet = synproxy_pernet(dev_net(par->in)); + struct synproxy_options opts = {}; + struct tcphdr *th, _th; + + if (nf_ip6_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP)) + return NF_DROP; + + th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); + if (th == NULL) + return NF_DROP; + + synproxy_parse_options(skb, par->thoff, th, &opts); + + if (th->syn) { + /* Initial SYN from client */ + this_cpu_inc(snet->stats->syn_received); + + if (th->ece && th->cwr) + opts.options |= XT_SYNPROXY_OPT_ECN; + + opts.options &= info->options; + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy_init_timestamp_cookie(info, &opts); + else + opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | + XT_SYNPROXY_OPT_SACK_PERM | + XT_SYNPROXY_OPT_ECN); + + synproxy_send_client_synack(skb, th, &opts); + } else if (th->ack && !(th->fin || th->rst)) + /* ACK from client */ + synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq)); + + return NF_DROP; +} + +static unsigned int ipv6_synproxy_hook(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out)); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct nf_conn_synproxy *synproxy; + struct synproxy_options opts = {}; + const struct ip_ct_tcp *state; + struct tcphdr *th, _th; + __be16 frag_off; + u8 nexthdr; + int thoff; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return NF_ACCEPT; + + synproxy = nfct_synproxy(ct); + if (synproxy == NULL) + return NF_ACCEPT; + + if (nf_is_loopback_packet(skb)) + return NF_ACCEPT; + + nexthdr = ipv6_hdr(skb)->nexthdr; + thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (thoff < 0) + return NF_ACCEPT; + + th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); + if (th == NULL) + return NF_DROP; + + state = &ct->proto.tcp; + switch (state->state) { + case TCP_CONNTRACK_CLOSE: + if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - + ntohl(th->seq) + 1); + break; + } + + if (!th->syn || th->ack || + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + break; + + /* Reopened connection - reset the sequence number and timestamp + * adjustments, they will get initialized once the connection is + * reestablished. + */ + nf_ct_seqadj_init(ct, ctinfo, 0); + synproxy->tsoff = 0; + this_cpu_inc(snet->stats->conn_reopened); + + /* fall through */ + case TCP_CONNTRACK_SYN_SENT: + synproxy_parse_options(skb, thoff, th, &opts); + + if (!th->syn && th->ack && + CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, + * therefore we need to add 1 to make the SYN sequence + * number match the one of first SYN. + */ + if (synproxy_recv_client_ack(snet, skb, th, &opts, + ntohl(th->seq) + 1)) + this_cpu_inc(snet->stats->cookie_retrans); + + return NF_DROP; + } + + synproxy->isn = ntohl(th->ack_seq); + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy->its = opts.tsecr; + break; + case TCP_CONNTRACK_SYN_RECV: + if (!th->syn || !th->ack) + break; + + synproxy_parse_options(skb, thoff, th, &opts); + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy->tsoff = opts.tsval - synproxy->its; + + opts.options &= ~(XT_SYNPROXY_OPT_MSS | + XT_SYNPROXY_OPT_WSCALE | + XT_SYNPROXY_OPT_SACK_PERM); + + swap(opts.tsval, opts.tsecr); + synproxy_send_server_ack(snet, state, skb, th, &opts); + + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + + swap(opts.tsval, opts.tsecr); + synproxy_send_client_ack(snet, skb, th, &opts); + + consume_skb(skb); + return NF_STOLEN; + default: + break; + } + + synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); + return NF_ACCEPT; +} + +static int synproxy_tg6_check(const struct xt_tgchk_param *par) +{ + const struct ip6t_entry *e = par->entryinfo; + + if (!(e->ipv6.flags & IP6T_F_PROTO) || + e->ipv6.proto != IPPROTO_TCP || + e->ipv6.invflags & XT_INV_PROTO) + return -EINVAL; + + return nf_ct_l3proto_try_module_get(par->family); +} + +static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_target synproxy_tg6_reg __read_mostly = { + .name = "SYNPROXY", + .family = NFPROTO_IPV6, + .target = synproxy_tg6, + .targetsize = sizeof(struct xt_synproxy_info), + .checkentry = synproxy_tg6_check, + .destroy = synproxy_tg6_destroy, + .me = THIS_MODULE, +}; + +static struct nf_hook_ops ipv6_synproxy_ops[] __read_mostly = { + { + .hook = ipv6_synproxy_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv6_synproxy_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + +static int __init synproxy_tg6_init(void) +{ + int err; + + err = nf_register_hooks(ipv6_synproxy_ops, + ARRAY_SIZE(ipv6_synproxy_ops)); + if (err < 0) + goto err1; + + err = xt_register_target(&synproxy_tg6_reg); + if (err < 0) + goto err2; + + return 0; + +err2: + nf_unregister_hooks(ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops)); +err1: + return err; +} + +static void __exit synproxy_tg6_exit(void) +{ + xt_unregister_target(&synproxy_tg6_reg); + nf_unregister_hooks(ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops)); +} + +module_init(synproxy_tg6_init); +module_exit(synproxy_tg6_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index c9b6a6e..d6e4dd8 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -28,6 +28,7 @@ #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> #include <net/netfilter/nf_nat_helper.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> @@ -158,11 +159,7 @@ static unsigned int ipv6_confirm(unsigned int hooknum, /* adjust seqs for loopback traffic only in outgoing direction */ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb)) { - typeof(nf_nat_seq_adjust_hook) seq_adjust; - - seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); - if (!seq_adjust || - !seq_adjust(skb, ct, ctinfo, protoff)) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); return NF_DROP; } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 790d9f4..1aeb473 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -490,6 +490,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, ipv6_hdr(head)->payload_len = htons(payload_len); ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); IP6CB(head)->nhoff = nhoff; + IP6CB(head)->flags |= IP6SKB_FRAGMENTED; /* Yes, and fold redundant checksum back. 8) */ if (head->ip_summed == CHECKSUM_COMPLETE) @@ -524,6 +525,9 @@ static int ipv6_frag_rcv(struct sk_buff *skb) struct net *net = dev_net(skb_dst(skb)->dev); int evicted; + if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) + goto fail_hdr; + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS); /* Jumbo payload inhibits frag. header */ @@ -544,6 +548,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS); IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb); + IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; return 1; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e22c4db..55236a8 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1177,6 +1177,27 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) } EXPORT_SYMBOL_GPL(ip6_redirect); +void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, + u32 mark) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = oif; + fl6.flowi6_mark = mark; + fl6.flowi6_flags = 0; + fl6.daddr = msg->dest; + fl6.saddr = iph->daddr; + + dst = ip6_route_output(net, NULL, &fl6); + if (!dst->error) + rt6_do_redirect(dst, NULL, skb); + dst_release(dst); +} + void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) { ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index d5dda20..bf63ac8 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -112,32 +112,38 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, const struct in6_addr *saddr, & COOKIEMASK; } -__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mssp) +u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, + const struct tcphdr *th, __u16 *mssp) { - const struct ipv6hdr *iph = ipv6_hdr(skb); - const struct tcphdr *th = tcp_hdr(skb); int mssind; const __u16 mss = *mssp; - tcp_synq_overflow(sk); - for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) if (mss >= msstab[mssind]) break; *mssp = msstab[mssind]; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); - return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source, th->dest, ntohl(th->seq), jiffies / (HZ * 60), mssind); } +EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence); -static inline int cookie_check(const struct sk_buff *skb, __u32 cookie) +__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mssp) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); + + tcp_synq_overflow(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); + + return __cookie_v6_init_sequence(iph, th, mssp); +} + +int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, + __u32 cookie) +{ __u32 seq = ntohl(th->seq) - 1; __u32 mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr, th->source, th->dest, seq, @@ -145,6 +151,7 @@ static inline int cookie_check(const struct sk_buff *skb, __u32 cookie) return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } +EXPORT_SYMBOL_GPL(__cookie_v6_check); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) { @@ -167,7 +174,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; if (tcp_synq_no_recent_overflow(sk) || - (mss = cookie_check(skb, cookie)) == 0) { + (mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie)) == 0) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } diff --git a/net/key/af_key.c b/net/key/af_key.c index ab8bd2c..9d58537 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -45,7 +45,7 @@ struct netns_pfkey { static DEFINE_MUTEX(pfkey_mutex); #define DUMMY_MARK 0 -static struct xfrm_mark dummy_mark = {0, 0}; +static const struct xfrm_mark dummy_mark = {0, 0}; struct pfkey_sock { /* struct sock must be the first member of struct pfkey_sock */ struct sock sk; @@ -338,7 +338,7 @@ static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk) return 0; } -static u8 sadb_ext_min_len[] = { +static const u8 sadb_ext_min_len[] = { [SADB_EXT_RESERVED] = (u8) 0, [SADB_EXT_SA] = (u8) sizeof(struct sadb_sa), [SADB_EXT_LIFETIME_CURRENT] = (u8) sizeof(struct sadb_lifetime), @@ -1196,10 +1196,6 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, x->props.family = pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_SRC-1], &x->props.saddr); - if (!x->props.family) { - err = -EAFNOSUPPORT; - goto out; - } pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1], &x->id.daddr); @@ -2205,10 +2201,6 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_ sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; xp->family = pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.saddr); - if (!xp->family) { - err = -EINVAL; - goto out; - } xp->selector.family = xp->family; xp->selector.prefixlen_s = sa->sadb_address_prefixlen; xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); @@ -2737,7 +2729,7 @@ static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sad typedef int (*pfkey_handler)(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs); -static pfkey_handler pfkey_funcs[SADB_MAX + 1] = { +static const pfkey_handler pfkey_funcs[SADB_MAX + 1] = { [SADB_RESERVED] = pfkey_reserved, [SADB_GETSPI] = pfkey_getspi, [SADB_UPDATE] = pfkey_add, diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 56d22ca..62a171a 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -408,21 +408,10 @@ config NF_NAT_TFTP depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_TFTP -endif # NF_CONNTRACK - -# transparent proxy support -config NETFILTER_TPROXY - tristate "Transparent proxying support" - depends on IP_NF_MANGLE - depends on NETFILTER_ADVANCED - help - This option enables transparent proxying support, that is, - support for handling non-locally bound IPv4 TCP and UDP sockets. - For it to work you will have to configure certain iptables rules - and use policy routing. For more information on how to set it up - see Documentation/networking/tproxy.txt. +config NETFILTER_SYNPROXY + tristate - To compile it as a module, choose M here. If unsure, say N. +endif # NF_CONNTRACK config NETFILTER_XTABLES tristate "Netfilter Xtables support (required for ip_tables)" @@ -720,10 +709,10 @@ config NETFILTER_XT_TARGET_TEE this clone be rerouted to another nexthop. config NETFILTER_XT_TARGET_TPROXY - tristate '"TPROXY" target support' - depends on NETFILTER_TPROXY + tristate '"TPROXY" target transparent proxying support' depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED + depends on IP_NF_MANGLE select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES help @@ -731,6 +720,9 @@ config NETFILTER_XT_TARGET_TPROXY REDIRECT. It can only be used in the mangle table and is useful to redirect traffic to a transparent proxy. It does _not_ depend on Netfilter connection tracking and NAT, unlike REDIRECT. + For it to work you will have to configure certain iptables rules + and use policy routing. For more information on how to set it up + see Documentation/networking/tproxy.txt. To compile it as a module, choose M here. If unsure, say N. @@ -1180,7 +1172,6 @@ config NETFILTER_XT_MATCH_SCTP config NETFILTER_XT_MATCH_SOCKET tristate '"socket" match support' - depends on NETFILTER_TPROXY depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED depends on !NF_CONNTRACK || NF_CONNTRACK diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index a1abf87..c3a0a12 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,6 +1,6 @@ netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o -nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o +nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o @@ -61,8 +61,8 @@ obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o -# transparent proxy support -obj-$(CONFIG_NETFILTER_TPROXY) += nf_tproxy_core.o +# SYNPROXY +obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 2217363..593b16e 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -234,12 +234,13 @@ EXPORT_SYMBOL(skb_make_writable); /* This does not belong here, but locally generated errors need it if connection tracking in use: without this, connection may not be in hash table, and hence manufactured ICMP or RST packets will not be associated with it. */ -void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu __read_mostly; +void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) + __rcu __read_mostly; EXPORT_SYMBOL(ip_ct_attach); -void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) +void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb) { - void (*attach)(struct sk_buff *, struct sk_buff *); + void (*attach)(struct sk_buff *, const struct sk_buff *); if (skb->nfct) { rcu_read_lock(); diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 3cd85b2..5199448 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -414,7 +414,7 @@ static void ip_vs_lblcr_flush(struct ip_vs_service *svc) spin_lock_bh(&svc->sched_lock); tbl->dead = 1; - for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { + for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblcr_free(en); } @@ -440,7 +440,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) struct ip_vs_lblcr_entry *en; struct hlist_node *next; - for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLCR_TAB_MASK; spin_lock(&svc->sched_lock); @@ -495,7 +495,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data) if (goal > tbl->max_size/2) goal = tbl->max_size/2; - for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLCR_TAB_MASK; spin_lock(&svc->sched_lock); @@ -536,7 +536,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) /* * Initialize the hash buckets */ - for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { + for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { INIT_HLIST_HEAD(&tbl->bucket[i]); } tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index f16c027..3588fae 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -269,14 +269,20 @@ ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) switch (iph->protocol) { case IPPROTO_TCP: th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + if (unlikely(th == NULL)) + return 0; port = th->source; break; case IPPROTO_UDP: uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); + if (unlikely(uh == NULL)) + return 0; port = uh->source; break; case IPPROTO_SCTP: sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); + if (unlikely(sh == NULL)) + return 0; port = sh->source; break; default: diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0283bae..5d892fe 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -39,6 +39,7 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_acct.h> @@ -47,6 +48,7 @@ #include <net/netfilter/nf_conntrack_timestamp.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_labels.h> +#include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_helper.h> @@ -238,7 +240,7 @@ destroy_conntrack(struct nf_conntrack *nfct) nf_conntrack_free(ct); } -void nf_ct_delete_from_lists(struct nf_conn *ct) +static void nf_ct_delete_from_lists(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); @@ -253,7 +255,6 @@ void nf_ct_delete_from_lists(struct nf_conn *ct) &net->ct.dying); spin_unlock_bh(&nf_conntrack_lock); } -EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists); static void death_by_event(unsigned long ul_conntrack) { @@ -275,7 +276,7 @@ static void death_by_event(unsigned long ul_conntrack) nf_ct_put(ct); } -void nf_ct_dying_timeout(struct nf_conn *ct) +static void nf_ct_dying_timeout(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); @@ -288,27 +289,33 @@ void nf_ct_dying_timeout(struct nf_conn *ct) (prandom_u32() % net->ct.sysctl_events_retry_timeout); add_timer(&ecache->timeout); } -EXPORT_SYMBOL_GPL(nf_ct_dying_timeout); -static void death_by_timeout(unsigned long ul_conntrack) +bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) { - struct nf_conn *ct = (void *)ul_conntrack; struct nf_conn_tstamp *tstamp; tstamp = nf_conn_tstamp_find(ct); if (tstamp && tstamp->stop == 0) tstamp->stop = ktime_to_ns(ktime_get_real()); - if (!test_bit(IPS_DYING_BIT, &ct->status) && - unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) { + if (!nf_ct_is_dying(ct) && + unlikely(nf_conntrack_event_report(IPCT_DESTROY, ct, + portid, report) < 0)) { /* destroy event was not delivered */ nf_ct_delete_from_lists(ct); nf_ct_dying_timeout(ct); - return; + return false; } set_bit(IPS_DYING_BIT, &ct->status); nf_ct_delete_from_lists(ct); nf_ct_put(ct); + return true; +} +EXPORT_SYMBOL_GPL(nf_ct_delete); + +static void death_by_timeout(unsigned long ul_conntrack) +{ + nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0); } /* @@ -643,10 +650,7 @@ static noinline int early_drop(struct net *net, unsigned int hash) return dropped; if (del_timer(&ct->timeout)) { - death_by_timeout((unsigned long)ct); - /* Check if we indeed killed this entry. Reliable event - delivery may have inserted it into the dying list. */ - if (test_bit(IPS_DYING_BIT, &ct->status)) { + if (nf_ct_delete(ct, 0, 0)) { dropped = 1; NF_CT_STAT_INC_ATOMIC(net, early_drop); } @@ -796,6 +800,11 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, if (IS_ERR(ct)) return (struct nf_conntrack_tuple_hash *)ct; + if (tmpl && nfct_synproxy(tmpl)) { + nfct_seqadj_ext_add(ct); + nfct_synproxy_ext_add(ct); + } + timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; if (timeout_ext) timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext); @@ -1192,7 +1201,7 @@ EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); #endif /* Used by ipt_REJECT and ip6t_REJECT. */ -static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) +static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -1244,7 +1253,7 @@ found: void nf_ct_iterate_cleanup(struct net *net, int (*iter)(struct nf_conn *i, void *data), - void *data) + void *data, u32 portid, int report) { struct nf_conn *ct; unsigned int bucket = 0; @@ -1252,7 +1261,8 @@ void nf_ct_iterate_cleanup(struct net *net, while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ if (del_timer(&ct->timeout)) - death_by_timeout((unsigned long)ct); + nf_ct_delete(ct, portid, report); + /* ... else the timer will get him soon. */ nf_ct_put(ct); @@ -1260,30 +1270,6 @@ void nf_ct_iterate_cleanup(struct net *net, } EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); -struct __nf_ct_flush_report { - u32 portid; - int report; -}; - -static int kill_report(struct nf_conn *i, void *data) -{ - struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data; - struct nf_conn_tstamp *tstamp; - - tstamp = nf_conn_tstamp_find(i); - if (tstamp && tstamp->stop == 0) - tstamp->stop = ktime_to_ns(ktime_get_real()); - - /* If we fail to deliver the event, death_by_timeout() will retry */ - if (nf_conntrack_event_report(IPCT_DESTROY, i, - fr->portid, fr->report) < 0) - return 1; - - /* Avoid the delivery of the destroy event in death_by_timeout(). */ - set_bit(IPS_DYING_BIT, &i->status); - return 1; -} - static int kill_all(struct nf_conn *i, void *data) { return 1; @@ -1301,11 +1287,7 @@ EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); void nf_conntrack_flush_report(struct net *net, u32 portid, int report) { - struct __nf_ct_flush_report fr = { - .portid = portid, - .report = report, - }; - nf_ct_iterate_cleanup(net, kill_report, &fr); + nf_ct_iterate_cleanup(net, kill_all, NULL, portid, report); } EXPORT_SYMBOL_GPL(nf_conntrack_flush_report); @@ -1351,6 +1333,7 @@ void nf_conntrack_cleanup_end(void) nf_ct_extend_unregister(&nf_ct_zone_extend); #endif nf_conntrack_proto_fini(); + nf_conntrack_seqadj_fini(); nf_conntrack_labels_fini(); nf_conntrack_helper_fini(); nf_conntrack_timeout_fini(); @@ -1386,7 +1369,7 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { - nf_ct_iterate_cleanup(net, kill_all, NULL); + nf_ct_iterate_cleanup(net, kill_all, NULL, 0, 0); nf_ct_release_dying_list(net); if (atomic_read(&net->ct.count) != 0) busy = 1; @@ -1556,6 +1539,10 @@ int nf_conntrack_init_start(void) if (ret < 0) goto err_labels; + ret = nf_conntrack_seqadj_init(); + if (ret < 0) + goto err_seqadj; + #ifdef CONFIG_NF_CONNTRACK_ZONES ret = nf_ct_extend_register(&nf_ct_zone_extend); if (ret < 0) @@ -1580,6 +1567,8 @@ err_proto: nf_ct_extend_unregister(&nf_ct_zone_extend); err_extend: #endif + nf_conntrack_seqadj_fini(); +err_seqadj: nf_conntrack_labels_fini(); err_labels: nf_conntrack_helper_fini(); @@ -1602,9 +1591,6 @@ void nf_conntrack_init_end(void) /* For use by REJECT target */ RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack); - - /* Howto get NAT offsets */ - RCU_INIT_POINTER(nf_ct_nat_offset, NULL); } /* @@ -1691,8 +1677,3 @@ err_slabname: err_stat: return ret; } - -s16 (*nf_ct_nat_offset)(const struct nf_conn *ct, - enum ip_conntrack_dir dir, - u32 seq); -EXPORT_SYMBOL_GPL(nf_ct_nat_offset); diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c index 355d2ef..bb53f12 100644 --- a/net/netfilter/nf_conntrack_labels.c +++ b/net/netfilter/nf_conntrack_labels.c @@ -8,12 +8,8 @@ * published by the Free Software Foundation. */ -#include <linux/ctype.h> #include <linux/export.h> -#include <linux/jhash.h> -#include <linux/spinlock.h> #include <linux/types.h> -#include <linux/slab.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_labels.h> diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index edc410e..eea936b 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -37,6 +37,7 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> @@ -381,9 +382,8 @@ nla_put_failure: return -1; } -#ifdef CONFIG_NF_NAT_NEEDED static int -dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type) +dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type) { struct nlattr *nest_parms; @@ -391,12 +391,12 @@ dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type) if (!nest_parms) goto nla_put_failure; - if (nla_put_be32(skb, CTA_NAT_SEQ_CORRECTION_POS, - htonl(natseq->correction_pos)) || - nla_put_be32(skb, CTA_NAT_SEQ_OFFSET_BEFORE, - htonl(natseq->offset_before)) || - nla_put_be32(skb, CTA_NAT_SEQ_OFFSET_AFTER, - htonl(natseq->offset_after))) + if (nla_put_be32(skb, CTA_SEQADJ_CORRECTION_POS, + htonl(seq->correction_pos)) || + nla_put_be32(skb, CTA_SEQADJ_OFFSET_BEFORE, + htonl(seq->offset_before)) || + nla_put_be32(skb, CTA_SEQADJ_OFFSET_AFTER, + htonl(seq->offset_after))) goto nla_put_failure; nla_nest_end(skb, nest_parms); @@ -408,27 +408,24 @@ nla_put_failure: } static inline int -ctnetlink_dump_nat_seq_adj(struct sk_buff *skb, const struct nf_conn *ct) +ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct) { - struct nf_nat_seq *natseq; - struct nf_conn_nat *nat = nfct_nat(ct); + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + struct nf_ct_seqadj *seq; - if (!(ct->status & IPS_SEQ_ADJUST) || !nat) + if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj) return 0; - natseq = &nat->seq[IP_CT_DIR_ORIGINAL]; - if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_ORIG) == -1) + seq = &seqadj->seq[IP_CT_DIR_ORIGINAL]; + if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1) return -1; - natseq = &nat->seq[IP_CT_DIR_REPLY]; - if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_REPLY) == -1) + seq = &seqadj->seq[IP_CT_DIR_REPLY]; + if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1) return -1; return 0; } -#else -#define ctnetlink_dump_nat_seq_adj(a, b) (0) -#endif static inline int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) @@ -502,7 +499,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || ctnetlink_dump_master(skb, ct) < 0 || - ctnetlink_dump_nat_seq_adj(skb, ct) < 0) + ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -707,8 +704,8 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) ctnetlink_dump_master(skb, ct) < 0) goto nla_put_failure; - if (events & (1 << IPCT_NATSEQADJ) && - ctnetlink_dump_nat_seq_adj(skb, ct) < 0) + if (events & (1 << IPCT_SEQADJ) && + ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; } @@ -1038,21 +1035,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, } } - if (del_timer(&ct->timeout)) { - if (nf_conntrack_event_report(IPCT_DESTROY, ct, - NETLINK_CB(skb).portid, - nlmsg_report(nlh)) < 0) { - nf_ct_delete_from_lists(ct); - /* we failed to report the event, try later */ - nf_ct_dying_timeout(ct); - nf_ct_put(ct); - return 0; - } - /* death_by_timeout would report the event again */ - set_bit(IPS_DYING_BIT, &ct->status); - nf_ct_delete_from_lists(ct); - nf_ct_put(ct); - } + if (del_timer(&ct->timeout)) + nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); + nf_ct_put(ct); return 0; @@ -1451,66 +1436,65 @@ ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[] return err; } -#ifdef CONFIG_NF_NAT_NEEDED -static const struct nla_policy nat_seq_policy[CTA_NAT_SEQ_MAX+1] = { - [CTA_NAT_SEQ_CORRECTION_POS] = { .type = NLA_U32 }, - [CTA_NAT_SEQ_OFFSET_BEFORE] = { .type = NLA_U32 }, - [CTA_NAT_SEQ_OFFSET_AFTER] = { .type = NLA_U32 }, +static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = { + [CTA_SEQADJ_CORRECTION_POS] = { .type = NLA_U32 }, + [CTA_SEQADJ_OFFSET_BEFORE] = { .type = NLA_U32 }, + [CTA_SEQADJ_OFFSET_AFTER] = { .type = NLA_U32 }, }; static inline int -change_nat_seq_adj(struct nf_nat_seq *natseq, const struct nlattr * const attr) +change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr) { int err; - struct nlattr *cda[CTA_NAT_SEQ_MAX+1]; + struct nlattr *cda[CTA_SEQADJ_MAX+1]; - err = nla_parse_nested(cda, CTA_NAT_SEQ_MAX, attr, nat_seq_policy); + err = nla_parse_nested(cda, CTA_SEQADJ_MAX, attr, seqadj_policy); if (err < 0) return err; - if (!cda[CTA_NAT_SEQ_CORRECTION_POS]) + if (!cda[CTA_SEQADJ_CORRECTION_POS]) return -EINVAL; - natseq->correction_pos = - ntohl(nla_get_be32(cda[CTA_NAT_SEQ_CORRECTION_POS])); + seq->correction_pos = + ntohl(nla_get_be32(cda[CTA_SEQADJ_CORRECTION_POS])); - if (!cda[CTA_NAT_SEQ_OFFSET_BEFORE]) + if (!cda[CTA_SEQADJ_OFFSET_BEFORE]) return -EINVAL; - natseq->offset_before = - ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_BEFORE])); + seq->offset_before = + ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_BEFORE])); - if (!cda[CTA_NAT_SEQ_OFFSET_AFTER]) + if (!cda[CTA_SEQADJ_OFFSET_AFTER]) return -EINVAL; - natseq->offset_after = - ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_AFTER])); + seq->offset_after = + ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_AFTER])); return 0; } static int -ctnetlink_change_nat_seq_adj(struct nf_conn *ct, - const struct nlattr * const cda[]) +ctnetlink_change_seq_adj(struct nf_conn *ct, + const struct nlattr * const cda[]) { + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); int ret = 0; - struct nf_conn_nat *nat = nfct_nat(ct); - if (!nat) + if (!seqadj) return 0; - if (cda[CTA_NAT_SEQ_ADJ_ORIG]) { - ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_ORIGINAL], - cda[CTA_NAT_SEQ_ADJ_ORIG]); + if (cda[CTA_SEQ_ADJ_ORIG]) { + ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL], + cda[CTA_SEQ_ADJ_ORIG]); if (ret < 0) return ret; ct->status |= IPS_SEQ_ADJUST; } - if (cda[CTA_NAT_SEQ_ADJ_REPLY]) { - ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_REPLY], - cda[CTA_NAT_SEQ_ADJ_REPLY]); + if (cda[CTA_SEQ_ADJ_REPLY]) { + ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY], + cda[CTA_SEQ_ADJ_REPLY]); if (ret < 0) return ret; @@ -1519,7 +1503,6 @@ ctnetlink_change_nat_seq_adj(struct nf_conn *ct, return 0; } -#endif static int ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[]) @@ -1585,13 +1568,12 @@ ctnetlink_change_conntrack(struct nf_conn *ct, ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); #endif -#ifdef CONFIG_NF_NAT_NEEDED - if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) { - err = ctnetlink_change_nat_seq_adj(ct, cda); + if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { + err = ctnetlink_change_seq_adj(ct, cda); if (err < 0) return err; } -#endif + if (cda[CTA_LABELS]) { err = ctnetlink_attach_labels(ct, cda); if (err < 0) @@ -1696,13 +1678,11 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, goto err2; } -#ifdef CONFIG_NF_NAT_NEEDED - if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) { - err = ctnetlink_change_nat_seq_adj(ct, cda); + if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { + err = ctnetlink_change_seq_adj(ct, cda); if (err < 0) goto err2; } -#endif memset(&ct->proto, 0, sizeof(ct->proto)); if (cda[CTA_PROTOINFO]) { @@ -1816,7 +1796,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, (1 << IPCT_ASSURED) | (1 << IPCT_HELPER) | (1 << IPCT_PROTOINFO) | - (1 << IPCT_NATSEQADJ) | + (1 << IPCT_SEQADJ) | (1 << IPCT_MARK) | events, ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); @@ -1839,7 +1819,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, (1 << IPCT_HELPER) | (1 << IPCT_LABEL) | (1 << IPCT_PROTOINFO) | - (1 << IPCT_NATSEQADJ) | + (1 << IPCT_SEQADJ) | (1 << IPCT_MARK), ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); @@ -1999,6 +1979,27 @@ out: return err == -EAGAIN ? -ENOBUFS : err; } +static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { + [CTA_EXPECT_MASTER] = { .type = NLA_NESTED }, + [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED }, + [CTA_EXPECT_MASK] = { .type = NLA_NESTED }, + [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 }, + [CTA_EXPECT_ID] = { .type = NLA_U32 }, + [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING, + .len = NF_CT_HELPER_NAME_LEN - 1 }, + [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, + [CTA_EXPECT_FLAGS] = { .type = NLA_U32 }, + [CTA_EXPECT_CLASS] = { .type = NLA_U32 }, + [CTA_EXPECT_NAT] = { .type = NLA_NESTED }, + [CTA_EXPECT_FN] = { .type = NLA_NUL_STRING }, +}; + +static struct nf_conntrack_expect * +ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct, + struct nf_conntrack_helper *helper, + struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *mask); + #ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT static size_t ctnetlink_nfqueue_build_size(const struct nf_conn *ct) @@ -2073,7 +2074,7 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct) goto nla_put_failure; if ((ct->status & IPS_SEQ_ADJUST) && - ctnetlink_dump_nat_seq_adj(skb, ct) < 0) + ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_MARK @@ -2139,10 +2140,70 @@ ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct) return ret; } +static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda, + const struct nf_conn *ct, + struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *mask) +{ + int err; + + err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE, + nf_ct_l3num(ct)); + if (err < 0) + return err; + + return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK, + nf_ct_l3num(ct)); +} + +static int +ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, + u32 portid, u32 report) +{ + struct nlattr *cda[CTA_EXPECT_MAX+1]; + struct nf_conntrack_tuple tuple, mask; + struct nf_conntrack_helper *helper = NULL; + struct nf_conntrack_expect *exp; + int err; + + err = nla_parse_nested(cda, CTA_EXPECT_MAX, attr, exp_nla_policy); + if (err < 0) + return err; + + err = ctnetlink_nfqueue_exp_parse((const struct nlattr * const *)cda, + ct, &tuple, &mask); + if (err < 0) + return err; + + if (cda[CTA_EXPECT_HELP_NAME]) { + const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); + + helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper == NULL) + return -EOPNOTSUPP; + } + + exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct, + helper, &tuple, &mask); + if (IS_ERR(exp)) + return PTR_ERR(exp); + + err = nf_ct_expect_related_report(exp, portid, report); + if (err < 0) { + nf_ct_expect_put(exp); + return err; + } + + return 0; +} + static struct nfq_ct_hook ctnetlink_nfqueue_hook = { .build_size = ctnetlink_nfqueue_build_size, .build = ctnetlink_nfqueue_build, .parse = ctnetlink_nfqueue_parse, + .attach_expect = ctnetlink_nfqueue_attach_expect, + .seq_adjust = nf_ct_tcp_seqadj_set, }; #endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */ @@ -2510,21 +2571,6 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb, return err; } -static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { - [CTA_EXPECT_MASTER] = { .type = NLA_NESTED }, - [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED }, - [CTA_EXPECT_MASK] = { .type = NLA_NESTED }, - [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 }, - [CTA_EXPECT_ID] = { .type = NLA_U32 }, - [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING, - .len = NF_CT_HELPER_NAME_LEN - 1 }, - [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, - [CTA_EXPECT_FLAGS] = { .type = NLA_U32 }, - [CTA_EXPECT_CLASS] = { .type = NLA_U32 }, - [CTA_EXPECT_NAT] = { .type = NLA_NESTED }, - [CTA_EXPECT_FN] = { .type = NLA_NUL_STRING }, -}; - static int ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, @@ -2747,76 +2793,26 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr, #endif } -static int -ctnetlink_create_expect(struct net *net, u16 zone, - const struct nlattr * const cda[], - u_int8_t u3, - u32 portid, int report) +static struct nf_conntrack_expect * +ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, + struct nf_conntrack_helper *helper, + struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *mask) { - struct nf_conntrack_tuple tuple, mask, master_tuple; - struct nf_conntrack_tuple_hash *h = NULL; + u_int32_t class = 0; struct nf_conntrack_expect *exp; - struct nf_conn *ct; struct nf_conn_help *help; - struct nf_conntrack_helper *helper = NULL; - u_int32_t class = 0; - int err = 0; - - /* caller guarantees that those three CTA_EXPECT_* exist */ - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); - if (err < 0) - return err; - err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3); - if (err < 0) - return err; - err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3); - if (err < 0) - return err; - - /* Look for master conntrack of this expectation */ - h = nf_conntrack_find_get(net, zone, &master_tuple); - if (!h) - return -ENOENT; - ct = nf_ct_tuplehash_to_ctrack(h); - - /* Look for helper of this expectation */ - if (cda[CTA_EXPECT_HELP_NAME]) { - const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); - - helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), - nf_ct_protonum(ct)); - if (helper == NULL) { -#ifdef CONFIG_MODULES - if (request_module("nfct-helper-%s", helpname) < 0) { - err = -EOPNOTSUPP; - goto out; - } - - helper = __nf_conntrack_helper_find(helpname, - nf_ct_l3num(ct), - nf_ct_protonum(ct)); - if (helper) { - err = -EAGAIN; - goto out; - } -#endif - err = -EOPNOTSUPP; - goto out; - } - } + int err; if (cda[CTA_EXPECT_CLASS] && helper) { class = ntohl(nla_get_be32(cda[CTA_EXPECT_CLASS])); - if (class > helper->expect_class_max) { - err = -EINVAL; - goto out; - } + if (class > helper->expect_class_max) + return ERR_PTR(-EINVAL); } exp = nf_ct_expect_alloc(ct); - if (!exp) { - err = -ENOMEM; - goto out; - } + if (!exp) + return ERR_PTR(-ENOMEM); + help = nfct_help(ct); if (!help) { if (!cda[CTA_EXPECT_TIMEOUT]) { @@ -2854,21 +2850,89 @@ ctnetlink_create_expect(struct net *net, u16 zone, exp->class = class; exp->master = ct; exp->helper = helper; - memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple)); - memcpy(&exp->mask.src.u3, &mask.src.u3, sizeof(exp->mask.src.u3)); - exp->mask.src.u.all = mask.src.u.all; + exp->tuple = *tuple; + exp->mask.src.u3 = mask->src.u3; + exp->mask.src.u.all = mask->src.u.all; if (cda[CTA_EXPECT_NAT]) { err = ctnetlink_parse_expect_nat(cda[CTA_EXPECT_NAT], - exp, u3); + exp, nf_ct_l3num(ct)); if (err < 0) goto err_out; } - err = nf_ct_expect_related_report(exp, portid, report); + return exp; err_out: nf_ct_expect_put(exp); -out: - nf_ct_put(nf_ct_tuplehash_to_ctrack(h)); + return ERR_PTR(err); +} + +static int +ctnetlink_create_expect(struct net *net, u16 zone, + const struct nlattr * const cda[], + u_int8_t u3, u32 portid, int report) +{ + struct nf_conntrack_tuple tuple, mask, master_tuple; + struct nf_conntrack_tuple_hash *h = NULL; + struct nf_conntrack_helper *helper = NULL; + struct nf_conntrack_expect *exp; + struct nf_conn *ct; + int err; + + /* caller guarantees that those three CTA_EXPECT_* exist */ + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3); + if (err < 0) + return err; + + /* Look for master conntrack of this expectation */ + h = nf_conntrack_find_get(net, zone, &master_tuple); + if (!h) + return -ENOENT; + ct = nf_ct_tuplehash_to_ctrack(h); + + if (cda[CTA_EXPECT_HELP_NAME]) { + const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); + + helper = __nf_conntrack_helper_find(helpname, u3, + nf_ct_protonum(ct)); + if (helper == NULL) { +#ifdef CONFIG_MODULES + if (request_module("nfct-helper-%s", helpname) < 0) { + err = -EOPNOTSUPP; + goto err_ct; + } + helper = __nf_conntrack_helper_find(helpname, u3, + nf_ct_protonum(ct)); + if (helper) { + err = -EAGAIN; + goto err_ct; + } +#endif + err = -EOPNOTSUPP; + goto err_ct; + } + } + + exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask); + if (IS_ERR(exp)) { + err = PTR_ERR(exp); + goto err_ct; + } + + err = nf_ct_expect_related_report(exp, portid, report); + if (err < 0) + goto err_exp; + + return 0; +err_exp: + nf_ct_expect_put(exp); +err_ct: + nf_ct_put(ct); return err; } diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 0ab9636..ce30041 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -281,7 +281,7 @@ void nf_ct_l3proto_pernet_unregister(struct net *net, nf_ct_l3proto_unregister_sysctl(net, proto); /* Remove all contrack entries for this protocol */ - nf_ct_iterate_cleanup(net, kill_l3proto, proto); + nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0); } EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_unregister); @@ -476,7 +476,7 @@ void nf_ct_l4proto_pernet_unregister(struct net *net, nf_ct_l4proto_unregister_sysctl(net, pn, l4proto); /* Remove all contrack entries for this protocol */ - nf_ct_iterate_cleanup(net, kill_l4proto, l4proto); + nf_ct_iterate_cleanup(net, kill_l4proto, l4proto, 0, 0); } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 2f80107..44d1ea3 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -27,6 +27,8 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> @@ -495,21 +497,6 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, } } -#ifdef CONFIG_NF_NAT_NEEDED -static inline s16 nat_offset(const struct nf_conn *ct, - enum ip_conntrack_dir dir, - u32 seq) -{ - typeof(nf_ct_nat_offset) get_offset = rcu_dereference(nf_ct_nat_offset); - - return get_offset != NULL ? get_offset(ct, dir, seq) : 0; -} -#define NAT_OFFSET(ct, dir, seq) \ - (nat_offset(ct, dir, seq)) -#else -#define NAT_OFFSET(ct, dir, seq) 0 -#endif - static bool tcp_in_window(const struct nf_conn *ct, struct ip_ct_tcp *state, enum ip_conntrack_dir dir, @@ -525,7 +512,7 @@ static bool tcp_in_window(const struct nf_conn *ct, struct ip_ct_tcp_state *receiver = &state->seen[!dir]; const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; __u32 seq, ack, sack, end, win, swin; - s16 receiver_offset; + s32 receiver_offset; bool res, in_recv_win; /* @@ -540,7 +527,7 @@ static bool tcp_in_window(const struct nf_conn *ct, tcp_sack(skb, dataoff, tcph, &sack); /* Take into account NAT sequence number mangling */ - receiver_offset = NAT_OFFSET(ct, !dir, ack - 1); + receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1); ack -= receiver_offset; sack -= receiver_offset; @@ -960,6 +947,21 @@ static int tcp_packet(struct nf_conn *ct, "state %s ", tcp_conntrack_names[old_state]); return NF_ACCEPT; case TCP_CONNTRACK_MAX: + /* Special case for SYN proxy: when the SYN to the server or + * the SYN/ACK from the server is lost, the client may transmit + * a keep-alive packet while in SYN_SENT state. This needs to + * be associated with the original conntrack entry in order to + * generate a new SYN with the correct sequence number. + */ + if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT && + index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL && + ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL && + ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) { + pr_debug("nf_ct_tcp: SYN proxy client keep alive\n"); + spin_unlock_bh(&ct->lock); + return NF_ACCEPT; + } + /* Invalid packet */ pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", dir, get_conntrack_index(th), old_state); diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c new file mode 100644 index 0000000..5f9bfd0 --- /dev/null +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -0,0 +1,238 @@ +#include <linux/types.h> +#include <linux/netfilter.h> +#include <net/tcp.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_seqadj.h> + +int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + s32 off) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_conn_seqadj *seqadj; + struct nf_ct_seqadj *this_way; + + if (off == 0) + return 0; + + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + + seqadj = nfct_seqadj(ct); + this_way = &seqadj->seq[dir]; + this_way->offset_before = off; + this_way->offset_after = off; + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_seqadj_init); + +int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + __be32 seq, s32 off) +{ + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_ct_seqadj *this_way; + + if (off == 0) + return 0; + + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + + spin_lock_bh(&ct->lock); + this_way = &seqadj->seq[dir]; + if (this_way->offset_before == this_way->offset_after || + before(this_way->correction_pos, seq)) { + this_way->correction_pos = seq; + this_way->offset_before = this_way->offset_after; + this_way->offset_after += off; + } + spin_unlock_bh(&ct->lock); + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_seqadj_set); + +void nf_ct_tcp_seqadj_set(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + s32 off) +{ + const struct tcphdr *th; + + if (nf_ct_protonum(ct) != IPPROTO_TCP) + return; + + th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb)); + nf_ct_seqadj_set(ct, ctinfo, th->seq, off); +} +EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set); + +/* Adjust one found SACK option including checksum correction */ +static void nf_ct_sack_block_adjust(struct sk_buff *skb, + struct tcphdr *tcph, + unsigned int sackoff, + unsigned int sackend, + struct nf_ct_seqadj *seq) +{ + while (sackoff < sackend) { + struct tcp_sack_block_wire *sack; + __be32 new_start_seq, new_end_seq; + + sack = (void *)skb->data + sackoff; + if (after(ntohl(sack->start_seq) - seq->offset_before, + seq->correction_pos)) + new_start_seq = htonl(ntohl(sack->start_seq) - + seq->offset_after); + else + new_start_seq = htonl(ntohl(sack->start_seq) - + seq->offset_before); + + if (after(ntohl(sack->end_seq) - seq->offset_before, + seq->correction_pos)) + new_end_seq = htonl(ntohl(sack->end_seq) - + seq->offset_after); + else + new_end_seq = htonl(ntohl(sack->end_seq) - + seq->offset_before); + + pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n", + ntohl(sack->start_seq), new_start_seq, + ntohl(sack->end_seq), new_end_seq); + + inet_proto_csum_replace4(&tcph->check, skb, + sack->start_seq, new_start_seq, 0); + inet_proto_csum_replace4(&tcph->check, skb, + sack->end_seq, new_end_seq, 0); + sack->start_seq = new_start_seq; + sack->end_seq = new_end_seq; + sackoff += sizeof(*sack); + } +} + +/* TCP SACK sequence number adjustment */ +static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, + unsigned int protoff, + struct tcphdr *tcph, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dir, optoff, optend; + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + + optoff = protoff + sizeof(struct tcphdr); + optend = protoff + tcph->doff * 4; + + if (!skb_make_writable(skb, optend)) + return 0; + + dir = CTINFO2DIR(ctinfo); + + while (optoff < optend) { + /* Usually: option, length. */ + unsigned char *op = skb->data + optoff; + + switch (op[0]) { + case TCPOPT_EOL: + return 1; + case TCPOPT_NOP: + optoff++; + continue; + default: + /* no partial options */ + if (optoff + 1 == optend || + optoff + op[1] > optend || + op[1] < 2) + return 0; + if (op[0] == TCPOPT_SACK && + op[1] >= 2+TCPOLEN_SACK_PERBLOCK && + ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) + nf_ct_sack_block_adjust(skb, tcph, optoff + 2, + optoff+op[1], + &seqadj->seq[!dir]); + optoff += op[1]; + } + } + return 1; +} + +/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ +int nf_ct_seq_adjust(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct tcphdr *tcph; + __be32 newseq, newack; + s32 seqoff, ackoff; + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + struct nf_ct_seqadj *this_way, *other_way; + int res; + + this_way = &seqadj->seq[dir]; + other_way = &seqadj->seq[!dir]; + + if (!skb_make_writable(skb, protoff + sizeof(*tcph))) + return 0; + + tcph = (void *)skb->data + protoff; + spin_lock_bh(&ct->lock); + if (after(ntohl(tcph->seq), this_way->correction_pos)) + seqoff = this_way->offset_after; + else + seqoff = this_way->offset_before; + + if (after(ntohl(tcph->ack_seq) - other_way->offset_before, + other_way->correction_pos)) + ackoff = other_way->offset_after; + else + ackoff = other_way->offset_before; + + newseq = htonl(ntohl(tcph->seq) + seqoff); + newack = htonl(ntohl(tcph->ack_seq) - ackoff); + + inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); + inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); + + pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", + ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), + ntohl(newack)); + + tcph->seq = newseq; + tcph->ack_seq = newack; + + res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo); + spin_unlock_bh(&ct->lock); + + return res; +} +EXPORT_SYMBOL_GPL(nf_ct_seq_adjust); + +s32 nf_ct_seq_offset(const struct nf_conn *ct, + enum ip_conntrack_dir dir, + u32 seq) +{ + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + struct nf_ct_seqadj *this_way; + + if (!seqadj) + return 0; + + this_way = &seqadj->seq[dir]; + return after(seq, this_way->correction_pos) ? + this_way->offset_after : this_way->offset_before; +} +EXPORT_SYMBOL_GPL(nf_ct_seq_offset); + +static struct nf_ct_ext_type nf_ct_seqadj_extend __read_mostly = { + .len = sizeof(struct nf_conn_seqadj), + .align = __alignof__(struct nf_conn_seqadj), + .id = NF_CT_EXT_SEQADJ, +}; + +int nf_conntrack_seqadj_init(void) +{ + return nf_ct_extend_register(&nf_ct_seqadj_extend); +} + +void nf_conntrack_seqadj_fini(void) +{ + nf_ct_extend_unregister(&nf_ct_seqadj_extend); +} diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 038eee5..6f0f4f7 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -25,6 +25,7 @@ #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_helper.h> #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_zones.h> #include <linux/netfilter/nf_nat.h> @@ -402,6 +403,9 @@ nf_nat_setup_info(struct nf_conn *ct, ct->status |= IPS_SRC_NAT; else ct->status |= IPS_DST_NAT; + + if (nfct_help(ct)) + nfct_seqadj_ext_add(ct); } if (maniptype == NF_NAT_MANIP_SRC) { @@ -497,7 +501,7 @@ static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto) rtnl_lock(); for_each_net(net) - nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean); + nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0); rtnl_unlock(); } @@ -511,7 +515,7 @@ static void nf_nat_l3proto_clean(u8 l3proto) rtnl_lock(); for_each_net(net) - nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean); + nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0); rtnl_unlock(); } @@ -749,7 +753,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) { struct nf_nat_proto_clean clean = {}; - nf_ct_iterate_cleanup(net, &nf_nat_proto_remove, &clean); + nf_ct_iterate_cleanup(net, &nf_nat_proto_remove, &clean, 0, 0); synchronize_rcu(); nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size); } @@ -764,10 +768,6 @@ static struct nf_ct_helper_expectfn follow_master_nat = { .expectfn = nf_nat_follow_master, }; -static struct nfq_ct_nat_hook nfq_ct_nat = { - .seq_adjust = nf_nat_tcp_seq_adjust, -}; - static int __init nf_nat_init(void) { int ret; @@ -787,14 +787,9 @@ static int __init nf_nat_init(void) /* Initialize fake conntrack so that NAT will skip it */ nf_ct_untracked_status_or(IPS_NAT_DONE_MASK); - BUG_ON(nf_nat_seq_adjust_hook != NULL); - RCU_INIT_POINTER(nf_nat_seq_adjust_hook, nf_nat_seq_adjust); BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, nfnetlink_parse_nat_setup); - BUG_ON(nf_ct_nat_offset != NULL); - RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset); - RCU_INIT_POINTER(nfq_ct_nat_hook, &nfq_ct_nat); #ifdef CONFIG_XFRM BUG_ON(nf_nat_decode_session_hook != NULL); RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session); @@ -813,10 +808,7 @@ static void __exit nf_nat_cleanup(void) unregister_pernet_subsys(&nf_nat_net_ops); nf_ct_extend_unregister(&nat_extend); nf_ct_helper_expectfn_unregister(&follow_master_nat); - RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL); RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); - RCU_INIT_POINTER(nf_ct_nat_offset, NULL); - RCU_INIT_POINTER(nfq_ct_nat_hook, NULL); #ifdef CONFIG_XFRM RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL); #endif diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index 85e20a9..2840abb 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -20,74 +20,13 @@ #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_l3proto.h> #include <net/netfilter/nf_nat_l4proto.h> #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_helper.h> -#define DUMP_OFFSET(x) \ - pr_debug("offset_before=%d, offset_after=%d, correction_pos=%u\n", \ - x->offset_before, x->offset_after, x->correction_pos); - -static DEFINE_SPINLOCK(nf_nat_seqofs_lock); - -/* Setup TCP sequence correction given this change at this sequence */ -static inline void -adjust_tcp_sequence(u32 seq, - int sizediff, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - struct nf_conn_nat *nat = nfct_nat(ct); - struct nf_nat_seq *this_way = &nat->seq[dir]; - - pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n", - seq, sizediff); - - pr_debug("adjust_tcp_sequence: Seq_offset before: "); - DUMP_OFFSET(this_way); - - spin_lock_bh(&nf_nat_seqofs_lock); - - /* SYN adjust. If it's uninitialized, or this is after last - * correction, record it: we don't handle more than one - * adjustment in the window, but do deal with common case of a - * retransmit */ - if (this_way->offset_before == this_way->offset_after || - before(this_way->correction_pos, seq)) { - this_way->correction_pos = seq; - this_way->offset_before = this_way->offset_after; - this_way->offset_after += sizediff; - } - spin_unlock_bh(&nf_nat_seqofs_lock); - - pr_debug("adjust_tcp_sequence: Seq_offset after: "); - DUMP_OFFSET(this_way); -} - -/* Get the offset value, for conntrack */ -s16 nf_nat_get_offset(const struct nf_conn *ct, - enum ip_conntrack_dir dir, - u32 seq) -{ - struct nf_conn_nat *nat = nfct_nat(ct); - struct nf_nat_seq *this_way; - s16 offset; - - if (!nat) - return 0; - - this_way = &nat->seq[dir]; - spin_lock_bh(&nf_nat_seqofs_lock); - offset = after(seq, this_way->correction_pos) - ? this_way->offset_after : this_way->offset_before; - spin_unlock_bh(&nf_nat_seqofs_lock); - - return offset; -} - /* Frobs data inside this packet, which is linear. */ static void mangle_contents(struct sk_buff *skb, unsigned int dataoff, @@ -142,30 +81,6 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra) return 1; } -void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, - __be32 seq, s16 off) -{ - if (!off) - return; - set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); - adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo); - nf_conntrack_event_cache(IPCT_NATSEQADJ, ct); -} -EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); - -void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, - u32 ctinfo, int off) -{ - const struct tcphdr *th; - - if (nf_ct_protonum(ct) != IPPROTO_TCP) - return; - - th = (struct tcphdr *)(skb_network_header(skb)+ ip_hdrlen(skb)); - nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off); -} -EXPORT_SYMBOL_GPL(nf_nat_tcp_seq_adjust); - /* Generic function for mangling variable-length address changes inside * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX * command in FTP). @@ -210,8 +125,8 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, datalen, oldlen); if (adjust && rep_len != match_len) - nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq, - (int)rep_len - (int)match_len); + nf_ct_seqadj_set(ct, ctinfo, tcph->seq, + (int)rep_len - (int)match_len); return 1; } @@ -271,145 +186,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, } EXPORT_SYMBOL(nf_nat_mangle_udp_packet); -/* Adjust one found SACK option including checksum correction */ -static void -sack_adjust(struct sk_buff *skb, - struct tcphdr *tcph, - unsigned int sackoff, - unsigned int sackend, - struct nf_nat_seq *natseq) -{ - while (sackoff < sackend) { - struct tcp_sack_block_wire *sack; - __be32 new_start_seq, new_end_seq; - - sack = (void *)skb->data + sackoff; - if (after(ntohl(sack->start_seq) - natseq->offset_before, - natseq->correction_pos)) - new_start_seq = htonl(ntohl(sack->start_seq) - - natseq->offset_after); - else - new_start_seq = htonl(ntohl(sack->start_seq) - - natseq->offset_before); - - if (after(ntohl(sack->end_seq) - natseq->offset_before, - natseq->correction_pos)) - new_end_seq = htonl(ntohl(sack->end_seq) - - natseq->offset_after); - else - new_end_seq = htonl(ntohl(sack->end_seq) - - natseq->offset_before); - - pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n", - ntohl(sack->start_seq), new_start_seq, - ntohl(sack->end_seq), new_end_seq); - - inet_proto_csum_replace4(&tcph->check, skb, - sack->start_seq, new_start_seq, 0); - inet_proto_csum_replace4(&tcph->check, skb, - sack->end_seq, new_end_seq, 0); - sack->start_seq = new_start_seq; - sack->end_seq = new_end_seq; - sackoff += sizeof(*sack); - } -} - -/* TCP SACK sequence number adjustment */ -static inline unsigned int -nf_nat_sack_adjust(struct sk_buff *skb, - unsigned int protoff, - struct tcphdr *tcph, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - unsigned int dir, optoff, optend; - struct nf_conn_nat *nat = nfct_nat(ct); - - optoff = protoff + sizeof(struct tcphdr); - optend = protoff + tcph->doff * 4; - - if (!skb_make_writable(skb, optend)) - return 0; - - dir = CTINFO2DIR(ctinfo); - - while (optoff < optend) { - /* Usually: option, length. */ - unsigned char *op = skb->data + optoff; - - switch (op[0]) { - case TCPOPT_EOL: - return 1; - case TCPOPT_NOP: - optoff++; - continue; - default: - /* no partial options */ - if (optoff + 1 == optend || - optoff + op[1] > optend || - op[1] < 2) - return 0; - if (op[0] == TCPOPT_SACK && - op[1] >= 2+TCPOLEN_SACK_PERBLOCK && - ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) - sack_adjust(skb, tcph, optoff+2, - optoff+op[1], &nat->seq[!dir]); - optoff += op[1]; - } - } - return 1; -} - -/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ -int -nf_nat_seq_adjust(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff) -{ - struct tcphdr *tcph; - int dir; - __be32 newseq, newack; - s16 seqoff, ackoff; - struct nf_conn_nat *nat = nfct_nat(ct); - struct nf_nat_seq *this_way, *other_way; - - dir = CTINFO2DIR(ctinfo); - - this_way = &nat->seq[dir]; - other_way = &nat->seq[!dir]; - - if (!skb_make_writable(skb, protoff + sizeof(*tcph))) - return 0; - - tcph = (void *)skb->data + protoff; - if (after(ntohl(tcph->seq), this_way->correction_pos)) - seqoff = this_way->offset_after; - else - seqoff = this_way->offset_before; - - if (after(ntohl(tcph->ack_seq) - other_way->offset_before, - other_way->correction_pos)) - ackoff = other_way->offset_after; - else - ackoff = other_way->offset_before; - - newseq = htonl(ntohl(tcph->seq) + seqoff); - newack = htonl(ntohl(tcph->ack_seq) - ackoff); - - inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); - inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); - - pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", - ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), - ntohl(newack)); - - tcph->seq = newseq; - tcph->ack_seq = newack; - - return nf_nat_sack_adjust(skb, protoff, tcph, ct, ctinfo); -} - /* Setup NAT on this expected conntrack so it follows master. */ /* If we fail to get a free NAT slot, we'll get dropped on confirm */ void nf_nat_follow_master(struct nf_conn *ct, diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index dac11f7..f979040 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -20,6 +20,7 @@ #include <net/netfilter/nf_nat_helper.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <linux/netfilter/nf_conntrack_sip.h> MODULE_LICENSE("GPL"); @@ -308,7 +309,7 @@ static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff, return; th = (struct tcphdr *)(skb->data + protoff); - nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off); + nf_ct_seqadj_set(ct, ctinfo, th->seq, off); } /* Handles expected signalling connections and media streams */ diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c new file mode 100644 index 0000000..d23dc79 --- /dev/null +++ b/net/netfilter/nf_synproxy_core.c @@ -0,0 +1,432 @@ +/* + * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <asm/unaligned.h> +#include <net/tcp.h> +#include <net/netns/generic.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_tcpudp.h> +#include <linux/netfilter/xt_SYNPROXY.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_synproxy.h> + +int synproxy_net_id; +EXPORT_SYMBOL_GPL(synproxy_net_id); + +void +synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, + const struct tcphdr *th, struct synproxy_options *opts) +{ + int length = (th->doff * 4) - sizeof(*th); + u8 buf[40], *ptr; + + ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf); + BUG_ON(ptr == NULL); + + opts->options = 0; + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) + return; + if (opsize > length) + return; + + switch (opcode) { + case TCPOPT_MSS: + if (opsize == TCPOLEN_MSS) { + opts->mss = get_unaligned_be16(ptr); + opts->options |= XT_SYNPROXY_OPT_MSS; + } + break; + case TCPOPT_WINDOW: + if (opsize == TCPOLEN_WINDOW) { + opts->wscale = *ptr; + if (opts->wscale > 14) + opts->wscale = 14; + opts->options |= XT_SYNPROXY_OPT_WSCALE; + } + break; + case TCPOPT_TIMESTAMP: + if (opsize == TCPOLEN_TIMESTAMP) { + opts->tsval = get_unaligned_be32(ptr); + opts->tsecr = get_unaligned_be32(ptr + 4); + opts->options |= XT_SYNPROXY_OPT_TIMESTAMP; + } + break; + case TCPOPT_SACK_PERM: + if (opsize == TCPOLEN_SACK_PERM) + opts->options |= XT_SYNPROXY_OPT_SACK_PERM; + break; + } + + ptr += opsize - 2; + length -= opsize; + } + } +} +EXPORT_SYMBOL_GPL(synproxy_parse_options); + +unsigned int synproxy_options_size(const struct synproxy_options *opts) +{ + unsigned int size = 0; + + if (opts->options & XT_SYNPROXY_OPT_MSS) + size += TCPOLEN_MSS_ALIGNED; + if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) + size += TCPOLEN_TSTAMP_ALIGNED; + else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + size += TCPOLEN_SACKPERM_ALIGNED; + if (opts->options & XT_SYNPROXY_OPT_WSCALE) + size += TCPOLEN_WSCALE_ALIGNED; + + return size; +} +EXPORT_SYMBOL_GPL(synproxy_options_size); + +void +synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) +{ + __be32 *ptr = (__be32 *)(th + 1); + u8 options = opts->options; + + if (options & XT_SYNPROXY_OPT_MSS) + *ptr++ = htonl((TCPOPT_MSS << 24) | + (TCPOLEN_MSS << 16) | + opts->mss); + + if (options & XT_SYNPROXY_OPT_TIMESTAMP) { + if (options & XT_SYNPROXY_OPT_SACK_PERM) + *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | + (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + else + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + + *ptr++ = htonl(opts->tsval); + *ptr++ = htonl(opts->tsecr); + } else if (options & XT_SYNPROXY_OPT_SACK_PERM) + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK_PERM << 8) | + TCPOLEN_SACK_PERM); + + if (options & XT_SYNPROXY_OPT_WSCALE) + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_WINDOW << 16) | + (TCPOLEN_WINDOW << 8) | + opts->wscale); +} +EXPORT_SYMBOL_GPL(synproxy_build_options); + +void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, + struct synproxy_options *opts) +{ + opts->tsecr = opts->tsval; + opts->tsval = tcp_time_stamp & ~0x3f; + + if (opts->options & XT_SYNPROXY_OPT_WSCALE) + opts->tsval |= info->wscale; + else + opts->tsval |= 0xf; + + if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + opts->tsval |= 1 << 4; + + if (opts->options & XT_SYNPROXY_OPT_ECN) + opts->tsval |= 1 << 5; +} +EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie); + +void synproxy_check_timestamp_cookie(struct synproxy_options *opts) +{ + opts->wscale = opts->tsecr & 0xf; + if (opts->wscale != 0xf) + opts->options |= XT_SYNPROXY_OPT_WSCALE; + + opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0; + + opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0; +} +EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie); + +unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, + unsigned int protoff, + struct tcphdr *th, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct nf_conn_synproxy *synproxy) +{ + unsigned int optoff, optend; + u32 *ptr, old; + + if (synproxy->tsoff == 0) + return 1; + + optoff = protoff + sizeof(struct tcphdr); + optend = protoff + th->doff * 4; + + if (!skb_make_writable(skb, optend)) + return 0; + + while (optoff < optend) { + unsigned char *op = skb->data + optoff; + + switch (op[0]) { + case TCPOPT_EOL: + return 1; + case TCPOPT_NOP: + optoff++; + continue; + default: + if (optoff + 1 == optend || + optoff + op[1] > optend || + op[1] < 2) + return 0; + if (op[0] == TCPOPT_TIMESTAMP && + op[1] == TCPOLEN_TIMESTAMP) { + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { + ptr = (u32 *)&op[2]; + old = *ptr; + *ptr = htonl(ntohl(*ptr) - + synproxy->tsoff); + } else { + ptr = (u32 *)&op[6]; + old = *ptr; + *ptr = htonl(ntohl(*ptr) + + synproxy->tsoff); + } + inet_proto_csum_replace4(&th->check, skb, + old, *ptr, 0); + return 1; + } + optoff += op[1]; + } + } + return 1; +} +EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust); + +static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = { + .len = sizeof(struct nf_conn_synproxy), + .align = __alignof__(struct nf_conn_synproxy), + .id = NF_CT_EXT_SYNPROXY, +}; + +#ifdef CONFIG_PROC_FS +static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq)); + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos - 1; cpu < nr_cpu_ids; cpu++) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return per_cpu_ptr(snet->stats, cpu); + } + + return NULL; +} + +static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq)); + int cpu; + + for (cpu = *pos; cpu < nr_cpu_ids; cpu++) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return per_cpu_ptr(snet->stats, cpu); + } + + return NULL; +} + +static void synproxy_cpu_seq_stop(struct seq_file *seq, void *v) +{ + return; +} + +static int synproxy_cpu_seq_show(struct seq_file *seq, void *v) +{ + struct synproxy_stats *stats = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries\t\tsyn_received\t" + "cookie_invalid\tcookie_valid\t" + "cookie_retrans\tconn_reopened\n"); + return 0; + } + + seq_printf(seq, "%08x\t%08x\t%08x\t%08x\t%08x\t%08x\n", 0, + stats->syn_received, + stats->cookie_invalid, + stats->cookie_valid, + stats->cookie_retrans, + stats->conn_reopened); + + return 0; +} + +static const struct seq_operations synproxy_cpu_seq_ops = { + .start = synproxy_cpu_seq_start, + .next = synproxy_cpu_seq_next, + .stop = synproxy_cpu_seq_stop, + .show = synproxy_cpu_seq_show, +}; + +static int synproxy_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &synproxy_cpu_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations synproxy_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = synproxy_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init synproxy_proc_init(struct net *net) +{ + if (!proc_create("synproxy", S_IRUGO, net->proc_net_stat, + &synproxy_cpu_seq_fops)) + return -ENOMEM; + return 0; +} + +static void __net_exit synproxy_proc_exit(struct net *net) +{ + remove_proc_entry("synproxy", net->proc_net_stat); +} +#else +static int __net_init synproxy_proc_init(struct net *net) +{ + return 0; +} + +static void __net_exit synproxy_proc_exit(struct net *net) +{ + return; +} +#endif /* CONFIG_PROC_FS */ + +static int __net_init synproxy_net_init(struct net *net) +{ + struct synproxy_net *snet = synproxy_pernet(net); + struct nf_conntrack_tuple t; + struct nf_conn *ct; + int err = -ENOMEM; + + memset(&t, 0, sizeof(t)); + ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL); + if (IS_ERR(ct)) { + err = PTR_ERR(ct); + goto err1; + } + + __set_bit(IPS_TEMPLATE_BIT, &ct->status); + __set_bit(IPS_CONFIRMED_BIT, &ct->status); + if (!nfct_seqadj_ext_add(ct)) + goto err2; + if (!nfct_synproxy_ext_add(ct)) + goto err2; + + snet->tmpl = ct; + + snet->stats = alloc_percpu(struct synproxy_stats); + if (snet->stats == NULL) + goto err2; + + err = synproxy_proc_init(net); + if (err < 0) + goto err3; + + return 0; + +err3: + free_percpu(snet->stats); +err2: + nf_conntrack_free(ct); +err1: + return err; +} + +static void __net_exit synproxy_net_exit(struct net *net) +{ + struct synproxy_net *snet = synproxy_pernet(net); + + nf_conntrack_free(snet->tmpl); + synproxy_proc_exit(net); + free_percpu(snet->stats); +} + +static struct pernet_operations synproxy_net_ops = { + .init = synproxy_net_init, + .exit = synproxy_net_exit, + .id = &synproxy_net_id, + .size = sizeof(struct synproxy_net), +}; + +static int __init synproxy_core_init(void) +{ + int err; + + err = nf_ct_extend_register(&nf_ct_synproxy_extend); + if (err < 0) + goto err1; + + err = register_pernet_subsys(&synproxy_net_ops); + if (err < 0) + goto err2; + + return 0; + +err2: + nf_ct_extend_unregister(&nf_ct_synproxy_extend); +err1: + return err; +} + +static void __exit synproxy_core_exit(void) +{ + unregister_pernet_subsys(&synproxy_net_ops); + nf_ct_extend_unregister(&nf_ct_synproxy_extend); +} + +module_init(synproxy_core_init); +module_exit(synproxy_core_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c deleted file mode 100644 index 474d621..0000000 --- a/net/netfilter/nf_tproxy_core.c +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Transparent proxy support for Linux/iptables - * - * Copyright (c) 2006-2007 BalaBit IT Ltd. - * Author: Balazs Scheidler, Krisztian Kovacs - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include <linux/module.h> - -#include <linux/net.h> -#include <linux/if.h> -#include <linux/netdevice.h> -#include <net/udp.h> -#include <net/netfilter/nf_tproxy_core.h> - - -static void -nf_tproxy_destructor(struct sk_buff *skb) -{ - struct sock *sk = skb->sk; - - skb->sk = NULL; - skb->destructor = NULL; - - if (sk) - sock_put(sk); -} - -/* consumes sk */ -void -nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) -{ - /* assigning tw sockets complicates things; most - * skb->sk->X checks would have to test sk->sk_state first */ - if (sk->sk_state == TCP_TIME_WAIT) { - inet_twsk_put(inet_twsk(sk)); - return; - } - - skb_orphan(skb); - skb->sk = sk; - skb->destructor = nf_tproxy_destructor; -} -EXPORT_SYMBOL_GPL(nf_tproxy_assign_sock); - -static int __init nf_tproxy_init(void) -{ - pr_info("NF_TPROXY: Transparent proxy support initialized, version 4.1.0\n"); - pr_info("NF_TPROXY: Copyright (c) 2006-2007 BalaBit IT Ltd.\n"); - return 0; -} - -module_init(nf_tproxy_init); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Krisztian Kovacs"); -MODULE_DESCRIPTION("Transparent proxy support core routines"); diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 8a703c3..95a98c8 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -862,6 +862,7 @@ static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { [NFQA_MARK] = { .type = NLA_U32 }, [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, [NFQA_CT] = { .type = NLA_UNSPEC }, + [NFQA_EXP] = { .type = NLA_UNSPEC }, }; static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { @@ -990,9 +991,14 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, if (entry == NULL) return -ENOENT; - rcu_read_lock(); - if (nfqa[NFQA_CT] && (queue->flags & NFQA_CFG_F_CONNTRACK)) + if (nfqa[NFQA_CT]) { ct = nfqnl_ct_parse(entry->skb, nfqa[NFQA_CT], &ctinfo); + if (ct && nfqa[NFQA_EXP]) { + nfqnl_attach_expect(ct, nfqa[NFQA_EXP], + NETLINK_CB(skb).portid, + nlmsg_report(nlh)); + } + } if (nfqa[NFQA_PAYLOAD]) { u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]); @@ -1005,7 +1011,6 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, if (ct) nfqnl_ct_seq_adjust(skb, ct, ctinfo, diff); } - rcu_read_unlock(); if (nfqa[NFQA_MARK]) entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); diff --git a/net/netfilter/nfnetlink_queue_ct.c b/net/netfilter/nfnetlink_queue_ct.c index ab61d66..96cac50 100644 --- a/net/netfilter/nfnetlink_queue_ct.c +++ b/net/netfilter/nfnetlink_queue_ct.c @@ -87,12 +87,27 @@ nla_put_failure: void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, int diff) { - struct nfq_ct_nat_hook *nfq_nat_ct; + struct nfq_ct_hook *nfq_ct; - nfq_nat_ct = rcu_dereference(nfq_ct_nat_hook); - if (nfq_nat_ct == NULL) + nfq_ct = rcu_dereference(nfq_ct_hook); + if (nfq_ct == NULL) return; if ((ct->status & IPS_NAT_MASK) && diff) - nfq_nat_ct->seq_adjust(skb, ct, ctinfo, diff); + nfq_ct->seq_adjust(skb, ct, ctinfo, diff); +} + +int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr, + u32 portid, u32 report) +{ + struct nfq_ct_hook *nfq_ct; + + if (nf_ct_is_untracked(ct)) + return 0; + + nfq_ct = rcu_dereference(nfq_ct_hook); + if (nfq_ct == NULL) + return -EOPNOTSUPP; + + return nfq_ct->attach_expect(attr, ct, portid, report); } diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index d7f1953..5d8a3a3 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -15,7 +15,9 @@ #include <linux/ip.h> #include <net/checksum.h> #include <net/udp.h> +#include <net/tcp.h> #include <net/inet_sock.h> +#include <net/inet_hashtables.h> #include <linux/inetdevice.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> @@ -26,13 +28,18 @@ #define XT_TPROXY_HAVE_IPV6 1 #include <net/if_inet6.h> #include <net/addrconf.h> +#include <net/inet6_hashtables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #endif -#include <net/netfilter/nf_tproxy_core.h> #include <linux/netfilter/xt_TPROXY.h> +enum nf_tproxy_lookup_t { + NFT_LOOKUP_LISTENER, + NFT_LOOKUP_ESTABLISHED, +}; + static bool tproxy_sk_is_transparent(struct sock *sk) { if (sk->sk_state != TCP_TIME_WAIT) { @@ -68,6 +75,157 @@ tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) return laddr ? laddr : daddr; } +/* + * This is used when the user wants to intercept a connection matching + * an explicit iptables rule. In this case the sockets are assumed + * matching in preference order: + * + * - match: if there's a fully established connection matching the + * _packet_ tuple, it is returned, assuming the redirection + * already took place and we process a packet belonging to an + * established connection + * + * - match: if there's a listening socket matching the redirection + * (e.g. on-port & on-ip of the connection), it is returned, + * regardless if it was bound to 0.0.0.0 or an explicit + * address. The reasoning is that if there's an explicit rule, it + * does not really matter if the listener is bound to an interface + * or to 0. The user already stated that he wants redirection + * (since he added the rule). + * + * Please note that there's an overlap between what a TPROXY target + * and a socket match will match. Normally if you have both rules the + * "socket" match will be the first one, effectively all packets + * belonging to established connections going through that one. + */ +static inline struct sock * +nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, + const __be32 saddr, const __be32 daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in, + const enum nf_tproxy_lookup_t lookup_type) +{ + struct sock *sk; + + switch (protocol) { + case IPPROTO_TCP: + switch (lookup_type) { + case NFT_LOOKUP_LISTENER: + sk = inet_lookup_listener(net, &tcp_hashinfo, + saddr, sport, + daddr, dport, + in->ifindex); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + break; + case NFT_LOOKUP_ESTABLISHED: + sk = inet_lookup_established(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + break; + default: + BUG(); + } + break; + case IPPROTO_UDP: + sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + if (sk) { + int connected = (sk->sk_state == TCP_ESTABLISHED); + int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || + (lookup_type == NFT_LOOKUP_LISTENER && connected)) { + sock_put(sk); + sk = NULL; + } + } + break; + default: + WARN_ON(1); + sk = NULL; + } + + pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n", + protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk); + + return sk; +} + +#ifdef XT_TPROXY_HAVE_IPV6 +static inline struct sock * +nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, + const struct in6_addr *saddr, const struct in6_addr *daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in, + const enum nf_tproxy_lookup_t lookup_type) +{ + struct sock *sk; + + switch (protocol) { + case IPPROTO_TCP: + switch (lookup_type) { + case NFT_LOOKUP_LISTENER: + sk = inet6_lookup_listener(net, &tcp_hashinfo, + saddr, sport, + daddr, ntohs(dport), + in->ifindex); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + break; + case NFT_LOOKUP_ESTABLISHED: + sk = __inet6_lookup_established(net, &tcp_hashinfo, + saddr, sport, daddr, ntohs(dport), + in->ifindex); + break; + default: + BUG(); + } + break; + case IPPROTO_UDP: + sk = udp6_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + if (sk) { + int connected = (sk->sk_state == TCP_ESTABLISHED); + int wildcard = ipv6_addr_any(&inet6_sk(sk)->rcv_saddr); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || + (lookup_type == NFT_LOOKUP_LISTENER && connected)) { + sock_put(sk); + sk = NULL; + } + } + break; + default: + WARN_ON(1); + sk = NULL; + } + + pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n", + protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk); + + return sk; +} +#endif + /** * tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections * @skb: The skb being processed. @@ -117,6 +275,15 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, return sk; } +/* assign a socket to the skb -- consumes sk */ +static void +nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sock_edemux; +} + static unsigned int tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, u_int32_t mark_mask, u_int32_t mark_value) diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index 68ff29f..fab6eea 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -202,7 +202,7 @@ static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) return -EINVAL; } if ((info->source | info->dest) >= XT_ADDRTYPE_PROHIBIT) { - pr_err("ipv6 PROHIBT (THROW, NAT ..) matching not supported\n"); + pr_err("ipv6 PROHIBIT (THROW, NAT ..) matching not supported\n"); return -EINVAL; } if ((info->source | info->dest) & XT_ADDRTYPE_BROADCAST) { diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 20b1591..06df2b9 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -19,12 +19,12 @@ #include <net/icmp.h> #include <net/sock.h> #include <net/inet_sock.h> -#include <net/netfilter/nf_tproxy_core.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) #define XT_SOCKET_HAVE_IPV6 1 #include <linux/netfilter_ipv6/ip6_tables.h> +#include <net/inet6_hashtables.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #endif @@ -101,6 +101,43 @@ extract_icmp4_fields(const struct sk_buff *skb, return 0; } +/* "socket" match based redirection (no specific rule) + * =================================================== + * + * There are connections with dynamic endpoints (e.g. FTP data + * connection) that the user is unable to add explicit rules + * for. These are taken care of by a generic "socket" rule. It is + * assumed that the proxy application is trusted to open such + * connections without explicit iptables rule (except of course the + * generic 'socket' rule). In this case the following sockets are + * matched in preference order: + * + * - match: if there's a fully established connection matching the + * _packet_ tuple + * + * - match: if there's a non-zero bound listener (possibly with a + * non-local address) We don't accept zero-bound listeners, since + * then local services could intercept traffic going through the + * box. + */ +static struct sock * +xt_socket_get_sock_v4(struct net *net, const u8 protocol, + const __be32 saddr, const __be32 daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in) +{ + switch (protocol) { + case IPPROTO_TCP: + return __inet_lookup(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + case IPPROTO_UDP: + return udp4_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + } + return NULL; +} + static bool socket_match(const struct sk_buff *skb, struct xt_action_param *par, const struct xt_socket_mtinfo1 *info) @@ -156,9 +193,9 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, #endif if (!sk) - sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol, + sk = xt_socket_get_sock_v4(dev_net(skb->dev), protocol, saddr, daddr, sport, dport, - par->in, NFT_LOOKUP_ANY); + par->in); if (sk) { bool wildcard; bool transparent = true; @@ -265,6 +302,25 @@ extract_icmp6_fields(const struct sk_buff *skb, return 0; } +static struct sock * +xt_socket_get_sock_v6(struct net *net, const u8 protocol, + const struct in6_addr *saddr, const struct in6_addr *daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in) +{ + switch (protocol) { + case IPPROTO_TCP: + return inet6_lookup(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + case IPPROTO_UDP: + return udp6_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + } + + return NULL; +} + static bool socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) { @@ -302,9 +358,9 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) } if (!sk) - sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + sk = xt_socket_get_sock_v6(dev_net(skb->dev), tproto, saddr, daddr, sport, dport, - par->in, NFT_LOOKUP_ANY); + par->in); if (sk) { bool wildcard; bool transparent = true; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index f85f8a2..512718a 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -789,10 +789,6 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) struct net *net = sock_net(skb->sk); int chains_to_skip = cb->args[0]; int fams_to_skip = cb->args[1]; - bool need_locking = chains_to_skip || fams_to_skip; - - if (need_locking) - genl_lock(); for (i = chains_to_skip; i < GENL_FAM_TAB_SIZE; i++) { n = 0; @@ -814,9 +810,6 @@ errout: cb->args[0] = i; cb->args[1] = n; - if (need_locking) - genl_unlock(); - return skb->len; } diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index bed30e6..6ecf491 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -4,6 +4,7 @@ config OPENVSWITCH tristate "Open vSwitch" + select LIBCRC32C ---help--- Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 82e4ee5..ea36e99 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -10,10 +10,13 @@ openvswitch-y := \ dp_notify.o \ flow.o \ vport.o \ - vport-gre.o \ vport-internal_dev.o \ vport-netdev.o ifneq ($(CONFIG_OPENVSWITCH_VXLAN),) openvswitch-y += vport-vxlan.o endif + +ifneq ($(CONFIG_OPENVSWITCH_GRE),) +openvswitch-y += vport-gre.o +endif diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index ab101f7..65cfaa8 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2013 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -22,6 +22,7 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/openvswitch.h> +#include <linux/sctp.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/in6.h> @@ -31,6 +32,7 @@ #include <net/ipv6.h> #include <net/checksum.h> #include <net/dsfield.h> +#include <net/sctp/checksum.h> #include "datapath.h" #include "vport.h" @@ -352,6 +354,39 @@ static int set_tcp(struct sk_buff *skb, const struct ovs_key_tcp *tcp_port_key) return 0; } +static int set_sctp(struct sk_buff *skb, + const struct ovs_key_sctp *sctp_port_key) +{ + struct sctphdr *sh; + int err; + unsigned int sctphoff = skb_transport_offset(skb); + + err = make_writable(skb, sctphoff + sizeof(struct sctphdr)); + if (unlikely(err)) + return err; + + sh = sctp_hdr(skb); + if (sctp_port_key->sctp_src != sh->source || + sctp_port_key->sctp_dst != sh->dest) { + __le32 old_correct_csum, new_csum, old_csum; + + old_csum = sh->checksum; + old_correct_csum = sctp_compute_cksum(skb, sctphoff); + + sh->source = sctp_port_key->sctp_src; + sh->dest = sctp_port_key->sctp_dst; + + new_csum = sctp_compute_cksum(skb, sctphoff); + + /* Carry any checksum errors through. */ + sh->checksum = old_csum ^ old_correct_csum ^ new_csum; + + skb->rxhash = 0; + } + + return 0; +} + static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port) { struct vport *vport; @@ -376,8 +411,10 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, const struct nlattr *a; int rem; + BUG_ON(!OVS_CB(skb)->pkt_key); + upcall.cmd = OVS_PACKET_CMD_ACTION; - upcall.key = &OVS_CB(skb)->flow->key; + upcall.key = OVS_CB(skb)->pkt_key; upcall.userdata = NULL; upcall.portid = 0; @@ -459,6 +496,10 @@ static int execute_set_action(struct sk_buff *skb, case OVS_KEY_ATTR_UDP: err = set_udp(skb, nla_data(nested_attr)); break; + + case OVS_KEY_ATTR_SCTP: + err = set_sctp(skb, nla_data(nested_attr)); + break; } return err; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index f2ed760..2aa13bd 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2013 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -165,7 +165,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) { struct datapath *dp = container_of(rcu, struct datapath, rcu); - ovs_flow_tbl_destroy((__force struct flow_table *)dp->table); + ovs_flow_tbl_destroy((__force struct flow_table *)dp->table, false); free_percpu(dp->stats_percpu); release_net(ovs_dp_get_net(dp)); kfree(dp->ports); @@ -226,19 +226,18 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) struct sw_flow_key key; u64 *stats_counter; int error; - int key_len; stats = this_cpu_ptr(dp->stats_percpu); /* Extract flow from 'skb' into 'key'. */ - error = ovs_flow_extract(skb, p->port_no, &key, &key_len); + error = ovs_flow_extract(skb, p->port_no, &key); if (unlikely(error)) { kfree_skb(skb); return; } /* Look up flow. */ - flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key, key_len); + flow = ovs_flow_lookup(rcu_dereference(dp->table), &key); if (unlikely(!flow)) { struct dp_upcall_info upcall; @@ -253,6 +252,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) } OVS_CB(skb)->flow = flow; + OVS_CB(skb)->pkt_key = &key; stats_counter = &stats->n_hit; ovs_flow_used(OVS_CB(skb)->flow, skb); @@ -435,7 +435,7 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, upcall->dp_ifindex = dp_ifindex; nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY); - ovs_flow_to_nlattrs(upcall_info->key, user_skb); + ovs_flow_to_nlattrs(upcall_info->key, upcall_info->key, user_skb); nla_nest_end(user_skb, nla); if (upcall_info->userdata) @@ -468,7 +468,7 @@ static int flush_flows(struct datapath *dp) rcu_assign_pointer(dp->table, new_table); - ovs_flow_tbl_deferred_destroy(old_table); + ovs_flow_tbl_destroy(old_table, true); return 0; } @@ -611,10 +611,12 @@ static int validate_tp_port(const struct sw_flow_key *flow_key) static int validate_and_copy_set_tun(const struct nlattr *attr, struct sw_flow_actions **sfa) { - struct ovs_key_ipv4_tunnel tun_key; + struct sw_flow_match match; + struct sw_flow_key key; int err, start; - err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &tun_key); + ovs_match_init(&match, &key, NULL); + err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &match, false); if (err) return err; @@ -622,7 +624,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (start < 0) return start; - err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &tun_key, sizeof(tun_key)); + err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key, + sizeof(match.key->tun_key)); add_nested_action_end(*sfa, start); return err; @@ -709,6 +712,12 @@ static int validate_set(const struct nlattr *a, return validate_tp_port(flow_key); + case OVS_KEY_ATTR_SCTP: + if (flow_key->ip.proto != IPPROTO_SCTP) + return -EINVAL; + + return validate_tp_port(flow_key); + default: return -EINVAL; } @@ -857,7 +866,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct ethhdr *eth; int len; int err; - int key_len; err = -EINVAL; if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] || @@ -890,11 +898,11 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(flow)) goto err_kfree_skb; - err = ovs_flow_extract(packet, -1, &flow->key, &key_len); + err = ovs_flow_extract(packet, -1, &flow->key); if (err) goto err_flow_free; - err = ovs_flow_metadata_from_nlattrs(flow, key_len, a[OVS_PACKET_ATTR_KEY]); + err = ovs_flow_metadata_from_nlattrs(flow, a[OVS_PACKET_ATTR_KEY]); if (err) goto err_flow_free; acts = ovs_flow_actions_alloc(nla_len(a[OVS_PACKET_ATTR_ACTIONS])); @@ -908,6 +916,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) goto err_flow_free; OVS_CB(packet)->flow = flow; + OVS_CB(packet)->pkt_key = &flow->key; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; @@ -922,13 +931,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) local_bh_enable(); rcu_read_unlock(); - ovs_flow_free(flow); + ovs_flow_free(flow, false); return err; err_unlock: rcu_read_unlock(); err_flow_free: - ovs_flow_free(flow); + ovs_flow_free(flow, false); err_kfree_skb: kfree_skb(packet); err: @@ -951,9 +960,10 @@ static struct genl_ops dp_packet_genl_ops[] = { static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats) { + struct flow_table *table; int i; - struct flow_table *table = ovsl_dereference(dp->table); + table = rcu_dereference_check(dp->table, lockdep_ovsl_is_held()); stats->n_flows = ovs_flow_tbl_count(table); stats->n_hit = stats->n_missed = stats->n_lost = 0; @@ -1044,7 +1054,8 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) if (!start) return -EMSGSIZE; - err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key)); + err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key), + nla_data(ovs_key)); if (err) return err; nla_nest_end(skb, start); @@ -1092,6 +1103,7 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) { return NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */ + + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_MASK */ + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ + nla_total_size(8) /* OVS_FLOW_ATTR_USED */ @@ -1104,7 +1116,6 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, u32 seq, u32 flags, u8 cmd) { const int skb_orig_len = skb->len; - const struct sw_flow_actions *sf_acts; struct nlattr *start; struct ovs_flow_stats stats; struct ovs_header *ovs_header; @@ -1113,20 +1124,31 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, u8 tcp_flags; int err; - sf_acts = ovsl_dereference(flow->sf_acts); - ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd); if (!ovs_header) return -EMSGSIZE; ovs_header->dp_ifindex = get_dpifindex(dp); + /* Fill flow key. */ nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY); if (!nla) goto nla_put_failure; - err = ovs_flow_to_nlattrs(&flow->key, skb); + + err = ovs_flow_to_nlattrs(&flow->unmasked_key, + &flow->unmasked_key, skb); + if (err) + goto error; + nla_nest_end(skb, nla); + + nla = nla_nest_start(skb, OVS_FLOW_ATTR_MASK); + if (!nla) + goto nla_put_failure; + + err = ovs_flow_to_nlattrs(&flow->key, &flow->mask->key, skb); if (err) goto error; + nla_nest_end(skb, nla); spin_lock_bh(&flow->lock); @@ -1161,6 +1183,11 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, */ start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS); if (start) { + const struct sw_flow_actions *sf_acts; + + sf_acts = rcu_dereference_check(flow->sf_acts, + lockdep_ovsl_is_held()); + err = actions_to_attr(sf_acts->actions, sf_acts->actions_len, skb); if (!err) nla_nest_end(skb, start); @@ -1211,20 +1238,24 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; - struct sw_flow_key key; - struct sw_flow *flow; + struct sw_flow_key key, masked_key; + struct sw_flow *flow = NULL; + struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; struct flow_table *table; struct sw_flow_actions *acts = NULL; + struct sw_flow_match match; int error; - int key_len; /* Extract key. */ error = -EINVAL; if (!a[OVS_FLOW_ATTR_KEY]) goto error; - error = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); + + ovs_match_init(&match, &key, &mask); + error = ovs_match_from_nlattrs(&match, + a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]); if (error) goto error; @@ -1235,9 +1266,13 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(acts)) goto error; - error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0, &acts); - if (error) + ovs_flow_key_mask(&masked_key, &key, &mask); + error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], + &masked_key, 0, &acts); + if (error) { + OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); goto err_kfree; + } } else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) { error = -EINVAL; goto error; @@ -1250,8 +1285,11 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) goto err_unlock_ovs; table = ovsl_dereference(dp->table); - flow = ovs_flow_tbl_lookup(table, &key, key_len); + + /* Check if this is a duplicate flow */ + flow = ovs_flow_lookup(table, &key); if (!flow) { + struct sw_flow_mask *mask_p; /* Bail out if we're not allowed to create a new flow. */ error = -ENOENT; if (info->genlhdr->cmd == OVS_FLOW_CMD_SET) @@ -1264,7 +1302,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) new_table = ovs_flow_tbl_expand(table); if (!IS_ERR(new_table)) { rcu_assign_pointer(dp->table, new_table); - ovs_flow_tbl_deferred_destroy(table); + ovs_flow_tbl_destroy(table, true); table = ovsl_dereference(dp->table); } } @@ -1277,14 +1315,30 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) } clear_stats(flow); + flow->key = masked_key; + flow->unmasked_key = key; + + /* Make sure mask is unique in the system */ + mask_p = ovs_sw_flow_mask_find(table, &mask); + if (!mask_p) { + /* Allocate a new mask if none exsits. */ + mask_p = ovs_sw_flow_mask_alloc(); + if (!mask_p) + goto err_flow_free; + mask_p->key = mask.key; + mask_p->range = mask.range; + ovs_sw_flow_mask_insert(table, mask_p); + } + + ovs_sw_flow_mask_add_ref(mask_p); + flow->mask = mask_p; rcu_assign_pointer(flow->sf_acts, acts); /* Put flow in bucket. */ - ovs_flow_tbl_insert(table, flow, &key, key_len); + ovs_flow_insert(table, flow); reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, - info->snd_seq, - OVS_FLOW_CMD_NEW); + info->snd_seq, OVS_FLOW_CMD_NEW); } else { /* We found a matching flow. */ struct sw_flow_actions *old_acts; @@ -1300,6 +1354,13 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) goto err_unlock_ovs; + /* The unmasked key has to be the same for flow updates. */ + error = -EINVAL; + if (!ovs_flow_cmp_unmasked_key(flow, &key, match.range.end)) { + OVS_NLERR("Flow modification message rejected, unmasked key does not match.\n"); + goto err_unlock_ovs; + } + /* Update actions. */ old_acts = ovsl_dereference(flow->sf_acts); rcu_assign_pointer(flow->sf_acts, acts); @@ -1324,6 +1385,8 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) ovs_dp_flow_multicast_group.id, PTR_ERR(reply)); return 0; +err_flow_free: + ovs_flow_free(flow, false); err_unlock_ovs: ovs_unlock(); err_kfree: @@ -1341,12 +1404,16 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) struct sw_flow *flow; struct datapath *dp; struct flow_table *table; + struct sw_flow_match match; int err; - int key_len; - if (!a[OVS_FLOW_ATTR_KEY]) + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR("Flow get message rejected, Key attribute missing.\n"); return -EINVAL; - err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); + } + + ovs_match_init(&match, &key, NULL); + err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL); if (err) return err; @@ -1358,7 +1425,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) } table = ovsl_dereference(dp->table); - flow = ovs_flow_tbl_lookup(table, &key, key_len); + flow = ovs_flow_lookup_unmasked_key(table, &match); if (!flow) { err = -ENOENT; goto unlock; @@ -1387,8 +1454,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) struct sw_flow *flow; struct datapath *dp; struct flow_table *table; + struct sw_flow_match match; int err; - int key_len; ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); @@ -1401,12 +1468,14 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) err = flush_flows(dp); goto unlock; } - err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); + + ovs_match_init(&match, &key, NULL); + err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL); if (err) goto unlock; table = ovsl_dereference(dp->table); - flow = ovs_flow_tbl_lookup(table, &key, key_len); + flow = ovs_flow_lookup_unmasked_key(table, &match); if (!flow) { err = -ENOENT; goto unlock; @@ -1418,13 +1487,13 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) goto unlock; } - ovs_flow_tbl_remove(table, flow); + ovs_flow_remove(table, flow); err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_DEL); BUG_ON(err < 0); - ovs_flow_deferred_free(flow); + ovs_flow_free(flow, true); ovs_unlock(); ovs_notify(reply, info, &ovs_dp_flow_multicast_group); @@ -1440,22 +1509,21 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) struct datapath *dp; struct flow_table *table; - ovs_lock(); + rcu_read_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { - ovs_unlock(); + rcu_read_unlock(); return -ENODEV; } - table = ovsl_dereference(dp->table); - + table = rcu_dereference(dp->table); for (;;) { struct sw_flow *flow; u32 bucket, obj; bucket = cb->args[0]; obj = cb->args[1]; - flow = ovs_flow_tbl_next(table, &bucket, &obj); + flow = ovs_flow_dump_next(table, &bucket, &obj); if (!flow) break; @@ -1468,7 +1536,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->args[0] = bucket; cb->args[1] = obj; } - ovs_unlock(); + rcu_read_unlock(); return skb->len; } @@ -1664,7 +1732,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) goto err_destroy_local_port; ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id); - list_add_tail(&dp->list_node, &ovs_net->dps); + list_add_tail_rcu(&dp->list_node, &ovs_net->dps); ovs_unlock(); @@ -1678,7 +1746,7 @@ err_destroy_ports_array: err_destroy_percpu: free_percpu(dp->stats_percpu); err_destroy_table: - ovs_flow_tbl_destroy(ovsl_dereference(dp->table)); + ovs_flow_tbl_destroy(ovsl_dereference(dp->table), false); err_free_dp: release_net(ovs_dp_get_net(dp)); kfree(dp); @@ -1702,7 +1770,7 @@ static void __dp_destroy(struct datapath *dp) ovs_dp_detach_port(vport); } - list_del(&dp->list_node); + list_del_rcu(&dp->list_node); /* OVSP_LOCAL is datapath internal port. We need to make sure that * all port in datapath are destroyed first before freeing datapath. @@ -1807,8 +1875,8 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) int skip = cb->args[0]; int i = 0; - ovs_lock(); - list_for_each_entry(dp, &ovs_net->dps, list_node) { + rcu_read_lock(); + list_for_each_entry_rcu(dp, &ovs_net->dps, list_node) { if (i >= skip && ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, @@ -1816,7 +1884,7 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) break; i++; } - ovs_unlock(); + rcu_read_unlock(); cb->args[0] = i; @@ -2285,7 +2353,7 @@ static void rehash_flow_table(struct work_struct *work) new_table = ovs_flow_tbl_rehash(old_table); if (!IS_ERR(new_table)) { rcu_assign_pointer(dp->table, new_table); - ovs_flow_tbl_deferred_destroy(old_table); + ovs_flow_tbl_destroy(old_table, true); } } } diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index a914864..4d109c1 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -88,11 +88,13 @@ struct datapath { /** * struct ovs_skb_cb - OVS data in skb CB * @flow: The flow associated with this packet. May be %NULL if no flow. + * @pkt_key: The flow information extracted from the packet. Must be nonnull. * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the * packet is not being tunneled. */ struct ovs_skb_cb { struct sw_flow *flow; + struct sw_flow_key *pkt_key; struct ovs_key_ipv4_tunnel *tun_key; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -183,4 +185,8 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb); void ovs_dp_notify_wq(struct work_struct *work); + +#define OVS_NLERR(fmt, ...) \ + pr_info_once("netlink: " fmt, ##__VA_ARGS__) + #endif /* datapath.h */ diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 1aa84dc..ad1aeeb 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2011 Nicira, Inc. + * Copyright (c) 2007-2013 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -34,6 +34,7 @@ #include <linux/if_arp.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <linux/sctp.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/icmp.h> @@ -46,6 +47,202 @@ static struct kmem_cache *flow_cache; +static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask, + struct sw_flow_key_range *range, u8 val); + +static void update_range__(struct sw_flow_match *match, + size_t offset, size_t size, bool is_mask) +{ + struct sw_flow_key_range *range = NULL; + size_t start = rounddown(offset, sizeof(long)); + size_t end = roundup(offset + size, sizeof(long)); + + if (!is_mask) + range = &match->range; + else if (match->mask) + range = &match->mask->range; + + if (!range) + return; + + if (range->start == range->end) { + range->start = start; + range->end = end; + return; + } + + if (range->start > start) + range->start = start; + + if (range->end < end) + range->end = end; +} + +#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \ + do { \ + update_range__(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) { \ + if ((match)->mask) \ + (match)->mask->key.field = value; \ + } else { \ + (match)->key->field = value; \ + } \ + } while (0) + +#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ + do { \ + update_range__(match, offsetof(struct sw_flow_key, field), \ + len, is_mask); \ + if (is_mask) { \ + if ((match)->mask) \ + memcpy(&(match)->mask->key.field, value_p, len);\ + } else { \ + memcpy(&(match)->key->field, value_p, len); \ + } \ + } while (0) + +static u16 range_n_bytes(const struct sw_flow_key_range *range) +{ + return range->end - range->start; +} + +void ovs_match_init(struct sw_flow_match *match, + struct sw_flow_key *key, + struct sw_flow_mask *mask) +{ + memset(match, 0, sizeof(*match)); + match->key = key; + match->mask = mask; + + memset(key, 0, sizeof(*key)); + + if (mask) { + memset(&mask->key, 0, sizeof(mask->key)); + mask->range.start = mask->range.end = 0; + } +} + +static bool ovs_match_validate(const struct sw_flow_match *match, + u64 key_attrs, u64 mask_attrs) +{ + u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET; + u64 mask_allowed = key_attrs; /* At most allow all key attributes */ + + /* The following mask attributes allowed only if they + * pass the validation tests. */ + mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4) + | (1 << OVS_KEY_ATTR_IPV6) + | (1 << OVS_KEY_ATTR_TCP) + | (1 << OVS_KEY_ATTR_UDP) + | (1 << OVS_KEY_ATTR_SCTP) + | (1 << OVS_KEY_ATTR_ICMP) + | (1 << OVS_KEY_ATTR_ICMPV6) + | (1 << OVS_KEY_ATTR_ARP) + | (1 << OVS_KEY_ATTR_ND)); + + /* Always allowed mask fields. */ + mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) + | (1 << OVS_KEY_ATTR_IN_PORT) + | (1 << OVS_KEY_ATTR_ETHERTYPE)); + + /* Check key attributes. */ + if (match->key->eth.type == htons(ETH_P_ARP) + || match->key->eth.type == htons(ETH_P_RARP)) { + key_expected |= 1 << OVS_KEY_ATTR_ARP; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_ARP; + } + + if (match->key->eth.type == htons(ETH_P_IP)) { + key_expected |= 1 << OVS_KEY_ATTR_IPV4; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_IPV4; + + if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { + if (match->key->ip.proto == IPPROTO_UDP) { + key_expected |= 1 << OVS_KEY_ATTR_UDP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_UDP; + } + + if (match->key->ip.proto == IPPROTO_SCTP) { + key_expected |= 1 << OVS_KEY_ATTR_SCTP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; + } + + if (match->key->ip.proto == IPPROTO_TCP) { + key_expected |= 1 << OVS_KEY_ATTR_TCP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + } + + if (match->key->ip.proto == IPPROTO_ICMP) { + key_expected |= 1 << OVS_KEY_ATTR_ICMP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_ICMP; + } + } + } + + if (match->key->eth.type == htons(ETH_P_IPV6)) { + key_expected |= 1 << OVS_KEY_ATTR_IPV6; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_IPV6; + + if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { + if (match->key->ip.proto == IPPROTO_UDP) { + key_expected |= 1 << OVS_KEY_ATTR_UDP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_UDP; + } + + if (match->key->ip.proto == IPPROTO_SCTP) { + key_expected |= 1 << OVS_KEY_ATTR_SCTP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; + } + + if (match->key->ip.proto == IPPROTO_TCP) { + key_expected |= 1 << OVS_KEY_ATTR_TCP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + } + + if (match->key->ip.proto == IPPROTO_ICMPV6) { + key_expected |= 1 << OVS_KEY_ATTR_ICMPV6; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6; + + if (match->key->ipv6.tp.src == + htons(NDISC_NEIGHBOUR_SOLICITATION) || + match->key->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { + key_expected |= 1 << OVS_KEY_ATTR_ND; + if (match->mask && (match->mask->key.ipv6.tp.src == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_ND; + } + } + } + } + + if ((key_attrs & key_expected) != key_expected) { + /* Key attributes check failed. */ + OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n", + key_attrs, key_expected); + return false; + } + + if ((mask_attrs & mask_allowed) != mask_attrs) { + /* Mask attributes check failed. */ + OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n", + mask_attrs, mask_allowed); + return false; + } + + return true; +} + static int check_header(struct sk_buff *skb, int len) { if (unlikely(skb->len < len)) @@ -102,6 +299,12 @@ static bool udphdr_ok(struct sk_buff *skb) sizeof(struct udphdr)); } +static bool sctphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct sctphdr)); +} + static bool icmphdr_ok(struct sk_buff *skb) { return pskb_may_pull(skb, skb_transport_offset(skb) + @@ -121,12 +324,7 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies) return cur_ms - idle_ms; } -#define SW_FLOW_KEY_OFFSET(field) \ - (offsetof(struct sw_flow_key, field) + \ - FIELD_SIZEOF(struct sw_flow_key, field)) - -static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key, - int *key_lenp) +static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) { unsigned int nh_ofs = skb_network_offset(skb); unsigned int nh_len; @@ -136,8 +334,6 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key, __be16 frag_off; int err; - *key_lenp = SW_FLOW_KEY_OFFSET(ipv6.label); - err = check_header(skb, nh_ofs + sizeof(*nh)); if (unlikely(err)) return err; @@ -176,6 +372,22 @@ static bool icmp6hdr_ok(struct sk_buff *skb) sizeof(struct icmp6hdr)); } +void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src, + const struct sw_flow_mask *mask) +{ + const long *m = (long *)((u8 *)&mask->key + mask->range.start); + const long *s = (long *)((u8 *)src + mask->range.start); + long *d = (long *)((u8 *)dst + mask->range.start); + int i; + + /* The memory outside of the 'mask->range' are not set since + * further operations on 'dst' only uses contents within + * 'mask->range'. + */ + for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long)) + *d++ = *s++ & *m++; +} + #define TCP_FLAGS_OFFSET 13 #define TCP_FLAG_MASK 0x3f @@ -224,6 +436,7 @@ struct sw_flow *ovs_flow_alloc(void) spin_lock_init(&flow->lock); flow->sf_acts = NULL; + flow->mask = NULL; return flow; } @@ -263,7 +476,7 @@ static void free_buckets(struct flex_array *buckets) flex_array_free(buckets); } -struct flow_table *ovs_flow_tbl_alloc(int new_size) +static struct flow_table *__flow_tbl_alloc(int new_size) { struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL); @@ -281,17 +494,15 @@ struct flow_table *ovs_flow_tbl_alloc(int new_size) table->node_ver = 0; table->keep_flows = false; get_random_bytes(&table->hash_seed, sizeof(u32)); + table->mask_list = NULL; return table; } -void ovs_flow_tbl_destroy(struct flow_table *table) +static void __flow_tbl_destroy(struct flow_table *table) { int i; - if (!table) - return; - if (table->keep_flows) goto skip_flows; @@ -302,32 +513,56 @@ void ovs_flow_tbl_destroy(struct flow_table *table) int ver = table->node_ver; hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { - hlist_del_rcu(&flow->hash_node[ver]); - ovs_flow_free(flow); + hlist_del(&flow->hash_node[ver]); + ovs_flow_free(flow, false); } } + BUG_ON(!list_empty(table->mask_list)); + kfree(table->mask_list); + skip_flows: free_buckets(table->buckets); kfree(table); } +struct flow_table *ovs_flow_tbl_alloc(int new_size) +{ + struct flow_table *table = __flow_tbl_alloc(new_size); + + if (!table) + return NULL; + + table->mask_list = kmalloc(sizeof(struct list_head), GFP_KERNEL); + if (!table->mask_list) { + table->keep_flows = true; + __flow_tbl_destroy(table); + return NULL; + } + INIT_LIST_HEAD(table->mask_list); + + return table; +} + static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) { struct flow_table *table = container_of(rcu, struct flow_table, rcu); - ovs_flow_tbl_destroy(table); + __flow_tbl_destroy(table); } -void ovs_flow_tbl_deferred_destroy(struct flow_table *table) +void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred) { if (!table) return; - call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb); + if (deferred) + call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb); + else + __flow_tbl_destroy(table); } -struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last) +struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *last) { struct sw_flow *flow; struct hlist_head *head; @@ -353,11 +588,13 @@ struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *la return NULL; } -static void __flow_tbl_insert(struct flow_table *table, struct sw_flow *flow) +static void __tbl_insert(struct flow_table *table, struct sw_flow *flow) { struct hlist_head *head; + head = find_bucket(table, flow->hash); hlist_add_head_rcu(&flow->hash_node[table->node_ver], head); + table->count++; } @@ -377,8 +614,10 @@ static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new head = flex_array_get(old->buckets, i); hlist_for_each_entry(flow, head, hash_node[old_ver]) - __flow_tbl_insert(new, flow); + __tbl_insert(new, flow); } + + new->mask_list = old->mask_list; old->keep_flows = true; } @@ -386,7 +625,7 @@ static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buck { struct flow_table *new_table; - new_table = ovs_flow_tbl_alloc(n_buckets); + new_table = __flow_tbl_alloc(n_buckets); if (!new_table) return ERR_PTR(-ENOMEM); @@ -405,28 +644,30 @@ struct flow_table *ovs_flow_tbl_expand(struct flow_table *table) return __flow_tbl_rehash(table, table->n_buckets * 2); } -void ovs_flow_free(struct sw_flow *flow) +static void __flow_free(struct sw_flow *flow) { - if (unlikely(!flow)) - return; - kfree((struct sf_flow_acts __force *)flow->sf_acts); kmem_cache_free(flow_cache, flow); } -/* RCU callback used by ovs_flow_deferred_free. */ static void rcu_free_flow_callback(struct rcu_head *rcu) { struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); - ovs_flow_free(flow); + __flow_free(flow); } -/* Schedules 'flow' to be freed after the next RCU grace period. - * The caller must hold rcu_read_lock for this to be sensible. */ -void ovs_flow_deferred_free(struct sw_flow *flow) +void ovs_flow_free(struct sw_flow *flow, bool deferred) { - call_rcu(&flow->rcu, rcu_free_flow_callback); + if (!flow) + return; + + ovs_sw_flow_mask_del_ref(flow->mask, deferred); + + if (deferred) + call_rcu(&flow->rcu, rcu_free_flow_callback); + else + __flow_free(flow); } /* Schedules 'sf_acts' to be freed after the next RCU grace period. @@ -497,18 +738,15 @@ static __be16 parse_ethertype(struct sk_buff *skb) } static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, - int *key_lenp, int nh_len) + int nh_len) { struct icmp6hdr *icmp = icmp6_hdr(skb); - int error = 0; - int key_len; /* The ICMPv6 type and code fields use the 16-bit transport port * fields, so we need to store them in 16-bit network byte order. */ key->ipv6.tp.src = htons(icmp->icmp6_type); key->ipv6.tp.dst = htons(icmp->icmp6_code); - key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); if (icmp->icmp6_code == 0 && (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || @@ -517,21 +755,17 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, struct nd_msg *nd; int offset; - key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); - /* In order to process neighbor discovery options, we need the * entire packet. */ if (unlikely(icmp_len < sizeof(*nd))) - goto out; - if (unlikely(skb_linearize(skb))) { - error = -ENOMEM; - goto out; - } + return 0; + + if (unlikely(skb_linearize(skb))) + return -ENOMEM; nd = (struct nd_msg *)skb_transport_header(skb); key->ipv6.nd.target = nd->target; - key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); icmp_len -= sizeof(*nd); offset = 0; @@ -541,7 +775,7 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, int opt_len = nd_opt->nd_opt_len * 8; if (unlikely(!opt_len || opt_len > icmp_len)) - goto invalid; + return 0; /* Store the link layer address if the appropriate * option is provided. It is considered an error if @@ -566,16 +800,14 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, } } - goto out; + return 0; invalid: memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target)); memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll)); memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll)); -out: - *key_lenp = key_len; - return error; + return 0; } /** @@ -584,7 +816,6 @@ out: * Ethernet header * @in_port: port number on which @skb was received. * @key: output flow key - * @key_lenp: length of output flow key * * The caller must ensure that skb->len >= ETH_HLEN. * @@ -602,11 +833,9 @@ out: * of a correct length, otherwise the same as skb->network_header. * For other key->eth.type values it is left untouched. */ -int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, - int *key_lenp) +int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) { - int error = 0; - int key_len = SW_FLOW_KEY_OFFSET(eth); + int error; struct ethhdr *eth; memset(key, 0, sizeof(*key)); @@ -649,15 +878,13 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, struct iphdr *nh; __be16 offset; - key_len = SW_FLOW_KEY_OFFSET(ipv4.addr); - error = check_iphdr(skb); if (unlikely(error)) { if (error == -EINVAL) { skb->transport_header = skb->network_header; error = 0; } - goto out; + return error; } nh = ip_hdr(skb); @@ -671,7 +898,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, offset = nh->frag_off & htons(IP_OFFSET); if (offset) { key->ip.frag = OVS_FRAG_TYPE_LATER; - goto out; + return 0; } if (nh->frag_off & htons(IP_MF) || skb_shinfo(skb)->gso_type & SKB_GSO_UDP) @@ -679,21 +906,24 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, /* Transport layer. */ if (key->ip.proto == IPPROTO_TCP) { - key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); if (tcphdr_ok(skb)) { struct tcphdr *tcp = tcp_hdr(skb); key->ipv4.tp.src = tcp->source; key->ipv4.tp.dst = tcp->dest; } } else if (key->ip.proto == IPPROTO_UDP) { - key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); key->ipv4.tp.src = udp->source; key->ipv4.tp.dst = udp->dest; } + } else if (key->ip.proto == IPPROTO_SCTP) { + if (sctphdr_ok(skb)) { + struct sctphdr *sctp = sctp_hdr(skb); + key->ipv4.tp.src = sctp->source; + key->ipv4.tp.dst = sctp->dest; + } } else if (key->ip.proto == IPPROTO_ICMP) { - key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); if (icmphdr_ok(skb)) { struct icmphdr *icmp = icmp_hdr(skb); /* The ICMP type and code fields use the 16-bit @@ -722,102 +952,175 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst)); memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN); memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN); - key_len = SW_FLOW_KEY_OFFSET(ipv4.arp); } } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ - nh_len = parse_ipv6hdr(skb, key, &key_len); + nh_len = parse_ipv6hdr(skb, key); if (unlikely(nh_len < 0)) { - if (nh_len == -EINVAL) + if (nh_len == -EINVAL) { skb->transport_header = skb->network_header; - else + error = 0; + } else { error = nh_len; - goto out; + } + return error; } if (key->ip.frag == OVS_FRAG_TYPE_LATER) - goto out; + return 0; if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) key->ip.frag = OVS_FRAG_TYPE_FIRST; /* Transport layer. */ if (key->ip.proto == NEXTHDR_TCP) { - key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); if (tcphdr_ok(skb)) { struct tcphdr *tcp = tcp_hdr(skb); key->ipv6.tp.src = tcp->source; key->ipv6.tp.dst = tcp->dest; } } else if (key->ip.proto == NEXTHDR_UDP) { - key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); key->ipv6.tp.src = udp->source; key->ipv6.tp.dst = udp->dest; } + } else if (key->ip.proto == NEXTHDR_SCTP) { + if (sctphdr_ok(skb)) { + struct sctphdr *sctp = sctp_hdr(skb); + key->ipv6.tp.src = sctp->source; + key->ipv6.tp.dst = sctp->dest; + } } else if (key->ip.proto == NEXTHDR_ICMP) { - key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); if (icmp6hdr_ok(skb)) { - error = parse_icmpv6(skb, key, &key_len, nh_len); - if (error < 0) - goto out; + error = parse_icmpv6(skb, key, nh_len); + if (error) + return error; } } } -out: - *key_lenp = key_len; - return error; + return 0; } -static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, int key_len) +static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, + int key_end) { - return jhash2((u32 *)((u8 *)key + key_start), - DIV_ROUND_UP(key_len - key_start, sizeof(u32)), 0); + u32 *hash_key = (u32 *)((u8 *)key + key_start); + int hash_u32s = (key_end - key_start) >> 2; + + /* Make sure number of hash bytes are multiple of u32. */ + BUILD_BUG_ON(sizeof(long) % sizeof(u32)); + + return jhash2(hash_key, hash_u32s, 0); } -static int flow_key_start(struct sw_flow_key *key) +static int flow_key_start(const struct sw_flow_key *key) { if (key->tun_key.ipv4_dst) return 0; else - return offsetof(struct sw_flow_key, phy); + return rounddown(offsetof(struct sw_flow_key, phy), + sizeof(long)); +} + +static bool __cmp_key(const struct sw_flow_key *key1, + const struct sw_flow_key *key2, int key_start, int key_end) +{ + const long *cp1 = (long *)((u8 *)key1 + key_start); + const long *cp2 = (long *)((u8 *)key2 + key_start); + long diffs = 0; + int i; + + for (i = key_start; i < key_end; i += sizeof(long)) + diffs |= *cp1++ ^ *cp2++; + + return diffs == 0; } -struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table, - struct sw_flow_key *key, int key_len) +static bool __flow_cmp_masked_key(const struct sw_flow *flow, + const struct sw_flow_key *key, int key_start, int key_end) +{ + return __cmp_key(&flow->key, key, key_start, key_end); +} + +static bool __flow_cmp_unmasked_key(const struct sw_flow *flow, + const struct sw_flow_key *key, int key_start, int key_end) +{ + return __cmp_key(&flow->unmasked_key, key, key_start, key_end); +} + +bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, + const struct sw_flow_key *key, int key_end) +{ + int key_start; + key_start = flow_key_start(key); + + return __flow_cmp_unmasked_key(flow, key, key_start, key_end); + +} + +struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table, + struct sw_flow_match *match) +{ + struct sw_flow_key *unmasked = match->key; + int key_end = match->range.end; + struct sw_flow *flow; + + flow = ovs_flow_lookup(table, unmasked); + if (flow && (!ovs_flow_cmp_unmasked_key(flow, unmasked, key_end))) + flow = NULL; + + return flow; +} + +static struct sw_flow *ovs_masked_flow_lookup(struct flow_table *table, + const struct sw_flow_key *unmasked, + struct sw_flow_mask *mask) { struct sw_flow *flow; struct hlist_head *head; - u8 *_key; - int key_start; + int key_start = mask->range.start; + int key_end = mask->range.end; u32 hash; + struct sw_flow_key masked_key; - key_start = flow_key_start(key); - hash = ovs_flow_hash(key, key_start, key_len); - - _key = (u8 *) key + key_start; + ovs_flow_key_mask(&masked_key, unmasked, mask); + hash = ovs_flow_hash(&masked_key, key_start, key_end); head = find_bucket(table, hash); hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) { - - if (flow->hash == hash && - !memcmp((u8 *)&flow->key + key_start, _key, key_len - key_start)) { + if (flow->mask == mask && + __flow_cmp_masked_key(flow, &masked_key, + key_start, key_end)) return flow; - } } return NULL; } -void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, - struct sw_flow_key *key, int key_len) +struct sw_flow *ovs_flow_lookup(struct flow_table *tbl, + const struct sw_flow_key *key) +{ + struct sw_flow *flow = NULL; + struct sw_flow_mask *mask; + + list_for_each_entry_rcu(mask, tbl->mask_list, list) { + flow = ovs_masked_flow_lookup(tbl, key, mask); + if (flow) /* Found */ + break; + } + + return flow; +} + + +void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow) { - flow->hash = ovs_flow_hash(key, flow_key_start(key), key_len); - memcpy(&flow->key, key, sizeof(flow->key)); - __flow_tbl_insert(table, flow); + flow->hash = ovs_flow_hash(&flow->key, flow->mask->range.start, + flow->mask->range.end); + __tbl_insert(table, flow); } -void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) +void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow) { BUG_ON(table->count == 0); hlist_del_rcu(&flow->hash_node[table->node_ver]); @@ -837,6 +1140,7 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6), [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp), [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp), + [OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp), [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp), [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), @@ -844,149 +1148,84 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_TUNNEL] = -1, }; -static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len, - const struct nlattr *a[], u32 *attrs) +static bool is_all_zero(const u8 *fp, size_t size) { - const struct ovs_key_icmp *icmp_key; - const struct ovs_key_tcp *tcp_key; - const struct ovs_key_udp *udp_key; - - switch (swkey->ip.proto) { - case IPPROTO_TCP: - if (!(*attrs & (1 << OVS_KEY_ATTR_TCP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_TCP); - - *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); - tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); - swkey->ipv4.tp.src = tcp_key->tcp_src; - swkey->ipv4.tp.dst = tcp_key->tcp_dst; - break; - - case IPPROTO_UDP: - if (!(*attrs & (1 << OVS_KEY_ATTR_UDP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_UDP); - - *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); - udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); - swkey->ipv4.tp.src = udp_key->udp_src; - swkey->ipv4.tp.dst = udp_key->udp_dst; - break; - - case IPPROTO_ICMP: - if (!(*attrs & (1 << OVS_KEY_ATTR_ICMP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_ICMP); - - *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); - icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); - swkey->ipv4.tp.src = htons(icmp_key->icmp_type); - swkey->ipv4.tp.dst = htons(icmp_key->icmp_code); - break; - } - - return 0; -} - -static int ipv6_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len, - const struct nlattr *a[], u32 *attrs) -{ - const struct ovs_key_icmpv6 *icmpv6_key; - const struct ovs_key_tcp *tcp_key; - const struct ovs_key_udp *udp_key; - - switch (swkey->ip.proto) { - case IPPROTO_TCP: - if (!(*attrs & (1 << OVS_KEY_ATTR_TCP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_TCP); - - *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); - tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); - swkey->ipv6.tp.src = tcp_key->tcp_src; - swkey->ipv6.tp.dst = tcp_key->tcp_dst; - break; - - case IPPROTO_UDP: - if (!(*attrs & (1 << OVS_KEY_ATTR_UDP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_UDP); - - *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); - udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); - swkey->ipv6.tp.src = udp_key->udp_src; - swkey->ipv6.tp.dst = udp_key->udp_dst; - break; - - case IPPROTO_ICMPV6: - if (!(*attrs & (1 << OVS_KEY_ATTR_ICMPV6))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); - - *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); - icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); - swkey->ipv6.tp.src = htons(icmpv6_key->icmpv6_type); - swkey->ipv6.tp.dst = htons(icmpv6_key->icmpv6_code); + int i; - if (swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) || - swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { - const struct ovs_key_nd *nd_key; + if (!fp) + return false; - if (!(*attrs & (1 << OVS_KEY_ATTR_ND))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_ND); - - *key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); - nd_key = nla_data(a[OVS_KEY_ATTR_ND]); - memcpy(&swkey->ipv6.nd.target, nd_key->nd_target, - sizeof(swkey->ipv6.nd.target)); - memcpy(swkey->ipv6.nd.sll, nd_key->nd_sll, ETH_ALEN); - memcpy(swkey->ipv6.nd.tll, nd_key->nd_tll, ETH_ALEN); - } - break; - } + for (i = 0; i < size; i++) + if (fp[i]) + return false; - return 0; + return true; } -static int parse_flow_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], u32 *attrsp) +static int __parse_flow_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], + u64 *attrsp, bool nz) { const struct nlattr *nla; u32 attrs; int rem; - attrs = 0; + attrs = *attrsp; nla_for_each_nested(nla, attr, rem) { u16 type = nla_type(nla); int expected_len; - if (type > OVS_KEY_ATTR_MAX || attrs & (1 << type)) + if (type > OVS_KEY_ATTR_MAX) { + OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n", + type, OVS_KEY_ATTR_MAX); + } + + if (attrs & (1 << type)) { + OVS_NLERR("Duplicate key attribute (type %d).\n", type); return -EINVAL; + } expected_len = ovs_key_lens[type]; - if (nla_len(nla) != expected_len && expected_len != -1) + if (nla_len(nla) != expected_len && expected_len != -1) { + OVS_NLERR("Key attribute has unexpected length (type=%d" + ", length=%d, expected=%d).\n", type, + nla_len(nla), expected_len); return -EINVAL; + } - attrs |= 1 << type; - a[type] = nla; + if (!nz || !is_all_zero(nla_data(nla), expected_len)) { + attrs |= 1 << type; + a[type] = nla; + } } - if (rem) + if (rem) { + OVS_NLERR("Message has %d unknown bytes.\n", rem); return -EINVAL; + } *attrsp = attrs; return 0; } +static int parse_flow_mask_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], u64 *attrsp) +{ + return __parse_flow_nlattrs(attr, a, attrsp, true); +} + +static int parse_flow_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], u64 *attrsp) +{ + return __parse_flow_nlattrs(attr, a, attrsp, false); +} + int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, - struct ovs_key_ipv4_tunnel *tun_key) + struct sw_flow_match *match, bool is_mask) { struct nlattr *a; int rem; bool ttl = false; - - memset(tun_key, 0, sizeof(*tun_key)); + __be16 tun_flags = 0; nla_for_each_nested(a, attr, rem) { int type = nla_type(a); @@ -1000,53 +1239,78 @@ int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, }; - if (type > OVS_TUNNEL_KEY_ATTR_MAX || - ovs_tunnel_key_lens[type] != nla_len(a)) + if (type > OVS_TUNNEL_KEY_ATTR_MAX) { + OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n", + type, OVS_TUNNEL_KEY_ATTR_MAX); return -EINVAL; + } + + if (ovs_tunnel_key_lens[type] != nla_len(a)) { + OVS_NLERR("IPv4 tunnel attribute type has unexpected " + " length (type=%d, length=%d, expected=%d).\n", + type, nla_len(a), ovs_tunnel_key_lens[type]); + return -EINVAL; + } switch (type) { case OVS_TUNNEL_KEY_ATTR_ID: - tun_key->tun_id = nla_get_be64(a); - tun_key->tun_flags |= TUNNEL_KEY; + SW_FLOW_KEY_PUT(match, tun_key.tun_id, + nla_get_be64(a), is_mask); + tun_flags |= TUNNEL_KEY; break; case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: - tun_key->ipv4_src = nla_get_be32(a); + SW_FLOW_KEY_PUT(match, tun_key.ipv4_src, + nla_get_be32(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_IPV4_DST: - tun_key->ipv4_dst = nla_get_be32(a); + SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst, + nla_get_be32(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_TOS: - tun_key->ipv4_tos = nla_get_u8(a); + SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos, + nla_get_u8(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_TTL: - tun_key->ipv4_ttl = nla_get_u8(a); + SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl, + nla_get_u8(a), is_mask); ttl = true; break; case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: - tun_key->tun_flags |= TUNNEL_DONT_FRAGMENT; + tun_flags |= TUNNEL_DONT_FRAGMENT; break; case OVS_TUNNEL_KEY_ATTR_CSUM: - tun_key->tun_flags |= TUNNEL_CSUM; + tun_flags |= TUNNEL_CSUM; break; default: return -EINVAL; - } } - if (rem > 0) - return -EINVAL; - if (!tun_key->ipv4_dst) - return -EINVAL; + SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); - if (!ttl) + if (rem > 0) { + OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem); return -EINVAL; + } + + if (!is_mask) { + if (!match->key->tun_key.ipv4_dst) { + OVS_NLERR("IPv4 tunnel destination address is zero.\n"); + return -EINVAL; + } + + if (!ttl) { + OVS_NLERR("IPv4 tunnel TTL not specified.\n"); + return -EINVAL; + } + } return 0; } int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *tun_key) + const struct ovs_key_ipv4_tunnel *tun_key, + const struct ovs_key_ipv4_tunnel *output) { struct nlattr *nla; @@ -1054,23 +1318,24 @@ int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, if (!nla) return -EMSGSIZE; - if (tun_key->tun_flags & TUNNEL_KEY && - nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, tun_key->tun_id)) + if (output->tun_flags & TUNNEL_KEY && + nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; - if (tun_key->ipv4_src && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, tun_key->ipv4_src)) + if (output->ipv4_src && + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) return -EMSGSIZE; - if (nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, tun_key->ipv4_dst)) + if (output->ipv4_dst && + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) return -EMSGSIZE; - if (tun_key->ipv4_tos && - nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, tun_key->ipv4_tos)) + if (output->ipv4_tos && + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) return -EMSGSIZE; - if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, tun_key->ipv4_ttl)) + if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) return -EMSGSIZE; - if ((tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) && + if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) return -EMSGSIZE; - if ((tun_key->tun_flags & TUNNEL_CSUM) && + if ((output->tun_flags & TUNNEL_CSUM) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) return -EMSGSIZE; @@ -1078,176 +1343,390 @@ int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, return 0; } -/** - * ovs_flow_from_nlattrs - parses Netlink attributes into a flow key. - * @swkey: receives the extracted flow key. - * @key_lenp: number of bytes used in @swkey. - * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute - * sequence. - */ -int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, - const struct nlattr *attr) +static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, + const struct nlattr **a, bool is_mask) { - const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; - const struct ovs_key_ethernet *eth_key; - int key_len; - u32 attrs; - int err; + if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { + SW_FLOW_KEY_PUT(match, phy.priority, + nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY); + } - memset(swkey, 0, sizeof(struct sw_flow_key)); - key_len = SW_FLOW_KEY_OFFSET(eth); + if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { + u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); - err = parse_flow_nlattrs(attr, a, &attrs); - if (err) - return err; + if (is_mask) + in_port = 0xffffffff; /* Always exact match in_port. */ + else if (in_port >= DP_MAX_PORTS) + return -EINVAL; - /* Metadata attributes. */ - if (attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { - swkey->phy.priority = nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]); - attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY); + SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); + } else if (!is_mask) { + SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask); } - if (attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { - u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); - if (in_port >= DP_MAX_PORTS) - return -EINVAL; - swkey->phy.in_port = in_port; - attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); - } else { - swkey->phy.in_port = DP_MAX_PORTS; + + if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) { + uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]); + + SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); } - if (attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) { - swkey->phy.skb_mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]); - attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); + if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { + if (ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, + is_mask)) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); } + return 0; +} - if (attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { - err = ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], &swkey->tun_key); - if (err) - return err; +static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, + const struct nlattr **a, bool is_mask) +{ + int err; + u64 orig_attrs = attrs; - attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); - } + err = metadata_from_nlattrs(match, &attrs, a, is_mask); + if (err) + return err; - /* Data attributes. */ - if (!(attrs & (1 << OVS_KEY_ATTR_ETHERNET))) - return -EINVAL; - attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); + if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) { + const struct ovs_key_ethernet *eth_key; - eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); - memcpy(swkey->eth.src, eth_key->eth_src, ETH_ALEN); - memcpy(swkey->eth.dst, eth_key->eth_dst, ETH_ALEN); + eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); + SW_FLOW_KEY_MEMCPY(match, eth.src, + eth_key->eth_src, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, eth.dst, + eth_key->eth_dst, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); + } - if (attrs & (1u << OVS_KEY_ATTR_ETHERTYPE) && - nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q)) { - const struct nlattr *encap; + if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { __be16 tci; - if (attrs != ((1 << OVS_KEY_ATTR_VLAN) | - (1 << OVS_KEY_ATTR_ETHERTYPE) | - (1 << OVS_KEY_ATTR_ENCAP))) - return -EINVAL; - - encap = a[OVS_KEY_ATTR_ENCAP]; tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - if (tci & htons(VLAN_TAG_PRESENT)) { - swkey->eth.tci = tci; - - err = parse_flow_nlattrs(encap, a, &attrs); - if (err) - return err; - } else if (!tci) { - /* Corner case for truncated 802.1Q header. */ - if (nla_len(encap)) - return -EINVAL; + if (!(tci & htons(VLAN_TAG_PRESENT))) { + if (is_mask) + OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n"); + else + OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n"); - swkey->eth.type = htons(ETH_P_8021Q); - *key_lenp = key_len; - return 0; - } else { return -EINVAL; } - } + + SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_VLAN); + } else if (!is_mask) + SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { - swkey->eth.type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); - if (ntohs(swkey->eth.type) < ETH_P_802_3_MIN) + __be16 eth_type; + + eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + if (is_mask) { + /* Always exact match EtherType. */ + eth_type = htons(0xffff); + } else if (ntohs(eth_type) < ETH_P_802_3_MIN) { + OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n", + ntohs(eth_type), ETH_P_802_3_MIN); return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask); attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - } else { - swkey->eth.type = htons(ETH_P_802_2); + } else if (!is_mask) { + SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask); } - if (swkey->eth.type == htons(ETH_P_IP)) { + if (attrs & (1 << OVS_KEY_ATTR_IPV4)) { const struct ovs_key_ipv4 *ipv4_key; - if (!(attrs & (1 << OVS_KEY_ATTR_IPV4))) - return -EINVAL; - attrs &= ~(1 << OVS_KEY_ATTR_IPV4); - - key_len = SW_FLOW_KEY_OFFSET(ipv4.addr); ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); - if (ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) + if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) { + OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n", + ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX); return -EINVAL; - swkey->ip.proto = ipv4_key->ipv4_proto; - swkey->ip.tos = ipv4_key->ipv4_tos; - swkey->ip.ttl = ipv4_key->ipv4_ttl; - swkey->ip.frag = ipv4_key->ipv4_frag; - swkey->ipv4.addr.src = ipv4_key->ipv4_src; - swkey->ipv4.addr.dst = ipv4_key->ipv4_dst; - - if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) { - err = ipv4_flow_from_nlattrs(swkey, &key_len, a, &attrs); - if (err) - return err; } - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - const struct ovs_key_ipv6 *ipv6_key; + SW_FLOW_KEY_PUT(match, ip.proto, + ipv4_key->ipv4_proto, is_mask); + SW_FLOW_KEY_PUT(match, ip.tos, + ipv4_key->ipv4_tos, is_mask); + SW_FLOW_KEY_PUT(match, ip.ttl, + ipv4_key->ipv4_ttl, is_mask); + SW_FLOW_KEY_PUT(match, ip.frag, + ipv4_key->ipv4_frag, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.src, + ipv4_key->ipv4_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.dst, + ipv4_key->ipv4_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_IPV4); + } - if (!(attrs & (1 << OVS_KEY_ATTR_IPV6))) - return -EINVAL; - attrs &= ~(1 << OVS_KEY_ATTR_IPV6); + if (attrs & (1 << OVS_KEY_ATTR_IPV6)) { + const struct ovs_key_ipv6 *ipv6_key; - key_len = SW_FLOW_KEY_OFFSET(ipv6.label); ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]); - if (ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) + if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) { + OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n", + ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); return -EINVAL; - swkey->ipv6.label = ipv6_key->ipv6_label; - swkey->ip.proto = ipv6_key->ipv6_proto; - swkey->ip.tos = ipv6_key->ipv6_tclass; - swkey->ip.ttl = ipv6_key->ipv6_hlimit; - swkey->ip.frag = ipv6_key->ipv6_frag; - memcpy(&swkey->ipv6.addr.src, ipv6_key->ipv6_src, - sizeof(swkey->ipv6.addr.src)); - memcpy(&swkey->ipv6.addr.dst, ipv6_key->ipv6_dst, - sizeof(swkey->ipv6.addr.dst)); - - if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) { - err = ipv6_flow_from_nlattrs(swkey, &key_len, a, &attrs); - if (err) - return err; } - } else if (swkey->eth.type == htons(ETH_P_ARP) || - swkey->eth.type == htons(ETH_P_RARP)) { + SW_FLOW_KEY_PUT(match, ipv6.label, + ipv6_key->ipv6_label, is_mask); + SW_FLOW_KEY_PUT(match, ip.proto, + ipv6_key->ipv6_proto, is_mask); + SW_FLOW_KEY_PUT(match, ip.tos, + ipv6_key->ipv6_tclass, is_mask); + SW_FLOW_KEY_PUT(match, ip.ttl, + ipv6_key->ipv6_hlimit, is_mask); + SW_FLOW_KEY_PUT(match, ip.frag, + ipv6_key->ipv6_frag, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src, + ipv6_key->ipv6_src, + sizeof(match->key->ipv6.addr.src), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst, + ipv6_key->ipv6_dst, + sizeof(match->key->ipv6.addr.dst), + is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_IPV6); + } + + if (attrs & (1 << OVS_KEY_ATTR_ARP)) { const struct ovs_key_arp *arp_key; - if (!(attrs & (1 << OVS_KEY_ATTR_ARP))) + arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); + if (!is_mask && (arp_key->arp_op & htons(0xff00))) { + OVS_NLERR("Unknown ARP opcode (opcode=%d).\n", + arp_key->arp_op); return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, ipv4.addr.src, + arp_key->arp_sip, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.dst, + arp_key->arp_tip, is_mask); + SW_FLOW_KEY_PUT(match, ip.proto, + ntohs(arp_key->arp_op), is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha, + arp_key->arp_sha, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha, + arp_key->arp_tha, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ARP); + } - key_len = SW_FLOW_KEY_OFFSET(ipv4.arp); - arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); - swkey->ipv4.addr.src = arp_key->arp_sip; - swkey->ipv4.addr.dst = arp_key->arp_tip; - if (arp_key->arp_op & htons(0xff00)) + if (attrs & (1 << OVS_KEY_ATTR_TCP)) { + const struct ovs_key_tcp *tcp_key; + + tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); + if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { + SW_FLOW_KEY_PUT(match, ipv4.tp.src, + tcp_key->tcp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.tp.dst, + tcp_key->tcp_dst, is_mask); + } else { + SW_FLOW_KEY_PUT(match, ipv6.tp.src, + tcp_key->tcp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv6.tp.dst, + tcp_key->tcp_dst, is_mask); + } + attrs &= ~(1 << OVS_KEY_ATTR_TCP); + } + + if (attrs & (1 << OVS_KEY_ATTR_UDP)) { + const struct ovs_key_udp *udp_key; + + udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); + if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { + SW_FLOW_KEY_PUT(match, ipv4.tp.src, + udp_key->udp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.tp.dst, + udp_key->udp_dst, is_mask); + } else { + SW_FLOW_KEY_PUT(match, ipv6.tp.src, + udp_key->udp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv6.tp.dst, + udp_key->udp_dst, is_mask); + } + attrs &= ~(1 << OVS_KEY_ATTR_UDP); + } + + if (attrs & (1 << OVS_KEY_ATTR_SCTP)) { + const struct ovs_key_sctp *sctp_key; + + sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]); + if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { + SW_FLOW_KEY_PUT(match, ipv4.tp.src, + sctp_key->sctp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.tp.dst, + sctp_key->sctp_dst, is_mask); + } else { + SW_FLOW_KEY_PUT(match, ipv6.tp.src, + sctp_key->sctp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv6.tp.dst, + sctp_key->sctp_dst, is_mask); + } + attrs &= ~(1 << OVS_KEY_ATTR_SCTP); + } + + if (attrs & (1 << OVS_KEY_ATTR_ICMP)) { + const struct ovs_key_icmp *icmp_key; + + icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); + SW_FLOW_KEY_PUT(match, ipv4.tp.src, + htons(icmp_key->icmp_type), is_mask); + SW_FLOW_KEY_PUT(match, ipv4.tp.dst, + htons(icmp_key->icmp_code), is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ICMP); + } + + if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) { + const struct ovs_key_icmpv6 *icmpv6_key; + + icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); + SW_FLOW_KEY_PUT(match, ipv6.tp.src, + htons(icmpv6_key->icmpv6_type), is_mask); + SW_FLOW_KEY_PUT(match, ipv6.tp.dst, + htons(icmpv6_key->icmpv6_code), is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); + } + + if (attrs & (1 << OVS_KEY_ATTR_ND)) { + const struct ovs_key_nd *nd_key; + + nd_key = nla_data(a[OVS_KEY_ATTR_ND]); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target, + nd_key->nd_target, + sizeof(match->key->ipv6.nd.target), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll, + nd_key->nd_sll, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll, + nd_key->nd_tll, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ND); + } + + if (attrs != 0) + return -EINVAL; + + return 0; +} + +/** + * ovs_match_from_nlattrs - parses Netlink attributes into a flow key and + * mask. In case the 'mask' is NULL, the flow is treated as exact match + * flow. Otherwise, it is treated as a wildcarded flow, except the mask + * does not include any don't care bit. + * @match: receives the extracted flow match information. + * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. The fields should of the packet that triggered the creation + * of this flow. + * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink + * attribute specifies the mask field of the wildcarded flow. + */ +int ovs_match_from_nlattrs(struct sw_flow_match *match, + const struct nlattr *key, + const struct nlattr *mask) +{ + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + const struct nlattr *encap; + u64 key_attrs = 0; + u64 mask_attrs = 0; + bool encap_valid = false; + int err; + + err = parse_flow_nlattrs(key, a, &key_attrs); + if (err) + return err; + + if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && + (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && + (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) { + __be16 tci; + + if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && + (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { + OVS_NLERR("Invalid Vlan frame.\n"); return -EINVAL; - swkey->ip.proto = ntohs(arp_key->arp_op); - memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN); - memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN); + } + + key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + encap = a[OVS_KEY_ATTR_ENCAP]; + key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + encap_valid = true; + + if (tci & htons(VLAN_TAG_PRESENT)) { + err = parse_flow_nlattrs(encap, a, &key_attrs); + if (err) + return err; + } else if (!tci) { + /* Corner case for truncated 802.1Q header. */ + if (nla_len(encap)) { + OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n"); + return -EINVAL; + } + } else { + OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n"); + return -EINVAL; + } + } + + err = ovs_key_from_nlattrs(match, key_attrs, a, false); + if (err) + return err; + + if (mask) { + err = parse_flow_mask_nlattrs(mask, a, &mask_attrs); + if (err) + return err; + + if (mask_attrs & 1ULL << OVS_KEY_ATTR_ENCAP) { + __be16 eth_type = 0; + __be16 tci = 0; + + if (!encap_valid) { + OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n"); + return -EINVAL; + } + + mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + if (a[OVS_KEY_ATTR_ETHERTYPE]) + eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + + if (eth_type == htons(0xffff)) { + mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + encap = a[OVS_KEY_ATTR_ENCAP]; + err = parse_flow_mask_nlattrs(encap, a, &mask_attrs); + } else { + OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n", + ntohs(eth_type)); + return -EINVAL; + } + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (!(tci & htons(VLAN_TAG_PRESENT))) { + OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci)); + return -EINVAL; + } + } + + err = ovs_key_from_nlattrs(match, mask_attrs, a, true); + if (err) + return err; + } else { + /* Populate exact match flow's key mask. */ + if (match->mask) + ovs_sw_flow_mask_set(match->mask, &match->range, 0xff); } - if (attrs) + if (!ovs_match_validate(match, key_attrs, mask_attrs)) return -EINVAL; - *key_lenp = key_len; return 0; } @@ -1255,7 +1734,6 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, /** * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key. * @flow: Receives extracted in_port, priority, tun_key and skb_mark. - * @key_len: Length of key in @flow. Used for calculating flow hash. * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute * sequence. * @@ -1264,102 +1742,100 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, * get the metadata, that is, the parts of the flow key that cannot be * extracted from the packet itself. */ -int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len, - const struct nlattr *attr) + +int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, + const struct nlattr *attr) { struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key; - const struct nlattr *nla; - int rem; + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + u64 attrs = 0; + int err; + struct sw_flow_match match; flow->key.phy.in_port = DP_MAX_PORTS; flow->key.phy.priority = 0; flow->key.phy.skb_mark = 0; memset(tun_key, 0, sizeof(flow->key.tun_key)); - nla_for_each_nested(nla, attr, rem) { - int type = nla_type(nla); - - if (type <= OVS_KEY_ATTR_MAX && ovs_key_lens[type] > 0) { - int err; - - if (nla_len(nla) != ovs_key_lens[type]) - return -EINVAL; - - switch (type) { - case OVS_KEY_ATTR_PRIORITY: - flow->key.phy.priority = nla_get_u32(nla); - break; - - case OVS_KEY_ATTR_TUNNEL: - err = ovs_ipv4_tun_from_nlattr(nla, tun_key); - if (err) - return err; - break; - - case OVS_KEY_ATTR_IN_PORT: - if (nla_get_u32(nla) >= DP_MAX_PORTS) - return -EINVAL; - flow->key.phy.in_port = nla_get_u32(nla); - break; - - case OVS_KEY_ATTR_SKB_MARK: - flow->key.phy.skb_mark = nla_get_u32(nla); - break; - } - } - } - if (rem) + err = parse_flow_nlattrs(attr, a, &attrs); + if (err) return -EINVAL; - flow->hash = ovs_flow_hash(&flow->key, - flow_key_start(&flow->key), key_len); + memset(&match, 0, sizeof(match)); + match.key = &flow->key; + + err = metadata_from_nlattrs(&match, &attrs, a, false); + if (err) + return err; return 0; } -int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) +int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, struct sk_buff *skb) { struct ovs_key_ethernet *eth_key; struct nlattr *nla, *encap; + bool is_mask = (swkey != output); - if (swkey->phy.priority && - nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority)) + if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; - if (swkey->tun_key.ipv4_dst && - ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key)) + if ((swkey->tun_key.ipv4_dst || is_mask) && + ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key)) goto nla_put_failure; - if (swkey->phy.in_port != DP_MAX_PORTS && - nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port)) - goto nla_put_failure; + if (swkey->phy.in_port == DP_MAX_PORTS) { + if (is_mask && (output->phy.in_port == 0xffff)) + if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff)) + goto nla_put_failure; + } else { + u16 upper_u16; + upper_u16 = !is_mask ? 0 : 0xffff; - if (swkey->phy.skb_mark && - nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, swkey->phy.skb_mark)) + if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, + (upper_u16 << 16) | output->phy.in_port)) + goto nla_put_failure; + } + + if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) goto nla_put_failure; nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); if (!nla) goto nla_put_failure; + eth_key = nla_data(nla); - memcpy(eth_key->eth_src, swkey->eth.src, ETH_ALEN); - memcpy(eth_key->eth_dst, swkey->eth.dst, ETH_ALEN); + memcpy(eth_key->eth_src, output->eth.src, ETH_ALEN); + memcpy(eth_key->eth_dst, output->eth.dst, ETH_ALEN); if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, htons(ETH_P_8021Q)) || - nla_put_be16(skb, OVS_KEY_ATTR_VLAN, swkey->eth.tci)) + __be16 eth_type; + eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff); + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || + nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci)) goto nla_put_failure; encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); if (!swkey->eth.tci) goto unencap; - } else { + } else encap = NULL; - } - if (swkey->eth.type == htons(ETH_P_802_2)) + if (swkey->eth.type == htons(ETH_P_802_2)) { + /* + * Ethertype 802.2 is represented in the netlink with omitted + * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and + * 0xffff in the mask attribute. Ethertype can also + * be wildcarded. + */ + if (is_mask && output->eth.type) + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, + output->eth.type)) + goto nla_put_failure; goto unencap; + } - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, swkey->eth.type)) + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) goto nla_put_failure; if (swkey->eth.type == htons(ETH_P_IP)) { @@ -1369,12 +1845,12 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) if (!nla) goto nla_put_failure; ipv4_key = nla_data(nla); - ipv4_key->ipv4_src = swkey->ipv4.addr.src; - ipv4_key->ipv4_dst = swkey->ipv4.addr.dst; - ipv4_key->ipv4_proto = swkey->ip.proto; - ipv4_key->ipv4_tos = swkey->ip.tos; - ipv4_key->ipv4_ttl = swkey->ip.ttl; - ipv4_key->ipv4_frag = swkey->ip.frag; + ipv4_key->ipv4_src = output->ipv4.addr.src; + ipv4_key->ipv4_dst = output->ipv4.addr.dst; + ipv4_key->ipv4_proto = output->ip.proto; + ipv4_key->ipv4_tos = output->ip.tos; + ipv4_key->ipv4_ttl = output->ip.ttl; + ipv4_key->ipv4_frag = output->ip.frag; } else if (swkey->eth.type == htons(ETH_P_IPV6)) { struct ovs_key_ipv6 *ipv6_key; @@ -1382,15 +1858,15 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) if (!nla) goto nla_put_failure; ipv6_key = nla_data(nla); - memcpy(ipv6_key->ipv6_src, &swkey->ipv6.addr.src, + memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src, sizeof(ipv6_key->ipv6_src)); - memcpy(ipv6_key->ipv6_dst, &swkey->ipv6.addr.dst, + memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst, sizeof(ipv6_key->ipv6_dst)); - ipv6_key->ipv6_label = swkey->ipv6.label; - ipv6_key->ipv6_proto = swkey->ip.proto; - ipv6_key->ipv6_tclass = swkey->ip.tos; - ipv6_key->ipv6_hlimit = swkey->ip.ttl; - ipv6_key->ipv6_frag = swkey->ip.frag; + ipv6_key->ipv6_label = output->ipv6.label; + ipv6_key->ipv6_proto = output->ip.proto; + ipv6_key->ipv6_tclass = output->ip.tos; + ipv6_key->ipv6_hlimit = output->ip.ttl; + ipv6_key->ipv6_frag = output->ip.frag; } else if (swkey->eth.type == htons(ETH_P_ARP) || swkey->eth.type == htons(ETH_P_RARP)) { struct ovs_key_arp *arp_key; @@ -1400,11 +1876,11 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) goto nla_put_failure; arp_key = nla_data(nla); memset(arp_key, 0, sizeof(struct ovs_key_arp)); - arp_key->arp_sip = swkey->ipv4.addr.src; - arp_key->arp_tip = swkey->ipv4.addr.dst; - arp_key->arp_op = htons(swkey->ip.proto); - memcpy(arp_key->arp_sha, swkey->ipv4.arp.sha, ETH_ALEN); - memcpy(arp_key->arp_tha, swkey->ipv4.arp.tha, ETH_ALEN); + arp_key->arp_sip = output->ipv4.addr.src; + arp_key->arp_tip = output->ipv4.addr.dst; + arp_key->arp_op = htons(output->ip.proto); + memcpy(arp_key->arp_sha, output->ipv4.arp.sha, ETH_ALEN); + memcpy(arp_key->arp_tha, output->ipv4.arp.tha, ETH_ALEN); } if ((swkey->eth.type == htons(ETH_P_IP) || @@ -1419,11 +1895,11 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) goto nla_put_failure; tcp_key = nla_data(nla); if (swkey->eth.type == htons(ETH_P_IP)) { - tcp_key->tcp_src = swkey->ipv4.tp.src; - tcp_key->tcp_dst = swkey->ipv4.tp.dst; + tcp_key->tcp_src = output->ipv4.tp.src; + tcp_key->tcp_dst = output->ipv4.tp.dst; } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - tcp_key->tcp_src = swkey->ipv6.tp.src; - tcp_key->tcp_dst = swkey->ipv6.tp.dst; + tcp_key->tcp_src = output->ipv6.tp.src; + tcp_key->tcp_dst = output->ipv6.tp.dst; } } else if (swkey->ip.proto == IPPROTO_UDP) { struct ovs_key_udp *udp_key; @@ -1433,11 +1909,25 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) goto nla_put_failure; udp_key = nla_data(nla); if (swkey->eth.type == htons(ETH_P_IP)) { - udp_key->udp_src = swkey->ipv4.tp.src; - udp_key->udp_dst = swkey->ipv4.tp.dst; + udp_key->udp_src = output->ipv4.tp.src; + udp_key->udp_dst = output->ipv4.tp.dst; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + udp_key->udp_src = output->ipv6.tp.src; + udp_key->udp_dst = output->ipv6.tp.dst; + } + } else if (swkey->ip.proto == IPPROTO_SCTP) { + struct ovs_key_sctp *sctp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key)); + if (!nla) + goto nla_put_failure; + sctp_key = nla_data(nla); + if (swkey->eth.type == htons(ETH_P_IP)) { + sctp_key->sctp_src = swkey->ipv4.tp.src; + sctp_key->sctp_dst = swkey->ipv4.tp.dst; } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - udp_key->udp_src = swkey->ipv6.tp.src; - udp_key->udp_dst = swkey->ipv6.tp.dst; + sctp_key->sctp_src = swkey->ipv6.tp.src; + sctp_key->sctp_dst = swkey->ipv6.tp.dst; } } else if (swkey->eth.type == htons(ETH_P_IP) && swkey->ip.proto == IPPROTO_ICMP) { @@ -1447,8 +1937,8 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) if (!nla) goto nla_put_failure; icmp_key = nla_data(nla); - icmp_key->icmp_type = ntohs(swkey->ipv4.tp.src); - icmp_key->icmp_code = ntohs(swkey->ipv4.tp.dst); + icmp_key->icmp_type = ntohs(output->ipv4.tp.src); + icmp_key->icmp_code = ntohs(output->ipv4.tp.dst); } else if (swkey->eth.type == htons(ETH_P_IPV6) && swkey->ip.proto == IPPROTO_ICMPV6) { struct ovs_key_icmpv6 *icmpv6_key; @@ -1458,8 +1948,8 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) if (!nla) goto nla_put_failure; icmpv6_key = nla_data(nla); - icmpv6_key->icmpv6_type = ntohs(swkey->ipv6.tp.src); - icmpv6_key->icmpv6_code = ntohs(swkey->ipv6.tp.dst); + icmpv6_key->icmpv6_type = ntohs(output->ipv6.tp.src); + icmpv6_key->icmpv6_code = ntohs(output->ipv6.tp.dst); if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION || icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) { @@ -1469,10 +1959,10 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) if (!nla) goto nla_put_failure; nd_key = nla_data(nla); - memcpy(nd_key->nd_target, &swkey->ipv6.nd.target, + memcpy(nd_key->nd_target, &output->ipv6.nd.target, sizeof(nd_key->nd_target)); - memcpy(nd_key->nd_sll, swkey->ipv6.nd.sll, ETH_ALEN); - memcpy(nd_key->nd_tll, swkey->ipv6.nd.tll, ETH_ALEN); + memcpy(nd_key->nd_sll, output->ipv6.nd.sll, ETH_ALEN); + memcpy(nd_key->nd_tll, output->ipv6.nd.tll, ETH_ALEN); } } } @@ -1491,6 +1981,8 @@ nla_put_failure: * Returns zero if successful or a negative error code. */ int ovs_flow_init(void) { + BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); + flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0, 0, NULL); if (flow_cache == NULL) @@ -1504,3 +1996,84 @@ void ovs_flow_exit(void) { kmem_cache_destroy(flow_cache); } + +struct sw_flow_mask *ovs_sw_flow_mask_alloc(void) +{ + struct sw_flow_mask *mask; + + mask = kmalloc(sizeof(*mask), GFP_KERNEL); + if (mask) + mask->ref_count = 0; + + return mask; +} + +void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *mask) +{ + mask->ref_count++; +} + +void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred) +{ + if (!mask) + return; + + BUG_ON(!mask->ref_count); + mask->ref_count--; + + if (!mask->ref_count) { + list_del_rcu(&mask->list); + if (deferred) + kfree_rcu(mask, rcu); + else + kfree(mask); + } +} + +static bool ovs_sw_flow_mask_equal(const struct sw_flow_mask *a, + const struct sw_flow_mask *b) +{ + u8 *a_ = (u8 *)&a->key + a->range.start; + u8 *b_ = (u8 *)&b->key + b->range.start; + + return (a->range.end == b->range.end) + && (a->range.start == b->range.start) + && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0); +} + +struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *tbl, + const struct sw_flow_mask *mask) +{ + struct list_head *ml; + + list_for_each(ml, tbl->mask_list) { + struct sw_flow_mask *m; + m = container_of(ml, struct sw_flow_mask, list); + if (ovs_sw_flow_mask_equal(mask, m)) + return m; + } + + return NULL; +} + +/** + * add a new mask into the mask list. + * The caller needs to make sure that 'mask' is not the same + * as any masks that are already on the list. + */ +void ovs_sw_flow_mask_insert(struct flow_table *tbl, struct sw_flow_mask *mask) +{ + list_add_rcu(&mask->list, tbl->mask_list); +} + +/** + * Set 'range' fields in the mask to the value of 'val'. + */ +static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask, + struct sw_flow_key_range *range, u8 val) +{ + u8 *m = (u8 *)&mask->key + range->start; + + mask->range = *range; + memset(m, val, range_n_bytes(range)); +} diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 66ef722..b65f885 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2011 Nicira, Inc. + * Copyright (c) 2007-2013 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -33,6 +33,8 @@ #include <net/inet_ecn.h> struct sk_buff; +struct sw_flow_mask; +struct flow_table; struct sw_flow_actions { struct rcu_head rcu; @@ -97,8 +99,8 @@ struct sw_flow_key { } addr; union { struct { - __be16 src; /* TCP/UDP source port. */ - __be16 dst; /* TCP/UDP destination port. */ + __be16 src; /* TCP/UDP/SCTP source port. */ + __be16 dst; /* TCP/UDP/SCTP destination port. */ } tp; struct { u8 sha[ETH_ALEN]; /* ARP source hardware address. */ @@ -113,8 +115,8 @@ struct sw_flow_key { } addr; __be32 label; /* IPv6 flow label. */ struct { - __be16 src; /* TCP/UDP source port. */ - __be16 dst; /* TCP/UDP destination port. */ + __be16 src; /* TCP/UDP/SCTP source port. */ + __be16 dst; /* TCP/UDP/SCTP destination port. */ } tp; struct { struct in6_addr target; /* ND target address. */ @@ -123,7 +125,7 @@ struct sw_flow_key { } nd; } ipv6; }; -}; +} __aligned(__alignof__(long)); struct sw_flow { struct rcu_head rcu; @@ -131,6 +133,8 @@ struct sw_flow { u32 hash; struct sw_flow_key key; + struct sw_flow_key unmasked_key; + struct sw_flow_mask *mask; struct sw_flow_actions __rcu *sf_acts; spinlock_t lock; /* Lock for values below. */ @@ -140,6 +144,20 @@ struct sw_flow { u8 tcp_flags; /* Union of seen TCP flags. */ }; +struct sw_flow_key_range { + size_t start; + size_t end; +}; + +struct sw_flow_match { + struct sw_flow_key *key; + struct sw_flow_key_range range; + struct sw_flow_mask *mask; +}; + +void ovs_match_init(struct sw_flow_match *match, + struct sw_flow_key *key, struct sw_flow_mask *mask); + struct arp_eth_header { __be16 ar_hrd; /* format of hardware address */ __be16 ar_pro; /* format of protocol address */ @@ -159,21 +177,21 @@ void ovs_flow_exit(void); struct sw_flow *ovs_flow_alloc(void); void ovs_flow_deferred_free(struct sw_flow *); -void ovs_flow_free(struct sw_flow *flow); +void ovs_flow_free(struct sw_flow *, bool deferred); struct sw_flow_actions *ovs_flow_actions_alloc(int actions_len); void ovs_flow_deferred_free_acts(struct sw_flow_actions *); -int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *, - int *key_lenp); +int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *); void ovs_flow_used(struct sw_flow *, struct sk_buff *); u64 ovs_flow_used_time(unsigned long flow_jiffies); - -int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *); -int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, +int ovs_flow_to_nlattrs(const struct sw_flow_key *, + const struct sw_flow_key *, struct sk_buff *); +int ovs_match_from_nlattrs(struct sw_flow_match *match, + const struct nlattr *, const struct nlattr *); -int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len, - const struct nlattr *attr); +int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, + const struct nlattr *attr); #define MAX_ACTIONS_BUFSIZE (32 * 1024) #define TBL_MIN_BUCKETS 1024 @@ -182,6 +200,7 @@ struct flow_table { struct flex_array *buckets; unsigned int count, n_buckets; struct rcu_head rcu; + struct list_head *mask_list; int node_ver; u32 hash_seed; bool keep_flows; @@ -197,22 +216,44 @@ static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table) return (table->count > table->n_buckets); } -struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table, - struct sw_flow_key *key, int len); -void ovs_flow_tbl_destroy(struct flow_table *table); -void ovs_flow_tbl_deferred_destroy(struct flow_table *table); +struct sw_flow *ovs_flow_lookup(struct flow_table *, + const struct sw_flow_key *); +struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table, + struct sw_flow_match *match); + +void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred); struct flow_table *ovs_flow_tbl_alloc(int new_size); struct flow_table *ovs_flow_tbl_expand(struct flow_table *table); struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table); -void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, - struct sw_flow_key *key, int key_len); -void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); -struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx); +void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow); +void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow); + +struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *idx); extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1]; int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, - struct ovs_key_ipv4_tunnel *tun_key); + struct sw_flow_match *match, bool is_mask); int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *tun_key); + const struct ovs_key_ipv4_tunnel *tun_key, + const struct ovs_key_ipv4_tunnel *output); + +bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, + const struct sw_flow_key *key, int key_end); + +struct sw_flow_mask { + int ref_count; + struct rcu_head rcu; + struct list_head list; + struct sw_flow_key_range range; + struct sw_flow_key key; +}; +struct sw_flow_mask *ovs_sw_flow_mask_alloc(void); +void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *); +void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *, bool deferred); +void ovs_sw_flow_mask_insert(struct flow_table *, struct sw_flow_mask *); +struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *, + const struct sw_flow_mask *); +void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src, + const struct sw_flow_mask *mask); #endif /* flow.h */ diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 493e977..21d5073 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -16,7 +16,6 @@ * 02110-1301, USA */ -#ifdef CONFIG_OPENVSWITCH_GRE #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/if.h> @@ -271,5 +270,3 @@ const struct vport_ops ovs_gre_vport_ops = { .get_name = gre_get_name, .send = gre_tnl_send, }; - -#endif /* OPENVSWITCH_GRE */ diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 5982f3f..09d93c1 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -25,6 +25,7 @@ #include <linux/llc.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> +#include <linux/openvswitch.h> #include <net/llc.h> @@ -74,6 +75,15 @@ static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) return RX_HANDLER_CONSUMED; } +static struct net_device *get_dpdev(struct datapath *dp) +{ + struct vport *local; + + local = ovs_vport_ovsl(dp, OVSP_LOCAL); + BUG_ON(!local); + return netdev_vport_priv(local)->dev; +} + static struct vport *netdev_create(const struct vport_parms *parms) { struct vport *vport; @@ -103,10 +113,15 @@ static struct vport *netdev_create(const struct vport_parms *parms) } rtnl_lock(); + err = netdev_master_upper_dev_link(netdev_vport->dev, + get_dpdev(vport->dp)); + if (err) + goto error_unlock; + err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook, vport); if (err) - goto error_unlock; + goto error_master_upper_dev_unlink; dev_set_promiscuity(netdev_vport->dev, 1); netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH; @@ -114,6 +129,8 @@ static struct vport *netdev_create(const struct vport_parms *parms) return vport; +error_master_upper_dev_unlink: + netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp)); error_unlock: rtnl_unlock(); error_put: @@ -140,6 +157,7 @@ static void netdev_destroy(struct vport *vport) rtnl_lock(); netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; netdev_rx_handler_unregister(netdev_vport->dev); + netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp)); dev_set_promiscuity(netdev_vport->dev, -1); rtnl_unlock(); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index d69e0c0..6f65dbe 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -203,7 +203,7 @@ out: * ovs_vport_set_options - modify existing vport device (for kernel callers) * * @vport: vport to modify. - * @port: New configuration. + * @options: New configuration. * * Modifies an existing device with the specified configuration (which is * dependent on device type). ovs_mutex must be held. @@ -328,6 +328,7 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) * * @vport: vport that received the packet * @skb: skb that was received + * @tun_key: tunnel (if any) that carried packet * * Must be called with rcu_read_lock. The packet cannot be shared and * skb->data should point to the Ethernet header. diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 6c53dd9..1fdf9ab 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3215,9 +3215,11 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, if (po->tp_version == TPACKET_V3) { lv = sizeof(struct tpacket_stats_v3); + st.stats3.tp_packets += st.stats3.tp_drops; data = &st.stats3; } else { lv = sizeof(struct tpacket_stats); + st.stats1.tp_packets += st.stats1.tp_drops; data = &st.stats1; } diff --git a/net/sctp/probe.c b/net/sctp/probe.c index e62c225..cd72ae5 100644 --- a/net/sctp/probe.c +++ b/net/sctp/probe.c @@ -155,13 +155,8 @@ static sctp_disposition_t jsctp_sf_eat_sack(struct net *net, if (sp == asoc->peer.primary_path) printl("*"); - if (sp->ipaddr.sa.sa_family == AF_INET) - printl("%pI4 ", &sp->ipaddr.v4.sin_addr); - else - printl("%pI6 ", &sp->ipaddr.v6.sin6_addr); - - printl("%2u %8u %8u %8u %8u %8u ", - sp->state, sp->cwnd, sp->ssthresh, + printl("%pISc %2u %8u %8u %8u %8u %8u ", + &sp->ipaddr, sp->state, sp->cwnd, sp->ssthresh, sp->flight_size, sp->partial_bytes_acked, sp->pathmtu); } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index adf1e98..170c0ab 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2664,8 +2664,8 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, NL80211_CMD_NEW_KEY); - if (IS_ERR(hdr)) - return PTR_ERR(hdr); + if (!hdr) + return -ENOBUFS; cookie.msg = msg; cookie.idx = key_idx; @@ -6670,6 +6670,9 @@ static int nl80211_testmode_dump(struct sk_buff *skb, NL80211_CMD_TESTMODE); struct nlattr *tmdata; + if (!hdr) + break; + if (nla_put_u32(skb, NL80211_ATTR_WIPHY, phy_idx)) { genlmsg_cancel(skb, hdr); break; @@ -7114,9 +7117,8 @@ static int nl80211_remain_on_channel(struct sk_buff *skb, hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, NL80211_CMD_REMAIN_ON_CHANNEL); - - if (IS_ERR(hdr)) { - err = PTR_ERR(hdr); + if (!hdr) { + err = -ENOBUFS; goto free_msg; } @@ -7414,9 +7416,8 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, NL80211_CMD_FRAME); - - if (IS_ERR(hdr)) { - err = PTR_ERR(hdr); + if (!hdr) { + err = -ENOBUFS; goto free_msg; } } @@ -8551,9 +8552,8 @@ static int nl80211_probe_client(struct sk_buff *skb, hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, NL80211_CMD_PROBE_CLIENT); - - if (IS_ERR(hdr)) { - err = PTR_ERR(hdr); + if (!hdr) { + err = -ENOBUFS; goto free_msg; } diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 81c8a10..20e86a9 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -976,21 +976,19 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 reason, bool wextev) { struct wireless_dev *wdev = dev->ieee80211_ptr; - int err; + int err = 0; ASSERT_WDEV_LOCK(wdev); kfree(wdev->connect_keys); wdev->connect_keys = NULL; - if (wdev->conn) { + if (wdev->conn) err = cfg80211_sme_disconnect(wdev, reason); - } else if (!rdev->ops->disconnect) { + else if (!rdev->ops->disconnect) cfg80211_mlme_down(rdev, dev); - err = 0; - } else { + else if (wdev->current_bss) err = rdev_disconnect(rdev, dev, reason); - } return err; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index d8da6b8..ad8cc7b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -308,7 +308,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy) { BUG_ON(!policy->walk.dead); - if (del_timer(&policy->timer)) + if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer)) BUG(); security_xfrm_policy_free(policy->security); @@ -2132,8 +2132,6 @@ restart: * have the xfrm_state's. We need to wait for KM to * negotiate new SA's or bail out with error.*/ if (net->xfrm.sysctl_larval_drop) { - /* EREMOTE tells the caller to generate - * a one-shot blackhole route. */ dst_release(dst); xfrm_pols_put(pols, drop_pols); XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 78f66fa..4f8ace8 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -499,7 +499,8 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) INIT_HLIST_NODE(&x->bydst); INIT_HLIST_NODE(&x->bysrc); INIT_HLIST_NODE(&x->byspi); - tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler, CLOCK_REALTIME, HRTIMER_MODE_ABS); + tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler, + CLOCK_BOOTTIME, HRTIMER_MODE_ABS); setup_timer(&x->rtimer, xfrm_replay_timer_handler, (unsigned long)x); x->curlft.add_time = get_seconds(); @@ -990,11 +991,13 @@ void xfrm_state_insert(struct xfrm_state *x) EXPORT_SYMBOL(xfrm_state_insert); /* xfrm_state_lock is held */ -static struct xfrm_state *__find_acq_core(struct net *net, struct xfrm_mark *m, +static struct xfrm_state *__find_acq_core(struct net *net, + const struct xfrm_mark *m, unsigned short family, u8 mode, u32 reqid, u8 proto, const xfrm_address_t *daddr, - const xfrm_address_t *saddr, int create) + const xfrm_address_t *saddr, + int create) { unsigned int h = xfrm_dst_hash(net, daddr, saddr, reqid, family); struct xfrm_state *x; @@ -1399,9 +1402,9 @@ xfrm_state_lookup_byaddr(struct net *net, u32 mark, EXPORT_SYMBOL(xfrm_state_lookup_byaddr); struct xfrm_state * -xfrm_find_acq(struct net *net, struct xfrm_mark *mark, u8 mode, u32 reqid, u8 proto, - const xfrm_address_t *daddr, const xfrm_address_t *saddr, - int create, unsigned short family) +xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, + u8 proto, const xfrm_address_t *daddr, + const xfrm_address_t *saddr, int create, unsigned short family) { struct xfrm_state *x; |