1279 files changed, 23822 insertions, 23696 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 708dc4c..2754fe8 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -64,8 +64,6 @@ auxdisplay/
 	- misc. LCD driver documentation (cfag12864b, ks0108).
 backlight/
 	- directory with info on controlling backlights in flat panel displays
-bcache.txt
-	- Block-layer cache on fast SSDs to improve slow (raid) I/O performance.
 block/
 	- info on the Block I/O (BIO) layer.
 blockdev/
@@ -78,18 +76,10 @@ bus-devices/
 	- directory with info on TI GPMC (General Purpose Memory Controller)
 bus-virt-phys-mapping.txt
 	- how to access I/O mapped memory from within device drivers.
-cachetlb.txt
-	- describes the cache/TLB flushing interfaces Linux uses.
 cdrom/
 	- directory with information on the CD-ROM drivers that Linux has.
 cgroup-v1/
 	- cgroups v1 features, including cpusets and memory controller.
-cgroup-v2.txt
-	- cgroups v2 features, including cpusets and memory controller.
-circular-buffers.txt
-	- how to make use of the existing circular buffer infrastructure
-clk.txt
-	- info on the common clock framework
 cma/
 	- Continuous Memory Area (CMA) debugfs interface.
 conf.py
diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index 5b2d0f0..3e90e1f 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -90,4 +90,4 @@ Date:		December 2009
 Contact:	Lee Schermerhorn <lee.schermerhorn@hp.com>
 Description:
 		The node's huge page size control/query attributes.
-		See Documentation/vm/hugetlbpage.txt
-\ No newline at end of file
+		See Documentation/admin-guide/mm/hugetlbpage.rst
+\ No newline at end of file
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-hugepages b/Documentation/ABI/testing/sysfs-kernel-mm-hugepages
index e21c005..fdaa216 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-hugepages
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-hugepages
@@ -12,4 +12,4 @@ Description:
 			free_hugepages
 			surplus_hugepages
 			resv_hugepages
-		See Documentation/vm/hugetlbpage.txt for details.
+		See Documentation/admin-guide/mm/hugetlbpage.rst for details.
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-ksm b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
index 73e653e..dfc1324 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-ksm
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
@@ -40,7 +40,7 @@ Description:	Kernel Samepage Merging daemon sysfs interface
 		sleep_millisecs: how many milliseconds ksm should sleep between
 		scans.
 
-		See Documentation/vm/ksm.txt for more information.
+		See Documentation/vm/ksm.rst for more information.
 
 What:		/sys/kernel/mm/ksm/merge_across_nodes
 Date:		January 2013
diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
index 2cc0a72..29601d9 100644
--- a/Documentation/ABI/testing/sysfs-kernel-slab
+++ b/Documentation/ABI/testing/sysfs-kernel-slab
@@ -37,7 +37,7 @@ Description:
 		The alloc_calls file is read-only and lists the kernel code
 		locations from which allocations for this cache were performed.
 		The alloc_calls file only contains information if debugging is
-		enabled for that cache (see Documentation/vm/slub.txt).
+		enabled for that cache (see Documentation/vm/slub.rst).
 
 What:		/sys/kernel/slab/cache/alloc_fastpath
 Date:		February 2008
@@ -219,7 +219,7 @@ Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
 Description:
 		The free_calls file is read-only and lists the locations of
 		object frees if slab debugging is enabled (see
-		Documentation/vm/slub.txt).
+		Documentation/vm/slub.rst).
 
 What:		/sys/kernel/slab/cache/free_fastpath
 Date:		February 2008
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index a27fbfb..65eb856 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -1,3 +1,5 @@
+What is RCU?  --  "Read, Copy, Update"
+
 Please note that the "What is RCU?" LWN series is an excellent place
 to start learning about RCU:
 
diff --git a/Documentation/bcache.txt b/Documentation/admin-guide/bcache.rst
index c0ce64d..c0ce64d 100644
--- a/Documentation/bcache.txt
+++ b/Documentation/admin-guide/bcache.rst
diff --git a/Documentation/cgroup-v2.txt b/Documentation/admin-guide/cgroup-v2.rst
index 74cdeae..74cdeae 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/admin-guide/cgroup-v2.rst
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index 5bb9161..48d70af 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -48,6 +48,7 @@ configure specific aspects of kernel behavior to your liking.
    :maxdepth: 1
 
    initrd
+   cgroup-v2
    serial-console
    braille-console
    parport
@@ -60,9 +61,11 @@ configure specific aspects of kernel behavior to your liking.
    mono
    java
    ras
+   bcache
    pm/index
    thunderbolt
    LSM/index
+   mm/index
 
 .. only::  subproject and html
 
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f2040d4..9d699c8 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -106,11 +106,11 @@
 			use by PCI
 			Format: <irq>,<irq>...
 
-	acpi_mask_gpe=  [HW,ACPI]
+	acpi_mask_gpe=	[HW,ACPI]
 			Due to the existence of _Lxx/_Exx, some GPEs triggered
 			by unsupported hardware/firmware features can result in
-                        GPE floodings that cannot be automatically disabled by
-                        the GPE dispatcher.
+			GPE floodings that cannot be automatically disabled by
+			the GPE dispatcher.
 			This facility can be used to prevent such uncontrolled
 			GPE floodings.
 			Format: <int>
@@ -472,10 +472,10 @@
 			for platform specific values (SB1, Loongson3 and
 			others).
 
-	ccw_timeout_log [S390]
+	ccw_timeout_log	[S390]
 			See Documentation/s390/CommonIO for details.
 
-	cgroup_disable= [KNL] Disable a particular controller
+	cgroup_disable=	[KNL] Disable a particular controller
 			Format: {name of the controller(s) to disable}
 			The effects of cgroup_disable=foo are:
 			- foo isn't auto-mounted if you mount all cgroups in
@@ -518,7 +518,7 @@
 			those clocks in any way. This parameter is useful for
 			debug and development, but should not be needed on a
 			platform with proper driver support.  For more
-			information, see Documentation/clk.txt.
+			information, see Documentation/driver-api/clk.rst.
 
 	clock=		[BUGS=X86-32, HW] gettimeofday clocksource override.
 			[Deprecated]
@@ -587,11 +587,6 @@
 			Sets the size of memory pool for coherent, atomic dma
 			allocations, by default set to 256K.
 
-	code_bytes	[X86] How many bytes of object code to print
-			in an oops report.
-			Range: 0 - 8192
-			Default: 64
-
 	com20020=	[HW,NET] ARCnet - COM20020 chipset
 			Format:
 			<io>[,<irq>[,<nodeID>[,<backplane>[,<ckp>[,<timeout>]]]]]
@@ -641,8 +636,8 @@
 		hvc<n>	Use the hypervisor console device <n>. This is for
 			both Xen and PowerPC hypervisors.
 
-                If the device connected to the port is not a TTY but a braille
-                device, prepend "brl," before the device type, for instance
+		If the device connected to the port is not a TTY but a braille
+		device, prepend "brl," before the device type, for instance
 			console=brl,ttyS0
 		For now, only VisioBraille is supported.
 
@@ -662,7 +657,7 @@
 
 	consoleblank=	[KNL] The console blank (screen saver) timeout in
 			seconds. A value of 0 disables the blank timer.
-                       Defaults to 0.
+			Defaults to 0.
 
 	coredump_filter=
 			[KNL] Change the default value for
@@ -730,7 +725,7 @@
 			or memory reserved is below 4G.
 
 	cryptomgr.notests
-                        [KNL] Disable crypto self-tests
+			[KNL] Disable crypto self-tests
 
 	cs89x0_dma=	[HW,NET]
 			Format: <dma>
@@ -746,7 +741,7 @@
 			Format: <port#>,<type>
 			See also Documentation/input/devices/joystick-parport.rst
 
-	ddebug_query=   [KNL,DYNAMIC_DEBUG] Enable debug messages at early boot
+	ddebug_query=	[KNL,DYNAMIC_DEBUG] Enable debug messages at early boot
 			time. See
 			Documentation/admin-guide/dynamic-debug-howto.rst for
 			details.  Deprecated, see dyndbg.
@@ -833,7 +828,7 @@
 			causing system reset or hang due to sending
 			INIT from AP to BSP.
 
-	disable_ddw     [PPC/PSERIES]
+	disable_ddw	[PPC/PSERIES]
 			Disable Dynamic DMA Window support. Use this if
 			to workaround buggy firmware.
 
@@ -1188,7 +1183,7 @@
 			parameter will force ia64_sal_cache_flush to call
 			ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
 
-	forcepae [X86-32]
+	forcepae	[X86-32]
 			Forcefully enable Physical Address Extension (PAE).
 			Many Pentium M systems disable PAE but may have a
 			functionally usable PAE implementation.
@@ -1247,7 +1242,7 @@
 
 	gamma=		[HW,DRM]
 
-	gart_fix_e820=  [X86_64] disable the fix e820 for K8 GART
+	gart_fix_e820=	[X86_64] disable the fix e820 for K8 GART
 			Format: off | on
 			default: on
 
@@ -1341,23 +1336,32 @@
 			x86-64 are 2M (when the CPU supports "pse") and 1G
 			(when the CPU supports the "pdpe1gb" cpuinfo flag).
 
-	hvc_iucv=	[S390] Number of z/VM IUCV hypervisor console (HVC)
-			       terminal devices. Valid values: 0..8
-	hvc_iucv_allow=	[S390] Comma-separated list of z/VM user IDs.
-			       If specified, z/VM IUCV HVC accepts connections
-			       from listed z/VM user IDs only.
+	hung_task_panic=
+			[KNL] Should the hung task detector generate panics.
+			Format: <integer>
 
+			A nonzero value instructs the kernel to panic when a
+			hung task is detected. The default value is controlled
+			by the CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time
+			option. The value selected by this boot parameter can
+			be changed later by the kernel.hung_task_panic sysctl.
+
+	hvc_iucv=	[S390]	Number of z/VM IUCV hypervisor console (HVC)
+				terminal devices. Valid values: 0..8
+	hvc_iucv_allow=	[S390]	Comma-separated list of z/VM user IDs.
+				If specified, z/VM IUCV HVC accepts connections
+				from listed z/VM user IDs only.
 	keep_bootcon	[KNL]
 			Do not unregister boot console at start. This is only
 			useful for debugging when something happens in the window
 			between unregistering the boot console and initializing
 			the real console.
 
-	i2c_bus=	[HW] Override the default board specific I2C bus speed
-			     or register an additional I2C bus that is not
-			     registered from board initialization code.
-			     Format:
-			     <bus_id>,<clkrate>
+	i2c_bus=	[HW]	Override the default board specific I2C bus speed
+				or register an additional I2C bus that is not
+				registered from board initialization code.
+				Format:
+				<bus_id>,<clkrate>
 
 	i8042.debug	[HW] Toggle i8042 debug mode
 	i8042.unmask_kbd_data
@@ -1386,7 +1390,7 @@
 			Default: only on s2r transitions on x86; most other
 			architectures force reset to be always executed
 	i8042.unlock	[HW] Unlock (ignore) the keylock
-	i8042.kbdreset  [HW] Reset device connected to KBD port
+	i8042.kbdreset	[HW] Reset device connected to KBD port
 
 	i810=		[HW,DRM]
 
@@ -1548,13 +1552,13 @@
 			programs exec'd, files mmap'd for exec, and all files
 			opened for read by uid=0.
 
-	ima_template=   [IMA]
+	ima_template=	[IMA]
 			Select one of defined IMA measurements template formats.
 			Formats: { "ima" | "ima-ng" | "ima-sig" }
 			Default: "ima-ng"
 
 	ima_template_fmt=
-	                [IMA] Define a custom template format.
+			[IMA] Define a custom template format.
 			Format: { "field1|...|fieldN" }
 
 	ima.ahash_minsize= [IMA] Minimum file size for asynchronous hash usage
@@ -1597,7 +1601,7 @@
 	inport.irq=	[HW] Inport (ATI XL and Microsoft) busmouse driver
 			Format: <irq>
 
-	int_pln_enable  [x86] Enable power limit notification interrupt
+	int_pln_enable	[x86] Enable power limit notification interrupt
 
 	integrity_audit=[IMA]
 			Format: { "0" | "1" }
@@ -1650,39 +1654,39 @@
 			0	disables intel_idle and fall back on acpi_idle.
 			1 to 9	specify maximum depth of C-state.
 
-	intel_pstate=  [X86]
-		       disable
-		         Do not enable intel_pstate as the default
-		         scaling driver for the supported processors
-		       passive
-			 Use intel_pstate as a scaling driver, but configure it
-			 to work with generic cpufreq governors (instead of
-			 enabling its internal governor).  This mode cannot be
-			 used along with the hardware-managed P-states (HWP)
-			 feature.
-		       force
-			 Enable intel_pstate on systems that prohibit it by default
-			 in favor of acpi-cpufreq. Forcing the intel_pstate driver
-			 instead of acpi-cpufreq may disable platform features, such
-			 as thermal controls and power capping, that rely on ACPI
-			 P-States information being indicated to OSPM and therefore
-			 should be used with caution. This option does not work with
-			 processors that aren't supported by the intel_pstate driver
-			 or on platforms that use pcc-cpufreq instead of acpi-cpufreq.
-		       no_hwp
-		         Do not enable hardware P state control (HWP)
-			 if available.
-		hwp_only
-			Only load intel_pstate on systems which support
-			hardware P state control (HWP) if available.
-		support_acpi_ppc
-			Enforce ACPI _PPC performance limits. If the Fixed ACPI
-			Description Table, specifies preferred power management
-			profile as "Enterprise Server" or "Performance Server",
-			then this feature is turned on by default.
-		per_cpu_perf_limits
-			Allow per-logical-CPU P-State performance control limits using
-			cpufreq sysfs interface
+	intel_pstate=	[X86]
+			disable
+			  Do not enable intel_pstate as the default
+			  scaling driver for the supported processors
+			passive
+			  Use intel_pstate as a scaling driver, but configure it
+			  to work with generic cpufreq governors (instead of
+			  enabling its internal governor).  This mode cannot be
+			  used along with the hardware-managed P-states (HWP)
+			  feature.
+			force
+			  Enable intel_pstate on systems that prohibit it by default
+			  in favor of acpi-cpufreq. Forcing the intel_pstate driver
+			  instead of acpi-cpufreq may disable platform features, such
+			  as thermal controls and power capping, that rely on ACPI
+			  P-States information being indicated to OSPM and therefore
+			  should be used with caution. This option does not work with
+			  processors that aren't supported by the intel_pstate driver
+			  or on platforms that use pcc-cpufreq instead of acpi-cpufreq.
+			no_hwp
+			  Do not enable hardware P state control (HWP)
+			  if available.
+			hwp_only
+			  Only load intel_pstate on systems which support
+			  hardware P state control (HWP) if available.
+			support_acpi_ppc
+			  Enforce ACPI _PPC performance limits. If the Fixed ACPI
+			  Description Table, specifies preferred power management
+			  profile as "Enterprise Server" or "Performance Server",
+			  then this feature is turned on by default.
+			per_cpu_perf_limits
+			  Allow per-logical-CPU P-State performance control limits using
+			  cpufreq sysfs interface
 
 	intremap=	[X86-64, Intel-IOMMU]
 			on	enable Interrupt Remapping (default)
@@ -1705,7 +1709,6 @@
 		nopanic
 		merge
 		nomerge
-		forcesac
 		soft
 		pt		[x86, IA-64]
 		nobypass	[PPC/POWERNV]
@@ -2027,7 +2030,7 @@
 			* [no]ncqtrim: Turn off queued DSM TRIM.
 
 			* nohrst, nosrst, norst: suppress hard, soft
-                          and both resets.
+			  and both resets.
 
 			* rstonce: only attempt one reset during
 			  hot-unplug link recovery
@@ -2215,7 +2218,7 @@
 			[KNL,SH] Allow user to override the default size for
 			per-device physically contiguous DMA buffers.
 
-        memhp_default_state=online/offline
+	memhp_default_state=online/offline
 			[KNL] Set the initial state for the memory hotplug
 			onlining policy. If not specified, the default value is
 			set according to the
@@ -2600,6 +2603,9 @@
 			emulation library even if a 387 maths coprocessor
 			is present.
 
+	no5lvl		[X86-64] Disable 5-level paging mode. Forces
+			kernel to use 4-level paging instead.
+
 	no_console_suspend
 			[HW] Never suspend the console
 			Disable suspending of consoles during suspend and
@@ -2765,7 +2771,7 @@
 			[X86,PV_OPS] Disable paravirtualized VMware scheduler
 			clock and use the default one.
 
-	no-steal-acc    [X86,KVM] Disable paravirtualized steal time accounting.
+	no-steal-acc	[X86,KVM] Disable paravirtualized steal time accounting.
 			steal time is computed, but won't influence scheduler
 			behaviour
 
@@ -2826,7 +2832,7 @@
 	notsc		[BUGS=X86-32] Disable Time Stamp Counter
 
 	nowatchdog	[KNL] Disable both lockup detectors, i.e.
-                        soft-lockup and NMI watchdog (hard-lockup).
+			soft-lockup and NMI watchdog (hard-lockup).
 
 	nowb		[ARM]
 
@@ -2846,7 +2852,7 @@
 			If the dependencies are under your control, you can
 			turn on cpu0_hotplug.
 
-	nps_mtm_hs_ctr= [KNL,ARC]
+	nps_mtm_hs_ctr=	[KNL,ARC]
 			This parameter sets the maximum duration, in
 			cycles, each HW thread of the CTOP can run
 			without interruptions, before HW switches it.
@@ -2987,7 +2993,7 @@
 
 	pci=option[,option...]	[PCI] various PCI subsystem options:
 		earlydump	[X86] dump PCI config space before the kernel
-			        changes anything
+				changes anything
 		off		[X86] don't probe for the PCI bus
 		bios		[X86-32] force use of PCI BIOS, don't access
 				the hardware directly. Use this if your machine
@@ -3075,7 +3081,7 @@
 				is enabled by default.  If you need to use this,
 				please report a bug.
 		nocrs		[X86] Ignore PCI host bridge windows from ACPI.
-			        If you need to use this, please report a bug.
+				If you need to use this, please report a bug.
 		routeirq	Do IRQ routing for all PCI devices.
 				This is normally done in pci_enable_device(),
 				so this option is a temporary workaround
@@ -3918,7 +3924,7 @@
 			cache (risks via metadata attacks are mostly
 			unchanged). Debug options disable merging on their
 			own.
-			For more information see Documentation/vm/slub.txt.
+			For more information see Documentation/vm/slub.rst.
 
 	slab_max_order=	[MM, SLAB]
 			Determines the maximum allowed order for slabs.
@@ -3932,7 +3938,7 @@
 			slub_debug can create guard zones around objects and
 			may poison objects when not in use. Also tracks the
 			last alloc / free. For more information see
-			Documentation/vm/slub.txt.
+			Documentation/vm/slub.rst.
 
 	slub_memcg_sysfs=	[MM, SLUB]
 			Determines whether to enable sysfs directories for
@@ -3946,7 +3952,7 @@
 			Determines the maximum allowed order for slabs.
 			A high setting may cause OOMs due to memory
 			fragmentation. For more information see
-			Documentation/vm/slub.txt.
+			Documentation/vm/slub.rst.
 
 	slub_min_objects=	[MM, SLUB]
 			The minimum number of objects per slab. SLUB will
@@ -3955,12 +3961,12 @@
 			the number of objects indicated. The higher the number
 			of objects the smaller the overhead of tracking slabs
 			and the less frequently locks need to be acquired.
-			For more information see Documentation/vm/slub.txt.
+			For more information see Documentation/vm/slub.rst.
 
 	slub_min_order=	[MM, SLUB]
 			Determines the minimum page order for slabs. Must be
 			lower than slub_max_order.
-			For more information see Documentation/vm/slub.txt.
+			For more information see Documentation/vm/slub.rst.
 
 	slub_nomerge	[MM, SLUB]
 			Same with slab_nomerge. This is supported for legacy.
@@ -4358,7 +4364,8 @@
 			Format: [always|madvise|never]
 			Can be used to control the default behavior of the system
 			with respect to transparent hugepages.
-			See Documentation/vm/transhuge.txt for more details.
+			See Documentation/admin-guide/mm/transhuge.rst
+			for more details.
 
 	tsc=		Disable clocksource stability checks for TSC.
 			Format: <string>
@@ -4436,7 +4443,7 @@
 
 	usbcore.initial_descriptor_timeout=
 			[USB] Specifies timeout for the initial 64-byte
-                        USB_REQ_GET_DESCRIPTOR request in milliseconds
+			USB_REQ_GET_DESCRIPTOR request in milliseconds
 			(default 5000 = 5.0 seconds).
 
 	usbcore.nousb	[USB] Disable the USB subsystem
diff --git a/Documentation/admin-guide/mm/concepts.rst b/Documentation/admin-guide/mm/concepts.rst
new file mode 100644
index 0000000..291699c
--- /dev/null
+++ b/Documentation/admin-guide/mm/concepts.rst
@@ -0,0 +1,222 @@
+.. _mm_concepts:
+
+=================
+Concepts overview
+=================
+
+The memory management in Linux is complex system that evolved over the
+years and included more and more functionality to support variety of
+systems from MMU-less microcontrollers to supercomputers. The memory
+management for systems without MMU is called ``nommu`` and it
+definitely deserves a dedicated document, which hopefully will be
+eventually written. Yet, although some of the concepts are the same,
+here we assume that MMU is available and CPU can translate a virtual
+address to a physical address.
+
+.. contents:: :local:
+
+Virtual Memory Primer
+=====================
+
+The physical memory in a computer system is a limited resource and
+even for systems that support memory hotplug there is a hard limit on
+the amount of memory that can be installed. The physical memory is not
+necessary contiguous, it might be accessible as a set of distinct
+address ranges. Besides, different CPU architectures, and even
+different implementations of the same architecture have different view
+how these address ranges defined.
+
+All this makes dealing directly with physical memory quite complex and
+to avoid this complexity a concept of virtual memory was developed.
+
+The virtual memory abstracts the details of physical memory from the
+application software, allows to keep only needed information in the
+physical memory (demand paging) and provides a mechanism for the
+protection and controlled sharing of data between processes.
+
+With virtual memory, each and every memory access uses a virtual
+address. When the CPU decodes the an instruction that reads (or
+writes) from (or to) the system memory, it translates the `virtual`
+address encoded in that instruction to a `physical` address that the
+memory controller can understand.
+
+The physical system memory is divided into page frames, or pages. The
+size of each page is architecture specific. Some architectures allow
+selection of the page size from several supported values; this
+selection is performed at the kernel build time by setting an
+appropriate kernel configuration option.
+
+Each physical memory page can be mapped as one or more virtual
+pages. These mappings are described by page tables that allow
+translation from virtual address used by programs to real address in
+the physical memory. The page tables organized hierarchically.
+
+The tables at the lowest level of the hierarchy contain physical
+addresses of actual pages used by the software. The tables at higher
+levels contain physical addresses of the pages belonging to the lower
+levels. The pointer to the top level page table resides in a
+register. When the CPU performs the address translation, it uses this
+register to access the top level page table. The high bits of the
+virtual address are used to index an entry in the top level page
+table. That entry is then used to access the next level in the
+hierarchy with the next bits of the virtual address as the index to
+that level page table. The lowest bits in the virtual address define
+the offset inside the actual page.
+
+Huge Pages
+==========
+
+The address translation requires several memory accesses and memory
+accesses are slow relatively to CPU speed. To avoid spending precious
+processor cycles on the address translation, CPUs maintain a cache of
+such translations called Translation Lookaside Buffer (or
+TLB). Usually TLB is pretty scarce resource and applications with
+large memory working set will experience performance hit because of
+TLB misses.
+
+Many modern CPU architectures allow mapping of the memory pages
+directly by the higher levels in the page table. For instance, on x86,
+it is possible to map 2M and even 1G pages using entries in the second
+and the third level page tables. In Linux such pages are called
+`huge`. Usage of huge pages significantly reduces pressure on TLB,
+improves TLB hit-rate and thus improves overall system performance.
+
+There are two mechanisms in Linux that enable mapping of the physical
+memory with the huge pages. The first one is `HugeTLB filesystem`, or
+hugetlbfs. It is a pseudo filesystem that uses RAM as its backing
+store. For the files created in this filesystem the data resides in
+the memory and mapped using huge pages. The hugetlbfs is described at
+:ref:`Documentation/admin-guide/mm/hugetlbpage.rst <hugetlbpage>`.
+
+Another, more recent, mechanism that enables use of the huge pages is
+called `Transparent HugePages`, or THP. Unlike the hugetlbfs that
+requires users and/or system administrators to configure what parts of
+the system memory should and can be mapped by the huge pages, THP
+manages such mappings transparently to the user and hence the
+name. See
+:ref:`Documentation/admin-guide/mm/transhuge.rst <admin_guide_transhuge>`
+for more details about THP.
+
+Zones
+=====
+
+Often hardware poses restrictions on how different physical memory
+ranges can be accessed. In some cases, devices cannot perform DMA to
+all the addressable memory. In other cases, the size of the physical
+memory exceeds the maximal addressable size of virtual memory and
+special actions are required to access portions of the memory. Linux
+groups memory pages into `zones` according to their possible
+usage. For example, ZONE_DMA will contain memory that can be used by
+devices for DMA, ZONE_HIGHMEM will contain memory that is not
+permanently mapped into kernel's address space and ZONE_NORMAL will
+contain normally addressed pages.
+
+The actual layout of the memory zones is hardware dependent as not all
+architectures define all zones, and requirements for DMA are different
+for different platforms.
+
+Nodes
+=====
+
+Many multi-processor machines are NUMA - Non-Uniform Memory Access -
+systems. In such systems the memory is arranged into banks that have
+different access latency depending on the "distance" from the
+processor. Each bank is referred as `node` and for each node Linux
+constructs an independent memory management subsystem. A node has it's
+own set of zones, lists of free and used pages and various statistics
+counters. You can find more details about NUMA in
+:ref:`Documentation/vm/numa.rst <numa>` and in
+:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`.
+
+Page cache
+==========
+
+The physical memory is volatile and the common case for getting data
+into the memory is to read it from files. Whenever a file is read, the
+data is put into the `page cache` to avoid expensive disk access on
+the subsequent reads. Similarly, when one writes to a file, the data
+is placed in the page cache and eventually gets into the backing
+storage device. The written pages are marked as `dirty` and when Linux
+decides to reuse them for other purposes, it makes sure to synchronize
+the file contents on the device with the updated data.
+
+Anonymous Memory
+================
+
+The `anonymous memory` or `anonymous mappings` represent memory that
+is not backed by a filesystem. Such mappings are implicitly created
+for program's stack and heap or by explicit calls to mmap(2) system
+call. Usually, the anonymous mappings only define virtual memory areas
+that the program is allowed to access. The read accesses will result
+in creation of a page table entry that references a special physical
+page filled with zeroes. When the program performs a write, regular
+physical page will be allocated to hold the written data. The page
+will be marked dirty and if the kernel will decide to repurpose it,
+the dirty page will be swapped out.
+
+Reclaim
+=======
+
+Throughout the system lifetime, a physical page can be used for storing
+different types of data. It can be kernel internal data structures,
+DMA'able buffers for device drivers use, data read from a filesystem,
+memory allocated by user space processes etc.
+
+Depending on the page usage it is treated differently by the Linux
+memory management. The pages that can be freed at any time, either
+because they cache the data available elsewhere, for instance, on a
+hard disk, or because they can be swapped out, again, to the hard
+disk, are called `reclaimable`. The most notable categories of the
+reclaimable pages are page cache and anonymous memory.
+
+In most cases, the pages holding internal kernel data and used as DMA
+buffers cannot be repurposed, and they remain pinned until freed by
+their user. Such pages are called `unreclaimable`. However, in certain
+circumstances, even pages occupied with kernel data structures can be
+reclaimed. For instance, in-memory caches of filesystem metadata can
+be re-read from the storage device and therefore it is possible to
+discard them from the main memory when system is under memory
+pressure.
+
+The process of freeing the reclaimable physical memory pages and
+repurposing them is called (surprise!) `reclaim`. Linux can reclaim
+pages either asynchronously or synchronously, depending on the state
+of the system. When system is not loaded, most of the memory is free
+and allocation request will be satisfied immediately from the free
+pages supply. As the load increases, the amount of the free pages goes
+down and when it reaches a certain threshold (high watermark), an
+allocation request will awaken the ``kswapd`` daemon. It will
+asynchronously scan memory pages and either just free them if the data
+they contain is available elsewhere, or evict to the backing storage
+device (remember those dirty pages?). As memory usage increases even
+more and reaches another threshold - min watermark - an allocation
+will trigger the `direct reclaim`. In this case allocation is stalled
+until enough memory pages are reclaimed to satisfy the request.
+
+Compaction
+==========
+
+As the system runs, tasks allocate and free the memory and it becomes
+fragmented. Although with virtual memory it is possible to present
+scattered physical pages as virtually contiguous range, sometimes it is
+necessary to allocate large physically contiguous memory areas. Such
+need may arise, for instance, when a device driver requires large
+buffer for DMA, or when THP allocates a huge page. Memory `compaction`
+addresses the fragmentation issue. This mechanism moves occupied pages
+from the lower part of a memory zone to free pages in the upper part
+of the zone. When a compaction scan is finished free pages are grouped
+together at the beginning of the zone and allocations of large
+physically contiguous areas become possible.
+
+Like reclaim, the compaction may happen asynchronously in ``kcompactd``
+daemon or synchronously as a result of memory allocation request.
+
+OOM killer
+==========
+
+It may happen, that on a loaded machine memory will be exhausted. When
+the kernel detects that the system runs out of memory (OOM) it invokes
+`OOM killer`. Its mission is simple: all it has to do is to select a
+task to sacrifice for the sake of the overall system health. The
+selected task is killed in a hope that after it exits enough memory
+will be freed to continue normal operation.
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/admin-guide/mm/hugetlbpage.rst
index faf077d..1cc0bc7 100644
--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -1,3 +1,11 @@
+.. _hugetlbpage:
+
+=============
+HugeTLB Pages
+=============
+
+Overview
+========
 
 The intent of this file is to give a brief summary of hugetlbpage support in
 the Linux kernel.  This support is built on top of multiple page size support
@@ -18,53 +26,59 @@ First the Linux kernel needs to be built with the CONFIG_HUGETLBFS
 automatically when CONFIG_HUGETLBFS is selected) configuration
 options.
 
-The /proc/meminfo file provides information about the total number of
+The ``/proc/meminfo`` file provides information about the total number of
 persistent hugetlb pages in the kernel's huge page pool.  It also displays
 default huge page size and information about the number of free, reserved
 and surplus huge pages in the pool of huge pages of default size.
 The huge page size is needed for generating the proper alignment and
 size of the arguments to system calls that map huge page regions.
 
-The output of "cat /proc/meminfo" will include lines like:
+The output of ``cat /proc/meminfo`` will include lines like::
 
-.....
-HugePages_Total: uuu
-HugePages_Free:  vvv
-HugePages_Rsvd:  www
-HugePages_Surp:  xxx
-Hugepagesize:    yyy kB
-Hugetlb:         zzz kB
+	HugePages_Total: uuu
+	HugePages_Free:  vvv
+	HugePages_Rsvd:  www
+	HugePages_Surp:  xxx
+	Hugepagesize:    yyy kB
+	Hugetlb:         zzz kB
 
 where:
-HugePages_Total is the size of the pool of huge pages.
-HugePages_Free  is the number of huge pages in the pool that are not yet
-                allocated.
-HugePages_Rsvd  is short for "reserved," and is the number of huge pages for
-                which a commitment to allocate from the pool has been made,
-                but no allocation has yet been made.  Reserved huge pages
-                guarantee that an application will be able to allocate a
-                huge page from the pool of huge pages at fault time.
-HugePages_Surp  is short for "surplus," and is the number of huge pages in
-                the pool above the value in /proc/sys/vm/nr_hugepages. The
-                maximum number of surplus huge pages is controlled by
-                /proc/sys/vm/nr_overcommit_hugepages.
-Hugepagesize    is the default hugepage size (in Kb).
-Hugetlb         is the total amount of memory (in kB), consumed by huge
-                pages of all sizes.
-                If huge pages of different sizes are in use, this number
-                will exceed HugePages_Total * Hugepagesize. To get more
-                detailed information, please, refer to
-                /sys/kernel/mm/hugepages (described below).
-
-
-/proc/filesystems should also show a filesystem of type "hugetlbfs" configured
-in the kernel.
-
-/proc/sys/vm/nr_hugepages indicates the current number of "persistent" huge
+
+HugePages_Total
+	is the size of the pool of huge pages.
+HugePages_Free
+	is the number of huge pages in the pool that are not yet
+        allocated.
+HugePages_Rsvd
+	is short for "reserved," and is the number of huge pages for
+        which a commitment to allocate from the pool has been made,
+        but no allocation has yet been made.  Reserved huge pages
+        guarantee that an application will be able to allocate a
+        huge page from the pool of huge pages at fault time.
+HugePages_Surp
+	is short for "surplus," and is the number of huge pages in
+        the pool above the value in ``/proc/sys/vm/nr_hugepages``. The
+        maximum number of surplus huge pages is controlled by
+        ``/proc/sys/vm/nr_overcommit_hugepages``.
+Hugepagesize
+	is the default hugepage size (in Kb).
+Hugetlb
+        is the total amount of memory (in kB), consumed by huge
+        pages of all sizes.
+        If huge pages of different sizes are in use, this number
+        will exceed HugePages_Total \* Hugepagesize. To get more
+        detailed information, please, refer to
+        ``/sys/kernel/mm/hugepages`` (described below).
+
+
+``/proc/filesystems`` should also show a filesystem of type "hugetlbfs"
+configured in the kernel.
+
+``/proc/sys/vm/nr_hugepages`` indicates the current number of "persistent" huge
 pages in the kernel's huge page pool.  "Persistent" huge pages will be
 returned to the huge page pool when freed by a task.  A user with root
 privileges can dynamically allocate more or free some persistent huge pages
-by increasing or decreasing the value of 'nr_hugepages'.
+by increasing or decreasing the value of ``nr_hugepages``.
 
 Pages that are used as huge pages are reserved inside the kernel and cannot
 be used for other purposes.  Huge pages cannot be swapped out under
@@ -73,7 +87,7 @@ memory pressure.
 Once a number of huge pages have been pre-allocated to the kernel huge page
 pool, a user with appropriate privilege can use either the mmap system call
 or shared memory system calls to use the huge pages.  See the discussion of
-Using Huge Pages, below.
+:ref:`Using Huge Pages <using_huge_pages>`, below.
 
 The administrator can allocate persistent huge pages on the kernel boot
 command line by specifying the "hugepages=N" parameter, where 'N' = the
@@ -86,10 +100,10 @@ with a huge page size selection parameter "hugepagesz=<size>".  <size> must
 be specified in bytes with optional scale suffix [kKmMgG].  The default huge
 page size may be selected with the "default_hugepagesz=<size>" boot parameter.
 
-When multiple huge page sizes are supported, /proc/sys/vm/nr_hugepages
+When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
 indicates the current number of pre-allocated huge pages of the default size.
 Thus, one can use the following command to dynamically allocate/deallocate
-default sized persistent huge pages:
+default sized persistent huge pages::
 
 	echo 20 > /proc/sys/vm/nr_hugepages
 
@@ -98,11 +112,12 @@ huge page pool to 20, allocating or freeing huge pages, as required.
 
 On a NUMA platform, the kernel will attempt to distribute the huge page pool
 over all the set of allowed nodes specified by the NUMA memory policy of the
-task that modifies nr_hugepages.  The default for the allowed nodes--when the
+task that modifies ``nr_hugepages``. The default for the allowed nodes--when the
 task has default memory policy--is all on-line nodes with memory.  Allowed
 nodes with insufficient available, contiguous memory for a huge page will be
-silently skipped when allocating persistent huge pages.  See the discussion
-below of the interaction of task memory policy, cpusets and per node attributes
+silently skipped when allocating persistent huge pages.  See the
+:ref:`discussion below <mem_policy_and_hp_alloc>`
+of the interaction of task memory policy, cpusets and per node attributes
 with the allocation and freeing of persistent huge pages.
 
 The success or failure of huge page allocation depends on the amount of
@@ -117,51 +132,52 @@ init files.  This will enable the kernel to allocate huge pages early in
 the boot process when the possibility of getting physical contiguous pages
 is still very high.  Administrators can verify the number of huge pages
 actually allocated by checking the sysctl or meminfo.  To check the per node
-distribution of huge pages in a NUMA system, use:
+distribution of huge pages in a NUMA system, use::
 
 	cat /sys/devices/system/node/node*/meminfo | fgrep Huge
 
-/proc/sys/vm/nr_overcommit_hugepages specifies how large the pool of
-huge pages can grow, if more huge pages than /proc/sys/vm/nr_hugepages are
+``/proc/sys/vm/nr_overcommit_hugepages`` specifies how large the pool of
+huge pages can grow, if more huge pages than ``/proc/sys/vm/nr_hugepages`` are
 requested by applications.  Writing any non-zero value into this file
 indicates that the hugetlb subsystem is allowed to try to obtain that
 number of "surplus" huge pages from the kernel's normal page pool, when the
 persistent huge page pool is exhausted. As these surplus huge pages become
 unused, they are freed back to the kernel's normal page pool.
 
-When increasing the huge page pool size via nr_hugepages, any existing surplus
-pages will first be promoted to persistent huge pages.  Then, additional
+When increasing the huge page pool size via ``nr_hugepages``, any existing
+surplus pages will first be promoted to persistent huge pages.  Then, additional
 huge pages will be allocated, if necessary and if possible, to fulfill
 the new persistent huge page pool size.
 
 The administrator may shrink the pool of persistent huge pages for
-the default huge page size by setting the nr_hugepages sysctl to a
+the default huge page size by setting the ``nr_hugepages`` sysctl to a
 smaller value.  The kernel will attempt to balance the freeing of huge pages
-across all nodes in the memory policy of the task modifying nr_hugepages.
+across all nodes in the memory policy of the task modifying ``nr_hugepages``.
 Any free huge pages on the selected nodes will be freed back to the kernel's
 normal page pool.
 
-Caveat: Shrinking the persistent huge page pool via nr_hugepages such that
+Caveat: Shrinking the persistent huge page pool via ``nr_hugepages`` such that
 it becomes less than the number of huge pages in use will convert the balance
 of the in-use huge pages to surplus huge pages.  This will occur even if
-the number of surplus pages it would exceed the overcommit value.  As long as
-this condition holds--that is, until nr_hugepages+nr_overcommit_hugepages is
+the number of surplus pages would exceed the overcommit value.  As long as
+this condition holds--that is, until ``nr_hugepages+nr_overcommit_hugepages`` is
 increased sufficiently, or the surplus huge pages go out of use and are freed--
 no more surplus huge pages will be allowed to be allocated.
 
 With support for multiple huge page pools at run-time available, much of
-the huge page userspace interface in /proc/sys/vm has been duplicated in sysfs.
-The /proc interfaces discussed above have been retained for backwards
-compatibility. The root huge page control directory in sysfs is:
+the huge page userspace interface in ``/proc/sys/vm`` has been duplicated in
+sysfs.
+The ``/proc`` interfaces discussed above have been retained for backwards
+compatibility. The root huge page control directory in sysfs is::
 
 	/sys/kernel/mm/hugepages
 
 For each huge page size supported by the running kernel, a subdirectory
-will exist, of the form:
+will exist, of the form::
 
 	hugepages-${size}kB
 
-Inside each of these directories, the same set of files will exist:
+Inside each of these directories, the same set of files will exist::
 
 	nr_hugepages
 	nr_hugepages_mempolicy
@@ -172,37 +188,39 @@ Inside each of these directories, the same set of files will exist:
 
 which function as described above for the default huge page-sized case.
 
+.. _mem_policy_and_hp_alloc:
 
 Interaction of Task Memory Policy with Huge Page Allocation/Freeing
 ===================================================================
 
-Whether huge pages are allocated and freed via the /proc interface or
-the /sysfs interface using the nr_hugepages_mempolicy attribute, the NUMA
-nodes from which huge pages are allocated or freed are controlled by the
-NUMA memory policy of the task that modifies the nr_hugepages_mempolicy
-sysctl or attribute.  When the nr_hugepages attribute is used, mempolicy
+Whether huge pages are allocated and freed via the ``/proc`` interface or
+the ``/sysfs`` interface using the ``nr_hugepages_mempolicy`` attribute, the
+NUMA nodes from which huge pages are allocated or freed are controlled by the
+NUMA memory policy of the task that modifies the ``nr_hugepages_mempolicy``
+sysctl or attribute.  When the ``nr_hugepages`` attribute is used, mempolicy
 is ignored.
 
 The recommended method to allocate or free huge pages to/from the kernel
-huge page pool, using the nr_hugepages example above, is:
+huge page pool, using the ``nr_hugepages`` example above, is::
 
     numactl --interleave <node-list> echo 20 \
 				>/proc/sys/vm/nr_hugepages_mempolicy
 
-or, more succinctly:
+or, more succinctly::
 
     numactl -m <node-list> echo 20 >/proc/sys/vm/nr_hugepages_mempolicy
 
-This will allocate or free abs(20 - nr_hugepages) to or from the nodes
+This will allocate or free ``abs(20 - nr_hugepages)`` to or from the nodes
 specified in <node-list>, depending on whether number of persistent huge pages
 is initially less than or greater than 20, respectively.  No huge pages will be
 allocated nor freed on any node not included in the specified <node-list>.
 
-When adjusting the persistent hugepage count via nr_hugepages_mempolicy, any
+When adjusting the persistent hugepage count via ``nr_hugepages_mempolicy``, any
 memory policy mode--bind, preferred, local or interleave--may be used.  The
 resulting effect on persistent huge page allocation is as follows:
 
-1) Regardless of mempolicy mode [see Documentation/vm/numa_memory_policy.txt],
+#. Regardless of mempolicy mode [see
+   :ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`],
    persistent huge pages will be distributed across the node or nodes
    specified in the mempolicy as if "interleave" had been specified.
    However, if a node in the policy does not contain sufficient contiguous
@@ -212,7 +230,7 @@ resulting effect on persistent huge page allocation is as follows:
    possibly, allocation of persistent huge pages on nodes not allowed by
    the task's memory policy.
 
-2) One or more nodes may be specified with the bind or interleave policy.
+#. One or more nodes may be specified with the bind or interleave policy.
    If more than one node is specified with the preferred policy, only the
    lowest numeric id will be used.  Local policy will select the node where
    the task is running at the time the nodes_allowed mask is constructed.
@@ -222,20 +240,20 @@ resulting effect on persistent huge page allocation is as follows:
    indeterminate.  Thus, local policy is not very useful for this purpose.
    Any of the other mempolicy modes may be used to specify a single node.
 
-3) The nodes allowed mask will be derived from any non-default task mempolicy,
+#. The nodes allowed mask will be derived from any non-default task mempolicy,
    whether this policy was set explicitly by the task itself or one of its
    ancestors, such as numactl.  This means that if the task is invoked from a
    shell with non-default policy, that policy will be used.  One can specify a
    node list of "all" with numactl --interleave or --membind [-m] to achieve
    interleaving over all nodes in the system or cpuset.
 
-4) Any task mempolicy specified--e.g., using numactl--will be constrained by
+#. Any task mempolicy specified--e.g., using numactl--will be constrained by
    the resource limits of any cpuset in which the task runs.  Thus, there will
    be no way for a task with non-default policy running in a cpuset with a
    subset of the system nodes to allocate huge pages outside the cpuset
    without first moving to a cpuset that contains all of the desired nodes.
 
-5) Boot-time huge page allocation attempts to distribute the requested number
+#. Boot-time huge page allocation attempts to distribute the requested number
    of huge pages over all on-lines nodes with memory.
 
 Per Node Hugepages Attributes
@@ -243,22 +261,22 @@ Per Node Hugepages Attributes
 
 A subset of the contents of the root huge page control directory in sysfs,
 described above, will be replicated under each the system device of each
-NUMA node with memory in:
+NUMA node with memory in::
 
 	/sys/devices/system/node/node[0-9]*/hugepages/
 
 Under this directory, the subdirectory for each supported huge page size
-contains the following attribute files:
+contains the following attribute files::
 
 	nr_hugepages
 	free_hugepages
 	surplus_hugepages
 
-The free_' and surplus_' attribute files are read-only.  They return the number
+The free\_' and surplus\_' attribute files are read-only.  They return the number
 of free and surplus [overcommitted] huge pages, respectively, on the parent
 node.
 
-The nr_hugepages attribute returns the total number of huge pages on the
+The ``nr_hugepages`` attribute returns the total number of huge pages on the
 specified node.  When this attribute is written, the number of persistent huge
 pages on the parent node will be adjusted to the specified value, if sufficient
 resources exist, regardless of the task's mempolicy or cpuset constraints.
@@ -267,43 +285,58 @@ Note that the number of overcommit and reserve pages remain global quantities,
 as we don't know until fault time, when the faulting task's mempolicy is
 applied, from which node the huge page allocation will be attempted.
 
+.. _using_huge_pages:
 
 Using Huge Pages
 ================
 
 If the user applications are going to request huge pages using mmap system
 call, then it is required that system administrator mount a file system of
-type hugetlbfs:
+type hugetlbfs::
 
   mount -t hugetlbfs \
 	-o uid=<value>,gid=<value>,mode=<value>,pagesize=<value>,size=<value>,\
 	min_size=<value>,nr_inodes=<value> none /mnt/huge
 
 This command mounts a (pseudo) filesystem of type hugetlbfs on the directory
-/mnt/huge.  Any files created on /mnt/huge uses huge pages.  The uid and gid
-options sets the owner and group of the root of the file system.  By default
-the uid and gid of the current process are taken.  The mode option sets the
-mode of root of file system to value & 01777.  This value is given in octal.
-By default the value 0755 is picked. If the platform supports multiple huge
-page sizes, the pagesize option can be used to specify the huge page size and
-associated pool.  pagesize is specified in bytes.  If pagesize is not specified
-the platform's default huge page size and associated pool will be used. The
-size option sets the maximum value of memory (huge pages) allowed for that
-filesystem (/mnt/huge).  The size option can be specified in bytes, or as a
-percentage of the specified huge page pool (nr_hugepages).  The size is
-rounded down to HPAGE_SIZE boundary.  The min_size option sets the minimum
-value of memory (huge pages) allowed for the filesystem.  min_size can be
-specified in the same way as size, either bytes or a percentage of the
-huge page pool.  At mount time, the number of huge pages specified by
-min_size are reserved for use by the filesystem.  If there are not enough
-free huge pages available, the mount will fail.  As huge pages are allocated
-to the filesystem and freed, the reserve count is adjusted so that the sum
-of allocated and reserved huge pages is always at least min_size.  The option
-nr_inodes sets the maximum number of inodes that /mnt/huge can use.  If the
-size, min_size or nr_inodes option is not provided on command line then
-no limits are set.  For pagesize, size, min_size and nr_inodes options, you
-can use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo. For example, size=2K
-has the same meaning as size=2048.
+``/mnt/huge``.  Any file created on ``/mnt/huge`` uses huge pages.
+
+The ``uid`` and ``gid`` options sets the owner and group of the root of the
+file system.  By default the ``uid`` and ``gid`` of the current process
+are taken.
+
+The ``mode`` option sets the mode of root of file system to value & 01777.
+This value is given in octal. By default the value 0755 is picked.
+
+If the platform supports multiple huge page sizes, the ``pagesize`` option can
+be used to specify the huge page size and associated pool. ``pagesize``
+is specified in bytes. If ``pagesize`` is not specified the platform's
+default huge page size and associated pool will be used.
+
+The ``size`` option sets the maximum value of memory (huge pages) allowed
+for that filesystem (``/mnt/huge``). The ``size`` option can be specified
+in bytes, or as a percentage of the specified huge page pool (``nr_hugepages``).
+The size is rounded down to HPAGE_SIZE boundary.
+
+The ``min_size`` option sets the minimum value of memory (huge pages) allowed
+for the filesystem. ``min_size`` can be specified in the same way as ``size``,
+either bytes or a percentage of the huge page pool.
+At mount time, the number of huge pages specified by ``min_size`` are reserved
+for use by the filesystem.
+If there are not enough free huge pages available, the mount will fail.
+As huge pages are allocated to the filesystem and freed, the reserve count
+is adjusted so that the sum of allocated and reserved huge pages is always
+at least ``min_size``.
+
+The option ``nr_inodes`` sets the maximum number of inodes that ``/mnt/huge``
+can use.
+
+If the ``size``, ``min_size`` or ``nr_inodes`` option is not provided on
+command line then no limits are set.
+
+For ``pagesize``, ``size``, ``min_size`` and ``nr_inodes`` options, you can
+use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo.
+For example, size=2K has the same meaning as size=2048.
 
 While read system calls are supported on files that reside on hugetlb
 file systems, write system calls are not.
@@ -313,12 +346,12 @@ used to change the file attributes on hugetlbfs.
 
 Also, it is important to note that no such mount command is required if
 applications are going to use only shmat/shmget system calls or mmap with
-MAP_HUGETLB.  For an example of how to use mmap with MAP_HUGETLB see map_hugetlb
-below.
+MAP_HUGETLB.  For an example of how to use mmap with MAP_HUGETLB see
+:ref:`map_hugetlb <map_hugetlb>` below.
 
-Users who wish to use hugetlb memory via shared memory segment should be a
-member of a supplementary group and system admin needs to configure that gid
-into /proc/sys/vm/hugetlb_shm_group.  It is possible for same or different
+Users who wish to use hugetlb memory via shared memory segment should be
+members of a supplementary group and system admin needs to configure that gid
+into ``/proc/sys/vm/hugetlb_shm_group``.  It is possible for same or different
 applications to use any combination of mmaps and shm* calls, though the mount of
 filesystem will be required for using mmap calls without MAP_HUGETLB.
 
@@ -332,20 +365,18 @@ a hugetlb page and the length is smaller than the hugepage size.
 Examples
 ========
 
-1) map_hugetlb: see tools/testing/selftests/vm/map_hugetlb.c
+.. _map_hugetlb:
 
-2) hugepage-shm:  see tools/testing/selftests/vm/hugepage-shm.c
+``map_hugetlb``
+	see tools/testing/selftests/vm/map_hugetlb.c
 
-3) hugepage-mmap:  see tools/testing/selftests/vm/hugepage-mmap.c
+``hugepage-shm``
+	see tools/testing/selftests/vm/hugepage-shm.c
 
-4) The libhugetlbfs (https://github.com/libhugetlbfs/libhugetlbfs) library
-   provides a wide range of userspace tools to help with huge page usability,
-   environment setup, and control.
+``hugepage-mmap``
+	see tools/testing/selftests/vm/hugepage-mmap.c
 
-Kernel development regression testing
-=====================================
+The `libhugetlbfs`_  library provides a wide range of userspace tools
+to help with huge page usability, environment setup, and control.
 
-The most complete set of hugetlb tests are in the libhugetlbfs repository.
-If you modify any hugetlb related code, use the libhugetlbfs test suite
-to check for regressions.  In addition, if you add any new hugetlb
-functionality, please add appropriate tests to libhugetlbfs.
+.. _libhugetlbfs: https://github.com/libhugetlbfs/libhugetlbfs
diff --git a/Documentation/vm/idle_page_tracking.txt b/Documentation/admin-guide/mm/idle_page_tracking.rst
index 85dcc3b..6f7b7ca 100644
--- a/Documentation/vm/idle_page_tracking.txt
+++ b/Documentation/admin-guide/mm/idle_page_tracking.rst
@@ -1,4 +1,11 @@
-MOTIVATION
+.. _idle_page_tracking:
+
+==================
+Idle Page Tracking
+==================
+
+Motivation
+==========
 
 The idle page tracking feature allows to track which memory pages are being
 accessed by a workload and which are idle. This information can be useful for
@@ -8,10 +15,14 @@ or deciding where to place the workload within a compute cluster.
 
 It is enabled by CONFIG_IDLE_PAGE_TRACKING=y.
 
-USER API
+.. _user_api:
 
-The idle page tracking API is located at /sys/kernel/mm/page_idle. Currently,
-it consists of the only read-write file, /sys/kernel/mm/page_idle/bitmap.
+User API
+========
+
+The idle page tracking API is located at ``/sys/kernel/mm/page_idle``.
+Currently, it consists of the only read-write file,
+``/sys/kernel/mm/page_idle/bitmap``.
 
 The file implements a bitmap where each bit corresponds to a memory page. The
 bitmap is represented by an array of 8-byte integers, and the page at PFN #i is
@@ -19,8 +30,9 @@ mapped to bit #i%64 of array element #i/64, byte order is native. When a bit is
 set, the corresponding page is idle.
 
 A page is considered idle if it has not been accessed since it was marked idle
-(for more details on what "accessed" actually means see the IMPLEMENTATION
-DETAILS section). To mark a page idle one has to set the bit corresponding to
+(for more details on what "accessed" actually means see the :ref:`Implementation
+Details <impl_details>` section).
+To mark a page idle one has to set the bit corresponding to
 the page by writing to the file. A value written to the file is OR-ed with the
 current bitmap value.
 
@@ -30,9 +42,9 @@ page types (e.g. SLAB pages) an attempt to mark a page idle is silently ignored,
 and hence such pages are never reported idle.
 
 For huge pages the idle flag is set only on the head page, so one has to read
-/proc/kpageflags in order to correctly count idle huge pages.
+``/proc/kpageflags`` in order to correctly count idle huge pages.
 
-Reading from or writing to /sys/kernel/mm/page_idle/bitmap will return
+Reading from or writing to ``/sys/kernel/mm/page_idle/bitmap`` will return
 -EINVAL if you are not starting the read/write on an 8-byte boundary, or
 if the size of the read/write is not a multiple of 8 bytes. Writing to
 this file beyond max PFN will return -ENXIO.
@@ -41,21 +53,26 @@ That said, in order to estimate the amount of pages that are not used by a
 workload one should:
 
  1. Mark all the workload's pages as idle by setting corresponding bits in
-    /sys/kernel/mm/page_idle/bitmap. The pages can be found by reading
-    /proc/pid/pagemap if the workload is represented by a process, or by
-    filtering out alien pages using /proc/kpagecgroup in case the workload is
-    placed in a memory cgroup.
+    ``/sys/kernel/mm/page_idle/bitmap``. The pages can be found by reading
+    ``/proc/pid/pagemap`` if the workload is represented by a process, or by
+    filtering out alien pages using ``/proc/kpagecgroup`` in case the workload
+    is placed in a memory cgroup.
 
  2. Wait until the workload accesses its working set.
 
- 3. Read /sys/kernel/mm/page_idle/bitmap and count the number of bits set. If
-    one wants to ignore certain types of pages, e.g. mlocked pages since they
-    are not reclaimable, he or she can filter them out using /proc/kpageflags.
+ 3. Read ``/sys/kernel/mm/page_idle/bitmap`` and count the number of bits set.
+    If one wants to ignore certain types of pages, e.g. mlocked pages since they
+    are not reclaimable, he or she can filter them out using
+    ``/proc/kpageflags``.
+
+See :ref:`Documentation/admin-guide/mm/pagemap.rst <pagemap>` for more
+information about ``/proc/pid/pagemap``, ``/proc/kpageflags``, and
+``/proc/kpagecgroup``.
 
-See Documentation/vm/pagemap.txt for more information about /proc/pid/pagemap,
-/proc/kpageflags, and /proc/kpagecgroup.
+.. _impl_details:
 
-IMPLEMENTATION DETAILS
+Implementation Details
+======================
 
 The kernel internally keeps track of accesses to user memory pages in order to
 reclaim unreferenced pages first on memory shortage conditions. A page is
@@ -77,7 +94,8 @@ When a dirty page is written to swap or disk as a result of memory reclaim or
 exceeding the dirty memory limit, it is not marked referenced.
 
 The idle memory tracking feature adds a new page flag, the Idle flag. This flag
-is set manually, by writing to /sys/kernel/mm/page_idle/bitmap (see the USER API
+is set manually, by writing to ``/sys/kernel/mm/page_idle/bitmap`` (see the
+:ref:`User API <user_api>`
 section), and cleared automatically whenever a page is referenced as defined
 above.
 
diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst
new file mode 100644
index 0000000..ceead68
--- /dev/null
+++ b/Documentation/admin-guide/mm/index.rst
@@ -0,0 +1,36 @@
+=================
+Memory Management
+=================
+
+Linux memory management subsystem is responsible, as the name implies,
+for managing the memory in the system. This includes implemnetation of
+virtual memory and demand paging, memory allocation both for kernel
+internal structures and user space programms, mapping of files into
+processes address space and many other cool things.
+
+Linux memory management is a complex system with many configurable
+settings. Most of these settings are available via ``/proc``
+filesystem and can be quired and adjusted using ``sysctl``. These APIs
+are described in Documentation/sysctl/vm.txt and in `man 5 proc`_.
+
+.. _man 5 proc: http://man7.org/linux/man-pages/man5/proc.5.html
+
+Linux memory management has its own jargon and if you are not yet
+familiar with it, consider reading
+:ref:`Documentation/admin-guide/mm/concepts.rst <mm_concepts>`.
+
+Here we document in detail how to interact with various mechanisms in
+the Linux memory management.
+
+.. toctree::
+   :maxdepth: 1
+
+   concepts
+   hugetlbpage
+   idle_page_tracking
+   ksm
+   numa_memory_policy
+   pagemap
+   soft-dirty
+   transhuge
+   userfaultfd
diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst
new file mode 100644
index 0000000..9303786
--- /dev/null
+++ b/Documentation/admin-guide/mm/ksm.rst
@@ -0,0 +1,189 @@
+.. _admin_guide_ksm:
+
+=======================
+Kernel Samepage Merging
+=======================
+
+Overview
+========
+
+KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y,
+added to the Linux kernel in 2.6.32.  See ``mm/ksm.c`` for its implementation,
+and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/
+
+KSM was originally developed for use with KVM (where it was known as
+Kernel Shared Memory), to fit more virtual machines into physical memory,
+by sharing the data common between them.  But it can be useful to any
+application which generates many instances of the same data.
+
+The KSM daemon ksmd periodically scans those areas of user memory
+which have been registered with it, looking for pages of identical
+content which can be replaced by a single write-protected page (which
+is automatically copied if a process later wants to update its
+content). The amount of pages that KSM daemon scans in a single pass
+and the time between the passes are configured using :ref:`sysfs
+intraface <ksm_sysfs>`
+
+KSM only merges anonymous (private) pages, never pagecache (file) pages.
+KSM's merged pages were originally locked into kernel memory, but can now
+be swapped out just like other user pages (but sharing is broken when they
+are swapped back in: ksmd must rediscover their identity and merge again).
+
+Controlling KSM with madvise
+============================
+
+KSM only operates on those areas of address space which an application
+has advised to be likely candidates for merging, by using the madvise(2)
+system call::
+
+	int madvise(addr, length, MADV_MERGEABLE)
+
+The app may call
+
+::
+
+	int madvise(addr, length, MADV_UNMERGEABLE)
+
+to cancel that advice and restore unshared pages: whereupon KSM
+unmerges whatever it merged in that range.  Note: this unmerging call
+may suddenly require more memory than is available - possibly failing
+with EAGAIN, but more probably arousing the Out-Of-Memory killer.
+
+If KSM is not configured into the running kernel, madvise MADV_MERGEABLE
+and MADV_UNMERGEABLE simply fail with EINVAL.  If the running kernel was
+built with CONFIG_KSM=y, those calls will normally succeed: even if the
+the KSM daemon is not currently running, MADV_MERGEABLE still registers
+the range for whenever the KSM daemon is started; even if the range
+cannot contain any pages which KSM could actually merge; even if
+MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE.
+
+If a region of memory must be split into at least one new MADV_MERGEABLE
+or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process
+will exceed ``vm.max_map_count`` (see Documentation/sysctl/vm.txt).
+
+Like other madvise calls, they are intended for use on mapped areas of
+the user address space: they will report ENOMEM if the specified range
+includes unmapped gaps (though working on the intervening mapped areas),
+and might fail with EAGAIN if not enough memory for internal structures.
+
+Applications should be considerate in their use of MADV_MERGEABLE,
+restricting its use to areas likely to benefit.  KSM's scans may use a lot
+of processing power: some installations will disable KSM for that reason.
+
+.. _ksm_sysfs:
+
+KSM daemon sysfs interface
+==========================
+
+The KSM daemon is controlled by sysfs files in ``/sys/kernel/mm/ksm/``,
+readable by all but writable only by root:
+
+pages_to_scan
+        how many pages to scan before ksmd goes to sleep
+        e.g. ``echo 100 > /sys/kernel/mm/ksm/pages_to_scan``.
+
+        Default: 100 (chosen for demonstration purposes)
+
+sleep_millisecs
+        how many milliseconds ksmd should sleep before next scan
+        e.g. ``echo 20 > /sys/kernel/mm/ksm/sleep_millisecs``
+
+        Default: 20 (chosen for demonstration purposes)
+
+merge_across_nodes
+        specifies if pages from different NUMA nodes can be merged.
+        When set to 0, ksm merges only pages which physically reside
+        in the memory area of same NUMA node. That brings lower
+        latency to access of shared pages. Systems with more nodes, at
+        significant NUMA distances, are likely to benefit from the
+        lower latency of setting 0. Smaller systems, which need to
+        minimize memory usage, are likely to benefit from the greater
+        sharing of setting 1 (default). You may wish to compare how
+        your system performs under each setting, before deciding on
+        which to use. ``merge_across_nodes`` setting can be changed only
+        when there are no ksm shared pages in the system: set run 2 to
+        unmerge pages first, then to 1 after changing
+        ``merge_across_nodes``, to remerge according to the new setting.
+
+        Default: 1 (merging across nodes as in earlier releases)
+
+run
+        * set to 0 to stop ksmd from running but keep merged pages,
+        * set to 1 to run ksmd e.g. ``echo 1 > /sys/kernel/mm/ksm/run``,
+        * set to 2 to stop ksmd and unmerge all pages currently merged, but
+	  leave mergeable areas registered for next run.
+
+        Default: 0 (must be changed to 1 to activate KSM, except if
+        CONFIG_SYSFS is disabled)
+
+use_zero_pages
+        specifies whether empty pages (i.e. allocated pages that only
+        contain zeroes) should be treated specially.  When set to 1,
+        empty pages are merged with the kernel zero page(s) instead of
+        with each other as it would happen normally. This can improve
+        the performance on architectures with coloured zero pages,
+        depending on the workload. Care should be taken when enabling
+        this setting, as it can potentially degrade the performance of
+        KSM for some workloads, for example if the checksums of pages
+        candidate for merging match the checksum of an empty
+        page. This setting can be changed at any time, it is only
+        effective for pages merged after the change.
+
+        Default: 0 (normal KSM behaviour as in earlier releases)
+
+max_page_sharing
+        Maximum sharing allowed for each KSM page. This enforces a
+        deduplication limit to avoid high latency for virtual memory
+        operations that involve traversal of the virtual mappings that
+        share the KSM page. The minimum value is 2 as a newly created
+        KSM page will have at least two sharers. The higher this value
+        the faster KSM will merge the memory and the higher the
+        deduplication factor will be, but the slower the worst case
+        virtual mappings traversal could be for any given KSM
+        page. Slowing down this traversal means there will be higher
+        latency for certain virtual memory operations happening during
+        swapping, compaction, NUMA balancing and page migration, in
+        turn decreasing responsiveness for the caller of those virtual
+        memory operations. The scheduler latency of other tasks not
+        involved with the VM operations doing the virtual mappings
+        traversal is not affected by this parameter as these
+        traversals are always schedule friendly themselves.
+
+stable_node_chains_prune_millisecs
+        specifies how frequently KSM checks the metadata of the pages
+        that hit the deduplication limit for stale information.
+        Smaller milllisecs values will free up the KSM metadata with
+        lower latency, but they will make ksmd use more CPU during the
+        scan. It's a noop if not a single KSM page hit the
+        ``max_page_sharing`` yet.
+
+The effectiveness of KSM and MADV_MERGEABLE is shown in ``/sys/kernel/mm/ksm/``:
+
+pages_shared
+        how many shared pages are being used
+pages_sharing
+        how many more sites are sharing them i.e. how much saved
+pages_unshared
+        how many pages unique but repeatedly checked for merging
+pages_volatile
+        how many pages changing too fast to be placed in a tree
+full_scans
+        how many times all mergeable areas have been scanned
+stable_node_chains
+        the number of KSM pages that hit the ``max_page_sharing`` limit
+stable_node_dups
+        number of duplicated KSM pages
+
+A high ratio of ``pages_sharing`` to ``pages_shared`` indicates good
+sharing, but a high ratio of ``pages_unshared`` to ``pages_sharing``
+indicates wasted effort.  ``pages_volatile`` embraces several
+different kinds of activity, but a high proportion there would also
+indicate poor use of madvise MADV_MERGEABLE.
+
+The maximum possible ``pages_sharing/pages_shared`` ratio is limited by the
+``max_page_sharing`` tunable. To increase the ratio ``max_page_sharing`` must
+be increased accordingly.
+
+--
+Izik Eidus,
+Hugh Dickins, 17 Nov 2009
diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
new file mode 100644
index 0000000..d78c5b3
--- /dev/null
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -0,0 +1,495 @@
+.. _numa_memory_policy:
+
+==================
+NUMA Memory Policy
+==================
+
+What is NUMA Memory Policy?
+============================
+
+In the Linux kernel, "memory policy" determines from which node the kernel will
+allocate memory in a NUMA system or in an emulated NUMA system.  Linux has
+supported platforms with Non-Uniform Memory Access architectures since 2.4.?.
+The current memory policy support was added to Linux 2.6 around May 2004.  This
+document attempts to describe the concepts and APIs of the 2.6 memory policy
+support.
+
+Memory policies should not be confused with cpusets
+(``Documentation/cgroup-v1/cpusets.txt``)
+which is an administrative mechanism for restricting the nodes from which
+memory may be allocated by a set of processes. Memory policies are a
+programming interface that a NUMA-aware application can take advantage of.  When
+both cpusets and policies are applied to a task, the restrictions of the cpuset
+takes priority.  See :ref:`Memory Policies and cpusets <mem_pol_and_cpusets>`
+below for more details.
+
+Memory Policy Concepts
+======================
+
+Scope of Memory Policies
+------------------------
+
+The Linux kernel supports _scopes_ of memory policy, described here from
+most general to most specific:
+
+System Default Policy
+	this policy is "hard coded" into the kernel.  It is the policy
+	that governs all page allocations that aren't controlled by
+	one of the more specific policy scopes discussed below.  When
+	the system is "up and running", the system default policy will
+	use "local allocation" described below.  However, during boot
+	up, the system default policy will be set to interleave
+	allocations across all nodes with "sufficient" memory, so as
+	not to overload the initial boot node with boot-time
+	allocations.
+
+Task/Process Policy
+	this is an optional, per-task policy.  When defined for a
+	specific task, this policy controls all page allocations made
+	by or on behalf of the task that aren't controlled by a more
+	specific scope. If a task does not define a task policy, then
+	all page allocations that would have been controlled by the
+	task policy "fall back" to the System Default Policy.
+
+	The task policy applies to the entire address space of a task. Thus,
+	it is inheritable, and indeed is inherited, across both fork()
+	[clone() w/o the CLONE_VM flag] and exec*().  This allows a parent task
+	to establish the task policy for a child task exec()'d from an
+	executable image that has no awareness of memory policy.  See the
+	:ref:`Memory Policy APIs <memory_policy_apis>` section,
+	below, for an overview of the system call
+	that a task may use to set/change its task/process policy.
+
+	In a multi-threaded task, task policies apply only to the thread
+	[Linux kernel task] that installs the policy and any threads
+	subsequently created by that thread.  Any sibling threads existing
+	at the time a new task policy is installed retain their current
+	policy.
+
+	A task policy applies only to pages allocated after the policy is
+	installed.  Any pages already faulted in by the task when the task
+	changes its task policy remain where they were allocated based on
+	the policy at the time they were allocated.
+
+.. _vma_policy:
+
+VMA Policy
+	A "VMA" or "Virtual Memory Area" refers to a range of a task's
+	virtual address space.  A task may define a specific policy for a range
+	of its virtual address space.   See the
+	:ref:`Memory Policy APIs <memory_policy_apis>` section,
+	below, for an overview of the mbind() system call used to set a VMA
+	policy.
+
+	A VMA policy will govern the allocation of pages that back
+	this region of the address space.  Any regions of the task's
+	address space that don't have an explicit VMA policy will fall
+	back to the task policy, which may itself fall back to the
+	System Default Policy.
+
+	VMA policies have a few complicating details:
+
+	* VMA policy applies ONLY to anonymous pages.  These include
+	  pages allocated for anonymous segments, such as the task
+	  stack and heap, and any regions of the address space
+	  mmap()ed with the MAP_ANONYMOUS flag.  If a VMA policy is
+	  applied to a file mapping, it will be ignored if the mapping
+	  used the MAP_SHARED flag.  If the file mapping used the
+	  MAP_PRIVATE flag, the VMA policy will only be applied when
+	  an anonymous page is allocated on an attempt to write to the
+	  mapping-- i.e., at Copy-On-Write.
+
+	* VMA policies are shared between all tasks that share a
+	  virtual address space--a.k.a. threads--independent of when
+	  the policy is installed; and they are inherited across
+	  fork().  However, because VMA policies refer to a specific
+	  region of a task's address space, and because the address
+	  space is discarded and recreated on exec*(), VMA policies
+	  are NOT inheritable across exec().  Thus, only NUMA-aware
+	  applications may use VMA policies.
+
+	* A task may install a new VMA policy on a sub-range of a
+	  previously mmap()ed region.  When this happens, Linux splits
+	  the existing virtual memory area into 2 or 3 VMAs, each with
+	  it's own policy.
+
+	* By default, VMA policy applies only to pages allocated after
+	  the policy is installed.  Any pages already faulted into the
+	  VMA range remain where they were allocated based on the
+	  policy at the time they were allocated.  However, since
+	  2.6.16, Linux supports page migration via the mbind() system
+	  call, so that page contents can be moved to match a newly
+	  installed policy.
+
+Shared Policy
+	Conceptually, shared policies apply to "memory objects" mapped
+	shared into one or more tasks' distinct address spaces.  An
+	application installs shared policies the same way as VMA
+	policies--using the mbind() system call specifying a range of
+	virtual addresses that map the shared object.  However, unlike
+	VMA policies, which can be considered to be an attribute of a
+	range of a task's address space, shared policies apply
+	directly to the shared object.  Thus, all tasks that attach to
+	the object share the policy, and all pages allocated for the
+	shared object, by any task, will obey the shared policy.
+
+	As of 2.6.22, only shared memory segments, created by shmget() or
+	mmap(MAP_ANONYMOUS|MAP_SHARED), support shared policy.  When shared
+	policy support was added to Linux, the associated data structures were
+	added to hugetlbfs shmem segments.  At the time, hugetlbfs did not
+	support allocation at fault time--a.k.a lazy allocation--so hugetlbfs
+	shmem segments were never "hooked up" to the shared policy support.
+	Although hugetlbfs segments now support lazy allocation, their support
+	for shared policy has not been completed.
+
+	As mentioned above in :ref:`VMA policies <vma_policy>` section,
+	allocations of page cache pages for regular files mmap()ed
+	with MAP_SHARED ignore any VMA policy installed on the virtual
+	address range backed by the shared file mapping.  Rather,
+	shared page cache pages, including pages backing private
+	mappings that have not yet been written by the task, follow
+	task policy, if any, else System Default Policy.
+
+	The shared policy infrastructure supports different policies on subset
+	ranges of the shared object.  However, Linux still splits the VMA of
+	the task that installs the policy for each range of distinct policy.
+	Thus, different tasks that attach to a shared memory segment can have
+	different VMA configurations mapping that one shared object.  This
+	can be seen by examining the /proc/<pid>/numa_maps of tasks sharing
+	a shared memory region, when one task has installed shared policy on
+	one or more ranges of the region.
+
+Components of Memory Policies
+-----------------------------
+
+A NUMA memory policy consists of a "mode", optional mode flags, and
+an optional set of nodes.  The mode determines the behavior of the
+policy, the optional mode flags determine the behavior of the mode,
+and the optional set of nodes can be viewed as the arguments to the
+policy behavior.
+
+Internally, memory policies are implemented by a reference counted
+structure, struct mempolicy.  Details of this structure will be
+discussed in context, below, as required to explain the behavior.
+
+NUMA memory policy supports the following 4 behavioral modes:
+
+Default Mode--MPOL_DEFAULT
+	This mode is only used in the memory policy APIs.  Internally,
+	MPOL_DEFAULT is converted to the NULL memory policy in all
+	policy scopes.  Any existing non-default policy will simply be
+	removed when MPOL_DEFAULT is specified.  As a result,
+	MPOL_DEFAULT means "fall back to the next most specific policy
+	scope."
+
+	For example, a NULL or default task policy will fall back to the
+	system default policy.  A NULL or default vma policy will fall
+	back to the task policy.
+
+	When specified in one of the memory policy APIs, the Default mode
+	does not use the optional set of nodes.
+
+	It is an error for the set of nodes specified for this policy to
+	be non-empty.
+
+MPOL_BIND
+	This mode specifies that memory must come from the set of
+	nodes specified by the policy.  Memory will be allocated from
+	the node in the set with sufficient free memory that is
+	closest to the node where the allocation takes place.
+
+MPOL_PREFERRED
+	This mode specifies that the allocation should be attempted
+	from the single node specified in the policy.  If that
+	allocation fails, the kernel will search other nodes, in order
+	of increasing distance from the preferred node based on
+	information provided by the platform firmware.
+
+	Internally, the Preferred policy uses a single node--the
+	preferred_node member of struct mempolicy.  When the internal
+	mode flag MPOL_F_LOCAL is set, the preferred_node is ignored
+	and the policy is interpreted as local allocation.  "Local"
+	allocation policy can be viewed as a Preferred policy that
+	starts at the node containing the cpu where the allocation
+	takes place.
+
+	It is possible for the user to specify that local allocation
+	is always preferred by passing an empty nodemask with this
+	mode.  If an empty nodemask is passed, the policy cannot use
+	the MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES flags
+	described below.
+
+MPOL_INTERLEAVED
+	This mode specifies that page allocations be interleaved, on a
+	page granularity, across the nodes specified in the policy.
+	This mode also behaves slightly differently, based on the
+	context where it is used:
+
+	For allocation of anonymous pages and shared memory pages,
+	Interleave mode indexes the set of nodes specified by the
+	policy using the page offset of the faulting address into the
+	segment [VMA] containing the address modulo the number of
+	nodes specified by the policy.  It then attempts to allocate a
+	page, starting at the selected node, as if the node had been
+	specified by a Preferred policy or had been selected by a
+	local allocation.  That is, allocation will follow the per
+	node zonelist.
+
+	For allocation of page cache pages, Interleave mode indexes
+	the set of nodes specified by the policy using a node counter
+	maintained per task.  This counter wraps around to the lowest
+	specified node after it reaches the highest specified node.
+	This will tend to spread the pages out over the nodes
+	specified by the policy based on the order in which they are
+	allocated, rather than based on any page offset into an
+	address range or file.  During system boot up, the temporary
+	interleaved system default policy works in this mode.
+
+NUMA memory policy supports the following optional mode flags:
+
+MPOL_F_STATIC_NODES
+	This flag specifies that the nodemask passed by
+	the user should not be remapped if the task or VMA's set of allowed
+	nodes changes after the memory policy has been defined.
+
+	Without this flag, any time a mempolicy is rebound because of a
+	change in the set of allowed nodes, the node (Preferred) or
+	nodemask (Bind, Interleave) is remapped to the new set of
+	allowed nodes.  This may result in nodes being used that were
+	previously undesired.
+
+	With this flag, if the user-specified nodes overlap with the
+	nodes allowed by the task's cpuset, then the memory policy is
+	applied to their intersection.  If the two sets of nodes do not
+	overlap, the Default policy is used.
+
+	For example, consider a task that is attached to a cpuset with
+	mems 1-3 that sets an Interleave policy over the same set.  If
+	the cpuset's mems change to 3-5, the Interleave will now occur
+	over nodes 3, 4, and 5.  With this flag, however, since only node
+	3 is allowed from the user's nodemask, the "interleave" only
+	occurs over that node.  If no nodes from the user's nodemask are
+	now allowed, the Default behavior is used.
+
+	MPOL_F_STATIC_NODES cannot be combined with the
+	MPOL_F_RELATIVE_NODES flag.  It also cannot be used for
+	MPOL_PREFERRED policies that were created with an empty nodemask
+	(local allocation).
+
+MPOL_F_RELATIVE_NODES
+	This flag specifies that the nodemask passed
+	by the user will be mapped relative to the set of the task or VMA's
+	set of allowed nodes.  The kernel stores the user-passed nodemask,
+	and if the allowed nodes changes, then that original nodemask will
+	be remapped relative to the new set of allowed nodes.
+
+	Without this flag (and without MPOL_F_STATIC_NODES), anytime a
+	mempolicy is rebound because of a change in the set of allowed
+	nodes, the node (Preferred) or nodemask (Bind, Interleave) is
+	remapped to the new set of allowed nodes.  That remap may not
+	preserve the relative nature of the user's passed nodemask to its
+	set of allowed nodes upon successive rebinds: a nodemask of
+	1,3,5 may be remapped to 7-9 and then to 1-3 if the set of
+	allowed nodes is restored to its original state.
+
+	With this flag, the remap is done so that the node numbers from
+	the user's passed nodemask are relative to the set of allowed
+	nodes.  In other words, if nodes 0, 2, and 4 are set in the user's
+	nodemask, the policy will be effected over the first (and in the
+	Bind or Interleave case, the third and fifth) nodes in the set of
+	allowed nodes.  The nodemask passed by the user represents nodes
+	relative to task or VMA's set of allowed nodes.
+
+	If the user's nodemask includes nodes that are outside the range
+	of the new set of allowed nodes (for example, node 5 is set in
+	the user's nodemask when the set of allowed nodes is only 0-3),
+	then the remap wraps around to the beginning of the nodemask and,
+	if not already set, sets the node in the mempolicy nodemask.
+
+	For example, consider a task that is attached to a cpuset with
+	mems 2-5 that sets an Interleave policy over the same set with
+	MPOL_F_RELATIVE_NODES.  If the cpuset's mems change to 3-7, the
+	interleave now occurs over nodes 3,5-7.  If the cpuset's mems
+	then change to 0,2-3,5, then the interleave occurs over nodes
+	0,2-3,5.
+
+	Thanks to the consistent remapping, applications preparing
+	nodemasks to specify memory policies using this flag should
+	disregard their current, actual cpuset imposed memory placement
+	and prepare the nodemask as if they were always located on
+	memory nodes 0 to N-1, where N is the number of memory nodes the
+	policy is intended to manage.  Let the kernel then remap to the
+	set of memory nodes allowed by the task's cpuset, as that may
+	change over time.
+
+	MPOL_F_RELATIVE_NODES cannot be combined with the
+	MPOL_F_STATIC_NODES flag.  It also cannot be used for
+	MPOL_PREFERRED policies that were created with an empty nodemask
+	(local allocation).
+
+Memory Policy Reference Counting
+================================
+
+To resolve use/free races, struct mempolicy contains an atomic reference
+count field.  Internal interfaces, mpol_get()/mpol_put() increment and
+decrement this reference count, respectively.  mpol_put() will only free
+the structure back to the mempolicy kmem cache when the reference count
+goes to zero.
+
+When a new memory policy is allocated, its reference count is initialized
+to '1', representing the reference held by the task that is installing the
+new policy.  When a pointer to a memory policy structure is stored in another
+structure, another reference is added, as the task's reference will be dropped
+on completion of the policy installation.
+
+During run-time "usage" of the policy, we attempt to minimize atomic operations
+on the reference count, as this can lead to cache lines bouncing between cpus
+and NUMA nodes.  "Usage" here means one of the following:
+
+1) querying of the policy, either by the task itself [using the get_mempolicy()
+   API discussed below] or by another task using the /proc/<pid>/numa_maps
+   interface.
+
+2) examination of the policy to determine the policy mode and associated node
+   or node lists, if any, for page allocation.  This is considered a "hot
+   path".  Note that for MPOL_BIND, the "usage" extends across the entire
+   allocation process, which may sleep during page reclaimation, because the
+   BIND policy nodemask is used, by reference, to filter ineligible nodes.
+
+We can avoid taking an extra reference during the usages listed above as
+follows:
+
+1) we never need to get/free the system default policy as this is never
+   changed nor freed, once the system is up and running.
+
+2) for querying the policy, we do not need to take an extra reference on the
+   target task's task policy nor vma policies because we always acquire the
+   task's mm's mmap_sem for read during the query.  The set_mempolicy() and
+   mbind() APIs [see below] always acquire the mmap_sem for write when
+   installing or replacing task or vma policies.  Thus, there is no possibility
+   of a task or thread freeing a policy while another task or thread is
+   querying it.
+
+3) Page allocation usage of task or vma policy occurs in the fault path where
+   we hold them mmap_sem for read.  Again, because replacing the task or vma
+   policy requires that the mmap_sem be held for write, the policy can't be
+   freed out from under us while we're using it for page allocation.
+
+4) Shared policies require special consideration.  One task can replace a
+   shared memory policy while another task, with a distinct mmap_sem, is
+   querying or allocating a page based on the policy.  To resolve this
+   potential race, the shared policy infrastructure adds an extra reference
+   to the shared policy during lookup while holding a spin lock on the shared
+   policy management structure.  This requires that we drop this extra
+   reference when we're finished "using" the policy.  We must drop the
+   extra reference on shared policies in the same query/allocation paths
+   used for non-shared policies.  For this reason, shared policies are marked
+   as such, and the extra reference is dropped "conditionally"--i.e., only
+   for shared policies.
+
+   Because of this extra reference counting, and because we must lookup
+   shared policies in a tree structure under spinlock, shared policies are
+   more expensive to use in the page allocation path.  This is especially
+   true for shared policies on shared memory regions shared by tasks running
+   on different NUMA nodes.  This extra overhead can be avoided by always
+   falling back to task or system default policy for shared memory regions,
+   or by prefaulting the entire shared memory region into memory and locking
+   it down.  However, this might not be appropriate for all applications.
+
+.. _memory_policy_apis:
+
+Memory Policy APIs
+==================
+
+Linux supports 3 system calls for controlling memory policy.  These APIS
+always affect only the calling task, the calling task's address space, or
+some shared object mapped into the calling task's address space.
+
+.. note::
+   the headers that define these APIs and the parameter data types for
+   user space applications reside in a package that is not part of the
+   Linux kernel.  The kernel system call interfaces, with the 'sys\_'
+   prefix, are defined in <linux/syscalls.h>; the mode and flag
+   definitions are defined in <linux/mempolicy.h>.
+
+Set [Task] Memory Policy::
+
+	long set_mempolicy(int mode, const unsigned long *nmask,
+					unsigned long maxnode);
+
+Set's the calling task's "task/process memory policy" to mode
+specified by the 'mode' argument and the set of nodes defined by
+'nmask'.  'nmask' points to a bit mask of node ids containing at least
+'maxnode' ids.  Optional mode flags may be passed by combining the
+'mode' argument with the flag (for example: MPOL_INTERLEAVE |
+MPOL_F_STATIC_NODES).
+
+See the set_mempolicy(2) man page for more details
+
+
+Get [Task] Memory Policy or Related Information::
+
+	long get_mempolicy(int *mode,
+			   const unsigned long *nmask, unsigned long maxnode,
+			   void *addr, int flags);
+
+Queries the "task/process memory policy" of the calling task, or the
+policy or location of a specified virtual address, depending on the
+'flags' argument.
+
+See the get_mempolicy(2) man page for more details
+
+
+Install VMA/Shared Policy for a Range of Task's Address Space::
+
+	long mbind(void *start, unsigned long len, int mode,
+		   const unsigned long *nmask, unsigned long maxnode,
+		   unsigned flags);
+
+mbind() installs the policy specified by (mode, nmask, maxnodes) as a
+VMA policy for the range of the calling task's address space specified
+by the 'start' and 'len' arguments.  Additional actions may be
+requested via the 'flags' argument.
+
+See the mbind(2) man page for more details.
+
+Memory Policy Command Line Interface
+====================================
+
+Although not strictly part of the Linux implementation of memory policy,
+a command line tool, numactl(8), exists that allows one to:
+
++ set the task policy for a specified program via set_mempolicy(2), fork(2) and
+  exec(2)
+
++ set the shared policy for a shared memory segment via mbind(2)
+
+The numactl(8) tool is packaged with the run-time version of the library
+containing the memory policy system call wrappers.  Some distributions
+package the headers and compile-time libraries in a separate development
+package.
+
+.. _mem_pol_and_cpusets:
+
+Memory Policies and cpusets
+===========================
+
+Memory policies work within cpusets as described above.  For memory policies
+that require a node or set of nodes, the nodes are restricted to the set of
+nodes whose memories are allowed by the cpuset constraints.  If the nodemask
+specified for the policy contains nodes that are not allowed by the cpuset and
+MPOL_F_RELATIVE_NODES is not used, the intersection of the set of nodes
+specified for the policy and the set of nodes with memory is used.  If the
+result is the empty set, the policy is considered invalid and cannot be
+installed.  If MPOL_F_RELATIVE_NODES is used, the policy's nodes are mapped
+onto and folded into the task's set of allowed nodes as previously described.
+
+The interaction of memory policies and cpusets can be problematic when tasks
+in two cpusets share access to a memory region, such as shared memory segments
+created by shmget() of mmap() with the MAP_ANONYMOUS and MAP_SHARED flags, and
+any of the tasks install shared policy on the region, only nodes whose
+memories are allowed in both cpusets may be used in the policies.  Obtaining
+this information requires "stepping outside" the memory policy APIs to use the
+cpuset information and requires that one know in what cpusets other task might
+be attaching to the shared region.  Furthermore, if the cpusets' allowed
+memory sets are disjoint, "local" allocation is the only valid policy.
diff --git a/Documentation/vm/pagemap.txt b/Documentation/admin-guide/mm/pagemap.rst
index eafcefa..577af85 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -1,21 +1,25 @@
-pagemap, from the userspace perspective
----------------------------------------
+.. _pagemap:
+
+=============================
+Examining Process Page Tables
+=============================
 
 pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow
 userspace programs to examine the page tables and related information by
-reading files in /proc.
+reading files in ``/proc``.
 
 There are four components to pagemap:
 
- * /proc/pid/pagemap.  This file lets a userspace process find out which
+ * ``/proc/pid/pagemap``.  This file lets a userspace process find out which
    physical frame each virtual page is mapped to.  It contains one 64-bit
    value for each virtual page, containing the following data (from
-   fs/proc/task_mmu.c, above pagemap_read):
+   ``fs/proc/task_mmu.c``, above pagemap_read):
 
     * Bits 0-54  page frame number (PFN) if present
     * Bits 0-4   swap type if swapped
     * Bits 5-54  swap offset if swapped
-    * Bit  55    pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
+    * Bit  55    pte is soft-dirty (see
+      :ref:`Documentation/admin-guide/mm/soft-dirty.rst <soft_dirty>`)
     * Bit  56    page exclusively mapped (since 4.2)
     * Bits 57-60 zero
     * Bit  61    page is file-page or shared-anon (since 3.5)
@@ -33,28 +37,28 @@ There are four components to pagemap:
    precisely which pages are mapped (or in swap) and comparing mapped
    pages between processes.
 
-   Efficient users of this interface will use /proc/pid/maps to
+   Efficient users of this interface will use ``/proc/pid/maps`` to
    determine which areas of memory are actually mapped and llseek to
    skip over unmapped regions.
 
- * /proc/kpagecount.  This file contains a 64-bit count of the number of
+ * ``/proc/kpagecount``.  This file contains a 64-bit count of the number of
    times each page is mapped, indexed by PFN.
 
- * /proc/kpageflags.  This file contains a 64-bit set of flags for each
+ * ``/proc/kpageflags``.  This file contains a 64-bit set of flags for each
    page, indexed by PFN.
 
-   The flags are (from fs/proc/page.c, above kpageflags_read):
-
-     0. LOCKED
-     1. ERROR
-     2. REFERENCED
-     3. UPTODATE
-     4. DIRTY
-     5. LRU
-     6. ACTIVE
-     7. SLAB
-     8. WRITEBACK
-     9. RECLAIM
+   The flags are (from ``fs/proc/page.c``, above kpageflags_read):
+
+    0. LOCKED
+    1. ERROR
+    2. REFERENCED
+    3. UPTODATE
+    4. DIRTY
+    5. LRU
+    6. ACTIVE
+    7. SLAB
+    8. WRITEBACK
+    9. RECLAIM
     10. BUDDY
     11. MMAP
     12. ANON
@@ -72,98 +76,111 @@ There are four components to pagemap:
     24. ZERO_PAGE
     25. IDLE
 
- * /proc/kpagecgroup.  This file contains a 64-bit inode number of the
+ * ``/proc/kpagecgroup``.  This file contains a 64-bit inode number of the
    memory cgroup each page is charged to, indexed by PFN. Only available when
    CONFIG_MEMCG is set.
 
-Short descriptions to the page flags:
-
- 0. LOCKED
-    page is being locked for exclusive access, eg. by undergoing read/write IO
-
- 7. SLAB
-    page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator
-    When compound page is used, SLUB/SLQB will only set this flag on the head
-    page; SLOB will not flag it at all.
+Short descriptions to the page flags
+====================================
 
-10. BUDDY
+0 - LOCKED
+   page is being locked for exclusive access, e.g. by undergoing read/write IO
+7 - SLAB
+   page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator
+   When compound page is used, SLUB/SLQB will only set this flag on the head
+   page; SLOB will not flag it at all.
+10 - BUDDY
     a free memory block managed by the buddy system allocator
     The buddy system organizes free memory in blocks of various orders.
     An order N block has 2^N physically contiguous pages, with the BUDDY flag
     set for and _only_ for the first page.
-
-15. COMPOUND_HEAD
-16. COMPOUND_TAIL
+15 - COMPOUND_HEAD
     A compound page with order N consists of 2^N physically contiguous pages.
     A compound page with order 2 takes the form of "HTTT", where H donates its
     head page and T donates its tail page(s).  The major consumers of compound
-    pages are hugeTLB pages (Documentation/vm/hugetlbpage.txt), the SLUB etc.
-    memory allocators and various device drivers. However in this interface,
-    only huge/giga pages are made visible to end users.
-17. HUGE
+    pages are hugeTLB pages
+    (:ref:`Documentation/admin-guide/mm/hugetlbpage.rst <hugetlbpage>`),
+    the SLUB etc.  memory allocators and various device drivers.
+    However in this interface, only huge/giga pages are made visible
+    to end users.
+16 - COMPOUND_TAIL
+    A compound page tail (see description above).
+17 - HUGE
     this is an integral part of a HugeTLB page
-
-19. HWPOISON
+19 - HWPOISON
     hardware detected memory corruption on this page: don't touch the data!
-
-20. NOPAGE
+20 - NOPAGE
     no page frame exists at the requested address
-
-21. KSM
+21 - KSM
     identical memory pages dynamically shared between one or more processes
-
-22. THP
+22 - THP
     contiguous pages which construct transparent hugepages
-
-23. BALLOON
+23 - BALLOON
     balloon compaction page
-
-24. ZERO_PAGE
+24 - ZERO_PAGE
     zero page for pfn_zero or huge_zero page
-
-25. IDLE
+25 - IDLE
     page has not been accessed since it was marked idle (see
-    Documentation/vm/idle_page_tracking.txt). Note that this flag may be
-    stale in case the page was accessed via a PTE. To make sure the flag
-    is up-to-date one has to read /sys/kernel/mm/page_idle/bitmap first.
-
-    [IO related page flags]
- 1. ERROR     IO error occurred
- 3. UPTODATE  page has up-to-date data
-              ie. for file backed page: (in-memory data revision >= on-disk one)
- 4. DIRTY     page has been written to, hence contains new data
-              ie. for file backed page: (in-memory data revision >  on-disk one)
- 8. WRITEBACK page is being synced to disk
-
-    [LRU related page flags]
- 5. LRU         page is in one of the LRU lists
- 6. ACTIVE      page is in the active LRU list
-18. UNEVICTABLE page is in the unevictable (non-)LRU list
-                It is somehow pinned and not a candidate for LRU page reclaims,
-		eg. ramfs pages, shmctl(SHM_LOCK) and mlock() memory segments
- 2. REFERENCED  page has been referenced since last LRU list enqueue/requeue
- 9. RECLAIM     page will be reclaimed soon after its pageout IO completed
-11. MMAP        a memory mapped page
-12. ANON        a memory mapped page that is not part of a file
-13. SWAPCACHE   page is mapped to swap space, ie. has an associated swap entry
-14. SWAPBACKED  page is backed by swap/RAM
+    :ref:`Documentation/admin-guide/mm/idle_page_tracking.rst <idle_page_tracking>`).
+    Note that this flag may be stale in case the page was accessed via
+    a PTE. To make sure the flag is up-to-date one has to read
+    ``/sys/kernel/mm/page_idle/bitmap`` first.
+
+IO related page flags
+---------------------
+
+1 - ERROR
+   IO error occurred
+3 - UPTODATE
+   page has up-to-date data
+   ie. for file backed page: (in-memory data revision >= on-disk one)
+4 - DIRTY
+   page has been written to, hence contains new data
+   i.e. for file backed page: (in-memory data revision >  on-disk one)
+8 - WRITEBACK
+   page is being synced to disk
+
+LRU related page flags
+----------------------
+
+5 - LRU
+   page is in one of the LRU lists
+6 - ACTIVE
+   page is in the active LRU list
+18 - UNEVICTABLE
+   page is in the unevictable (non-)LRU list It is somehow pinned and
+   not a candidate for LRU page reclaims, e.g. ramfs pages,
+   shmctl(SHM_LOCK) and mlock() memory segments
+2 - REFERENCED
+   page has been referenced since last LRU list enqueue/requeue
+9 - RECLAIM
+   page will be reclaimed soon after its pageout IO completed
+11 - MMAP
+   a memory mapped page
+12 - ANON
+   a memory mapped page that is not part of a file
+13 - SWAPCACHE
+   page is mapped to swap space, i.e. has an associated swap entry
+14 - SWAPBACKED
+   page is backed by swap/RAM
 
 The page-types tool in the tools/vm directory can be used to query the
 above flags.
 
-Using pagemap to do something useful:
+Using pagemap to do something useful
+====================================
 
 The general procedure for using pagemap to find out about a process' memory
 usage goes like this:
 
- 1. Read /proc/pid/maps to determine which parts of the memory space are
+ 1. Read ``/proc/pid/maps`` to determine which parts of the memory space are
     mapped to what.
  2. Select the maps you are interested in -- all of them, or a particular
     library, or the stack or the heap, etc.
- 3. Open /proc/pid/pagemap and seek to the pages you would like to examine.
+ 3. Open ``/proc/pid/pagemap`` and seek to the pages you would like to examine.
  4. Read a u64 for each page from pagemap.
- 5. Open /proc/kpagecount and/or /proc/kpageflags.  For each PFN you just
-    read, seek to that entry in the file, and read the data you want.
+ 5. Open ``/proc/kpagecount`` and/or ``/proc/kpageflags``.  For each PFN you
+    just read, seek to that entry in the file, and read the data you want.
 
 For example, to find the "unique set size" (USS), which is the amount of
 memory that a process is using that is not shared with any other process,
@@ -171,7 +188,8 @@ you can go through every map in the process, find the PFNs, look those up
 in kpagecount, and tally up the number of pages that are only referenced
 once.
 
-Other notes:
+Other notes
+===========
 
 Reading from any of the files will return -EINVAL if you are not starting
 the read on an 8-byte boundary (e.g., if you sought an odd number of bytes
diff --git a/Documentation/vm/soft-dirty.txt b/Documentation/admin-guide/mm/soft-dirty.rst
index 55684d1..cb0cfd6 100644
--- a/Documentation/vm/soft-dirty.txt
+++ b/Documentation/admin-guide/mm/soft-dirty.rst
@@ -1,34 +1,38 @@
-                            SOFT-DIRTY PTEs
+.. _soft_dirty:
 
-  The soft-dirty is a bit on a PTE which helps to track which pages a task
+===============
+Soft-Dirty PTEs
+===============
+
+The soft-dirty is a bit on a PTE which helps to track which pages a task
 writes to. In order to do this tracking one should
 
   1. Clear soft-dirty bits from the task's PTEs.
 
-     This is done by writing "4" into the /proc/PID/clear_refs file of the
+     This is done by writing "4" into the ``/proc/PID/clear_refs`` file of the
      task in question.
 
   2. Wait some time.
 
   3. Read soft-dirty bits from the PTEs.
 
-     This is done by reading from the /proc/PID/pagemap. The bit 55 of the
+     This is done by reading from the ``/proc/PID/pagemap``. The bit 55 of the
      64-bit qword is the soft-dirty one. If set, the respective PTE was
      written to since step 1.
 
 
-  Internally, to do this tracking, the writable bit is cleared from PTEs
+Internally, to do this tracking, the writable bit is cleared from PTEs
 when the soft-dirty bit is cleared. So, after this, when the task tries to
 modify a page at some virtual address the #PF occurs and the kernel sets
 the soft-dirty bit on the respective PTE.
 
-  Note, that although all the task's address space is marked as r/o after the
+Note, that although all the task's address space is marked as r/o after the
 soft-dirty bits clear, the #PF-s that occur after that are processed fast.
 This is so, since the pages are still mapped to physical memory, and thus all
 the kernel does is finds this fact out and puts both writable and soft-dirty
 bits on the PTE.
 
-  While in most cases tracking memory changes by #PF-s is more than enough
+While in most cases tracking memory changes by #PF-s is more than enough
 there is still a scenario when we can lose soft dirty bits -- a task
 unmaps a previously mapped memory region and then maps a new one at exactly
 the same place. When unmap is called, the kernel internally clears PTE values
@@ -36,7 +40,7 @@ including soft dirty bits. To notify user space application about such
 memory region renewal the kernel always marks new memory regions (and
 expanded regions) as soft dirty.
 
-  This feature is actively used by the checkpoint-restore project. You
+This feature is actively used by the checkpoint-restore project. You
 can find more details about it on http://criu.org
 
 
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
new file mode 100644
index 0000000..7ab93a8
--- /dev/null
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -0,0 +1,418 @@
+.. _admin_guide_transhuge:
+
+============================
+Transparent Hugepage Support
+============================
+
+Objective
+=========
+
+Performance critical computing applications dealing with large memory
+working sets are already running on top of libhugetlbfs and in turn
+hugetlbfs. Transparent HugePage Support (THP) is an alternative mean of
+using huge pages for the backing of virtual memory with huge pages
+that supports the automatic promotion and demotion of page sizes and
+without the shortcomings of hugetlbfs.
+
+Currently THP only works for anonymous memory mappings and tmpfs/shmem.
+But in the future it can expand to other filesystems.
+
+.. note::
+   in the examples below we presume that the basic page size is 4K and
+   the huge page size is 2M, although the actual numbers may vary
+   depending on the CPU architecture.
+
+The reason applications are running faster is because of two
+factors. The first factor is almost completely irrelevant and it's not
+of significant interest because it'll also have the downside of
+requiring larger clear-page copy-page in page faults which is a
+potentially negative effect. The first factor consists in taking a
+single page fault for each 2M virtual region touched by userland (so
+reducing the enter/exit kernel frequency by a 512 times factor). This
+only matters the first time the memory is accessed for the lifetime of
+a memory mapping. The second long lasting and much more important
+factor will affect all subsequent accesses to the memory for the whole
+runtime of the application. The second factor consist of two
+components:
+
+1) the TLB miss will run faster (especially with virtualization using
+   nested pagetables but almost always also on bare metal without
+   virtualization)
+
+2) a single TLB entry will be mapping a much larger amount of virtual
+   memory in turn reducing the number of TLB misses. With
+   virtualization and nested pagetables the TLB can be mapped of
+   larger size only if both KVM and the Linux guest are using
+   hugepages but a significant speedup already happens if only one of
+   the two is using hugepages just because of the fact the TLB miss is
+   going to run faster.
+
+THP can be enabled system wide or restricted to certain tasks or even
+memory ranges inside task's address space. Unless THP is completely
+disabled, there is ``khugepaged`` daemon that scans memory and
+collapses sequences of basic pages into huge pages.
+
+The THP behaviour is controlled via :ref:`sysfs <thp_sysfs>`
+interface and using madivse(2) and prctl(2) system calls.
+
+Transparent Hugepage Support maximizes the usefulness of free memory
+if compared to the reservation approach of hugetlbfs by allowing all
+unused memory to be used as cache or other movable (or even unmovable
+entities). It doesn't require reservation to prevent hugepage
+allocation failures to be noticeable from userland. It allows paging
+and all other advanced VM features to be available on the
+hugepages. It requires no modifications for applications to take
+advantage of it.
+
+Applications however can be further optimized to take advantage of
+this feature, like for example they've been optimized before to avoid
+a flood of mmap system calls for every malloc(4k). Optimizing userland
+is by far not mandatory and khugepaged already can take care of long
+lived page allocations even for hugepage unaware applications that
+deals with large amounts of memory.
+
+In certain cases when hugepages are enabled system wide, application
+may end up allocating more memory resources. An application may mmap a
+large region but only touch 1 byte of it, in that case a 2M page might
+be allocated instead of a 4k page for no good. This is why it's
+possible to disable hugepages system-wide and to only have them inside
+MADV_HUGEPAGE madvise regions.
+
+Embedded systems should enable hugepages only inside madvise regions
+to eliminate any risk of wasting any precious byte of memory and to
+only run faster.
+
+Applications that gets a lot of benefit from hugepages and that don't
+risk to lose memory by using hugepages, should use
+madvise(MADV_HUGEPAGE) on their critical mmapped regions.
+
+.. _thp_sysfs:
+
+sysfs
+=====
+
+Global THP controls
+-------------------
+
+Transparent Hugepage Support for anonymous memory can be entirely disabled
+(mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE
+regions (to avoid the risk of consuming more memory resources) or enabled
+system wide. This can be achieved with one of::
+
+	echo always >/sys/kernel/mm/transparent_hugepage/enabled
+	echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
+	echo never >/sys/kernel/mm/transparent_hugepage/enabled
+
+It's also possible to limit defrag efforts in the VM to generate
+anonymous hugepages in case they're not immediately free to madvise
+regions or to never try to defrag memory and simply fallback to regular
+pages unless hugepages are immediately available. Clearly if we spend CPU
+time to defrag memory, we would expect to gain even more by the fact we
+use hugepages later instead of regular pages. This isn't always
+guaranteed, but it may be more likely in case the allocation is for a
+MADV_HUGEPAGE region.
+
+::
+
+	echo always >/sys/kernel/mm/transparent_hugepage/defrag
+	echo defer >/sys/kernel/mm/transparent_hugepage/defrag
+	echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag
+	echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
+	echo never >/sys/kernel/mm/transparent_hugepage/defrag
+
+always
+	means that an application requesting THP will stall on
+	allocation failure and directly reclaim pages and compact
+	memory in an effort to allocate a THP immediately. This may be
+	desirable for virtual machines that benefit heavily from THP
+	use and are willing to delay the VM start to utilise them.
+
+defer
+	means that an application will wake kswapd in the background
+	to reclaim pages and wake kcompactd to compact memory so that
+	THP is available in the near future. It's the responsibility
+	of khugepaged to then install the THP pages later.
+
+defer+madvise
+	will enter direct reclaim and compaction like ``always``, but
+	only for regions that have used madvise(MADV_HUGEPAGE); all
+	other regions will wake kswapd in the background to reclaim
+	pages and wake kcompactd to compact memory so that THP is
+	available in the near future.
+
+madvise
+	will enter direct reclaim like ``always`` but only for regions
+	that are have used madvise(MADV_HUGEPAGE). This is the default
+	behaviour.
+
+never
+	should be self-explanatory.
+
+By default kernel tries to use huge zero page on read page fault to
+anonymous mapping. It's possible to disable huge zero page by writing 0
+or enable it back by writing 1::
+
+	echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page
+	echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page
+
+Some userspace (such as a test program, or an optimized memory allocation
+library) may want to know the size (in bytes) of a transparent hugepage::
+
+	cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
+
+khugepaged will be automatically started when
+transparent_hugepage/enabled is set to "always" or "madvise, and it'll
+be automatically shutdown if it's set to "never".
+
+Khugepaged controls
+-------------------
+
+khugepaged runs usually at low frequency so while one may not want to
+invoke defrag algorithms synchronously during the page faults, it
+should be worth invoking defrag at least in khugepaged. However it's
+also possible to disable defrag in khugepaged by writing 0 or enable
+defrag in khugepaged by writing 1::
+
+	echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+	echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+
+You can also control how many pages khugepaged should scan at each
+pass::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan
+
+and how many milliseconds to wait in khugepaged between each pass (you
+can set this to 0 to run khugepaged at 100% utilization of one core)::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs
+
+and how many milliseconds to wait in khugepaged if there's an hugepage
+allocation failure to throttle the next allocation attempt::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
+
+The khugepaged progress can be seen in the number of pages collapsed::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
+
+for each pass::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
+
+``max_ptes_none`` specifies how many extra small pages (that are
+not already mapped) can be allocated when collapsing a group
+of small pages into one large page::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none
+
+A higher value leads to use additional memory for programs.
+A lower value leads to gain less thp performance. Value of
+max_ptes_none can waste cpu time very little, you can
+ignore it.
+
+``max_ptes_swap`` specifies how many pages can be brought in from
+swap when collapsing a group of pages into a transparent huge page::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap
+
+A higher value can cause excessive swap IO and waste
+memory. A lower value can prevent THPs from being
+collapsed, resulting fewer pages being collapsed into
+THPs, and lower memory access performance.
+
+Boot parameter
+==============
+
+You can change the sysfs boot time defaults of Transparent Hugepage
+Support by passing the parameter ``transparent_hugepage=always`` or
+``transparent_hugepage=madvise`` or ``transparent_hugepage=never``
+to the kernel command line.
+
+Hugepages in tmpfs/shmem
+========================
+
+You can control hugepage allocation policy in tmpfs with mount option
+``huge=``. It can have following values:
+
+always
+    Attempt to allocate huge pages every time we need a new page;
+
+never
+    Do not allocate huge pages;
+
+within_size
+    Only allocate huge page if it will be fully within i_size.
+    Also respect fadvise()/madvise() hints;
+
+advise
+    Only allocate huge pages if requested with fadvise()/madvise();
+
+The default policy is ``never``.
+
+``mount -o remount,huge= /mountpoint`` works fine after mount: remounting
+``huge=never`` will not attempt to break up huge pages at all, just stop more
+from being allocated.
+
+There's also sysfs knob to control hugepage allocation policy for internal
+shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount
+is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or
+MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem.
+
+In addition to policies listed above, shmem_enabled allows two further
+values:
+
+deny
+    For use in emergencies, to force the huge option off from
+    all mounts;
+force
+    Force the huge option on for all - very useful for testing;
+
+Need of application restart
+===========================
+
+The transparent_hugepage/enabled values and tmpfs mount option only affect
+future behavior. So to make them effective you need to restart any
+application that could have been using hugepages. This also applies to the
+regions registered in khugepaged.
+
+Monitoring usage
+================
+
+The number of anonymous transparent huge pages currently used by the
+system is available by reading the AnonHugePages field in ``/proc/meminfo``.
+To identify what applications are using anonymous transparent huge pages,
+it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages fields
+for each mapping.
+
+The number of file transparent huge pages mapped to userspace is available
+by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``.
+To identify what applications are mapping file transparent huge pages, it
+is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields
+for each mapping.
+
+Note that reading the smaps file is expensive and reading it
+frequently will incur overhead.
+
+There are a number of counters in ``/proc/vmstat`` that may be used to
+monitor how successfully the system is providing huge pages for use.
+
+thp_fault_alloc
+	is incremented every time a huge page is successfully
+	allocated to handle a page fault. This applies to both the
+	first time a page is faulted and for COW faults.
+
+thp_collapse_alloc
+	is incremented by khugepaged when it has found
+	a range of pages to collapse into one huge page and has
+	successfully allocated a new huge page to store the data.
+
+thp_fault_fallback
+	is incremented if a page fault fails to allocate
+	a huge page and instead falls back to using small pages.
+
+thp_collapse_alloc_failed
+	is incremented if khugepaged found a range
+	of pages that should be collapsed into one huge page but failed
+	the allocation.
+
+thp_file_alloc
+	is incremented every time a file huge page is successfully
+	allocated.
+
+thp_file_mapped
+	is incremented every time a file huge page is mapped into
+	user address space.
+
+thp_split_page
+	is incremented every time a huge page is split into base
+	pages. This can happen for a variety of reasons but a common
+	reason is that a huge page is old and is being reclaimed.
+	This action implies splitting all PMD the page mapped with.
+
+thp_split_page_failed
+	is incremented if kernel fails to split huge
+	page. This can happen if the page was pinned by somebody.
+
+thp_deferred_split_page
+	is incremented when a huge page is put onto split
+	queue. This happens when a huge page is partially unmapped and
+	splitting it would free up some memory. Pages on split queue are
+	going to be split under memory pressure.
+
+thp_split_pmd
+	is incremented every time a PMD split into table of PTEs.
+	This can happen, for instance, when application calls mprotect() or
+	munmap() on part of huge page. It doesn't split huge page, only
+	page table entry.
+
+thp_zero_page_alloc
+	is incremented every time a huge zero page is
+	successfully allocated. It includes allocations which where
+	dropped due race with other allocation. Note, it doesn't count
+	every map of the huge zero page, only its allocation.
+
+thp_zero_page_alloc_failed
+	is incremented if kernel fails to allocate
+	huge zero page and falls back to using small pages.
+
+thp_swpout
+	is incremented every time a huge page is swapout in one
+	piece without splitting.
+
+thp_swpout_fallback
+	is incremented if a huge page has to be split before swapout.
+	Usually because failed to allocate some continuous swap space
+	for the huge page.
+
+As the system ages, allocating huge pages may be expensive as the
+system uses memory compaction to copy data around memory to free a
+huge page for use. There are some counters in ``/proc/vmstat`` to help
+monitor this overhead.
+
+compact_stall
+	is incremented every time a process stalls to run
+	memory compaction so that a huge page is free for use.
+
+compact_success
+	is incremented if the system compacted memory and
+	freed a huge page for use.
+
+compact_fail
+	is incremented if the system tries to compact memory
+	but failed.
+
+compact_pages_moved
+	is incremented each time a page is moved. If
+	this value is increasing rapidly, it implies that the system
+	is copying a lot of data to satisfy the huge page allocation.
+	It is possible that the cost of copying exceeds any savings
+	from reduced TLB misses.
+
+compact_pagemigrate_failed
+	is incremented when the underlying mechanism
+	for moving a page failed.
+
+compact_blocks_moved
+	is incremented each time memory compaction examines
+	a huge page aligned range of pages.
+
+It is possible to establish how long the stalls were using the function
+tracer to record how long was spent in __alloc_pages_nodemask and
+using the mm_page_alloc tracepoint to identify which allocations were
+for huge pages.
+
+Optimizing the applications
+===========================
+
+To be guaranteed that the kernel will map a 2M page immediately in any
+memory region, the mmap region has to be hugepage naturally
+aligned. posix_memalign() can provide that guarantee.
+
+Hugetlbfs
+=========
+
+You can use hugetlbfs on a kernel that has transparent hugepage
+support enabled just fine as always. No difference can be noted in
+hugetlbfs other than there will be less overall fragmentation. All
+usual features belonging to hugetlbfs are preserved and
+unaffected. libhugetlbfs will also work fine as usual.
diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/admin-guide/mm/userfaultfd.rst
index bb2f945..5048cf6 100644
--- a/Documentation/vm/userfaultfd.txt
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -1,6 +1,11 @@
-= Userfaultfd =
+.. _userfaultfd:
 
-== Objective ==
+===========
+Userfaultfd
+===========
+
+Objective
+=========
 
 Userfaults allow the implementation of on-demand paging from userland
 and more generally they allow userland to take control of various
@@ -9,7 +14,8 @@ memory page faults, something otherwise only the kernel code could do.
 For example userfaults allows a proper and more optimal implementation
 of the PROT_NONE+SIGSEGV trick.
 
-== Design ==
+Design
+======
 
 Userfaults are delivered and resolved through the userfaultfd syscall.
 
@@ -41,7 +47,8 @@ different processes without them being aware about what is going on
 themselves on the same region the manager is already tracking, which
 is a corner case that would currently return -EBUSY).
 
-== API ==
+API
+===
 
 When first opened the userfaultfd must be enabled invoking the
 UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or
@@ -101,7 +108,8 @@ UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an
 half copied page since it'll keep userfaulting until the copy has
 finished.
 
-== QEMU/KVM ==
+QEMU/KVM
+========
 
 QEMU/KVM is using the userfaultfd syscall to implement postcopy live
 migration. Postcopy live migration is one form of memory
@@ -163,7 +171,8 @@ sending the same page twice (in case the userfault is read by the
 postcopy thread just before UFFDIO_COPY|ZEROPAGE runs in the migration
 thread).
 
-== Non-cooperative userfaultfd ==
+Non-cooperative userfaultfd
+===========================
 
 When the userfaultfd is monitored by an external manager, the manager
 must be able to track changes in the process virtual memory
@@ -172,27 +181,30 @@ the same read(2) protocol as for the page fault notifications. The
 manager has to explicitly enable these events by setting appropriate
 bits in uffdio_api.features passed to UFFDIO_API ioctl:
 
-UFFD_FEATURE_EVENT_FORK - enable userfaultfd hooks for fork(). When
-this feature is enabled, the userfaultfd context of the parent process
-is duplicated into the newly created process. The manager receives
-UFFD_EVENT_FORK with file descriptor of the new userfaultfd context in
-the uffd_msg.fork.
-
-UFFD_FEATURE_EVENT_REMAP - enable notifications about mremap()
-calls. When the non-cooperative process moves a virtual memory area to
-a different location, the manager will receive UFFD_EVENT_REMAP. The
-uffd_msg.remap will contain the old and new addresses of the area and
-its original length.
-
-UFFD_FEATURE_EVENT_REMOVE - enable notifications about
-madvise(MADV_REMOVE) and madvise(MADV_DONTNEED) calls. The event
-UFFD_EVENT_REMOVE will be generated upon these calls to madvise. The
-uffd_msg.remove will contain start and end addresses of the removed
-area.
-
-UFFD_FEATURE_EVENT_UNMAP - enable notifications about memory
-unmapping. The manager will get UFFD_EVENT_UNMAP with uffd_msg.remove
-containing start and end addresses of the unmapped area.
+UFFD_FEATURE_EVENT_FORK
+	enable userfaultfd hooks for fork(). When this feature is
+	enabled, the userfaultfd context of the parent process is
+	duplicated into the newly created process. The manager
+	receives UFFD_EVENT_FORK with file descriptor of the new
+	userfaultfd context in the uffd_msg.fork.
+
+UFFD_FEATURE_EVENT_REMAP
+	enable notifications about mremap() calls. When the
+	non-cooperative process moves a virtual memory area to a
+	different location, the manager will receive
+	UFFD_EVENT_REMAP. The uffd_msg.remap will contain the old and
+	new addresses of the area and its original length.
+
+UFFD_FEATURE_EVENT_REMOVE
+	enable notifications about madvise(MADV_REMOVE) and
+	madvise(MADV_DONTNEED) calls. The event UFFD_EVENT_REMOVE will
+	be generated upon these calls to madvise. The uffd_msg.remove
+	will contain start and end addresses of the removed area.
+
+UFFD_FEATURE_EVENT_UNMAP
+	enable notifications about memory unmapping. The manager will
+	get UFFD_EVENT_UNMAP with uffd_msg.remove containing start and
+	end addresses of the unmapped area.
 
 Although the UFFD_FEATURE_EVENT_REMOVE and UFFD_FEATURE_EVENT_UNMAP
 are pretty similar, they quite differ in the action expected from the
diff --git a/Documentation/admin-guide/ramoops.rst b/Documentation/admin-guide/ramoops.rst
index 4efd7ce..6dbcc54 100644
--- a/Documentation/admin-guide/ramoops.rst
+++ b/Documentation/admin-guide/ramoops.rst
@@ -61,7 +61,7 @@ Setting the ramoops parameters can be done in several different manners:
 	mem=128M ramoops.mem_address=0x8000000 ramoops.ecc=1
 
  B. Use Device Tree bindings, as described in
- ``Documentation/device-tree/bindings/reserved-memory/admin-guide/ramoops.rst``.
+ ``Documentation/devicetree/bindings/reserved-memory/ramoops.txt``.
  For example::
 
 	reserved-memory {
diff --git a/Documentation/arm/Marvell/README b/Documentation/arm/Marvell/README
index b5bb7f5..56ada27c5 100644
--- a/Documentation/arm/Marvell/README
+++ b/Documentation/arm/Marvell/README
@@ -302,19 +302,15 @@ Berlin family (Multimedia Solutions)
 	88DE3010, Armada 1000 (no Linux support)
 		Core:		Marvell PJ1 (ARMv5TE), Dual-core
 		Product Brief:	http://www.marvell.com.cn/digital-entertainment/assets/armada_1000_pb.pdf
-	88DE3005, Armada 1500-mini
 	88DE3005, Armada 1500 Mini
 		Design name:	BG2CD
 		Core:		ARM Cortex-A9, PL310 L2CC
-		Homepage:	http://www.marvell.com/multimedia-solutions/armada-1500-mini/
-        88DE3006, Armada 1500 Mini Plus
-                Design name:    BG2CDP
-                Core:           Dual Core ARM Cortex-A7
-                Homepage:       http://www.marvell.com/multimedia-solutions/armada-1500-mini-plus/
+	88DE3006, Armada 1500 Mini Plus
+		Design name:	BG2CDP
+		Core:		Dual Core ARM Cortex-A7
 	88DE3100, Armada 1500
 		Design name:	BG2
 		Core:		Marvell PJ4B-MP (ARMv7), Tauros3 L2CC
-		Product Brief:	http://www.marvell.com/digital-entertainment/armada-1500/assets/Marvell-ARMADA-1500-Product-Brief.pdf
 	88DE3114, Armada 1500 Pro
 		Design name:	BG2Q
 		Core:		Quad Core ARM Cortex-A9, PL310 L2CC
@@ -324,13 +320,16 @@ Berlin family (Multimedia Solutions)
 	88DE3218, ARMADA 1500 Ultra
 		Core:		ARM Cortex-A53
 
-  Homepage: http://www.marvell.com/multimedia-solutions/
+  Homepage: https://www.synaptics.com/products/multimedia-solutions
   Directory: arch/arm/mach-berlin
 
   Comments:
+
    * This line of SoCs is based on Marvell Sheeva or ARM Cortex CPUs
      with Synopsys DesignWare (IRQ, GPIO, Timers, ...) and PXA IP (SDHCI, USB, ETH, ...).
 
+   * The Berlin family was acquired by Synaptics from Marvell in 2017.
+
 CPU Cores
 ---------
 
diff --git a/Documentation/block/cmdline-partition.txt b/Documentation/block/cmdline-partition.txt
index 525b9f6..760a3f7 100644
--- a/Documentation/block/cmdline-partition.txt
+++ b/Documentation/block/cmdline-partition.txt
@@ -1,7 +1,9 @@
 Embedded device command line partition parsing
 =====================================================================
 
-Support for reading the block device partition table from the command line.
+The "blkdevparts" command line option adds support for reading the
+block device partition table from the kernel command line.
+
 It is typically used for fixed block (eMMC) embedded devices.
 It has no MBR, so saves storage space. Bootloader can be easily accessed
 by absolute address of data on the block device.
@@ -14,22 +16,27 @@ blkdevparts=<blkdev-def>[;<blkdev-def>]
     <partdef> := <size>[@<offset>](part-name)
 
 <blkdev-id>
-    block device disk name, embedded device used fixed block device,
-    it's disk name also fixed. such as: mmcblk0, mmcblk1, mmcblk0boot0.
+    block device disk name. Embedded device uses fixed block device.
+    Its disk name is also fixed, such as: mmcblk0, mmcblk1, mmcblk0boot0.
 
 <size>
     partition size, in bytes, such as: 512, 1m, 1G.
+    size may contain an optional suffix of (upper or lower case):
+      K, M, G, T, P, E.
+    "-" is used to denote all remaining space.
 
 <offset>
     partition start address, in bytes.
+    offset may contain an optional suffix of (upper or lower case):
+      K, M, G, T, P, E.
 
 (part-name)
-    partition name, kernel send uevent with "PARTNAME". application can create
-    a link to block device partition with the name "PARTNAME".
-    user space application can access partition by partition name.
+    partition name. Kernel sends uevent with "PARTNAME". Application can
+    create a link to block device partition with the name "PARTNAME".
+    User space application can access partition by partition name.
 
 Example:
-    eMMC disk name is "mmcblk0" and "mmcblk0boot0"
+    eMMC disk names are "mmcblk0" and "mmcblk0boot0".
 
   bootargs:
     'blkdevparts=mmcblk0:1G(data0),1G(data1),-;mmcblk0boot0:1m(boot),-(kernel)'
diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
index 733927a..07f1473 100644
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -71,13 +71,16 @@ use_per_node_hctx=[0/1]: Default: 0
   1: The multi-queue block layer is instantiated with a hardware dispatch
      queue for each CPU node in the system.
 
-use_lightnvm=[0/1]: Default: 0
-  Register device with LightNVM. Requires blk-mq and CONFIG_NVM to be enabled.
-
 no_sched=[0/1]: Default: 0
   0: nullb* use default blk-mq io scheduler.
   1: nullb* doesn't use io scheduler.
 
+blocking=[0/1]: Default: 0
+  0: Register as a non-blocking blk-mq driver device.
+  1: Register as a blocking blk-mq driver device, null_blk will set
+     the BLK_MQ_F_BLOCKING flag, indicating that it sometimes/always
+     needs to block in its ->queue_rq() function.
+
 shared_tags=[0/1]: Default: 0
   0: Tag set is not shared.
   1: Tag set shared between devices for blk-mq. Only makes sense with
diff --git a/Documentation/core-api/atomic_ops.rst b/Documentation/core-api/atomic_ops.rst
index fce9291..2e7165f 100644
--- a/Documentation/core-api/atomic_ops.rst
+++ b/Documentation/core-api/atomic_ops.rst
@@ -111,7 +111,6 @@ If the compiler can prove that do_something() does not store to the
 variable a, then the compiler is within its rights transforming this to
 the following::
 
-	tmp = a;
 	if (a > 0)
 		for (;;)
 			do_something();
@@ -119,7 +118,7 @@ the following::
 If you don't want the compiler to do this (and you probably don't), then
 you should use something like the following::
 
-	while (READ_ONCE(a) < 0)
+	while (READ_ONCE(a) > 0)
 		do_something();
 
 Alternatively, you could place a barrier() call in the loop.
@@ -467,10 +466,12 @@ Like the above, except that these routines return a boolean which
 indicates whether the changed bit was set _BEFORE_ the atomic bit
 operation.
 
-WARNING! It is incredibly important that the value be a boolean,
-ie. "0" or "1".  Do not try to be fancy and save a few instructions by
-declaring the above to return "long" and just returning something like
-"old_val & mask" because that will not work.
+
+.. warning::
+        It is incredibly important that the value be a boolean, ie. "0" or "1".
+        Do not try to be fancy and save a few instructions by declaring the
+        above to return "long" and just returning something like "old_val &
+        mask" because that will not work.
 
 For one thing, this return value gets truncated to int in many code
 paths using these interfaces, so on 64-bit if the bit is set in the
diff --git a/Documentation/cachetlb.txt b/Documentation/core-api/cachetlb.rst
index 6eb9d3f..6eb9d3f 100644
--- a/Documentation/cachetlb.txt
+++ b/Documentation/core-api/cachetlb.rst
diff --git a/Documentation/circular-buffers.txt b/Documentation/core-api/circular-buffers.rst
index 53e51ca..53e51ca 100644
--- a/Documentation/circular-buffers.txt
+++ b/Documentation/core-api/circular-buffers.rst
diff --git a/Documentation/core-api/gfp_mask-from-fs-io.rst b/Documentation/core-api/gfp_mask-from-fs-io.rst
new file mode 100644
index 0000000..e0df8f4
--- /dev/null
+++ b/Documentation/core-api/gfp_mask-from-fs-io.rst
@@ -0,0 +1,66 @@
+=================================
+GFP masks used from FS/IO context
+=================================
+
+:Date: May, 2018
+:Author: Michal Hocko <mhocko@kernel.org>
+
+Introduction
+============
+
+Code paths in the filesystem and IO stacks must be careful when
+allocating memory to prevent recursion deadlocks caused by direct
+memory reclaim calling back into the FS or IO paths and blocking on
+already held resources (e.g. locks - most commonly those used for the
+transaction context).
+
+The traditional way to avoid this deadlock problem is to clear __GFP_FS
+respectively __GFP_IO (note the latter implies clearing the first as well) in
+the gfp mask when calling an allocator. GFP_NOFS respectively GFP_NOIO can be
+used as shortcut. It turned out though that above approach has led to
+abuses when the restricted gfp mask is used "just in case" without a
+deeper consideration which leads to problems because an excessive use
+of GFP_NOFS/GFP_NOIO can lead to memory over-reclaim or other memory
+reclaim issues.
+
+New API
+========
+
+Since 4.12 we do have a generic scope API for both NOFS and NOIO context
+``memalloc_nofs_save``, ``memalloc_nofs_restore`` respectively ``memalloc_noio_save``,
+``memalloc_noio_restore`` which allow to mark a scope to be a critical
+section from a filesystem or I/O point of view. Any allocation from that
+scope will inherently drop __GFP_FS respectively __GFP_IO from the given
+mask so no memory allocation can recurse back in the FS/IO.
+
+.. kernel-doc:: include/linux/sched/mm.h
+   :functions: memalloc_nofs_save memalloc_nofs_restore
+.. kernel-doc:: include/linux/sched/mm.h
+   :functions: memalloc_noio_save memalloc_noio_restore
+
+FS/IO code then simply calls the appropriate save function before
+any critical section with respect to the reclaim is started - e.g.
+lock shared with the reclaim context or when a transaction context
+nesting would be possible via reclaim. The restore function should be
+called when the critical section ends. All that ideally along with an
+explanation what is the reclaim context for easier maintenance.
+
+Please note that the proper pairing of save/restore functions
+allows nesting so it is safe to call ``memalloc_noio_save`` or
+``memalloc_noio_restore`` respectively from an existing NOIO or NOFS
+scope.
+
+What about __vmalloc(GFP_NOFS)
+==============================
+
+vmalloc doesn't support GFP_NOFS semantic because there are hardcoded
+GFP_KERNEL allocations deep inside the allocator which are quite non-trivial
+to fix up. That means that calling ``vmalloc`` with GFP_NOFS/GFP_NOIO is
+almost always a bug. The good news is that the NOFS/NOIO semantic can be
+achieved by the scope API.
+
+In the ideal world, upper layers should already mark dangerous contexts
+and so no special care is required and vmalloc should be called without
+any problems. Sometimes if the context is not really clear or there are
+layering violations then the recommended way around that is to wrap ``vmalloc``
+by the scope API with a comment explaining the problem.
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index c670a80..f5a66b7 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -14,6 +14,7 @@ Core utilities
    kernel-api
    assoc_array
    atomic_ops
+   cachetlb
    refcount-vs-atomic
    cpu_hotplug
    idr
@@ -25,6 +26,8 @@ Core utilities
    genalloc
    errseq
    printk-formats
+   circular-buffers
+   gfp_mask-from-fs-io
 
 Interfaces for kernel debugging
 ===============================
diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index 92f3000..8e44aea 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -39,17 +39,17 @@ String Manipulation
 .. kernel-doc:: lib/string.c
    :export:
 
+Basic Kernel Library Functions
+==============================
+
+The Linux kernel provides more basic utility functions.
+
 Bit Operations
 --------------
 
 .. kernel-doc:: arch/x86/include/asm/bitops.h
    :internal:
 
-Basic Kernel Library Functions
-==============================
-
-The Linux kernel provides more basic utility functions.
-
 Bitmap Operations
 -----------------
 
@@ -80,6 +80,31 @@ Command-line Parsing
 .. kernel-doc:: lib/cmdline.c
    :export:
 
+Sorting
+-------
+
+.. kernel-doc:: lib/sort.c
+   :export:
+
+.. kernel-doc:: lib/list_sort.c
+   :export:
+
+Text Searching
+--------------
+
+.. kernel-doc:: lib/textsearch.c
+   :doc: ts_intro
+
+.. kernel-doc:: lib/textsearch.c
+   :export:
+
+.. kernel-doc:: include/linux/textsearch.h
+   :functions: textsearch_find textsearch_next \
+               textsearch_get_pattern textsearch_get_pattern_len
+
+CRC and Math Functions in Linux
+===============================
+
 CRC Functions
 -------------
 
@@ -103,9 +128,6 @@ CRC Functions
 .. kernel-doc:: lib/crc-itu-t.c
    :export:
 
-Math Functions in Linux
-=======================
-
 Base 2 log and power Functions
 ------------------------------
 
@@ -127,28 +149,6 @@ Division Functions
 .. kernel-doc:: lib/gcd.c
    :export:
 
-Sorting
--------
-
-.. kernel-doc:: lib/sort.c
-   :export:
-
-.. kernel-doc:: lib/list_sort.c
-   :export:
-
-Text Searching
---------------
-
-.. kernel-doc:: lib/textsearch.c
-   :doc: ts_intro
-
-.. kernel-doc:: lib/textsearch.c
-   :export:
-
-.. kernel-doc:: include/linux/textsearch.h
-   :functions: textsearch_find textsearch_next \
-               textsearch_get_pattern textsearch_get_pattern_len
-
 UUID/GUID
 ---------
 
diff --git a/Documentation/core-api/refcount-vs-atomic.rst b/Documentation/core-api/refcount-vs-atomic.rst
index 83351c2..322851b 100644
--- a/Documentation/core-api/refcount-vs-atomic.rst
+++ b/Documentation/core-api/refcount-vs-atomic.rst
@@ -17,7 +17,7 @@ in order to help maintainers validate their code against the change in
 these memory ordering guarantees.
 
 The terms used through this document try to follow the formal LKMM defined in
-github.com/aparri/memory-model/blob/master/Documentation/explanation.txt
+tools/memory-model/Documentation/explanation.txt.
 
 memory-barriers.txt and atomic_t.txt provide more background to the
 memory ordering in general and for atomic operations specifically.
diff --git a/Documentation/crypto/index.rst b/Documentation/crypto/index.rst
index 94c4786..c4ff5d7 100644
--- a/Documentation/crypto/index.rst
+++ b/Documentation/crypto/index.rst
@@ -20,5 +20,6 @@ for cryptographic use cases, as well as programming examples.
    architecture
    devel-algos
    userspace-if
+   crypto_engine
    api
    api-samples
diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index f7a18f2..aabc873 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -120,7 +120,7 @@ A typical out of bounds access report looks like this::
 
 The header of the report discribe what kind of bug happened and what kind of
 access caused it. It's followed by the description of the accessed slub object
-(see 'SLUB Debug output' section in Documentation/vm/slub.txt for details) and
+(see 'SLUB Debug output' section in Documentation/vm/slub.rst for details) and
 the description of the accessed memory page.
 
 In the last section the report shows memory state around the accessed address.
diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst
index e80850e..3bf371a 100644
--- a/Documentation/dev-tools/kselftest.rst
+++ b/Documentation/dev-tools/kselftest.rst
@@ -151,6 +151,11 @@ Contributing new tests (details)
    TEST_FILES, TEST_GEN_FILES mean it is the file which is used by
    test.
 
+ * First use the headers inside the kernel source and/or git repo, and then the
+   system headers.  Headers for the kernel release as opposed to headers
+   installed by the distro on the system should be the primary focus to be able
+   to find regressions.
+
 Test Harness
 ============
 
diff --git a/Documentation/devicetree/bindings/hwmon/gpio-fan.txt b/Documentation/devicetree/bindings/hwmon/gpio-fan.txt
index 439a743..2becdcf 100644
--- a/Documentation/devicetree/bindings/hwmon/gpio-fan.txt
+++ b/Documentation/devicetree/bindings/hwmon/gpio-fan.txt
@@ -11,7 +11,7 @@ Optional properties:
   must have the RPM values in ascending order.
 - alarm-gpios: This pin going active indicates something is wrong with
   the fan, and a udev event will be fired.
-- cooling-cells: If used as a cooling device, must be <2>
+- #cooling-cells: If used as a cooling device, must be <2>
   Also see: Documentation/devicetree/bindings/thermal/thermal.txt
   min and max states are derived from the speed-map of the fan.
 
diff --git a/Documentation/devicetree/bindings/hwmon/ltc2990.txt b/Documentation/devicetree/bindings/hwmon/ltc2990.txt
new file mode 100644
index 0000000..f92f540
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/ltc2990.txt
@@ -0,0 +1,36 @@
+ltc2990: Linear Technology LTC2990 power monitor
+
+Required properties:
+- compatible: Must be "lltc,ltc2990"
+- reg: I2C slave address
+- lltc,meas-mode:
+	An array of two integers for configuring the chip measurement mode.
+
+	The first integer defines the bits 2..0 in the control register. In all
+	cases the internal temperature and supply voltage are measured. In
+	addition the following input measurements are enabled per mode:
+
+		0: V1, V2, TR2
+		1: V1-V2, TR2
+		2: V1-V2, V3, V4
+		3: TR1, V3, V4
+		4: TR1, V3-V4
+		5: TR1, TR2
+		6: V1-V2, V3-V4
+		7: V1, V2, V3, V4
+
+	The second integer defines the bits 4..3 in the control register. This
+	allows a subset of the measurements to be enabled:
+
+		0: Internal temperature and supply voltage only
+		1: TR1, V1 or V1-V2 only per mode
+		2: TR2, V3 or V3-V4 only per mode
+		3: All measurements per mode
+
+Example:
+
+ltc2990@4c {
+	compatible = "lltc,ltc2990";
+	reg = <0x4c>;
+	lltc,meas-mode = <7 3>;	/* V1, V2, V3, V4 */
+};
diff --git a/Documentation/clk.txt b/Documentation/driver-api/clk.rst
index 511628b..511628b 100644
--- a/Documentation/clk.txt
+++ b/Documentation/driver-api/clk.rst
diff --git a/Documentation/driver-api/device_connection.rst b/Documentation/driver-api/device_connection.rst
index affbc556..ba36422 100644
--- a/Documentation/driver-api/device_connection.rst
+++ b/Documentation/driver-api/device_connection.rst
@@ -40,4 +40,4 @@ API
 ---
 
 .. kernel-doc:: drivers/base/devcon.c
-   : functions: device_connection_find_match device_connection_find device_connection_add device_connection_remove
+   :functions: device_connection_find_match device_connection_find device_connection_add device_connection_remove
diff --git a/Documentation/driver-api/gpio/driver.rst b/Documentation/driver-api/gpio/driver.rst
index 505ee90..cbe0242 100644
--- a/Documentation/driver-api/gpio/driver.rst
+++ b/Documentation/driver-api/gpio/driver.rst
@@ -44,7 +44,7 @@ common to each controller of that type:
 
  - methods to establish GPIO line direction
  - methods used to access GPIO line values
- - method to set electrical configuration to a a given GPIO line
+ - method to set electrical configuration for a given GPIO line
  - method to return the IRQ number associated to a given GPIO line
  - flag saying whether calls to its methods may sleep
  - optional line names array to identify lines
@@ -143,7 +143,7 @@ resistor will make the line tend to high level unless one of the transistors on
 the rail actively pulls it down.
 
 The level on the line will go as high as the VDD on the pull-up resistor, which
-may be higher than the level supported by the transistor, achieveing a
+may be higher than the level supported by the transistor, achieving a
 level-shift to the higher VDD.
 
 Integrated electronics often have an output driver stage in the form of a CMOS
@@ -382,7 +382,7 @@ Real-Time compliance for GPIO IRQ chips
 
 Any provider of irqchips needs to be carefully tailored to support Real Time
 preemption. It is desirable that all irqchips in the GPIO subsystem keep this
-in mind and does the proper testing to assure they are real time-enabled.
+in mind and do the proper testing to assure they are real time-enabled.
 So, pay attention on above " RT_FULL:" notes, please.
 The following is a checklist to follow when preparing a driver for real
 time-compliance:
diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index 6d8352c..5d04296 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -17,7 +17,9 @@ available subsections can be seen below.
    basics
    infrastructure
    pm/index
+   clk
    device-io
+   device_connection
    dma-buf
    device_link
    message-based
diff --git a/Documentation/driver-api/uio-howto.rst b/Documentation/driver-api/uio-howto.rst
index 92056c2..fb2eb73 100644
--- a/Documentation/driver-api/uio-howto.rst
+++ b/Documentation/driver-api/uio-howto.rst
@@ -711,7 +711,8 @@ The vmbus device regions are mapped into uio device resources:
 
 If a subchannel is created by a request to host, then the uio_hv_generic
 device driver will create a sysfs binary file for the per-channel ring buffer.
-For example:
+For example::
+
 	/sys/bus/vmbus/devices/3811fe4d-0fa0-4b62-981a-74fc1084c757/channels/21/ring
 
 Further information
diff --git a/Documentation/features/lib/strncasecmp/arch-support.txt b/Documentation/features/core/cBPF-JIT/arch-support.txt
index 4f3a6a0..90459cd 100644
--- a/Documentation/features/lib/strncasecmp/arch-support.txt
+++ b/Documentation/features/core/cBPF-JIT/arch-support.txt
@@ -1,7 +1,7 @@
 #
-# Feature name:          strncasecmp
-#         Kconfig:       __HAVE_ARCH_STRNCASECMP
-#         description:   arch provides an optimized strncasecmp() function
+# Feature name:          cBPF-JIT
+#         Kconfig:       HAVE_CBPF_JIT
+#         description:   arch supports cBPF JIT optimizations
 #
     -----------------------
     |         arch |status|
@@ -16,14 +16,16 @@
     |        ia64: | TODO |
     |        m68k: | TODO |
     |  microblaze: | TODO |
-    |        mips: | TODO |
+    |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
-    |     powerpc: | TODO |
+    |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: | TODO |
-    |       sparc: | TODO |
+    |       sparc: |  ok  |
     |          um: | TODO |
     |   unicore32: | TODO |
     |         x86: | TODO |
diff --git a/Documentation/features/core/BPF-JIT/arch-support.txt b/Documentation/features/core/eBPF-JIT/arch-support.txt
index 0b96b4e..c90a038 100644
--- a/Documentation/features/core/BPF-JIT/arch-support.txt
+++ b/Documentation/features/core/eBPF-JIT/arch-support.txt
@@ -1,7 +1,7 @@
 #
-# Feature name:          BPF-JIT
-#         Kconfig:       HAVE_BPF_JIT
-#         description:   arch supports BPF JIT optimizations
+# Feature name:          eBPF-JIT
+#         Kconfig:       HAVE_EBPF_JIT
+#         description:   arch supports eBPF JIT optimizations
 #
     -----------------------
     |         arch |status|
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: |  ok  |
     |          sh: | TODO |
     |       sparc: |  ok  |
diff --git a/Documentation/features/core/generic-idle-thread/arch-support.txt b/Documentation/features/core/generic-idle-thread/arch-support.txt
index 372a2b1..0ef6acd 100644
--- a/Documentation/features/core/generic-idle-thread/arch-support.txt
+++ b/Documentation/features/core/generic-idle-thread/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: | TODO |
-    |    openrisc: | TODO |
+    |    openrisc: |  ok  |
     |      parisc: |  ok  |
     |     powerpc: |  ok  |
+    |       riscv: |  ok  |
     |        s390: |  ok  |
     |          sh: |  ok  |
     |       sparc: |  ok  |
diff --git a/Documentation/features/core/jump-labels/arch-support.txt b/Documentation/features/core/jump-labels/arch-support.txt
index ad97217..27cbd63a 100644
--- a/Documentation/features/core/jump-labels/arch-support.txt
+++ b/Documentation/features/core/jump-labels/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: |  ok  |
     |          sh: | TODO |
     |       sparc: |  ok  |
diff --git a/Documentation/features/core/tracehook/arch-support.txt b/Documentation/features/core/tracehook/arch-support.txt
index 36ee7be..f44c274 100644
--- a/Documentation/features/core/tracehook/arch-support.txt
+++ b/Documentation/features/core/tracehook/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: |  ok  |
+    |       nds32: |  ok  |
     |       nios2: |  ok  |
     |    openrisc: |  ok  |
     |      parisc: |  ok  |
     |     powerpc: |  ok  |
+    |       riscv: |  ok  |
     |        s390: |  ok  |
     |          sh: |  ok  |
     |       sparc: |  ok  |
diff --git a/Documentation/features/debug/KASAN/arch-support.txt b/Documentation/features/debug/KASAN/arch-support.txt
index f5c99fa..282ecc8 100644
--- a/Documentation/features/debug/KASAN/arch-support.txt
+++ b/Documentation/features/debug/KASAN/arch-support.txt
@@ -17,15 +17,17 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: | TODO |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: | TODO |
+    |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
     |   unicore32: | TODO |
-    |         x86: |  ok  | 64-bit only
+    |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/debug/gcov-profile-all/arch-support.txt b/Documentation/features/debug/gcov-profile-all/arch-support.txt
index 5170a993..01b2b30 100644
--- a/Documentation/features/debug/gcov-profile-all/arch-support.txt
+++ b/Documentation/features/debug/gcov-profile-all/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: |  ok  |
     |        mips: | TODO |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: |  ok  |
     |          sh: |  ok  |
     |       sparc: | TODO |
diff --git a/Documentation/features/debug/kgdb/arch-support.txt b/Documentation/features/debug/kgdb/arch-support.txt
index 13b6e99..3b4dff2 100644
--- a/Documentation/features/debug/kgdb/arch-support.txt
+++ b/Documentation/features/debug/kgdb/arch-support.txt
@@ -11,16 +11,18 @@
     |         arm: |  ok  |
     |       arm64: |  ok  |
     |         c6x: | TODO |
-    |       h8300: | TODO |
+    |       h8300: |  ok  |
     |     hexagon: |  ok  |
     |        ia64: | TODO |
     |        m68k: | TODO |
     |  microblaze: |  ok  |
     |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: |  ok  |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: |  ok  |
     |       sparc: |  ok  |
diff --git a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
index 419bb38..7e963d0 100644
--- a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
+++ b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: | TODO |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: | TODO |
     |       sparc: | TODO |
diff --git a/Documentation/features/debug/kprobes/arch-support.txt b/Documentation/features/debug/kprobes/arch-support.txt
index 52b3ace..4ada027 100644
--- a/Documentation/features/debug/kprobes/arch-support.txt
+++ b/Documentation/features/debug/kprobes/arch-support.txt
@@ -9,7 +9,7 @@
     |       alpha: | TODO |
     |         arc: |  ok  |
     |         arm: |  ok  |
-    |       arm64: | TODO |
+    |       arm64: |  ok  |
     |         c6x: | TODO |
     |       h8300: | TODO |
     |     hexagon: | TODO |
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
+    |       riscv: |  ok  |
     |        s390: |  ok  |
     |          sh: |  ok  |
     |       sparc: |  ok  |
diff --git a/Documentation/features/debug/kretprobes/arch-support.txt b/Documentation/features/debug/kretprobes/arch-support.txt
index 180d244..044e13f 100644
--- a/Documentation/features/debug/kretprobes/arch-support.txt
+++ b/Documentation/features/debug/kretprobes/arch-support.txt
@@ -9,7 +9,7 @@
     |       alpha: | TODO |
     |         arc: |  ok  |
     |         arm: |  ok  |
-    |       arm64: | TODO |
+    |       arm64: |  ok  |
     |         c6x: | TODO |
     |       h8300: | TODO |
     |     hexagon: | TODO |
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: |  ok  |
     |          sh: |  ok  |
     |       sparc: |  ok  |
diff --git a/Documentation/features/debug/optprobes/arch-support.txt b/Documentation/features/debug/optprobes/arch-support.txt
index 0a1241f..dce7669 100644
--- a/Documentation/features/debug/optprobes/arch-support.txt
+++ b/Documentation/features/debug/optprobes/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: | TODO |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
-    |     powerpc: | TODO |
+    |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: | TODO |
     |       sparc: | TODO |
diff --git a/Documentation/features/debug/stackprotector/arch-support.txt b/Documentation/features/debug/stackprotector/arch-support.txt
index 5700195..74b89a9 100644
--- a/Documentation/features/debug/stackprotector/arch-support.txt
+++ b/Documentation/features/debug/stackprotector/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: | TODO |
+    |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: |  ok  |
     |       sparc: | TODO |
diff --git a/Documentation/features/debug/uprobes/arch-support.txt b/Documentation/features/debug/uprobes/arch-support.txt
index 0b8d922..1a3f9d3 100644
--- a/Documentation/features/debug/uprobes/arch-support.txt
+++ b/Documentation/features/debug/uprobes/arch-support.txt
@@ -9,7 +9,7 @@
     |       alpha: | TODO |
     |         arc: | TODO |
     |         arm: |  ok  |
-    |       arm64: | TODO |
+    |       arm64: |  ok  |
     |         c6x: | TODO |
     |       h8300: | TODO |
     |     hexagon: | TODO |
@@ -17,13 +17,15 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: |  ok  |
     |          sh: | TODO |
-    |       sparc: | TODO |
+    |       sparc: |  ok  |
     |          um: | TODO |
     |   unicore32: | TODO |
     |         x86: |  ok  |
diff --git a/Documentation/features/debug/user-ret-profiler/arch-support.txt b/Documentation/features/debug/user-ret-profiler/arch-support.txt
index 13852ae..1d78d10 100644
--- a/Documentation/features/debug/user-ret-profiler/arch-support.txt
+++ b/Documentation/features/debug/user-ret-profiler/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: | TODO |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: | TODO |
+    |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: | TODO |
     |       sparc: | TODO |
diff --git a/Documentation/features/io/dma-api-debug/arch-support.txt b/Documentation/features/io/dma-api-debug/arch-support.txt
deleted file mode 100644
index e438ed6..0000000
--- a/Documentation/features/io/dma-api-debug/arch-support.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-#
-# Feature name:          dma-api-debug
-#         Kconfig:       HAVE_DMA_API_DEBUG
-#         description:   arch supports DMA debug facilities
-#
-    -----------------------
-    |         arch |status|
-    -----------------------
-    |       alpha: | TODO |
-    |         arc: | TODO |
-    |         arm: |  ok  |
-    |       arm64: |  ok  |
-    |         c6x: |  ok  |
-    |       h8300: | TODO |
-    |     hexagon: | TODO |
-    |        ia64: |  ok  |
-    |        m68k: | TODO |
-    |  microblaze: |  ok  |
-    |        mips: |  ok  |
-    |       nios2: | TODO |
-    |    openrisc: | TODO |
-    |      parisc: | TODO |
-    |     powerpc: |  ok  |
-    |        s390: |  ok  |
-    |          sh: |  ok  |
-    |       sparc: |  ok  |
-    |          um: | TODO |
-    |   unicore32: | TODO |
-    |         x86: |  ok  |
-    |      xtensa: |  ok  |
-    -----------------------
diff --git a/Documentation/features/io/dma-contiguous/arch-support.txt b/Documentation/features/io/dma-contiguous/arch-support.txt
index 47f64a4..30c072d 100644
--- a/Documentation/features/io/dma-contiguous/arch-support.txt
+++ b/Documentation/features/io/dma-contiguous/arch-support.txt
@@ -17,11 +17,13 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: | TODO |
-    |        s390: | TODO |
+    |       riscv: |  ok  |
+    |        s390: |  ok  |
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
diff --git a/Documentation/features/io/sg-chain/arch-support.txt b/Documentation/features/io/sg-chain/arch-support.txt
index 07f357f..6554f03 100644
--- a/Documentation/features/io/sg-chain/arch-support.txt
+++ b/Documentation/features/io/sg-chain/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: | TODO |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: |  ok  |
     |          sh: | TODO |
     |       sparc: |  ok  |
diff --git a/Documentation/features/locking/cmpxchg-local/arch-support.txt b/Documentation/features/locking/cmpxchg-local/arch-support.txt
index 482a0b0..51704a2 100644
--- a/Documentation/features/locking/cmpxchg-local/arch-support.txt
+++ b/Documentation/features/locking/cmpxchg-local/arch-support.txt
@@ -9,7 +9,7 @@
     |       alpha: | TODO |
     |         arc: | TODO |
     |         arm: | TODO |
-    |       arm64: | TODO |
+    |       arm64: |  ok  |
     |         c6x: | TODO |
     |       h8300: | TODO |
     |     hexagon: | TODO |
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: | TODO |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: | TODO |
+    |       riscv: | TODO |
     |        s390: |  ok  |
     |          sh: | TODO |
     |       sparc: | TODO |
diff --git a/Documentation/features/locking/lockdep/arch-support.txt b/Documentation/features/locking/lockdep/arch-support.txt
index bb35c5b..bd39c5e 100644
--- a/Documentation/features/locking/lockdep/arch-support.txt
+++ b/Documentation/features/locking/lockdep/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: |  ok  |
     |        mips: |  ok  |
+    |       nds32: |  ok  |
     |       nios2: | TODO |
-    |    openrisc: | TODO |
+    |    openrisc: |  ok  |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: |  ok  |
     |          sh: |  ok  |
     |       sparc: |  ok  |
diff --git a/Documentation/features/locking/queued-rwlocks/arch-support.txt b/Documentation/features/locking/queued-rwlocks/arch-support.txt
index 627e9a6..da7aff3 100644
--- a/Documentation/features/locking/queued-rwlocks/arch-support.txt
+++ b/Documentation/features/locking/queued-rwlocks/arch-support.txt
@@ -9,21 +9,23 @@
     |       alpha: | TODO |
     |         arc: | TODO |
     |         arm: | TODO |
-    |       arm64: | TODO |
+    |       arm64: |  ok  |
     |         c6x: | TODO |
     |       h8300: | TODO |
     |     hexagon: | TODO |
     |        ia64: | TODO |
     |        m68k: | TODO |
     |  microblaze: | TODO |
-    |        mips: | TODO |
+    |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: | TODO |
-    |    openrisc: | TODO |
+    |    openrisc: |  ok  |
     |      parisc: | TODO |
     |     powerpc: | TODO |
+    |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: | TODO |
-    |       sparc: | TODO |
+    |       sparc: |  ok  |
     |          um: | TODO |
     |   unicore32: | TODO |
     |         x86: |  ok  |
diff --git a/Documentation/features/locking/queued-spinlocks/arch-support.txt b/Documentation/features/locking/queued-spinlocks/arch-support.txt
index 9edda21..478e910 100644
--- a/Documentation/features/locking/queued-spinlocks/arch-support.txt
+++ b/Documentation/features/locking/queued-spinlocks/arch-support.txt
@@ -16,14 +16,16 @@
     |        ia64: | TODO |
     |        m68k: | TODO |
     |  microblaze: | TODO |
-    |        mips: | TODO |
+    |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: | TODO |
-    |    openrisc: | TODO |
+    |    openrisc: |  ok  |
     |      parisc: | TODO |
     |     powerpc: | TODO |
+    |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: | TODO |
-    |       sparc: | TODO |
+    |       sparc: |  ok  |
     |          um: | TODO |
     |   unicore32: | TODO |
     |         x86: |  ok  |
diff --git a/Documentation/features/locking/rwsem-optimized/arch-support.txt b/Documentation/features/locking/rwsem-optimized/arch-support.txt
index 8d9afb1..e54b1f1 100644
--- a/Documentation/features/locking/rwsem-optimized/arch-support.txt
+++ b/Documentation/features/locking/rwsem-optimized/arch-support.txt
@@ -1,6 +1,6 @@
 #
 # Feature name:          rwsem-optimized
-#         Kconfig:       Optimized asm/rwsem.h
+#         Kconfig:       !RWSEM_GENERIC_SPINLOCK
 #         description:   arch provides optimized rwsem APIs
 #
     -----------------------
@@ -8,8 +8,8 @@
     -----------------------
     |       alpha: |  ok  |
     |         arc: | TODO |
-    |         arm: | TODO |
-    |       arm64: | TODO |
+    |         arm: |  ok  |
+    |       arm64: |  ok  |
     |         c6x: | TODO |
     |       h8300: | TODO |
     |     hexagon: | TODO |
@@ -17,14 +17,16 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: | TODO |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: | TODO |
+    |       riscv: | TODO |
     |        s390: |  ok  |
     |          sh: |  ok  |
     |       sparc: |  ok  |
-    |          um: | TODO |
+    |          um: |  ok  |
     |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
diff --git a/Documentation/features/perf/kprobes-event/arch-support.txt b/Documentation/features/perf/kprobes-event/arch-support.txt
index d01239e..7331402 100644
--- a/Documentation/features/perf/kprobes-event/arch-support.txt
+++ b/Documentation/features/perf/kprobes-event/arch-support.txt
@@ -9,7 +9,7 @@
     |       alpha: | TODO |
     |         arc: | TODO |
     |         arm: |  ok  |
-    |       arm64: | TODO |
+    |       arm64: |  ok  |
     |         c6x: | TODO |
     |       h8300: | TODO |
     |     hexagon: |  ok  |
@@ -17,13 +17,15 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: |  ok  |
+    |       nds32: |  ok  |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: |  ok  |
     |          sh: |  ok  |
-    |       sparc: | TODO |
+    |       sparc: |  ok  |
     |          um: | TODO |
     |   unicore32: | TODO |
     |         x86: |  ok  |
diff --git a/Documentation/features/perf/perf-regs/arch-support.txt b/Documentation/features/perf/perf-regs/arch-support.txt
index 458faba..53feeee 100644
--- a/Documentation/features/perf/perf-regs/arch-support.txt
+++ b/Documentation/features/perf/perf-regs/arch-support.txt
@@ -17,11 +17,13 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: | TODO |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
-    |        s390: | TODO |
+    |       riscv: | TODO |
+    |        s390: |  ok  |
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
diff --git a/Documentation/features/perf/perf-stackdump/arch-support.txt b/Documentation/features/perf/perf-stackdump/arch-support.txt
index 545d01c..1616434 100644
--- a/Documentation/features/perf/perf-stackdump/arch-support.txt
+++ b/Documentation/features/perf/perf-stackdump/arch-support.txt
@@ -17,11 +17,13 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: | TODO |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
-    |        s390: | TODO |
+    |       riscv: | TODO |
+    |        s390: |  ok  |
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
index 85a6c9d..dbdf629 100644
--- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt
+++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
@@ -40,10 +40,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: | TODO |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: | TODO |
+    |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: | TODO |
     |       sparc: | TODO |
diff --git a/Documentation/features/sched/numa-balancing/arch-support.txt b/Documentation/features/sched/numa-balancing/arch-support.txt
index 3475088..c68bb2c 100644
--- a/Documentation/features/sched/numa-balancing/arch-support.txt
+++ b/Documentation/features/sched/numa-balancing/arch-support.txt
@@ -9,7 +9,7 @@
     |       alpha: | TODO |
     |         arc: |  ..  |
     |         arm: |  ..  |
-    |       arm64: |  ..  |
+    |       arm64: |  ok  |
     |         c6x: |  ..  |
     |       h8300: |  ..  |
     |     hexagon: |  ..  |
@@ -17,11 +17,13 @@
     |        m68k: |  ..  |
     |  microblaze: |  ..  |
     |        mips: | TODO |
+    |       nds32: | TODO |
     |       nios2: |  ..  |
     |    openrisc: |  ..  |
     |      parisc: |  ..  |
     |     powerpc: |  ok  |
-    |        s390: |  ..  |
+    |       riscv: | TODO |
+    |        s390: |  ok  |
     |          sh: |  ..  |
     |       sparc: | TODO |
     |          um: |  ..  |
diff --git a/Documentation/features/scripts/features-refresh.sh b/Documentation/features/scripts/features-refresh.sh
new file mode 100755
index 0000000..9e72d38
--- /dev/null
+++ b/Documentation/features/scripts/features-refresh.sh
@@ -0,0 +1,98 @@
+#
+# Small script that refreshes the kernel feature support status in place.
+#
+
+for F_FILE in Documentation/features/*/*/arch-support.txt; do
+	F=$(grep "^#         Kconfig:" "$F_FILE" | cut -c26-)
+
+	#
+	# Each feature F is identified by a pair (O, K), where 'O' can
+	# be either the empty string (for 'nop') or "not" (the logical
+	# negation operator '!'); other operators are not supported.
+	#
+	O=""
+	K=$F
+	if [[ "$F" == !* ]]; then
+		O="not"
+		K=$(echo $F | sed -e 's/^!//g')
+	fi
+
+	#
+	# F := (O, K) is 'valid' iff there is a Kconfig file (for some
+	# arch) which contains K.
+	#
+	# Notice that this definition entails an 'asymmetry' between
+	# the case 'O = ""' and the case 'O = "not"'. E.g., F may be
+	# _invalid_ if:
+	#
+	# [case 'O = ""']
+	#   1) no arch provides support for F,
+	#   2) K does not exist (e.g., it was renamed/mis-typed);
+	#
+	# [case 'O = "not"']
+	#   3) all archs provide support for F,
+	#   4) as in (2).
+	#
+	# The rationale for adopting this definition (and, thus, for
+	# keeping the asymmetry) is:
+	#
+	#       We want to be able to 'detect' (2) (or (4)).
+	#
+	# (1) and (3) may further warn the developers about the fact
+	# that K can be removed.
+	#
+	F_VALID="false"
+	for ARCH_DIR in arch/*/; do
+		K_FILES=$(find $ARCH_DIR -name "Kconfig*")
+		K_GREP=$(grep "$K" $K_FILES)
+		if [ ! -z "$K_GREP" ]; then
+			F_VALID="true"
+			break
+		fi
+	done
+	if [ "$F_VALID" = "false" ]; then
+		printf "WARNING: '%s' is not a valid Kconfig\n" "$F"
+	fi
+
+	T_FILE="$F_FILE.tmp"
+	grep "^#" $F_FILE > $T_FILE
+	echo "    -----------------------" >> $T_FILE
+	echo "    |         arch |status|" >> $T_FILE
+	echo "    -----------------------" >> $T_FILE
+	for ARCH_DIR in arch/*/; do
+		ARCH=$(echo $ARCH_DIR | sed -e 's/arch//g' | sed -e 's/\///g')
+		K_FILES=$(find $ARCH_DIR -name "Kconfig*")
+		K_GREP=$(grep "$K" $K_FILES)
+		#
+		# Arch support status values for (O, K) are updated according
+		# to the following rules.
+		#
+		#   - ("", K) is 'supported by a given arch', if there is a
+		#     Kconfig file for that arch which contains K;
+		#
+		#   - ("not", K) is 'supported by a given arch', if there is
+		#     no Kconfig file for that arch which contains K;
+		#
+		#   - otherwise: preserve the previous status value (if any),
+		#                default to 'not yet supported'.
+		#
+		# Notice that, according these rules, invalid features may be
+		# updated/modified.
+		#
+		if [ "$O" = "" ] && [ ! -z "$K_GREP" ]; then
+			printf "    |%12s: |  ok  |\n" "$ARCH" >> $T_FILE
+		elif [ "$O" = "not" ] && [ -z "$K_GREP" ]; then
+			printf "    |%12s: |  ok  |\n" "$ARCH" >> $T_FILE
+		else
+			S=$(grep -v "^#" "$F_FILE" | grep " $ARCH:")
+			if [ ! -z "$S" ]; then
+				echo "$S" >> $T_FILE
+			else
+				printf "    |%12s: | TODO |\n" "$ARCH" \
+					>> $T_FILE
+			fi
+		fi
+	done
+	echo "    -----------------------" >> $T_FILE
+	mv $T_FILE $F_FILE
+done
diff --git a/Documentation/features/seccomp/seccomp-filter/arch-support.txt b/Documentation/features/seccomp/seccomp-filter/arch-support.txt
index e4fad58..d4271b4 100644
--- a/Documentation/features/seccomp/seccomp-filter/arch-support.txt
+++ b/Documentation/features/seccomp/seccomp-filter/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
-    |      parisc: | TODO |
-    |     powerpc: | TODO |
+    |      parisc: |  ok  |
+    |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: |  ok  |
     |          sh: | TODO |
     |       sparc: | TODO |
diff --git a/Documentation/features/time/arch-tick-broadcast/arch-support.txt b/Documentation/features/time/arch-tick-broadcast/arch-support.txt
index 8052904..83d9e68 100644
--- a/Documentation/features/time/arch-tick-broadcast/arch-support.txt
+++ b/Documentation/features/time/arch-tick-broadcast/arch-support.txt
@@ -17,12 +17,14 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: | TODO |
-    |          sh: | TODO |
+    |          sh: |  ok  |
     |       sparc: | TODO |
     |          um: | TODO |
     |   unicore32: | TODO |
diff --git a/Documentation/features/time/clockevents/arch-support.txt b/Documentation/features/time/clockevents/arch-support.txt
index 7c76b94..3d4908f 100644
--- a/Documentation/features/time/clockevents/arch-support.txt
+++ b/Documentation/features/time/clockevents/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: |  ok  |
     |  microblaze: |  ok  |
     |        mips: |  ok  |
+    |       nds32: |  ok  |
     |       nios2: |  ok  |
     |    openrisc: |  ok  |
-    |      parisc: | TODO |
+    |      parisc: |  ok  |
     |     powerpc: |  ok  |
+    |       riscv: |  ok  |
     |        s390: |  ok  |
     |          sh: |  ok  |
     |       sparc: |  ok  |
diff --git a/Documentation/features/time/context-tracking/arch-support.txt b/Documentation/features/time/context-tracking/arch-support.txt
index 9433b3e..c29974a 100644
--- a/Documentation/features/time/context-tracking/arch-support.txt
+++ b/Documentation/features/time/context-tracking/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: | TODO |
     |       sparc: |  ok  |
diff --git a/Documentation/features/time/irq-time-acct/arch-support.txt b/Documentation/features/time/irq-time-acct/arch-support.txt
index 212dde0..8d73c46 100644
--- a/Documentation/features/time/irq-time-acct/arch-support.txt
+++ b/Documentation/features/time/irq-time-acct/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: |  ..  |
-    |     powerpc: |  ..  |
+    |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: |  ..  |
     |          sh: | TODO |
     |       sparc: |  ..  |
diff --git a/Documentation/features/time/modern-timekeeping/arch-support.txt b/Documentation/features/time/modern-timekeeping/arch-support.txt
index 4074028..e7c6ea6 100644
--- a/Documentation/features/time/modern-timekeeping/arch-support.txt
+++ b/Documentation/features/time/modern-timekeeping/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: |  ok  |
     |        mips: |  ok  |
+    |       nds32: |  ok  |
     |       nios2: |  ok  |
     |    openrisc: |  ok  |
     |      parisc: |  ok  |
     |     powerpc: |  ok  |
+    |       riscv: |  ok  |
     |        s390: |  ok  |
     |          sh: |  ok  |
     |       sparc: |  ok  |
diff --git a/Documentation/features/time/virt-cpuacct/arch-support.txt b/Documentation/features/time/virt-cpuacct/arch-support.txt
index a394d88..4646457 100644
--- a/Documentation/features/time/virt-cpuacct/arch-support.txt
+++ b/Documentation/features/time/virt-cpuacct/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: |  ok  |
     |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: |  ok  |
     |          sh: | TODO |
     |       sparc: |  ok  |
diff --git a/Documentation/features/vm/ELF-ASLR/arch-support.txt b/Documentation/features/vm/ELF-ASLR/arch-support.txt
index 082f93d..1f71d09 100644
--- a/Documentation/features/vm/ELF-ASLR/arch-support.txt
+++ b/Documentation/features/vm/ELF-ASLR/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
-    |      parisc: | TODO |
+    |      parisc: |  ok  |
     |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: |  ok  |
     |          sh: | TODO |
     |       sparc: | TODO |
diff --git a/Documentation/features/vm/PG_uncached/arch-support.txt b/Documentation/features/vm/PG_uncached/arch-support.txt
index 605e0ab..fbd5aa4 100644
--- a/Documentation/features/vm/PG_uncached/arch-support.txt
+++ b/Documentation/features/vm/PG_uncached/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: | TODO |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: | TODO |
+    |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: | TODO |
     |       sparc: | TODO |
diff --git a/Documentation/features/vm/THP/arch-support.txt b/Documentation/features/vm/THP/arch-support.txt
index 7a8eb0b..5d7ecc3 100644
--- a/Documentation/features/vm/THP/arch-support.txt
+++ b/Documentation/features/vm/THP/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: |  ..  |
     |  microblaze: |  ..  |
     |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: |  ..  |
     |    openrisc: |  ..  |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: |  ok  |
     |          sh: |  ..  |
     |       sparc: |  ok  |
diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt
index 35fb99b..f7af967 100644
--- a/Documentation/features/vm/TLB/arch-support.txt
+++ b/Documentation/features/vm/TLB/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: |  ..  |
     |  microblaze: |  ..  |
     |        mips: | TODO |
+    |       nds32: | TODO |
     |       nios2: |  ..  |
     |    openrisc: |  ..  |
     |      parisc: | TODO |
     |     powerpc: | TODO |
+    |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: | TODO |
     |       sparc: | TODO |
diff --git a/Documentation/features/vm/huge-vmap/arch-support.txt b/Documentation/features/vm/huge-vmap/arch-support.txt
index ed8b943..d0713cc 100644
--- a/Documentation/features/vm/huge-vmap/arch-support.txt
+++ b/Documentation/features/vm/huge-vmap/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: | TODO |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: | TODO |
+    |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: | TODO |
     |       sparc: | TODO |
diff --git a/Documentation/features/vm/ioremap_prot/arch-support.txt b/Documentation/features/vm/ioremap_prot/arch-support.txt
index 589947b..8527601 100644
--- a/Documentation/features/vm/ioremap_prot/arch-support.txt
+++ b/Documentation/features/vm/ioremap_prot/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: | TODO |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: |  ok  |
     |       sparc: | TODO |
diff --git a/Documentation/features/vm/numa-memblock/arch-support.txt b/Documentation/features/vm/numa-memblock/arch-support.txt
index 8b8bea0..1a98805 100644
--- a/Documentation/features/vm/numa-memblock/arch-support.txt
+++ b/Documentation/features/vm/numa-memblock/arch-support.txt
@@ -9,7 +9,7 @@
     |       alpha: | TODO |
     |         arc: |  ..  |
     |         arm: |  ..  |
-    |       arm64: |  ..  |
+    |       arm64: |  ok  |
     |         c6x: |  ..  |
     |       h8300: |  ..  |
     |     hexagon: |  ..  |
@@ -17,10 +17,12 @@
     |        m68k: |  ..  |
     |  microblaze: |  ok  |
     |        mips: |  ok  |
+    |       nds32: | TODO |
     |       nios2: |  ..  |
     |    openrisc: |  ..  |
     |      parisc: |  ..  |
     |     powerpc: |  ok  |
+    |       riscv: |  ok  |
     |        s390: |  ok  |
     |          sh: |  ok  |
     |       sparc: |  ok  |
diff --git a/Documentation/features/vm/pte_special/arch-support.txt b/Documentation/features/vm/pte_special/arch-support.txt
index 055004f..6a608a6 100644
--- a/Documentation/features/vm/pte_special/arch-support.txt
+++ b/Documentation/features/vm/pte_special/arch-support.txt
@@ -17,10 +17,12 @@
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: | TODO |
+    |       nds32: | TODO |
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
+    |       riscv: | TODO |
     |        s390: |  ok  |
     |          sh: |  ok  |
     |       sparc: |  ok  |
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 75d2d57..2c39133 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -69,31 +69,31 @@ prototypes:
 
 locking rules:
 	all may block
-		i_mutex(inode)
-lookup:		yes
-create:		yes
-link:		yes (both)
-mknod:		yes
-symlink:	yes
-mkdir:		yes
-unlink:		yes (both)
-rmdir:		yes (both)	(see below)
-rename:	yes (all)	(see below)
+		i_rwsem(inode)
+lookup:		shared
+create:		exclusive
+link:		exclusive (both)
+mknod:		exclusive
+symlink:	exclusive
+mkdir:		exclusive
+unlink:		exclusive (both)
+rmdir:		exclusive (both)(see below)
+rename:		exclusive (all)	(see below)
 readlink:	no
 get_link:	no
-setattr:	yes
+setattr:	exclusive
 permission:	no (may not block if called in rcu-walk mode)
 get_acl:	no
 getattr:	no
 listxattr:	no
 fiemap:		no
 update_time:	no
-atomic_open:	yes
+atomic_open:	exclusive
 tmpfile:	no
 
 
-	Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
-victim.
+	Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_rwsem
+	exclusive on victim.
 	cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
 
 See Documentation/filesystems/directory-locking for more detailed discussion
@@ -111,10 +111,10 @@ prototypes:
 
 locking rules:
 	all may block
-		i_mutex(inode)
+		i_rwsem(inode)
 list:		no
 get:		no
-set:		yes
+set:		exclusive
 
 --------------------------- super_operations ---------------------------
 prototypes:
@@ -217,14 +217,14 @@ prototypes:
 locking rules:
 	All except set_page_dirty and freepage may block
 
-			PageLocked(page)	i_mutex
+			PageLocked(page)	i_rwsem
 writepage:		yes, unlocks (see below)
 readpage:		yes, unlocks
 writepages:
 set_page_dirty		no
 readpages:
-write_begin:		locks the page		yes
-write_end:		yes, unlocks		yes
+write_begin:		locks the page		exclusive
+write_end:		yes, unlocks		exclusive
 bmap:
 invalidatepage:		yes
 releasepage:		yes
@@ -439,7 +439,10 @@ prototypes:
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
-	unsigned int (*poll) (struct file *, struct poll_table_struct *);
+	int (*iterate_shared) (struct file *, struct dir_context *);
+	__poll_t (*poll) (struct file *, struct poll_table_struct *);
+	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
+	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
@@ -470,7 +473,7 @@ prototypes:
 };
 
 locking rules:
-	All may block.
+	All except for ->poll_mask may block.
 
 ->llseek() locking has moved from llseek to the individual llseek
 implementations.  If your fs is not using generic_file_llseek, you
@@ -480,6 +483,10 @@ mutex or just to use i_size_read() instead.
 Note: this does not protect the file->f_pos against concurrent modifications
 since this is something the userspace has to take care about.
 
+->iterate() is called with i_rwsem exclusive.
+
+->iterate_shared() is called with i_rwsem at least shared.
+
 ->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags.
 Most instances call fasync_helper(), which does that maintenance, so it's
 not normally something one needs to worry about.  Return values > 0 will be
@@ -498,6 +505,9 @@ in sys_read() and friends.
 the lease within the individual filesystem to record the result of the
 operation
 
+->poll_mask can be called with or without the waitqueue lock for the waitqueue
+returned from ->get_poll_head.
+
 --------------------------- dquot_operations -------------------------------
 prototypes:
 	int (*write_dquot) (struct dquot *);
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 2a84bb3..520f6a8 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -515,7 +515,8 @@ guarantees:
 
 The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG
 bits on both physical and virtual pages associated with a process, and the
-soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details).
+soft-dirty bit on pte (see Documentation/admin-guide/mm/soft-dirty.rst
+for details).
 To clear the bits for all the pages associated with the process
     > echo 1 > /proc/PID/clear_refs
 
@@ -536,7 +537,8 @@ Any other value written to /proc/PID/clear_refs will have no effect.
 
 The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags
 using /proc/kpageflags and number of times a page is mapped using
-/proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.txt.
+/proc/kpagecount. For detailed explanation, see
+Documentation/admin-guide/mm/pagemap.rst.
 
 The /proc/pid/numa_maps is an extension based on maps, showing the memory
 locality and binding policy, as well as the memory usage (in pages) of
@@ -564,7 +566,7 @@ address   policy    mapping details
 
 Where:
 "address" is the starting address for the mapping;
-"policy" reports the NUMA memory policy set for the mapping (see vm/numa_memory_policy.txt);
+"policy" reports the NUMA memory policy set for the mapping (see Documentation/admin-guide/mm/numa_memory_policy.rst);
 "mapping details" summarizes mapping data such as mapping type, page usage counters,
 node locality page counters (N0 == node0, N1 == node1, ...) and the kernel page
 size, in KB, that is backing the mapping up.
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index a85355c..d06e9a5 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -105,8 +105,9 @@ policy for the file will revert to "default" policy.
 NUMA memory allocation policies have optional flags that can be used in
 conjunction with their modes.  These optional flags can be specified
 when tmpfs is mounted by appending them to the mode before the NodeList.
-See Documentation/vm/numa_memory_policy.txt for a list of all available
-memory allocation policy mode flags and their effect on memory policy.
+See Documentation/admin-guide/mm/numa_memory_policy.rst for a list of
+all available memory allocation policy mode flags and their effect on
+memory policy.
 
 	=static		is equivalent to	MPOL_F_STATIC_NODES
 	=relative	is equivalent to	MPOL_F_RELATIVE_NODES
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 5fd325d..829a7b7 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -856,7 +856,9 @@ struct file_operations {
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
-	unsigned int (*poll) (struct file *, struct poll_table_struct *);
+	__poll_t (*poll) (struct file *, struct poll_table_struct *);
+	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
+	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
@@ -901,6 +903,17 @@ otherwise noted.
 	activity on this file and (optionally) go to sleep until there
 	is activity. Called by the select(2) and poll(2) system calls
 
+  get_poll_head: Returns the struct wait_queue_head that callers can
+  wait on.  Callers need to check the returned events using ->poll_mask
+  once woken.  Can return NULL to indicate polling is not supported,
+  or any error code using the ERR_PTR convention to indicate that a
+  grave error occured and ->poll_mask shall not be called.
+
+  poll_mask: return the mask of EPOLL* values describing the file descriptor
+  state.  Called either before going to sleep on the waitqueue returned by
+  get_poll_head, or after it has been woken.  If ->get_poll_head and
+  ->poll_mask are implemented ->poll does not need to be implement.
+
   unlocked_ioctl: called by the ioctl(2) system call.
 
   compat_ioctl: called by the ioctl(2) system call when 32 bit system calls
diff --git a/Documentation/hwmon/hwmon-kernel-api.txt b/Documentation/hwmon/hwmon-kernel-api.txt
index 53a8066..eb7a78a 100644
--- a/Documentation/hwmon/hwmon-kernel-api.txt
+++ b/Documentation/hwmon/hwmon-kernel-api.txt
@@ -71,7 +71,8 @@ hwmon_device_register_with_info is the most comprehensive and preferred means
 to register a hardware monitoring device. It creates the standard sysfs
 attributes in the hardware monitoring core, letting the driver focus on reading
 from and writing to the chip instead of having to bother with sysfs attributes.
-Its parameters are described in more detail below.
+The parent device parameter cannot be NULL with non-NULL chip info. Its
+parameters are described in more detail below.
 
 devm_hwmon_device_register_with_info is similar to
 hwmon_device_register_with_info. However, it is device managed, meaning the
diff --git a/Documentation/hwmon/ltc2990 b/Documentation/hwmon/ltc2990
index c25211e..3ed68f6 100644
--- a/Documentation/hwmon/ltc2990
+++ b/Documentation/hwmon/ltc2990
@@ -8,6 +8,7 @@ Supported chips:
     Datasheet: http://www.linear.com/product/ltc2990
 
 Author: Mike Looijmans <mike.looijmans@topic.nl>
+        Tom Levens <tom.levens@cern.ch>
 
 
 Description
@@ -16,10 +17,8 @@ Description
 LTC2990 is a Quad I2C Voltage, Current and Temperature Monitor.
 The chip's inputs can measure 4 voltages, or two inputs together (1+2 and 3+4)
 can be combined to measure a differential voltage, which is typically used to
-measure current through a series resistor, or a temperature.
-
-This driver currently uses the 2x differential mode only. In order to support
-other modes, the driver will need to be expanded.
+measure current through a series resistor, or a temperature with an external
+diode.
 
 
 Usage Notes
@@ -32,12 +31,19 @@ devices explicitly.
 Sysfs attributes
 ----------------
 
+in0_input     Voltage at Vcc pin in millivolt (range 2.5V to 5V)
+temp1_input   Internal chip temperature in millidegrees Celcius
+
+A subset of the following attributes are visible, depending on the measurement
+mode of the chip.
+
+in[1-4]_input Voltage at V[1-4] pin in millivolt
+temp2_input   External temperature sensor TR1 in millidegrees Celcius
+temp3_input   External temperature sensor TR2 in millidegrees Celcius
+curr1_input   Current in mA across V1-V2 assuming a 1mOhm sense resistor
+curr2_input   Current in mA across V3-V4 assuming a 1mOhm sense resistor
+
 The "curr*_input" measurements actually report the voltage drop across the
 input pins in microvolts. This is equivalent to the current through a 1mOhm
 sense resistor. Divide the reported value by the actual sense resistor value
 in mOhm to get the actual value.
-
-in0_input     Voltage at Vcc pin in millivolt (range 2.5V to 5V)
-temp1_input   Internal chip temperature in millidegrees Celcius
-curr1_input   Current in mA across v1-v2 assuming a 1mOhm sense resistor.
-curr2_input   Current in mA across v3-v4 assuming a 1mOhm sense resistor.
diff --git a/Documentation/index.rst b/Documentation/index.rst
index 3b99ab9..fdc5857 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -45,7 +45,7 @@ the kernel interface as seen by application developers.
 .. toctree::
    :maxdepth: 2
 
-   userspace-api/index	      
+   userspace-api/index
 
 
 Introduction to kernel development
@@ -89,6 +89,7 @@ needed).
    sound/index
    crypto/index
    filesystems/index
+   vm/index
 
 Architecture-specific documentation
 -----------------------------------
diff --git a/Documentation/ioctl/botching-up-ioctls.txt b/Documentation/ioctl/botching-up-ioctls.txt
index d02cfb4..883fb03 100644
--- a/Documentation/ioctl/botching-up-ioctls.txt
+++ b/Documentation/ioctl/botching-up-ioctls.txt
@@ -73,7 +73,9 @@ will have a second iteration or at least an extension for any given interface.
    future extensions is going right down the gutters since someone will submit
    an ioctl struct with random stack garbage in the yet unused parts. Which
    then bakes in the ABI that those fields can never be used for anything else
-   but garbage.
+   but garbage. This is also the reason why you must explicitly pad all
+   structures, even if you never use them in an array - the padding the compiler
+   might insert could contain garbage.
 
  * Have simple testcases for all of the above.
 
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 6dafc80..a02d6bb 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1920,9 +1920,6 @@ There are some more advanced barrier functions:
 		/* assign ownership */
 		desc->status = DEVICE_OWN;
 
-		/* force memory to sync before notifying device via MMIO */
-		wmb();
-
 		/* notify device of new descriptors */
 		writel(DESC_NOTIFY, doorbell);
 	}
@@ -1930,11 +1927,15 @@ There are some more advanced barrier functions:
      The dma_rmb() allows us guarantee the device has released ownership
      before we read the data from the descriptor, and the dma_wmb() allows
      us to guarantee the data is written to the descriptor before the device
-     can see it now has ownership.  The wmb() is needed to guarantee that the
-     cache coherent memory writes have completed before attempting a write to
-     the cache incoherent MMIO region.
+     can see it now has ownership.  Note that, when using writel(), a prior
+     wmb() is not needed to guarantee that the cache coherent memory writes
+     have completed before writing to the MMIO region.  The cheaper
+     writel_relaxed() does not provide this guarantee and must not be used
+     here.
 
-     See Documentation/DMA-API.txt for more information on consistent memory.
+     See the subsection "Kernel I/O barrier effects" for more information on
+     relaxed I/O accessors and the Documentation/DMA-API.txt file for more
+     information on consistent memory.
 
 
 MMIO WRITE BARRIER
@@ -2903,7 +2904,7 @@ is discarded from the CPU's cache and reloaded.  To deal with this, the
 appropriate part of the kernel must invalidate the overlapping bits of the
 cache on each CPU.
 
-See Documentation/cachetlb.txt for more information on cache management.
+See Documentation/core-api/cachetlb.rst for more information on cache management.
 
 
 CACHE COHERENCY VS MMIO
@@ -3083,7 +3084,7 @@ CIRCULAR BUFFERS
 Memory barriers can be used to implement circular buffering without the need
 of a lock to serialise the producer with the consumer.  See:
 
-	Documentation/circular-buffers.txt
+	Documentation/core-api/circular-buffers.rst
 
 for details.
 
diff --git a/Documentation/process/2.Process.rst b/Documentation/process/2.Process.rst
index ce5561b..a9c46dd 100644
--- a/Documentation/process/2.Process.rst
+++ b/Documentation/process/2.Process.rst
@@ -18,17 +18,17 @@ major kernel release happening every two or three months.  The recent
 release history looks like this:
 
 	======  =================
-	2.6.38	March 14, 2011
-	2.6.37	January 4, 2011
-	2.6.36	October 20, 2010
-	2.6.35	August 1, 2010
-	2.6.34	May 15, 2010
-	2.6.33	February 24, 2010
+	4.11	April 30, 2017
+	4.12	July 2, 2017
+	4.13	September 3, 2017
+	4.14	November 12, 2017
+	4.15	January 28, 2018
+	4.16	April 1, 2018
 	======  =================
 
-Every 2.6.x release is a major kernel release with new features, internal
-API changes, and more.  A typical 2.6 release can contain nearly 10,000
-changesets with changes to several hundred thousand lines of code.  2.6 is
+Every 4.x release is a major kernel release with new features, internal
+API changes, and more.  A typical 4.x release contain about 13,000
+changesets with changes to several hundred thousand lines of code.  4.x is
 thus the leading edge of Linux kernel development; the kernel uses a
 rolling development model which is continually integrating major changes.
 
@@ -70,20 +70,19 @@ will get up to somewhere between -rc6 and -rc9 before the kernel is
 considered to be sufficiently stable and the final 2.6.x release is made.
 At that point the whole process starts over again.
 
-As an example, here is how the 2.6.38 development cycle went (all dates in
-2011):
+As an example, here is how the 4.16 development cycle went (all dates in
+2018):
 
 	==============  ===============================
-	January 4	2.6.37 stable release
-	January 18	2.6.38-rc1, merge window closes
-	January 21	2.6.38-rc2
-	February 1	2.6.38-rc3
-	February 7	2.6.38-rc4
-	February 15	2.6.38-rc5
-	February 21	2.6.38-rc6
-	March 1		2.6.38-rc7
-	March 7		2.6.38-rc8
-	March 14	2.6.38 stable release
+	January 28	4.15 stable release
+	February 11	4.16-rc1, merge window closes
+	February 18	4.16-rc2
+	February 25	4.16-rc3
+	March 4		4.16-rc4
+	March 11	4.16-rc5
+	March 18	4.16-rc6
+	March 25	4.16-rc7
+	April 1		4.17 stable release
 	==============  ===============================
 
 How do the developers decide when to close the development cycle and create
@@ -99,37 +98,42 @@ release is made.  In the real world, this kind of perfection is hard to
 achieve; there are just too many variables in a project of this size.
 There comes a point where delaying the final release just makes the problem
 worse; the pile of changes waiting for the next merge window will grow
-larger, creating even more regressions the next time around.  So most 2.6.x
+larger, creating even more regressions the next time around.  So most 4.x
 kernels go out with a handful of known regressions though, hopefully, none
 of them are serious.
 
 Once a stable release is made, its ongoing maintenance is passed off to the
 "stable team," currently consisting of Greg Kroah-Hartman.  The stable team
-will release occasional updates to the stable release using the 2.6.x.y
+will release occasional updates to the stable release using the 4.x.y
 numbering scheme.  To be considered for an update release, a patch must (1)
 fix a significant bug, and (2) already be merged into the mainline for the
 next development kernel.  Kernels will typically receive stable updates for
 a little more than one development cycle past their initial release.  So,
-for example, the 2.6.36 kernel's history looked like:
+for example, the 4.13 kernel's history looked like:
 
 	==============  ===============================
-	October 10	2.6.36 stable release
-	November 22	2.6.36.1
-	December 9	2.6.36.2
-	January 7	2.6.36.3
-	February 17	2.6.36.4
+	September 3 	4.13 stable release
+	September 13	4.13.1
+	September 20	4.13.2
+	September 27	4.13.3
+	October 5	4.13.4
+	October 12  	4.13.5
+	...		...
+	November 24	4.13.16
 	==============  ===============================
 
-2.6.36.4 was the final stable update for the 2.6.36 release.
+4.13.16 was the final stable update of the 4.13 release.
 
 Some kernels are designated "long term" kernels; they will receive support
 for a longer period.  As of this writing, the current long term kernels
 and their maintainers are:
 
-	======  ======================  ===========================
-	2.6.27	Willy Tarreau		(Deep-frozen stable kernel)
-	2.6.32	Greg Kroah-Hartman
-	2.6.35	Andi Kleen		(Embedded flag kernel)
+	======  ======================  ==============================
+	3.16	Ben Hutchings		(very long-term stable kernel)
+	4.1	Sasha Levin
+	4.4	Greg Kroah-Hartman	(very long-term stable kernel)
+	4.9	Greg Kroah-Hartman
+	4.14	Greg Kroah-Hartman
 	======  ======================  ===========================
 
 The selection of a kernel for long-term support is purely a matter of a
diff --git a/Documentation/process/5.Posting.rst b/Documentation/process/5.Posting.rst
index c209d70..c418c5d 100644
--- a/Documentation/process/5.Posting.rst
+++ b/Documentation/process/5.Posting.rst
@@ -10,8 +10,8 @@ of conventions and procedures which are used in the posting of patches;
 following them will make life much easier for everybody involved.  This
 document will attempt to cover these expectations in reasonable detail;
 more information can also be found in the files process/submitting-patches.rst,
-process/submitting-drivers.rst, and process/submit-checklist.rst in the kernel documentation
-directory.
+process/submitting-drivers.rst, and process/submit-checklist.rst in the kernel
+documentation directory.
 
 
 When to post
@@ -198,8 +198,8 @@ pass it to diff with the "-X" option.
 
 The tags mentioned above are used to describe how various developers have
 been associated with the development of this patch.  They are described in
-detail in the process/submitting-patches.rst document; what follows here is a brief
-summary.  Each of these lines has the format:
+detail in the process/submitting-patches.rst document; what follows here is a
+brief summary.  Each of these lines has the format:
 
 ::
 
@@ -210,8 +210,8 @@ The tags in common use are:
  - Signed-off-by: this is a developer's certification that he or she has
    the right to submit the patch for inclusion into the kernel.  It is an
    agreement to the Developer's Certificate of Origin, the full text of
-   which can be found in Documentation/process/submitting-patches.rst.  Code without a
-   proper signoff cannot be merged into the mainline.
+   which can be found in Documentation/process/submitting-patches.rst.  Code
+   without a proper signoff cannot be merged into the mainline.
 
  - Co-developed-by: states that the patch was also created by another developer
    along with the original author.  This is useful at times when multiple
@@ -226,8 +226,8 @@ The tags in common use are:
    it to work.
 
  - Reviewed-by: the named developer has reviewed the patch for correctness;
-   see the reviewer's statement in Documentation/process/submitting-patches.rst for more
-   detail.
+   see the reviewer's statement in Documentation/process/submitting-patches.rst
+   for more detail.
 
  - Reported-by: names a user who reported a problem which is fixed by this
    patch; this tag is used to give credit to the (often underappreciated)
diff --git a/Documentation/process/index.rst b/Documentation/process/index.rst
index 1c9fe65..37bd062 100644
--- a/Documentation/process/index.rst
+++ b/Documentation/process/index.rst
@@ -52,6 +52,7 @@ lack of a better place.
    adding-syscalls
    magic-number
    volatile-considered-harmful
+   clang-format
 
 .. only::  subproject and html
 
diff --git a/Documentation/process/maintainer-pgp-guide.rst b/Documentation/process/maintainer-pgp-guide.rst
index b453561..aff9b1a 100644
--- a/Documentation/process/maintainer-pgp-guide.rst
+++ b/Documentation/process/maintainer-pgp-guide.rst
@@ -219,7 +219,7 @@ Our goal is to protect your master key by moving it to offline media, so
 if you only have a combined **[SC]** key, then you should create a separate
 signing subkey::
 
-    $ gpg --quick-add-key [fpr] ed25519 sign
+    $ gpg --quick-addkey [fpr] ed25519 sign
 
 Remember to tell the keyservers about this change, so others can pull down
 your new subkey::
@@ -450,11 +450,18 @@ functionality.  There are several options available:
 others. If you want to use ECC keys, your best bet among commercially
 available devices is the Nitrokey Start.
 
+.. note::
+
+    If you are listed in MAINTAINERS or have an account at kernel.org,
+    you `qualify for a free Nitrokey Start`_ courtesy of The Linux
+    Foundation.
+
 .. _`Nitrokey Start`: https://shop.nitrokey.com/shop/product/nitrokey-start-6
 .. _`Nitrokey Pro`: https://shop.nitrokey.com/shop/product/nitrokey-pro-3
 .. _`Yubikey 4`: https://www.yubico.com/product/yubikey-4-series/
 .. _Gnuk: http://www.fsij.org/doc-gnuk/
 .. _`LWN has a good review`: https://lwn.net/Articles/736231/
+.. _`qualify for a free Nitrokey Start`: https://www.kernel.org/nitrokey-digital-tokens-for-kernel-developers.html
 
 Configure your smartcard device
 -------------------------------
@@ -482,7 +489,7 @@ there are no convenient command-line switches::
 You should set the user PIN (1), Admin PIN (3), and the Reset Code (4).
 Please make sure to record and store these in a safe place -- especially
 the Admin PIN and the Reset Code (which allows you to completely wipe
-the smartcard).  You so rarely need to use the Admin PIN, that you will
+the smartcard). You so rarely need to use the Admin PIN, that you will
 inevitably forget what it is if you do not record it.
 
 Getting back to the main card menu, you can also set other values (such
@@ -494,6 +501,12 @@ additionally leak information about your smartcard should you lose it.
     Despite having the name "PIN", neither the user PIN nor the admin
     PIN on the card need to be numbers.
 
+.. warning::
+
+    Some devices may require that you move the subkeys onto the device
+    before you can change the passphrase. Please check the documentation
+    provided by the device manufacturer.
+
 Move the subkeys to your smartcard
 ----------------------------------
 
@@ -655,6 +668,20 @@ want to import these changes back into your regular working directory::
     $ gpg --export | gpg --homedir ~/.gnupg --import
     $ unset GNUPGHOME
 
+Using gpg-agent over ssh
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can forward your gpg-agent over ssh if you need to sign tags or
+commits on a remote system. Please refer to the instructions provided
+on the GnuPG wiki:
+
+- `Agent Forwarding over SSH`_
+
+It works more smoothly if you can modify the sshd server settings on the
+remote end.
+
+.. _`Agent Forwarding over SSH`: https://wiki.gnupg.org/AgentForwarding
+
 
 Using PGP with Git
 ==================
@@ -692,6 +719,7 @@ should be used (``[fpr]`` is the fingerprint of your key)::
 tell git to always use it instead of the legacy ``gpg`` from version 1::
 
     $ git config --global gpg.program gpg2
+    $ git config --global gpgv.program gpgv2
 
 How to work with signed tags
 ----------------------------
@@ -731,6 +759,13 @@ If you are verifying someone else's git tag, then you will need to
 import their PGP key. Please refer to the
 ":ref:`verify_identities`" section below.
 
+.. note::
+
+    If you get "``gpg: Can't check signature: unknown pubkey
+    algorithm``" error, you need to tell git to use gpgv2 for
+    verification, so it properly processes signatures made by ECC keys.
+    See instructions at the start of this section.
+
 Configure git to always sign annotated tags
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst
index f7152ed..908bb55 100644
--- a/Documentation/process/submitting-patches.rst
+++ b/Documentation/process/submitting-patches.rst
@@ -761,7 +761,7 @@ requests, especially from new, unknown developers.  If in doubt you can use
 the pull request as the cover letter for a normal posting of the patch
 series, giving the maintainer the option of using either.
 
-A pull request should have [GIT] or [PULL] in the subject line.  The
+A pull request should have [GIT PULL] in the subject line.  The
 request itself should include the repository name and the branch of
 interest on a single line; it should look something like::
 
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt
index 8ce78f8..b14e03f 100644
--- a/Documentation/scheduler/sched-deadline.txt
+++ b/Documentation/scheduler/sched-deadline.txt
@@ -49,7 +49,7 @@ CONTENTS
 2.1 Main algorithm
 ------------------
 
- SCHED_DEADLINE uses three parameters, named "runtime", "period", and
+ SCHED_DEADLINE [18] uses three parameters, named "runtime", "period", and
  "deadline", to schedule tasks. A SCHED_DEADLINE task should receive
  "runtime" microseconds of execution time every "period" microseconds, and
  these "runtime" microseconds are available within "deadline" microseconds
@@ -117,6 +117,10 @@ CONTENTS
          scheduling deadline = scheduling deadline + period
          remaining runtime = remaining runtime + runtime
 
+ The SCHED_FLAG_DL_OVERRUN flag in sched_attr's sched_flags field allows a task
+ to get informed about runtime overruns through the delivery of SIGXCPU
+ signals.
+
 
 2.2 Bandwidth reclaiming
 ------------------------
@@ -279,6 +283,19 @@ CONTENTS
     running_bw is incremented.
 
 
+2.3 Energy-aware scheduling
+------------------------
+
+ When cpufreq's schedutil governor is selected, SCHED_DEADLINE implements the
+ GRUB-PA [19] algorithm, reducing the CPU operating frequency to the minimum
+ value that still allows to meet the deadlines. This behavior is currently
+ implemented only for ARM architectures.
+
+ A particular care must be taken in case the time needed for changing frequency
+ is of the same order of magnitude of the reservation period. In such cases,
+ setting a fixed CPU frequency results in a lower amount of deadline misses.
+
+
 3. Scheduling Real-Time Tasks
 =============================
 
@@ -505,6 +522,12 @@ CONTENTS
   17 - L. Abeni, G. Lipari, A. Parri, Y. Sun, Multicore CPU reclaiming: parallel
        or sequential?. In Proceedings of the 31st Annual ACM Symposium on Applied
        Computing, 2016.
+  18 - J. Lelli, C. Scordino, L. Abeni, D. Faggioli, Deadline scheduling in the
+       Linux kernel, Software: Practice and Experience, 46(6): 821-839, June
+       2016.
+  19 - C. Scordino, L. Abeni, J. Lelli, Energy-Aware Real-Time Scheduling in
+       the Linux Kernel, 33rd ACM/SIGAPP Symposium On Applied Computing (SAC
+       2018), Pau, France, April 2018.
 
 
 4. Bandwidth management
diff --git a/Documentation/scsi/scsi_eh.txt b/Documentation/scsi/scsi_eh.txt
index 11e447b..1b74369 100644
--- a/Documentation/scsi/scsi_eh.txt
+++ b/Documentation/scsi/scsi_eh.txt
@@ -82,24 +82,13 @@ function
  1. invokes optional hostt->eh_timed_out() callback.  Return value can
     be one of
 
-    - BLK_EH_HANDLED
-	This indicates that eh_timed_out() dealt with the timeout.
-	The command is passed back to the block layer and completed
-	via __blk_complete_requests().
-
-	*NOTE* After returning BLK_EH_HANDLED the SCSI layer is
-	assumed to be finished with the command, and no other
-	functions from the SCSI layer will be called. So this
-	should typically only be returned if the eh_timed_out()
-	handler raced with normal completion.
-
     - BLK_EH_RESET_TIMER
 	This indicates that more time is required to finish the
 	command.  Timer is restarted.  This action is counted as a
 	retry and only allowed scmd->allowed + 1(!) times.  Once the
-	limit is reached, action for BLK_EH_NOT_HANDLED is taken instead.
+	limit is reached, action for BLK_EH_DONE is taken instead.
 
-    - BLK_EH_NOT_HANDLED
+    - BLK_EH_DONE
         eh_timed_out() callback did not handle the command.
 	Step #2 is taken.
 
diff --git a/Documentation/security/index.rst b/Documentation/security/index.rst
index 298a94a..85492bf 100644
--- a/Documentation/security/index.rst
+++ b/Documentation/security/index.rst
@@ -9,5 +9,7 @@ Security Documentation
    IMA-templates
    keys/index
    LSM
+   LSM-sctp
+   SELinux-sctp
    self-protection
    tpm/index
diff --git a/Documentation/sound/alsa-configuration.rst b/Documentation/sound/alsa-configuration.rst
index aed6b4f..ab57611 100644
--- a/Documentation/sound/alsa-configuration.rst
+++ b/Documentation/sound/alsa-configuration.rst
@@ -1062,7 +1062,7 @@ output (with ``--no-upload`` option) to kernel bugzilla or alsa-devel
 ML (see the section `Links and Addresses`_).
 
 ``power_save`` and ``power_save_controller`` options are for power-saving
-mode.  See powersave.txt for details.
+mode.  See powersave.rst for details.
 
 Note 2: If you get click noises on output, try the module option
 ``position_fix=1`` or ``2``.  ``position_fix=1`` will use the SD_LPIB
@@ -1133,7 +1133,7 @@ line_outs_monitor
 enable_monitor
     Enable Analog Out on Channel 63/64 by default.
 
-See hdspm.txt for details.
+See hdspm.rst for details.
 
 Module snd-ice1712
 ------------------
diff --git a/Documentation/sound/soc/codec.rst b/Documentation/sound/soc/codec.rst
index f87612b..240770e 100644
--- a/Documentation/sound/soc/codec.rst
+++ b/Documentation/sound/soc/codec.rst
@@ -139,7 +139,7 @@ DAPM description
 ----------------
 The Dynamic Audio Power Management description describes the codec power
 components and their relationships and registers to the ASoC core.
-Please read dapm.txt for details of building the description.
+Please read dapm.rst for details of building the description.
 
 Please also see the examples in other codec drivers.
 
diff --git a/Documentation/sound/soc/platform.rst b/Documentation/sound/soc/platform.rst
index d557490..02c93a8 100644
--- a/Documentation/sound/soc/platform.rst
+++ b/Documentation/sound/soc/platform.rst
@@ -66,7 +66,7 @@ Each SoC DAI driver must provide the following features:-
 4. SYSCLK configuration
 5. Suspend and resume (optional)
 
-Please see codec.txt for a description of items 1 - 4.
+Please see codec.rst for a description of items 1 - 4.
 
 
 SoC DSP Drivers
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 17256f2..697ef8c 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -515,7 +515,7 @@ nr_hugepages
 
 Change the minimum size of the hugepage pool.
 
-See Documentation/vm/hugetlbpage.txt
+See Documentation/admin-guide/mm/hugetlbpage.rst
 
 ==============================================================
 
@@ -524,7 +524,7 @@ nr_overcommit_hugepages
 Change the maximum size of the hugepage pool. The maximum is
 nr_hugepages + nr_overcommit_hugepages.
 
-See Documentation/vm/hugetlbpage.txt
+See Documentation/admin-guide/mm/hugetlbpage.rst
 
 ==============================================================
 
@@ -667,7 +667,7 @@ and don't use much of it.
 
 The default value is 0.
 
-See Documentation/vm/overcommit-accounting and
+See Documentation/vm/overcommit-accounting.rst and
 mm/mmap.c::__vm_enough_memory() for more information.
 
 ==============================================================
diff --git a/Documentation/trace/coresight.txt b/Documentation/trace/coresight.txt
index 6f0120c..1d74ad0 100644
--- a/Documentation/trace/coresight.txt
+++ b/Documentation/trace/coresight.txt
@@ -187,13 +187,19 @@ that can be performed on them (see "struct coresight_ops").  The
 specific to that component only.  "Implementation defined" customisations are
 expected to be accessed and controlled using those entries.
 
-Last but not least, "struct module *owner" is expected to be set to reflect
-the information carried in "THIS_MODULE".
 
 How to use the tracer modules
 -----------------------------
 
-Before trace collection can start, a coresight sink needs to be identify.
+There are two ways to use the Coresight framework: 1) using the perf cmd line
+tools and 2) interacting directly with the Coresight devices using the sysFS
+interface.  Preference is given to the former as using the sysFS interface
+requires a deep understanding of the Coresight HW.  The following sections
+provide details on using both methods.
+
+1) Using the sysFS interface:
+
+Before trace collection can start, a coresight sink needs to be identified.
 There is no limit on the amount of sinks (nor sources) that can be enabled at
 any given moment.  As a generic operation, all device pertaining to the sink
 class will have an "active" entry in sysfs:
@@ -298,42 +304,48 @@ Instruction     13570831        0x8026B584      E28DD00C        false   ADD
 Instruction     0       0x8026B588      E8BD8000        true    LDM      sp!,{pc}
 Timestamp                                       Timestamp: 17107041535
 
-How to use the STM module
--------------------------
+2) Using perf framework:
 
-Using the System Trace Macrocell module is the same as the tracers - the only
-difference is that clients are driving the trace capture rather
-than the program flow through the code.
+Coresight tracers are represented using the Perf framework's Performance
+Monitoring Unit (PMU) abstraction.  As such the perf framework takes charge of
+controlling when tracing gets enabled based on when the process of interest is
+scheduled.  When configured in a system, Coresight PMUs will be listed when
+queried by the perf command line tool:
 
-As with any other CoreSight component, specifics about the STM tracer can be
-found in sysfs with more information on each entry being found in [1]:
+	linaro@linaro-nano:~$ ./perf list pmu
 
-root@genericarmv8:~# ls /sys/bus/coresight/devices/20100000.stm
-enable_source   hwevent_select  port_enable     subsystem       uevent
-hwevent_enable  mgmt            port_select     traceid
-root@genericarmv8:~#
+		List of pre-defined events (to be used in -e):
 
-Like any other source a sink needs to be identified and the STM enabled before
-being used:
+		cs_etm//                                    [Kernel PMU event]
 
-root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20010000.etf/enable_sink
-root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20100000.stm/enable_source
+	linaro@linaro-nano:~$
 
-From there user space applications can request and use channels using the devfs
-interface provided for that purpose by the generic STM API:
+Regardless of the number of tracers available in a system (usually equal to the
+amount of processor cores), the "cs_etm" PMU will be listed only once.
 
-root@genericarmv8:~# ls -l /dev/20100000.stm
-crw-------    1 root     root       10,  61 Jan  3 18:11 /dev/20100000.stm
-root@genericarmv8:~#
+A Coresight PMU works the same way as any other PMU, i.e the name of the PMU is
+listed along with configuration options within forward slashes '/'.  Since a
+Coresight system will typically have more than one sink, the name of the sink to
+work with needs to be specified as an event option.  Names for sink to choose
+from are listed in sysFS under ($SYSFS)/bus/coresight/devices:
 
-Details on how to use the generic STM API can be found here [2].
+	root@linaro-nano:~# ls /sys/bus/coresight/devices/
+		20010000.etf   20040000.funnel  20100000.stm  22040000.etm
+		22140000.etm  230c0000.funnel  23240000.etm 20030000.tpiu
+		20070000.etr     20120000.replicator  220c0000.funnel
+		23040000.etm  23140000.etm     23340000.etm
 
-[1]. Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
-[2]. Documentation/trace/stm.txt
+	root@linaro-nano:~# perf record -e cs_etm/@20070000.etr/u --per-thread program
 
+The syntax within the forward slashes '/' is important.  The '@' character
+tells the parser that a sink is about to be specified and that this is the sink
+to use for the trace session.
 
-Using perf tools
-----------------
+More information on the above and other example on how to use Coresight with
+the perf tools can be found in the "HOWTO.md" file of the openCSD gitHub
+repository [3].
+
+2.1) AutoFDO analysis using the perf tools:
 
 perf can be used to record and analyze trace of programs.
 
@@ -381,3 +393,38 @@ sort example is from the AutoFDO tutorial (https://gcc.gnu.org/wiki/AutoFDO/Tuto
 	$ taskset -c 2 ./sort_autofdo
 	Bubble sorting array of 30000 elements
 	5806 ms
+
+
+How to use the STM module
+-------------------------
+
+Using the System Trace Macrocell module is the same as the tracers - the only
+difference is that clients are driving the trace capture rather
+than the program flow through the code.
+
+As with any other CoreSight component, specifics about the STM tracer can be
+found in sysfs with more information on each entry being found in [1]:
+
+root@genericarmv8:~# ls /sys/bus/coresight/devices/20100000.stm
+enable_source   hwevent_select  port_enable     subsystem       uevent
+hwevent_enable  mgmt            port_select     traceid
+root@genericarmv8:~#
+
+Like any other source a sink needs to be identified and the STM enabled before
+being used:
+
+root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20010000.etf/enable_sink
+root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20100000.stm/enable_source
+
+From there user space applications can request and use channels using the devfs
+interface provided for that purpose by the generic STM API:
+
+root@genericarmv8:~# ls -l /dev/20100000.stm
+crw-------    1 root     root       10,  61 Jan  3 18:11 /dev/20100000.stm
+root@genericarmv8:~#
+
+Details on how to use the generic STM API can be found here [2].
+
+[1]. Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
+[2]. Documentation/trace/stm.txt
+[3]. https://github.com/Linaro/perf-opencsd
diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst
index 998a60a..00283b6 100644
--- a/Documentation/trace/ftrace-uses.rst
+++ b/Documentation/trace/ftrace-uses.rst
@@ -12,7 +12,7 @@ Written for: 4.14
 Introduction
 ============
 
-The ftrace infrastructure was originially created to attach callbacks to the
+The ftrace infrastructure was originally created to attach callbacks to the
 beginning of functions in order to record and trace the flow of the kernel.
 But callbacks to the start of a function can have other use cases. Either
 for live kernel patching, or for security monitoring. This document describes
@@ -30,7 +30,7 @@ The ftrace context
   This requires extra care to what can be done inside a callback. A callback
   can be called outside the protective scope of RCU.
 
-The ftrace infrastructure has some protections agains recursions and RCU
+The ftrace infrastructure has some protections against recursions and RCU
 but one must still be very careful how they use the callbacks.
 
 
diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index 67d9c38..6b80ac4 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -224,6 +224,8 @@ of ftrace. Here is a list of some of the key files:
 	has a side effect of enabling or disabling specific functions
 	to be traced. Echoing names of functions into this file
 	will limit the trace to only those functions.
+	This influences the tracers "function" and "function_graph"
+	and thus also function profiling (see "function_profile_enabled").
 
 	The functions listed in "available_filter_functions" are what
 	can be written into this file.
@@ -265,6 +267,8 @@ of ftrace. Here is a list of some of the key files:
 	Functions listed in this file will cause the function graph
 	tracer to only trace these functions and the functions that
 	they call. (See the section "dynamic ftrace" for more details).
+	Note, set_ftrace_filter and set_ftrace_notrace still affects
+	what functions are being traced.
 
   set_graph_notrace:
 
@@ -277,7 +281,8 @@ of ftrace. Here is a list of some of the key files:
 
 	This lists the functions that ftrace has processed and can trace.
 	These are the function names that you can pass to
-	"set_ftrace_filter" or "set_ftrace_notrace".
+	"set_ftrace_filter", "set_ftrace_notrace",
+	"set_graph_function", or "set_graph_notrace".
 	(See the section "dynamic ftrace" below for more details.)
 
   dyn_ftrace_total_info:
diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt
index 0a0930a..921739d 100644
--- a/Documentation/translations/ko_KR/memory-barriers.txt
+++ b/Documentation/translations/ko_KR/memory-barriers.txt
@@ -36,6 +36,9 @@ Documentation/memory-barriers.txt
 부분도 있고, 의도하진 않았지만 사람에 의해 쓰였다보니 불완전한 부분도 있습니다.
 이 문서는 리눅스에서 제공하는 다양한 메모리 배리어들을 사용하기 위한
 안내서입니다만, 뭔가 이상하다 싶으면 (그런게 많을 겁니다) 질문을 부탁드립니다.
+일부 이상한 점들은 공식적인 메모리 일관성 모델과 tools/memory-model/ 에 있는
+관련 문서를 참고해서 해결될 수 있을 겁니다.  그러나, 이 메모리 모델조차도 그
+관리자들의 의견의 집합으로 봐야지, 절대 옳은 예언자로 신봉해선 안될 겁니다.
 
 다시 말하지만, 이 문서는 리눅스가 하드웨어에 기대하는 사항에 대한 명세서가
 아닙니다.
@@ -77,7 +80,7 @@ Documentation/memory-barriers.txt
 
      - 메모리 배리어의 종류.
      - 메모리 배리어에 대해 가정해선 안될 것.
-     - 데이터 의존성 배리어.
+     - 데이터 의존성 배리어 (역사적).
      - 컨트롤 의존성.
      - SMP 배리어 짝맞추기.
      - 메모리 배리어 시퀀스의 예.
@@ -255,17 +258,20 @@ CPU 에게 기대할 수 있는 최소한의 보장사항 몇가지가 있습니
  (*) 어떤 CPU 든, 의존성이 존재하는 메모리 액세스들은 해당 CPU 자신에게
      있어서는 순서대로 메모리 시스템에 수행 요청됩니다. 즉, 다음에 대해서:
 
-	Q = READ_ONCE(P); smp_read_barrier_depends(); D = READ_ONCE(*Q);
+	Q = READ_ONCE(P); D = READ_ONCE(*Q);
 
      CPU 는 다음과 같은 메모리 오퍼레이션 시퀀스를 수행 요청합니다:
 
 	Q = LOAD P, D = LOAD *Q
 
-     그리고 그 시퀀스 내에서의 순서는 항상 지켜집니다.  대부분의 시스템에서
-     smp_read_barrier_depends() 는 아무일도 안하지만 DEC Alpha 에서는
-     명시적으로 사용되어야 합니다.  보통의 경우에는 smp_read_barrier_depends()
-     를 직접 사용하는 대신 rcu_dereference() 같은 것들을 사용해야 함을
-     알아두세요.
+     그리고 그 시퀀스 내에서의 순서는 항상 지켜집니다.  하지만, DEC Alpha 에서
+     READ_ONCE() 는 메모리 배리어 명령도 내게 되어 있어서, DEC Alpha CPU 는
+     다음과 같은 메모리 오퍼레이션들을 내놓게 됩니다:
+
+	Q = LOAD P, MEMORY_BARRIER, D = LOAD *Q, MEMORY_BARRIER
+
+     DEC Alpha 에서 수행되든 아니든, READ_ONCE() 는 컴파일러로부터의 악영향
+     또한 제거합니다.
 
  (*) 특정 CPU 내에서 겹치는 영역의 메모리에 행해지는 로드와 스토어 들은 해당
      CPU 안에서는 순서가 바뀌지 않은 것으로 보여집니다.  즉, 다음에 대해서:
@@ -421,8 +427,8 @@ CPU 에게 기대할 수 있는 최소한의 보장사항 몇가지가 있습니
      데이터 의존성 배리어는 읽기 배리어의 보다 완화된 형태입니다.  두개의 로드
      오퍼레이션이 있고 두번째 것이 첫번째 것의 결과에 의존하고 있을 때(예:
      두번째 로드가 참조할 주소를 첫번째 로드가 읽는 경우), 두번째 로드가 읽어올
-     데이터는 첫번째 로드에 의해 그 주소가 얻어지기 전에 업데이트 되어 있음을
-     보장하기 위해서 데이터 의존성 배리어가 필요할 수 있습니다.
+     데이터는 첫번째 로드에 의해 그 주소가 얻어진 뒤에 업데이트 됨을 보장하기
+     위해서 데이터 의존성 배리어가 필요할 수 있습니다.
 
      데이터 의존성 배리어는 상호 의존적인 로드 오퍼레이션들 사이의 부분적 순서
      세우기입니다; 스토어 오퍼레이션들이나 독립적인 로드들, 또는 중복되는
@@ -570,8 +576,14 @@ ACQUIRE 는 해당 오퍼레이션의 로드 부분에만 적용되고 RELEASE �
 	    Documentation/DMA-API.txt
 
 
-데이터 의존성 배리어
---------------------
+데이터 의존성 배리어 (역사적)
+-----------------------------
+
+리눅스 커널 v4.15 기준으로, smp_read_barrier_depends() 가 READ_ONCE() 에
+추가되었는데, 이는 이 섹션에 주의를 기울여야 하는 사람들은 DEC Alpha 아키텍쳐
+전용 코드를 만드는 사람들과 READ_ONCE() 자체를 만드는 사람들 뿐임을 의미합니다.
+그런 분들을 위해, 그리고 역사에 관심 있는 분들을 위해, 여기 데이터 의존성
+배리어에 대한 이야기를 적습니다.
 
 데이터 의존성 배리어의 사용에 있어 지켜야 하는 사항들은 약간 미묘하고, 데이터
 의존성 배리어가 사용되어야 하는 상황도 항상 명백하지는 않습니다.  설명을 위해
@@ -1787,7 +1799,7 @@ CPU 메모리 배리어
 	범용		mb()			smp_mb()
 	쓰기		wmb()			smp_wmb()
 	읽기		rmb()			smp_rmb()
-	데이터 의존성	read_barrier_depends()	smp_read_barrier_depends()
+	데이터 의존성				READ_ONCE()
 
 
 데이터 의존성 배리어를 제외한 모든 메모리 배리어는 컴파일러 배리어를
@@ -2796,8 +2808,9 @@ CPU 2 는 C/D 를 갖습니다)가 병렬로 연결되어 있는 시스템을 �
 
 
 여기에 개입하기 위해선, 데이터 의존성 배리어나 읽기 배리어를 로드 오퍼레이션들
-사이에 넣어야 합니다.  이렇게 함으로써 캐시가 다음 요청을 처리하기 전에 일관성
-큐를 처리하도록 강제하게 됩니다.
+사이에 넣어야 합니다 (v4.15 부터는 READ_ONCE() 매크로에 의해 무조건적으로
+그렇게 됩니다).  이렇게 함으로써 캐시가 다음 요청을 처리하기 전에 일관성 큐를
+처리하도록 강제하게 됩니다.
 
 	CPU 1		CPU 2		COMMENT
 	===============	===============	=======================================
@@ -2826,7 +2839,10 @@ CPU 2 는 C/D 를 갖습니다)가 병렬로 연결되어 있는 시스템을 �
 다른 CPU 들도 분할된 캐시를 가지고 있을 수 있지만, 그런 CPU 들은 평범한 메모리
 액세스를 위해서도 이 분할된 캐시들 사이의 조정을 해야만 합니다.  Alpha 는 가장
 약한 메모리 순서 시맨틱 (semantic) 을 선택함으로써 메모리 배리어가 명시적으로
-사용되지 않았을 때에는 그런 조정이 필요하지 않게 했습니다.
+사용되지 않았을 때에는 그런 조정이 필요하지 않게 했으며, 이는 Alpha 가 당시에
+더 높은 CPU 클락 속도를 가질 수 있게 했습니다.  하지만, (다시 말하건대, v4.15
+이후부터는) Alpha 아키텍쳐 전용 코드와 READ_ONCE() 매크로 내부에서를 제외하고는
+smp_read_barrier_depends() 가 사용되지 않아야 함을 알아두시기 바랍니다.
 
 
 캐시 일관성 VS DMA
@@ -2846,7 +2862,7 @@ CPU 의 캐시에서 RAM 으로 쓰여지는 더티 캐시 라인에 의해 덮�
 문제를 해결하기 위해선, 커널의 적절한 부분에서 각 CPU 의 캐시 안의 문제가 되는
 비트들을 무효화 시켜야 합니다.
 
-캐시 관리에 대한 더 많은 정보를 위해선 Documentation/cachetlb.txt 를
+캐시 관리에 대한 더 많은 정보를 위해선 Documentation/core-api/cachetlb.rst 를
 참고하세요.
 
 
@@ -2988,7 +3004,9 @@ Alpha CPU 의 일부 버전은 분할된 데이터 캐시를 가지고 있어서
 메모리 일관성 시스템과 함께 두개의 캐시를 동기화 시켜서, 포인터 변경과 새로운
 데이터의 발견을 올바른 순서로 일어나게 하기 때문입니다.
 
-리눅스 커널의 메모리 배리어 모델은 Alpha 에 기초해서 정의되었습니다.
+리눅스 커널의 메모리 배리어 모델은 Alpha 에 기초해서 정의되었습니다만, v4.15
+부터는 리눅스 커널이 READ_ONCE() 내에 smp_read_barrier_depends() 를 추가해서
+Alpha 의 메모리 모델로의 영향력이 크게 줄어들긴 했습니다.
 
 위의 "캐시 일관성" 서브섹션을 참고하세요.
 
@@ -3023,7 +3041,7 @@ smp_mb() 가 아니라 virt_mb() 를 사용해야 합니다.
 동기화에 락을 사용하지 않고 구현하는데에 사용될 수 있습니다.  더 자세한 내용을
 위해선 다음을 참고하세요:
 
-	Documentation/circular-buffers.txt
+	Documentation/core-api/circular-buffers.rst
 
 
 =========
diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index ef6a511..f1a4d3c 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -252,15 +252,14 @@ into VFIO core.  When devices are bound and unbound to the driver,
 the driver should call vfio_add_group_dev() and vfio_del_group_dev()
 respectively::
 
-	extern int vfio_add_group_dev(struct iommu_group *iommu_group,
-	                              struct device *dev,
+	extern int vfio_add_group_dev(struct device *dev,
 				      const struct vfio_device_ops *ops,
 				      void *device_data);
 
 	extern void *vfio_del_group_dev(struct device *dev);
 
 vfio_add_group_dev() indicates to the core to begin tracking the
-specified iommu_group and register the specified dev as owned by
+iommu_group of the specified dev and register the dev as owned by
 a VFIO bus driver.  The driver provides an ops structure for callbacks
 similar to a file operations structure::
 
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
index 0278f2c..f4a4f3e 100644
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -1,62 +1,50 @@
 00-INDEX
 	- this file.
-active_mm.txt
+active_mm.rst
 	- An explanation from Linus about tsk->active_mm vs tsk->mm.
-balance
+balance.rst
 	- various information on memory balancing.
-cleancache.txt
+cleancache.rst
 	- Intro to cleancache and page-granularity victim cache.
-frontswap.txt
+frontswap.rst
 	- Outline frontswap, part of the transcendent memory frontend.
-highmem.txt
+highmem.rst
 	- Outline of highmem and common issues.
-hmm.txt
+hmm.rst
 	- Documentation of heterogeneous memory management
-hugetlbpage.txt
-	- a brief summary of hugetlbpage support in the Linux kernel.
-hugetlbfs_reserv.txt
+hugetlbfs_reserv.rst
 	- A brief overview of hugetlbfs reservation design/implementation.
-hwpoison.txt
+hwpoison.rst
 	- explains what hwpoison is
-idle_page_tracking.txt
-	- description of the idle page tracking feature.
-ksm.txt
+ksm.rst
 	- how to use the Kernel Samepage Merging feature.
-mmu_notifier.txt
+mmu_notifier.rst
 	- a note about clearing pte/pmd and mmu notifications
-numa
+numa.rst
 	- information about NUMA specific code in the Linux vm.
-numa_memory_policy.txt
-	- documentation of concepts and APIs of the 2.6 memory policy support.
-overcommit-accounting
+overcommit-accounting.rst
 	- description of the Linux kernels overcommit handling modes.
-page_frags
+page_frags.rst
 	- description of page fragments allocator
-page_migration
+page_migration.rst
 	- description of page migration in NUMA systems.
-pagemap.txt
-	- pagemap, from the userspace perspective
-page_owner.txt
+page_owner.rst
 	- tracking about who allocated each page
-remap_file_pages.txt
+remap_file_pages.rst
 	- a note about remap_file_pages() system call
-slub.txt
+slub.rst
 	- a short users guide for SLUB.
-soft-dirty.txt
-	- short explanation for soft-dirty PTEs
-split_page_table_lock
+split_page_table_lock.rst
 	- Separate per-table lock to improve scalability of the old page_table_lock.
-swap_numa.txt
+swap_numa.rst
 	- automatic binding of swap device to numa node
-transhuge.txt
+transhuge.rst
 	- Transparent Hugepage Support, alternative way of using hugepages.
-unevictable-lru.txt
+unevictable-lru.rst
 	- Unevictable LRU infrastructure
-userfaultfd.txt
-	- description of userfaultfd system call
 z3fold.txt
 	- outline of z3fold allocator for storing compressed pages
-zsmalloc.txt
+zsmalloc.rst
 	- outline of zsmalloc allocator for storing compressed pages
-zswap.txt
+zswap.rst
 	- Intro to compressed cache for swap pages
diff --git a/Documentation/vm/active_mm.rst b/Documentation/vm/active_mm.rst
new file mode 100644
index 0000000..c84471b
--- /dev/null
+++ b/Documentation/vm/active_mm.rst
@@ -0,0 +1,91 @@
+.. _active_mm:
+
+=========
+Active MM
+=========
+
+::
+
+ List:       linux-kernel
+ Subject:    Re: active_mm
+ From:       Linus Torvalds <torvalds () transmeta ! com>
+ Date:       1999-07-30 21:36:24
+
+ Cc'd to linux-kernel, because I don't write explanations all that often,
+ and when I do I feel better about more people reading them.
+
+ On Fri, 30 Jul 1999, David Mosberger wrote:
+ >
+ > Is there a brief description someplace on how "mm" vs. "active_mm" in
+ > the task_struct are supposed to be used?  (My apologies if this was
+ > discussed on the mailing lists---I just returned from vacation and
+ > wasn't able to follow linux-kernel for a while).
+
+ Basically, the new setup is:
+
+  - we have "real address spaces" and "anonymous address spaces". The
+    difference is that an anonymous address space doesn't care about the
+    user-level page tables at all, so when we do a context switch into an
+    anonymous address space we just leave the previous address space
+    active.
+
+    The obvious use for a "anonymous address space" is any thread that
+    doesn't need any user mappings - all kernel threads basically fall into
+    this category, but even "real" threads can temporarily say that for
+    some amount of time they are not going to be interested in user space,
+    and that the scheduler might as well try to avoid wasting time on
+    switching the VM state around. Currently only the old-style bdflush
+    sync does that.
+
+  - "tsk->mm" points to the "real address space". For an anonymous process,
+    tsk->mm will be NULL, for the logical reason that an anonymous process
+    really doesn't _have_ a real address space at all.
+
+  - however, we obviously need to keep track of which address space we
+    "stole" for such an anonymous user. For that, we have "tsk->active_mm",
+    which shows what the currently active address space is.
+
+    The rule is that for a process with a real address space (ie tsk->mm is
+    non-NULL) the active_mm obviously always has to be the same as the real
+    one.
+
+    For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the
+    "borrowed" mm while the anonymous process is running. When the
+    anonymous process gets scheduled away, the borrowed address space is
+    returned and cleared.
+
+ To support all that, the "struct mm_struct" now has two counters: a
+ "mm_users" counter that is how many "real address space users" there are,
+ and a "mm_count" counter that is the number of "lazy" users (ie anonymous
+ users) plus one if there are any real users.
+
+ Usually there is at least one real user, but it could be that the real
+ user exited on another CPU while a lazy user was still active, so you do
+ actually get cases where you have a address space that is _only_ used by
+ lazy users. That is often a short-lived state, because once that thread
+ gets scheduled away in favour of a real thread, the "zombie" mm gets
+ released because "mm_users" becomes zero.
+
+ Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any
+ more. "init_mm" should be considered just a "lazy context when no other
+ context is available", and in fact it is mainly used just at bootup when
+ no real VM has yet been created. So code that used to check
+
+ 	if (current->mm == &init_mm)
+
+ should generally just do
+
+ 	if (!current->mm)
+
+ instead (which makes more sense anyway - the test is basically one of "do
+ we have a user context", and is generally done by the page fault handler
+ and things like that).
+
+ Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago,
+ because it slightly changes the interfaces to accommodate the alpha (who
+ would have thought it, but the alpha actually ends up having one of the
+ ugliest context switch codes - unlike the other architectures where the MM
+ and register state is separate, the alpha PALcode joins the two, and you
+ need to switch both together).
+
+ (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/vm/active_mm.txt b/Documentation/vm/active_mm.txt
deleted file mode 100644
index dbf4581..0000000
--- a/Documentation/vm/active_mm.txt
+++ /dev/null
@@ -1,83 +0,0 @@
-List:       linux-kernel
-Subject:    Re: active_mm
-From:       Linus Torvalds <torvalds () transmeta ! com>
-Date:       1999-07-30 21:36:24
-
-Cc'd to linux-kernel, because I don't write explanations all that often,
-and when I do I feel better about more people reading them.
-
-On Fri, 30 Jul 1999, David Mosberger wrote:
->
-> Is there a brief description someplace on how "mm" vs. "active_mm" in
-> the task_struct are supposed to be used?  (My apologies if this was
-> discussed on the mailing lists---I just returned from vacation and
-> wasn't able to follow linux-kernel for a while).
-
-Basically, the new setup is:
-
- - we have "real address spaces" and "anonymous address spaces". The
-   difference is that an anonymous address space doesn't care about the
-   user-level page tables at all, so when we do a context switch into an
-   anonymous address space we just leave the previous address space
-   active.
-
-   The obvious use for a "anonymous address space" is any thread that
-   doesn't need any user mappings - all kernel threads basically fall into
-   this category, but even "real" threads can temporarily say that for
-   some amount of time they are not going to be interested in user space,
-   and that the scheduler might as well try to avoid wasting time on
-   switching the VM state around. Currently only the old-style bdflush
-   sync does that.
-
- - "tsk->mm" points to the "real address space". For an anonymous process,
-   tsk->mm will be NULL, for the logical reason that an anonymous process
-   really doesn't _have_ a real address space at all.
-
- - however, we obviously need to keep track of which address space we
-   "stole" for such an anonymous user. For that, we have "tsk->active_mm",
-   which shows what the currently active address space is.
-
-   The rule is that for a process with a real address space (ie tsk->mm is
-   non-NULL) the active_mm obviously always has to be the same as the real
-   one.
-
-   For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the
-   "borrowed" mm while the anonymous process is running. When the
-   anonymous process gets scheduled away, the borrowed address space is
-   returned and cleared.
-
-To support all that, the "struct mm_struct" now has two counters: a
-"mm_users" counter that is how many "real address space users" there are,
-and a "mm_count" counter that is the number of "lazy" users (ie anonymous
-users) plus one if there are any real users.
-
-Usually there is at least one real user, but it could be that the real
-user exited on another CPU while a lazy user was still active, so you do
-actually get cases where you have a address space that is _only_ used by
-lazy users. That is often a short-lived state, because once that thread
-gets scheduled away in favour of a real thread, the "zombie" mm gets
-released because "mm_users" becomes zero.
-
-Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any
-more. "init_mm" should be considered just a "lazy context when no other
-context is available", and in fact it is mainly used just at bootup when
-no real VM has yet been created. So code that used to check
-
-	if (current->mm == &init_mm)
-
-should generally just do
-
-	if (!current->mm)
-
-instead (which makes more sense anyway - the test is basically one of "do
-we have a user context", and is generally done by the page fault handler
-and things like that).
-
-Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago,
-because it slightly changes the interfaces to accommodate the alpha (who
-would have thought it, but the alpha actually ends up having one of the
-ugliest context switch codes - unlike the other architectures where the MM
-and register state is separate, the alpha PALcode joins the two, and you
-need to switch both together).
-
-(From http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/vm/balance b/Documentation/vm/balance.rst
index 9645954..6a1fadf 100644
--- a/Documentation/vm/balance
+++ b/Documentation/vm/balance.rst
@@ -1,3 +1,9 @@
+.. _balance:
+
+================
+Memory Balancing
+================
+
 Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
 
 Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as
@@ -62,11 +68,11 @@ for non-sleepable allocations. Second, the HIGHMEM zone is also balanced,
 so as to give a fighting chance for replace_with_highmem() to get a
 HIGHMEM page, as well as to ensure that HIGHMEM allocations do not
 fall back into regular zone. This also makes sure that HIGHMEM pages
-are not leaked (for example, in situations where a HIGHMEM page is in 
+are not leaked (for example, in situations where a HIGHMEM page is in
 the swapcache but is not being used by anyone)
 
 kswapd also needs to know about the zones it should balance. kswapd is
-primarily needed in a situation where balancing can not be done, 
+primarily needed in a situation where balancing can not be done,
 probably because all allocation requests are coming from intr context
 and all process contexts are sleeping. For 2.3, kswapd does not really
 need to balance the highmem zone, since intr context does not request
@@ -89,7 +95,8 @@ pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set.
 
 
 (Good) Ideas that I have heard:
+
 1. Dynamic experience should influence balancing: number of failed requests
-for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net)
+   for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net)
 2. Implement a replace_with_highmem()-like replace_with_regular() to preserve
-dma pages. (lkd@tantalophile.demon.co.uk)
+   dma pages. (lkd@tantalophile.demon.co.uk)
diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.rst
index e4b49df..68cba91 100644
--- a/Documentation/vm/cleancache.txt
+++ b/Documentation/vm/cleancache.rst
@@ -1,4 +1,11 @@
-MOTIVATION
+.. _cleancache:
+
+==========
+Cleancache
+==========
+
+Motivation
+==========
 
 Cleancache is a new optional feature provided by the VFS layer that
 potentially dramatically increases page cache effectiveness for
@@ -21,9 +28,10 @@ Transcendent memory "drivers" for cleancache are currently implemented
 in Xen (using hypervisor memory) and zcache (using in-kernel compressed
 memory) and other implementations are in development.
 
-FAQs are included below.
+:ref:`FAQs <faq>` are included below.
 
-IMPLEMENTATION OVERVIEW
+Implementation Overview
+=======================
 
 A cleancache "backend" that provides transcendent memory registers itself
 to the kernel's cleancache "frontend" by calling cleancache_register_ops,
@@ -80,22 +88,33 @@ different Linux threads are simultaneously putting and invalidating a page
 with the same handle, the results are indeterminate.  Callers must
 lock the page to ensure serial behavior.
 
-CLEANCACHE PERFORMANCE METRICS
+Cleancache Performance Metrics
+==============================
 
 If properly configured, monitoring of cleancache is done via debugfs in
-the /sys/kernel/debug/cleancache directory.  The effectiveness of cleancache
+the `/sys/kernel/debug/cleancache` directory.  The effectiveness of cleancache
 can be measured (across all filesystems) with:
 
-succ_gets	- number of gets that were successful
-failed_gets	- number of gets that failed
-puts		- number of puts attempted (all "succeed")
-invalidates	- number of invalidates attempted
+``succ_gets``
+	number of gets that were successful
+
+``failed_gets``
+	number of gets that failed
+
+``puts``
+	number of puts attempted (all "succeed")
+
+``invalidates``
+	number of invalidates attempted
 
 A backend implementation may provide additional metrics.
 
+.. _faq:
+
 FAQ
+===
 
-1) Where's the value? (Andrew Morton)
+* Where's the value? (Andrew Morton)
 
 Cleancache provides a significant performance benefit to many workloads
 in many environments with negligible overhead by improving the
@@ -137,8 +156,8 @@ device that stores pages of data in a compressed state.  And
 the proposed "RAMster" driver shares RAM across multiple physical
 systems.
 
-2) Why does cleancache have its sticky fingers so deep inside the
-   filesystems and VFS? (Andrew Morton and Christoph Hellwig)
+* Why does cleancache have its sticky fingers so deep inside the
+  filesystems and VFS? (Andrew Morton and Christoph Hellwig)
 
 The core hooks for cleancache in VFS are in most cases a single line
 and the minimum set are placed precisely where needed to maintain
@@ -168,9 +187,9 @@ filesystems in the future.
 The total impact of the hooks to existing fs and mm files is only
 about 40 lines added (not counting comments and blank lines).
 
-3) Why not make cleancache asynchronous and batched so it can
-   more easily interface with real devices with DMA instead
-   of copying each individual page? (Minchan Kim)
+* Why not make cleancache asynchronous and batched so it can more
+  easily interface with real devices with DMA instead of copying each
+  individual page? (Minchan Kim)
 
 The one-page-at-a-time copy semantics simplifies the implementation
 on both the frontend and backend and also allows the backend to
@@ -182,8 +201,8 @@ are avoided.  While the interface seems odd for a "real device"
 or for real kernel-addressable RAM, it makes perfect sense for
 transcendent memory.
 
-4) Why is non-shared cleancache "exclusive"?  And where is the
-   page "invalidated" after a "get"? (Minchan Kim)
+* Why is non-shared cleancache "exclusive"?  And where is the
+  page "invalidated" after a "get"? (Minchan Kim)
 
 The main reason is to free up space in transcendent memory and
 to avoid unnecessary cleancache_invalidate calls.  If you want inclusive,
@@ -193,7 +212,7 @@ be easily extended to add a "get_no_invalidate" call.
 
 The invalidate is done by the cleancache backend implementation.
 
-5) What's the performance impact?
+* What's the performance impact?
 
 Performance analysis has been presented at OLS'09 and LCA'10.
 Briefly, performance gains can be significant on most workloads,
@@ -206,7 +225,7 @@ single-core systems with slow memory-copy speeds, cleancache
 has little value, but in newer multicore machines, especially
 consolidated/virtualized machines, it has great value.
 
-6) How do I add cleancache support for filesystem X? (Boaz Harrash)
+* How do I add cleancache support for filesystem X? (Boaz Harrash)
 
 Filesystems that are well-behaved and conform to certain
 restrictions can utilize cleancache simply by making a call to
@@ -217,26 +236,26 @@ not enable the optional cleancache.
 
 Some points for a filesystem to consider:
 
-- The FS should be block-device-based (e.g. a ram-based FS such
-  as tmpfs should not enable cleancache)
-- To ensure coherency/correctness, the FS must ensure that all
-  file removal or truncation operations either go through VFS or
-  add hooks to do the equivalent cleancache "invalidate" operations
-- To ensure coherency/correctness, either inode numbers must
-  be unique across the lifetime of the on-disk file OR the
-  FS must provide an "encode_fh" function.
-- The FS must call the VFS superblock alloc and deactivate routines
-  or add hooks to do the equivalent cleancache calls done there.
-- To maximize performance, all pages fetched from the FS should
-  go through the do_mpag_readpage routine or the FS should add
-  hooks to do the equivalent (cf. btrfs)
-- Currently, the FS blocksize must be the same as PAGESIZE.  This
-  is not an architectural restriction, but no backends currently
-  support anything different.
-- A clustered FS should invoke the "shared_init_fs" cleancache
-  hook to get best performance for some backends.
-
-7) Why not use the KVA of the inode as the key? (Christoph Hellwig)
+  - The FS should be block-device-based (e.g. a ram-based FS such
+    as tmpfs should not enable cleancache)
+  - To ensure coherency/correctness, the FS must ensure that all
+    file removal or truncation operations either go through VFS or
+    add hooks to do the equivalent cleancache "invalidate" operations
+  - To ensure coherency/correctness, either inode numbers must
+    be unique across the lifetime of the on-disk file OR the
+    FS must provide an "encode_fh" function.
+  - The FS must call the VFS superblock alloc and deactivate routines
+    or add hooks to do the equivalent cleancache calls done there.
+  - To maximize performance, all pages fetched from the FS should
+    go through the do_mpag_readpage routine or the FS should add
+    hooks to do the equivalent (cf. btrfs)
+  - Currently, the FS blocksize must be the same as PAGESIZE.  This
+    is not an architectural restriction, but no backends currently
+    support anything different.
+  - A clustered FS should invoke the "shared_init_fs" cleancache
+    hook to get best performance for some backends.
+
+* Why not use the KVA of the inode as the key? (Christoph Hellwig)
 
 If cleancache would use the inode virtual address instead of
 inode/filehandle, the pool id could be eliminated.  But, this
@@ -251,7 +270,7 @@ of cleancache would be lost because the cache of pages in cleanache
 is potentially much larger than the kernel pagecache and is most
 useful if the pages survive inode cache removal.
 
-8) Why is a global variable required?
+* Why is a global variable required?
 
 The cleancache_enabled flag is checked in all of the frequently-used
 cleancache hooks.  The alternative is a function call to check a static
@@ -262,14 +281,14 @@ global variable allows cleancache to be enabled by default at compile
 time, but have insignificant performance impact when cleancache remains
 disabled at runtime.
 
-9) Does cleanache work with KVM?
+* Does cleanache work with KVM?
 
 The memory model of KVM is sufficiently different that a cleancache
 backend may have less value for KVM.  This remains to be tested,
 especially in an overcommitted system.
 
-10) Does cleancache work in userspace?  It sounds useful for
-   memory hungry caches like web browsers.  (Jamie Lokier)
+* Does cleancache work in userspace?  It sounds useful for
+  memory hungry caches like web browsers.  (Jamie Lokier)
 
 No plans yet, though we agree it sounds useful, at least for
 apps that bypass the page cache (e.g. O_DIRECT).
diff --git a/Documentation/vm/conf.py b/Documentation/vm/conf.py
new file mode 100644
index 0000000..3b0b601
--- /dev/null
+++ b/Documentation/vm/conf.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8; mode: python -*-
+
+project = "Linux Memory Management Documentation"
+
+tags.add("subproject")
+
+latex_documents = [
+    ('index', 'memory-management.tex', project,
+     'The kernel development community', 'manual'),
+]
diff --git a/Documentation/vm/frontswap.txt b/Documentation/vm/frontswap.rst
index c71a019..1979f43 100644
--- a/Documentation/vm/frontswap.txt
+++ b/Documentation/vm/frontswap.rst
@@ -1,13 +1,20 @@
+.. _frontswap:
+
+=========
+Frontswap
+=========
+
 Frontswap provides a "transcendent memory" interface for swap pages.
 In some environments, dramatic performance savings may be obtained because
 swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk.
 
-(Note, frontswap -- and cleancache (merged at 3.0) -- are the "frontends"
+(Note, frontswap -- and :ref:`cleancache` (merged at 3.0) -- are the "frontends"
 and the only necessary changes to the core kernel for transcendent memory;
 all other supporting code -- the "backends" -- is implemented as drivers.
-See the LWN.net article "Transcendent memory in a nutshell" for a detailed
-overview of frontswap and related kernel parts:
-https://lwn.net/Articles/454795/ )
+See the LWN.net article `Transcendent memory in a nutshell`_
+for a detailed overview of frontswap and related kernel parts)
+
+.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/
 
 Frontswap is so named because it can be thought of as the opposite of
 a "backing" store for a swap device.  The storage is assumed to be
@@ -50,19 +57,27 @@ or the store fails AND the page is invalidated.  This ensures stale data may
 never be obtained from frontswap.
 
 If properly configured, monitoring of frontswap is done via debugfs in
-the /sys/kernel/debug/frontswap directory.  The effectiveness of
+the `/sys/kernel/debug/frontswap` directory.  The effectiveness of
 frontswap can be measured (across all swap devices) with:
 
-failed_stores	- how many store attempts have failed
-loads		- how many loads were attempted (all should succeed)
-succ_stores	- how many store attempts have succeeded
-invalidates	- how many invalidates were attempted
+``failed_stores``
+	how many store attempts have failed
+
+``loads``
+	how many loads were attempted (all should succeed)
+
+``succ_stores``
+	how many store attempts have succeeded
+
+``invalidates``
+	how many invalidates were attempted
 
 A backend implementation may provide additional metrics.
 
 FAQ
+===
 
-1) Where's the value?
+* Where's the value?
 
 When a workload starts swapping, performance falls through the floor.
 Frontswap significantly increases performance in many such workloads by
@@ -117,8 +132,8 @@ A KVM implementation is underway and has been RFC'ed to lkml.  And,
 using frontswap, investigation is also underway on the use of NVM as
 a memory extension technology.
 
-2) Sure there may be performance advantages in some situations, but
-   what's the space/time overhead of frontswap?
+* Sure there may be performance advantages in some situations, but
+  what's the space/time overhead of frontswap?
 
 If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into
 nothingness and the only overhead is a few extra bytes per swapon'ed
@@ -148,8 +163,8 @@ pressure that can potentially outweigh the other advantages.  A
 backend, such as zcache, must implement policies to carefully (but
 dynamically) manage memory limits to ensure this doesn't happen.
 
-3) OK, how about a quick overview of what this frontswap patch does
-   in terms that a kernel hacker can grok?
+* OK, how about a quick overview of what this frontswap patch does
+  in terms that a kernel hacker can grok?
 
 Let's assume that a frontswap "backend" has registered during
 kernel initialization; this registration indicates that this
@@ -188,9 +203,9 @@ and (potentially) a swap device write are replaced by a "frontswap backend
 store" and (possibly) a "frontswap backend loads", which are presumably much
 faster.
 
-4) Can't frontswap be configured as a "special" swap device that is
-   just higher priority than any real swap device (e.g. like zswap,
-   or maybe swap-over-nbd/NFS)?
+* Can't frontswap be configured as a "special" swap device that is
+  just higher priority than any real swap device (e.g. like zswap,
+  or maybe swap-over-nbd/NFS)?
 
 No.  First, the existing swap subsystem doesn't allow for any kind of
 swap hierarchy.  Perhaps it could be rewritten to accommodate a hierarchy,
@@ -240,9 +255,9 @@ installation, frontswap is useless.  Swapless portable devices
 can still use frontswap but a backend for such devices must configure
 some kind of "ghost" swap device and ensure that it is never used.
 
-5) Why this weird definition about "duplicate stores"?  If a page
-   has been previously successfully stored, can't it always be
-   successfully overwritten?
+* Why this weird definition about "duplicate stores"?  If a page
+  has been previously successfully stored, can't it always be
+  successfully overwritten?
 
 Nearly always it can, but no, sometimes it cannot.  Consider an example
 where data is compressed and the original 4K page has been compressed
@@ -254,7 +269,7 @@ the old data and ensure that it is no longer accessible.  Since the
 swap subsystem then writes the new data to the read swap device,
 this is the correct course of action to ensure coherency.
 
-6) What is frontswap_shrink for?
+* What is frontswap_shrink for?
 
 When the (non-frontswap) swap subsystem swaps out a page to a real
 swap device, that page is only taking up low-value pre-allocated disk
@@ -267,7 +282,7 @@ to "repatriate" pages sent to a remote machine back to the local machine;
 this is driven using the frontswap_shrink mechanism when memory pressure
 subsides.
 
-7) Why does the frontswap patch create the new include file swapfile.h?
+* Why does the frontswap patch create the new include file swapfile.h?
 
 The frontswap code depends on some swap-subsystem-internal data
 structures that have, over the years, moved back and forth between
diff --git a/Documentation/vm/highmem.txt b/Documentation/vm/highmem.rst
index 4324d24..0f69a9f 100644
--- a/Documentation/vm/highmem.txt
+++ b/Documentation/vm/highmem.rst
@@ -1,25 +1,14 @@
+.. _highmem:
 
-			     ====================
-			     HIGH MEMORY HANDLING
-			     ====================
+====================
+High Memory Handling
+====================
 
 By: Peter Zijlstra <a.p.zijlstra@chello.nl>
 
-Contents:
-
- (*) What is high memory?
-
- (*) Temporary virtual mappings.
-
- (*) Using kmap_atomic.
-
- (*) Cost of temporary mappings.
-
- (*) i386 PAE.
+.. contents:: :local:
 
-
-====================
-WHAT IS HIGH MEMORY?
+What Is High Memory?
 ====================
 
 High memory (highmem) is used when the size of physical memory approaches or
@@ -38,7 +27,7 @@ kernel entry/exit.  This means the available virtual memory space (4GiB on
 i386) has to be divided between user and kernel space.
 
 The traditional split for architectures using this approach is 3:1, 3GiB for
-userspace and the top 1GiB for kernel space:
+userspace and the top 1GiB for kernel space::
 
 		+--------+ 0xffffffff
 		| Kernel |
@@ -58,40 +47,38 @@ and user maps.  Some hardware (like some ARMs), however, have limited virtual
 space when they use mm context tags.
 
 
-==========================
-TEMPORARY VIRTUAL MAPPINGS
+Temporary Virtual Mappings
 ==========================
 
 The kernel contains several ways of creating temporary mappings:
 
- (*) vmap().  This can be used to make a long duration mapping of multiple
-     physical pages into a contiguous virtual space.  It needs global
-     synchronization to unmap.
+* vmap().  This can be used to make a long duration mapping of multiple
+  physical pages into a contiguous virtual space.  It needs global
+  synchronization to unmap.
 
- (*) kmap().  This permits a short duration mapping of a single page.  It needs
-     global synchronization, but is amortized somewhat.  It is also prone to
-     deadlocks when using in a nested fashion, and so it is not recommended for
-     new code.
+* kmap().  This permits a short duration mapping of a single page.  It needs
+  global synchronization, but is amortized somewhat.  It is also prone to
+  deadlocks when using in a nested fashion, and so it is not recommended for
+  new code.
 
- (*) kmap_atomic().  This permits a very short duration mapping of a single
-     page.  Since the mapping is restricted to the CPU that issued it, it
-     performs well, but the issuing task is therefore required to stay on that
-     CPU until it has finished, lest some other task displace its mappings.
+* kmap_atomic().  This permits a very short duration mapping of a single
+  page.  Since the mapping is restricted to the CPU that issued it, it
+  performs well, but the issuing task is therefore required to stay on that
+  CPU until it has finished, lest some other task displace its mappings.
 
-     kmap_atomic() may also be used by interrupt contexts, since it is does not
-     sleep and the caller may not sleep until after kunmap_atomic() is called.
+  kmap_atomic() may also be used by interrupt contexts, since it is does not
+  sleep and the caller may not sleep until after kunmap_atomic() is called.
 
-     It may be assumed that k[un]map_atomic() won't fail.
+  It may be assumed that k[un]map_atomic() won't fail.
 
 
-=================
-USING KMAP_ATOMIC
+Using kmap_atomic
 =================
 
 When and where to use kmap_atomic() is straightforward.  It is used when code
 wants to access the contents of a page that might be allocated from high memory
 (see __GFP_HIGHMEM), for example a page in the pagecache.  The API has two
-functions, and they can be used in a manner similar to the following:
+functions, and they can be used in a manner similar to the following::
 
 	/* Find the page of interest. */
 	struct page *page = find_get_page(mapping, offset);
@@ -109,7 +96,7 @@ Note that the kunmap_atomic() call takes the result of the kmap_atomic() call
 not the argument.
 
 If you need to map two pages because you want to copy from one page to
-another you need to keep the kmap_atomic calls strictly nested, like:
+another you need to keep the kmap_atomic calls strictly nested, like::
 
 	vaddr1 = kmap_atomic(page1);
 	vaddr2 = kmap_atomic(page2);
@@ -120,8 +107,7 @@ another you need to keep the kmap_atomic calls strictly nested, like:
 	kunmap_atomic(vaddr1);
 
 
-==========================
-COST OF TEMPORARY MAPPINGS
+Cost of Temporary Mappings
 ==========================
 
 The cost of creating temporary mappings can be quite high.  The arch has to
@@ -136,25 +122,24 @@ If CONFIG_MMU is not set, then there can be no temporary mappings and no
 highmem.  In such a case, the arithmetic approach will also be used.
 
 
-========
 i386 PAE
 ========
 
 The i386 arch, under some circumstances, will permit you to stick up to 64GiB
 of RAM into your 32-bit machine.  This has a number of consequences:
 
- (*) Linux needs a page-frame structure for each page in the system and the
-     pageframes need to live in the permanent mapping, which means:
+* Linux needs a page-frame structure for each page in the system and the
+  pageframes need to live in the permanent mapping, which means:
 
- (*) you can have 896M/sizeof(struct page) page-frames at most; with struct
-     page being 32-bytes that would end up being something in the order of 112G
-     worth of pages; the kernel, however, needs to store more than just
-     page-frames in that memory...
+* you can have 896M/sizeof(struct page) page-frames at most; with struct
+  page being 32-bytes that would end up being something in the order of 112G
+  worth of pages; the kernel, however, needs to store more than just
+  page-frames in that memory...
 
- (*) PAE makes your page tables larger - which slows the system down as more
-     data has to be accessed to traverse in TLB fills and the like.  One
-     advantage is that PAE has more PTE bits and can provide advanced features
-     like NX and PAT.
+* PAE makes your page tables larger - which slows the system down as more
+  data has to be accessed to traverse in TLB fills and the like.  One
+  advantage is that PAE has more PTE bits and can provide advanced features
+  like NX and PAT.
 
 The general recommendation is that you don't use more than 8GiB on a 32-bit
 machine - although more might work for you and your workload, you're pretty
diff --git a/Documentation/vm/hmm.txt b/Documentation/vm/hmm.rst
index 2d1d6f6..cdf3911 100644
--- a/Documentation/vm/hmm.txt
+++ b/Documentation/vm/hmm.rst
@@ -1,4 +1,8 @@
+.. hmm:
+
+=====================================
 Heterogeneous Memory Management (HMM)
+=====================================
 
 Provide infrastructure and helpers to integrate non-conventional memory (device
 memory like GPU on board memory) into regular kernel path, with the cornerstone
@@ -6,10 +10,10 @@ of this being specialized struct page for such memory (see sections 5 to 7 of
 this document).
 
 HMM also provides optional helpers for SVM (Share Virtual Memory), i.e.,
-allowing a device to transparently access program address coherently with the
-CPU meaning that any valid pointer on the CPU is also a valid pointer for the
-device. This is becoming mandatory to simplify the use of advanced hetero-
-geneous computing where GPU, DSP, or FPGA are used to perform various
+allowing a device to transparently access program address coherently with
+the CPU meaning that any valid pointer on the CPU is also a valid pointer
+for the device. This is becoming mandatory to simplify the use of advanced
+heterogeneous computing where GPU, DSP, or FPGA are used to perform various
 computations on behalf of a process.
 
 This document is divided as follows: in the first section I expose the problems
@@ -21,19 +25,10 @@ fifth section deals with how device memory is represented inside the kernel.
 Finally, the last section presents a new migration helper that allows lever-
 aging the device DMA engine.
 
+.. contents:: :local:
 
-1) Problems of using a device specific memory allocator:
-2) I/O bus, device memory characteristics
-3) Shared address space and migration
-4) Address space mirroring implementation and API
-5) Represent and manage device memory from core kernel point of view
-6) Migration to and from device memory
-7) Memory cgroup (memcg) and rss accounting
-
-
--------------------------------------------------------------------------------
-
-1) Problems of using a device specific memory allocator:
+Problems of using a device specific memory allocator
+====================================================
 
 Devices with a large amount of on board memory (several gigabytes) like GPUs
 have historically managed their memory through dedicated driver specific APIs.
@@ -77,9 +72,8 @@ are only do-able with a shared address space. It is also more reasonable to use
 a shared address space for all other patterns.
 
 
--------------------------------------------------------------------------------
-
-2) I/O bus, device memory characteristics
+I/O bus, device memory characteristics
+======================================
 
 I/O buses cripple shared address spaces due to a few limitations. Most I/O
 buses only allow basic memory access from device to main memory; even cache
@@ -109,9 +103,8 @@ access any memory but we must also permit any memory to be migrated to device
 memory while device is using it (blocking CPU access while it happens).
 
 
--------------------------------------------------------------------------------
-
-3) Shared address space and migration
+Shared address space and migration
+==================================
 
 HMM intends to provide two main features. First one is to share the address
 space by duplicating the CPU page table in the device page table so the same
@@ -148,23 +141,23 @@ ages device memory by migrating the part of the data set that is actively being
 used by the device.
 
 
--------------------------------------------------------------------------------
-
-4) Address space mirroring implementation and API
+Address space mirroring implementation and API
+==============================================
 
 Address space mirroring's main objective is to allow duplication of a range of
 CPU page table into a device page table; HMM helps keep both synchronized. A
 device driver that wants to mirror a process address space must start with the
-registration of an hmm_mirror struct:
+registration of an hmm_mirror struct::
 
  int hmm_mirror_register(struct hmm_mirror *mirror,
                          struct mm_struct *mm);
  int hmm_mirror_register_locked(struct hmm_mirror *mirror,
                                 struct mm_struct *mm);
 
+
 The locked variant is to be used when the driver is already holding mmap_sem
 of the mm in write mode. The mirror struct has a set of callbacks that are used
-to propagate CPU page tables:
+to propagate CPU page tables::
 
  struct hmm_mirror_ops {
      /* sync_cpu_device_pagetables() - synchronize page tables
@@ -193,10 +186,10 @@ The device driver must perform the update action to the range (mark range
 read only, or fully unmap, ...). The device must be done with the update before
 the driver callback returns.
 
-
 When the device driver wants to populate a range of virtual addresses, it can
-use either:
- int hmm_vma_get_pfns(struct vm_area_struct *vma,
+use either::
+
+  int hmm_vma_get_pfns(struct vm_area_struct *vma,
                       struct hmm_range *range,
                       unsigned long start,
                       unsigned long end,
@@ -221,7 +214,7 @@ provides a set of flags to help the driver identify special CPU page table
 entries.
 
 Locking with the update() callback is the most important aspect the driver must
-respect in order to keep things properly synchronized. The usage pattern is:
+respect in order to keep things properly synchronized. The usage pattern is::
 
  int driver_populate_range(...)
  {
@@ -262,9 +255,8 @@ report commands as executed is serialized (there is no point in doing this
 concurrently).
 
 
--------------------------------------------------------------------------------
-
-5) Represent and manage device memory from core kernel point of view
+Represent and manage device memory from core kernel point of view
+=================================================================
 
 Several different designs were tried to support device memory. First one used
 a device specific data structure to keep information about migrated memory and
@@ -280,14 +272,14 @@ unaware of the difference. We only need to make sure that no one ever tries to
 map those pages from the CPU side.
 
 HMM provides a set of helpers to register and hotplug device memory as a new
-region needing a struct page. This is offered through a very simple API:
+region needing a struct page. This is offered through a very simple API::
 
  struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
                                    struct device *device,
                                    unsigned long size);
  void hmm_devmem_remove(struct hmm_devmem *devmem);
 
-The hmm_devmem_ops is where most of the important things are:
+The hmm_devmem_ops is where most of the important things are::
 
  struct hmm_devmem_ops {
      void (*free)(struct hmm_devmem *devmem, struct page *page);
@@ -306,13 +298,12 @@ which it cannot do. This second callback must trigger a migration back to
 system memory.
 
 
--------------------------------------------------------------------------------
-
-6) Migration to and from device memory
+Migration to and from device memory
+===================================
 
 Because the CPU cannot access device memory, migration must use the device DMA
 engine to perform copy from and to device memory. For this we need a new
-migration helper:
+migration helper::
 
  int migrate_vma(const struct migrate_vma_ops *ops,
                  struct vm_area_struct *vma,
@@ -331,7 +322,7 @@ migration might be for a range of addresses the device is actively accessing.
 
 The migrate_vma_ops struct defines two callbacks. First one (alloc_and_copy())
 controls destination memory allocation and copy operation. Second one is there
-to allow the device driver to perform cleanup operations after migration.
+to allow the device driver to perform cleanup operations after migration::
 
  struct migrate_vma_ops {
      void (*alloc_and_copy)(struct vm_area_struct *vma,
@@ -365,9 +356,8 @@ bandwidth but this is considered as a rare event and a price that we are
 willing to pay to keep all the code simpler.
 
 
--------------------------------------------------------------------------------
-
-7) Memory cgroup (memcg) and rss accounting
+Memory cgroup (memcg) and rss accounting
+========================================
 
 For now device memory is accounted as any regular page in rss counters (either
 anonymous if device page is used for anonymous, file if device page is used for
diff --git a/Documentation/vm/hugetlbfs_reserv.txt b/Documentation/vm/hugetlbfs_reserv.rst
index 9aca09a..9d20076 100644
--- a/Documentation/vm/hugetlbfs_reserv.txt
+++ b/Documentation/vm/hugetlbfs_reserv.rst
@@ -1,6 +1,13 @@
-Hugetlbfs Reservation Overview
-------------------------------
-Huge pages as described at 'Documentation/vm/hugetlbpage.txt' are typically
+.. _hugetlbfs_reserve:
+
+=====================
+Hugetlbfs Reservation
+=====================
+
+Overview
+========
+
+Huge pages as described at :ref:`hugetlbpage` are typically
 preallocated for application use.  These huge pages are instantiated in a
 task's address space at page fault time if the VMA indicates huge pages are
 to be used.  If no huge page exists at page fault time, the task is sent
@@ -17,47 +24,55 @@ describe how huge page reserve processing is done in the v4.10 kernel.
 
 
 Audience
---------
+========
 This description is primarily targeted at kernel developers who are modifying
 hugetlbfs code.
 
 
 The Data Structures
--------------------
+===================
+
 resv_huge_pages
 	This is a global (per-hstate) count of reserved huge pages.  Reserved
 	huge pages are only available to the task which reserved them.
 	Therefore, the number of huge pages generally available is computed
-	as (free_huge_pages - resv_huge_pages).
+	as (``free_huge_pages - resv_huge_pages``).
 Reserve Map
-	A reserve map is described by the structure:
-	struct resv_map {
-		struct kref refs;
-		spinlock_t lock;
-		struct list_head regions;
-		long adds_in_progress;
-		struct list_head region_cache;
-		long region_cache_count;
-	};
+	A reserve map is described by the structure::
+
+		struct resv_map {
+			struct kref refs;
+			spinlock_t lock;
+			struct list_head regions;
+			long adds_in_progress;
+			struct list_head region_cache;
+			long region_cache_count;
+		};
+
 	There is one reserve map for each huge page mapping in the system.
 	The regions list within the resv_map describes the regions within
-	the mapping.  A region is described as:
-	struct file_region {
-		struct list_head link;
-		long from;
-		long to;
-	};
+	the mapping.  A region is described as::
+
+		struct file_region {
+			struct list_head link;
+			long from;
+			long to;
+		};
+
 	The 'from' and 'to' fields of the file region structure are huge page
 	indices into the mapping.  Depending on the type of mapping, a
 	region in the reserv_map may indicate reservations exist for the
 	range, or reservations do not exist.
 Flags for MAP_PRIVATE Reservations
 	These are stored in the bottom bits of the reservation map pointer.
-	#define HPAGE_RESV_OWNER    (1UL << 0) Indicates this task is the
-		owner of the reservations associated with the mapping.
-	#define HPAGE_RESV_UNMAPPED (1UL << 1) Indicates task originally
-		mapping this range (and creating reserves) has unmapped a
-		page from this task (the child) due to a failed COW.
+
+	``#define HPAGE_RESV_OWNER    (1UL << 0)``
+		Indicates this task is the owner of the reservations
+		associated with the mapping.
+	``#define HPAGE_RESV_UNMAPPED (1UL << 1)``
+		Indicates task originally mapping this range (and creating
+		reserves) has unmapped a page from this task (the child)
+		due to a failed COW.
 Page Flags
 	The PagePrivate page flag is used to indicate that a huge page
 	reservation must be restored when the huge page is freed.  More
@@ -65,12 +80,14 @@ Page Flags
 
 
 Reservation Map Location (Private or Shared)
---------------------------------------------
+============================================
+
 A huge page mapping or segment is either private or shared.  If private,
 it is typically only available to a single address space (task).  If shared,
 it can be mapped into multiple address spaces (tasks).  The location and
 semantics of the reservation map is significantly different for two types
 of mappings.  Location differences are:
+
 - For private mappings, the reservation map hangs off the the VMA structure.
   Specifically, vma->vm_private_data.  This reserve map is created at the
   time the mapping (mmap(MAP_PRIVATE)) is created.
@@ -82,15 +99,15 @@ of mappings.  Location differences are:
 
 
 Creating Reservations
----------------------
+=====================
 Reservations are created when a huge page backed shared memory segment is
 created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB).
-These operations result in a call to the routine hugetlb_reserve_pages()
+These operations result in a call to the routine hugetlb_reserve_pages()::
 
-int hugetlb_reserve_pages(struct inode *inode,
-					long from, long to,
-					struct vm_area_struct *vma,
-					vm_flags_t vm_flags)
+	int hugetlb_reserve_pages(struct inode *inode,
+				  long from, long to,
+				  struct vm_area_struct *vma,
+				  vm_flags_t vm_flags)
 
 The first thing hugetlb_reserve_pages() does is check for the NORESERVE
 flag was specified in either the shmget() or mmap() call.  If NORESERVE
@@ -105,6 +122,7 @@ the 'from' and 'to' arguments have been adjusted by this offset.
 
 One of the big differences between PRIVATE and SHARED mappings is the way
 in which reservations are represented in the reservation map.
+
 - For shared mappings, an entry in the reservation map indicates a reservation
   exists or did exist for the corresponding page.  As reservations are
   consumed, the reservation map is not modified.
@@ -121,12 +139,13 @@ to indicate this VMA owns the reservations.
 The reservation map is consulted to determine how many huge page reservations
 are needed for the current mapping/segment.  For private mappings, this is
 always the value (to - from).  However, for shared mappings it is possible that some reservations may already exist within the range (to - from).  See the
-section "Reservation Map Modifications" for details on how this is accomplished.
+section :ref:`Reservation Map Modifications <resv_map_modifications>`
+for details on how this is accomplished.
 
 The mapping may be associated with a subpool.  If so, the subpool is consulted
 to ensure there is sufficient space for the mapping.  It is possible that the
 subpool has set aside reservations that can be used for the mapping.  See the
-section "Subpool Reservations" for more details.
+section :ref:`Subpool Reservations <sub_pool_resv>` for more details.
 
 After consulting the reservation map and subpool, the number of needed new
 reservations is known.  The routine hugetlb_acct_memory() is called to check
@@ -135,9 +154,11 @@ calls into routines that potentially allocate and adjust surplus page counts.
 However, within those routines the code is simply checking to ensure there
 are enough free huge pages to accommodate the reservation.  If there are,
 the global reservation count resv_huge_pages is adjusted something like the
-following.
+following::
+
 	if (resv_needed <= (resv_huge_pages - free_huge_pages))
 		resv_huge_pages += resv_needed;
+
 Note that the global lock hugetlb_lock is held when checking and adjusting
 these counters.
 
@@ -152,14 +173,18 @@ If hugetlb_reserve_pages() was successful, the global reservation count and
 reservation map associated with the mapping will be modified as required to
 ensure reservations exist for the range 'from' - 'to'.
 
+.. _consume_resv:
 
 Consuming Reservations/Allocating a Huge Page
----------------------------------------------
+=============================================
+
 Reservations are consumed when huge pages associated with the reservations
 are allocated and instantiated in the corresponding mapping.  The allocation
-is performed within the routine alloc_huge_page().
-struct page *alloc_huge_page(struct vm_area_struct *vma,
-                                    unsigned long addr, int avoid_reserve)
+is performed within the routine alloc_huge_page()::
+
+	struct page *alloc_huge_page(struct vm_area_struct *vma,
+				     unsigned long addr, int avoid_reserve)
+
 alloc_huge_page is passed a VMA pointer and a virtual address, so it can
 consult the reservation map to determine if a reservation exists.  In addition,
 alloc_huge_page takes the argument avoid_reserve which indicates reserves
@@ -170,8 +195,9 @@ page are being allocated.
 
 The helper routine vma_needs_reservation() is called to determine if a
 reservation exists for the address within the mapping(vma).  See the section
-"Reservation Map Helper Routines" for detailed information on what this
-routine does.  The value returned from vma_needs_reservation() is generally
+:ref:`Reservation Map Helper Routines <resv_map_helpers>` for detailed
+information on what this routine does.
+The value returned from vma_needs_reservation() is generally
 0 or 1.  0 if a reservation exists for the address, 1 if no reservation exists.
 If a reservation does not exist, and there is a subpool associated with the
 mapping the subpool is consulted to determine if it contains reservations.
@@ -180,21 +206,25 @@ However, in every case the avoid_reserve argument overrides the use of
 a reservation for the allocation.  After determining whether a reservation
 exists and can be used for the allocation, the routine dequeue_huge_page_vma()
 is called.  This routine takes two arguments related to reservations:
+
 - avoid_reserve, this is the same value/argument passed to alloc_huge_page()
 - chg, even though this argument is of type long only the values 0 or 1 are
   passed to dequeue_huge_page_vma.  If the value is 0, it indicates a
   reservation exists (see the section "Memory Policy and Reservations" for
   possible issues).  If the value is 1, it indicates a reservation does not
   exist and the page must be taken from the global free pool if possible.
+
 The free lists associated with the memory policy of the VMA are searched for
 a free page.  If a page is found, the value free_huge_pages is decremented
 when the page is removed from the free list.  If there was a reservation
-associated with the page, the following adjustments are made:
+associated with the page, the following adjustments are made::
+
 	SetPagePrivate(page);	/* Indicates allocating this page consumed
 				 * a reservation, and if an error is
 				 * encountered such that the page must be
 				 * freed, the reservation will be restored. */
 	resv_huge_pages--;	/* Decrement the global reservation count */
+
 Note, if no huge page can be found that satisfies the VMA's memory policy
 an attempt will be made to allocate one using the buddy allocator.  This
 brings up the issue of surplus huge pages and overcommit which is beyond
@@ -222,12 +252,14 @@ mapping.  In such cases, the reservation count and subpool free page count
 will be off by one.  This rare condition can be identified by comparing the
 return value from vma_needs_reservation and vma_commit_reservation.  If such
 a race is detected, the subpool and global reserve counts are adjusted to
-compensate.  See the section "Reservation Map Helper Routines" for more
+compensate.  See the section
+:ref:`Reservation Map Helper Routines <resv_map_helpers>` for more
 information on these routines.
 
 
 Instantiate Huge Pages
-----------------------
+======================
+
 After huge page allocation, the page is typically added to the page tables
 of the allocating task.  Before this, pages in a shared mapping are added
 to the page cache and pages in private mappings are added to an anonymous
@@ -237,7 +269,8 @@ to the global reservation count (resv_huge_pages).
 
 
 Freeing Huge Pages
-------------------
+==================
+
 Huge page freeing is performed by the routine free_huge_page().  This routine
 is the destructor for hugetlbfs compound pages.  As a result, it is only
 passed a pointer to the page struct.  When a huge page is freed, reservation
@@ -247,7 +280,8 @@ on an error path where a global reserve count must be restored.
 
 The page->private field points to any subpool associated with the page.
 If the PagePrivate flag is set, it indicates the global reserve count should
-be adjusted (see the section "Consuming Reservations/Allocating a Huge Page"
+be adjusted (see the section
+:ref:`Consuming Reservations/Allocating a Huge Page <consume_resv>`
 for information on how these are set).
 
 The routine first calls hugepage_subpool_put_pages() for the page.  If this
@@ -259,9 +293,11 @@ Therefore, the global resv_huge_pages counter is incremented in this case.
 If the PagePrivate flag was set in the page, the global resv_huge_pages counter
 will always be incremented.
 
+.. _sub_pool_resv:
 
 Subpool Reservations
---------------------
+====================
+
 There is a struct hstate associated with each huge page size.  The hstate
 tracks all huge pages of the specified size.  A subpool represents a subset
 of pages within a hstate that is associated with a mounted hugetlbfs
@@ -295,7 +331,8 @@ the global pools.
 
 
 COW and Reservations
---------------------
+====================
+
 Since shared mappings all point to and use the same underlying pages, the
 biggest reservation concern for COW is private mappings.  In this case,
 two tasks can be pointing at the same previously allocated page.  One task
@@ -326,30 +363,36 @@ faults on a non-present page.  But, the original owner of the
 mapping/reservation will behave as expected.
 
 
+.. _resv_map_modifications:
+
 Reservation Map Modifications
------------------------------
+=============================
+
 The following low level routines are used to make modifications to a
 reservation map.  Typically, these routines are not called directly.  Rather,
 a reservation map helper routine is called which calls one of these low level
 routines.  These low level routines are fairly well documented in the source
-code (mm/hugetlb.c).  These routines are:
-long region_chg(struct resv_map *resv, long f, long t);
-long region_add(struct resv_map *resv, long f, long t);
-void region_abort(struct resv_map *resv, long f, long t);
-long region_count(struct resv_map *resv, long f, long t);
+code (mm/hugetlb.c).  These routines are::
+
+	long region_chg(struct resv_map *resv, long f, long t);
+	long region_add(struct resv_map *resv, long f, long t);
+	void region_abort(struct resv_map *resv, long f, long t);
+	long region_count(struct resv_map *resv, long f, long t);
 
 Operations on the reservation map typically involve two operations:
+
 1) region_chg() is called to examine the reserve map and determine how
    many pages in the specified range [f, t) are NOT currently represented.
 
    The calling code performs global checks and allocations to determine if
    there are enough huge pages for the operation to succeed.
 
-2a) If the operation can succeed, region_add() is called to actually modify
-    the reservation map for the same range [f, t) previously passed to
-    region_chg().
-2b) If the operation can not succeed, region_abort is called for the same range
-    [f, t) to abort the operation.
+2)
+  a) If the operation can succeed, region_add() is called to actually modify
+     the reservation map for the same range [f, t) previously passed to
+     region_chg().
+  b) If the operation can not succeed, region_abort is called for the same
+     range [f, t) to abort the operation.
 
 Note that this is a two step process where region_add() and region_abort()
 are guaranteed to succeed after a prior call to region_chg() for the same
@@ -371,6 +414,7 @@ and make the appropriate adjustments.
 
 The routine region_del() is called to remove regions from a reservation map.
 It is typically called in the following situations:
+
 - When a file in the hugetlbfs filesystem is being removed, the inode will
   be released and the reservation map freed.  Before freeing the reservation
   map, all the individual file_region structures must be freed.  In this case
@@ -384,6 +428,7 @@ It is typically called in the following situations:
   removed, region_del() is called to remove the corresponding entry from the
   reservation map.  In this case, region_del is passed the range
   [page_idx, page_idx + 1).
+
 In every case, region_del() will return the number of pages removed from the
 reservation map.  In VERY rare cases, region_del() can fail.  This can only
 happen in the hole punch case where it has to split an existing file_region
@@ -403,9 +448,11 @@ outstanding (outstanding = (end - start) - region_count(resv, start, end)).
 Since the mapping is going away, the subpool and global reservation counts
 are decremented by the number of outstanding reservations.
 
+.. _resv_map_helpers:
 
 Reservation Map Helper Routines
--------------------------------
+===============================
+
 Several helper routines exist to query and modify the reservation maps.
 These routines are only interested with reservations for a specific huge
 page, so they just pass in an address instead of a range.  In addition,
@@ -414,32 +461,40 @@ or shared) and the location of the reservation map (inode or VMA) can be
 determined.  These routines simply call the underlying routines described
 in the section "Reservation Map Modifications".  However, they do take into
 account the 'opposite' meaning of reservation map entries for private and
-shared mappings and hide this detail from the caller.
+shared mappings and hide this detail from the caller::
+
+	long vma_needs_reservation(struct hstate *h,
+				   struct vm_area_struct *vma,
+				   unsigned long addr)
 
-long vma_needs_reservation(struct hstate *h,
-				struct vm_area_struct *vma, unsigned long addr)
 This routine calls region_chg() for the specified page.  If no reservation
-exists, 1 is returned.  If a reservation exists, 0 is returned.
+exists, 1 is returned.  If a reservation exists, 0 is returned::
+
+	long vma_commit_reservation(struct hstate *h,
+				    struct vm_area_struct *vma,
+				    unsigned long addr)
 
-long vma_commit_reservation(struct hstate *h,
-				struct vm_area_struct *vma, unsigned long addr)
 This calls region_add() for the specified page.  As in the case of region_chg
 and region_add, this routine is to be called after a previous call to
 vma_needs_reservation.  It will add a reservation entry for the page.  It
 returns 1 if the reservation was added and 0 if not.  The return value should
 be compared with the return value of the previous call to
 vma_needs_reservation.  An unexpected difference indicates the reservation
-map was modified between calls.
+map was modified between calls::
+
+	void vma_end_reservation(struct hstate *h,
+				 struct vm_area_struct *vma,
+				 unsigned long addr)
 
-void vma_end_reservation(struct hstate *h,
-				struct vm_area_struct *vma, unsigned long addr)
 This calls region_abort() for the specified page.  As in the case of region_chg
 and region_abort, this routine is to be called after a previous call to
 vma_needs_reservation.  It will abort/end the in progress reservation add
-operation.
+operation::
+
+	long vma_add_reservation(struct hstate *h,
+				 struct vm_area_struct *vma,
+				 unsigned long addr)
 
-long vma_add_reservation(struct hstate *h,
-				struct vm_area_struct *vma, unsigned long addr)
 This is a special wrapper routine to help facilitate reservation cleanup
 on error paths.  It is only called from the routine restore_reserve_on_error().
 This routine is used in conjunction with vma_needs_reservation in an attempt
@@ -453,8 +508,10 @@ be done on error paths.
 
 
 Reservation Cleanup in Error Paths
-----------------------------------
-As mentioned in the section "Reservation Map Helper Routines", reservation
+==================================
+
+As mentioned in the section
+:ref:`Reservation Map Helper Routines <resv_map_helpers>`, reservation
 map modifications are performed in two steps.  First vma_needs_reservation
 is called before a page is allocated.  If the allocation is successful,
 then vma_commit_reservation is called.  If not, vma_end_reservation is called.
@@ -494,13 +551,14 @@ so that a reservation will not be leaked when the huge page is freed.
 
 
 Reservations and Memory Policy
-------------------------------
+==============================
 Per-node huge page lists existed in struct hstate when git was first used
 to manage Linux code.  The concept of reservations was added some time later.
 When reservations were added, no attempt was made to take memory policy
 into account.  While cpusets are not exactly the same as memory policy, this
 comment in hugetlb_acct_memory sums up the interaction between reservations
-and cpusets/memory policy.
+and cpusets/memory policy::
+
 	/*
 	 * When cpuset is configured, it breaks the strict hugetlb page
 	 * reservation as the accounting is done on a global variable. Such
@@ -525,5 +583,13 @@ of cpusets or memory policy there is no guarantee that huge pages will be
 available on the required nodes.  This is true even if there are a sufficient
 number of global reservations.
 
+Hugetlbfs regression testing
+============================
+
+The most complete set of hugetlb tests are in the libhugetlbfs repository.
+If you modify any hugetlb related code, use the libhugetlbfs test suite
+to check for regressions.  In addition, if you add any new hugetlb
+functionality, please add appropriate tests to libhugetlbfs.
 
+--
 Mike Kravetz, 7 April 2017
diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.rst
index e912d7e..09bd24a 100644
--- a/Documentation/vm/hwpoison.txt
+++ b/Documentation/vm/hwpoison.rst
@@ -1,7 +1,14 @@
+.. hwpoison:
+
+========
+hwpoison
+========
+
 What is hwpoison?
+=================
 
 Upcoming Intel CPUs have support for recovering from some memory errors
-(``MCA recovery''). This requires the OS to declare a page "poisoned",
+(``MCA recovery``). This requires the OS to declare a page "poisoned",
 kill the processes associated with it and avoid using it in the future.
 
 This patchkit implements the necessary infrastructure in the VM.
@@ -46,9 +53,10 @@ address. This in theory allows other applications to handle
 memory failures too. The expection is that near all applications
 won't do that, but some very specialized ones might.
 
----
+Failure recovery modes
+======================
 
-There are two (actually three) modi memory failure recovery can be in:
+There are two (actually three) modes memory failure recovery can be in:
 
 vm.memory_failure_recovery sysctl set to zero:
 	All memory failures cause a panic. Do not attempt recovery.
@@ -67,9 +75,8 @@ late kill
 	This is best for memory error unaware applications and default
 	Note some pages are always handled as late kill.
 
----
-
-User control:
+User control
+============
 
 vm.memory_failure_recovery
 	See sysctl.txt
@@ -79,11 +86,19 @@ vm.memory_failure_early_kill
 
 PR_MCE_KILL
 	Set early/late kill mode/revert to system default
-	arg1: PR_MCE_KILL_CLEAR: Revert to system default
-	arg1: PR_MCE_KILL_SET: arg2 defines thread specific mode
-		PR_MCE_KILL_EARLY: Early kill
-		PR_MCE_KILL_LATE:  Late kill
-		PR_MCE_KILL_DEFAULT: Use system global default
+
+	arg1: PR_MCE_KILL_CLEAR:
+		Revert to system default
+	arg1: PR_MCE_KILL_SET:
+		arg2 defines thread specific mode
+
+		PR_MCE_KILL_EARLY:
+			Early kill
+		PR_MCE_KILL_LATE:
+			Late kill
+		PR_MCE_KILL_DEFAULT
+			Use system global default
+
 	Note that if you want to have a dedicated thread which handles
 	the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should
 	call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise,
@@ -92,77 +107,64 @@ PR_MCE_KILL
 PR_MCE_KILL_GET
 	return current mode
 
+Testing
+=======
 
----
-
-Testing:
-
-madvise(MADV_HWPOISON, ....)
-	(as root)
-	Poison a page in the process for testing
-
+* madvise(MADV_HWPOISON, ....) (as root) - Poison a page in the
+  process for testing
 
-hwpoison-inject module through debugfs
+* hwpoison-inject module through debugfs ``/sys/kernel/debug/hwpoison/``
 
-/sys/kernel/debug/hwpoison/
+  corrupt-pfn
+	Inject hwpoison fault at PFN echoed into this file. This does
+	some early filtering to avoid corrupted unintended pages in test suites.
 
-corrupt-pfn
+  unpoison-pfn
+	Software-unpoison page at PFN echoed into this file. This way
+	a page can be reused again.  This only works for Linux
+	injected failures, not for real memory failures.
 
-Inject hwpoison fault at PFN echoed into this file. This does
-some early filtering to avoid corrupted unintended pages in test suites.
+  Note these injection interfaces are not stable and might change between
+  kernel versions
 
-unpoison-pfn
+  corrupt-filter-dev-major, corrupt-filter-dev-minor
+	Only handle memory failures to pages associated with the file
+	system defined by block device major/minor.  -1U is the
+	wildcard value.  This should be only used for testing with
+	artificial injection.
 
-Software-unpoison page at PFN echoed into this file. This
-way a page can be reused again.
-This only works for Linux injected failures, not for real
-memory failures.
+  corrupt-filter-memcg
+	Limit injection to pages owned by memgroup. Specified by inode
+	number of the memcg.
 
-Note these injection interfaces are not stable and might change between
-kernel versions
+	Example::
 
-corrupt-filter-dev-major
-corrupt-filter-dev-minor
+		mkdir /sys/fs/cgroup/mem/hwpoison
 
-Only handle memory failures to pages associated with the file system defined
-by block device major/minor.  -1U is the wildcard value.
-This should be only used for testing with artificial injection.
+	        usemem -m 100 -s 1000 &
+		echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks
 
-corrupt-filter-memcg
+		memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ')
+		echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
 
-Limit injection to pages owned by memgroup. Specified by inode number
-of the memcg.
+		page-types -p `pidof init`   --hwpoison  # shall do nothing
+		page-types -p `pidof usemem` --hwpoison  # poison its pages
 
-Example:
-        mkdir /sys/fs/cgroup/mem/hwpoison
+  corrupt-filter-flags-mask, corrupt-filter-flags-value
+	When specified, only poison pages if ((page_flags & mask) ==
+	value).  This allows stress testing of many kinds of
+	pages. The page_flags are the same as in /proc/kpageflags. The
+	flag bits are defined in include/linux/kernel-page-flags.h and
+	documented in Documentation/admin-guide/mm/pagemap.rst
 
-        usemem -m 100 -s 1000 &
-        echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks
+* Architecture specific MCE injector
 
-        memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ')
-        echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
+  x86 has mce-inject, mce-test
 
-        page-types -p `pidof init`   --hwpoison  # shall do nothing
-        page-types -p `pidof usemem` --hwpoison  # poison its pages
+  Some portable hwpoison test programs in mce-test, see below.
 
-corrupt-filter-flags-mask
-corrupt-filter-flags-value
-
-When specified, only poison pages if ((page_flags & mask) == value).
-This allows stress testing of many kinds of pages. The page_flags
-are the same as in /proc/kpageflags. The flag bits are defined in
-include/linux/kernel-page-flags.h and documented in
-Documentation/vm/pagemap.txt
-
-Architecture specific MCE injector
-
-x86 has mce-inject, mce-test
-
-Some portable hwpoison test programs in mce-test, see blow.
-
----
-
-References:
+References
+==========
 
 http://halobates.de/mce-lc09-2.pdf
 	Overview presentation from LinuxCon 09
@@ -174,14 +176,11 @@ git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
 	x86 specific injector
 
 
----
-
-Limitations:
-
+Limitations
+===========
 - Not all page types are supported and never will. Most kernel internal
-objects cannot be recovered, only LRU pages for now.
+  objects cannot be recovered, only LRU pages for now.
 - Right now hugepage support is missing.
 
 ---
 Andi Kleen, Oct 2009
-
diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst
new file mode 100644
index 0000000..c4ded22
--- /dev/null
+++ b/Documentation/vm/index.rst
@@ -0,0 +1,50 @@
+=====================================
+Linux Memory Management Documentation
+=====================================
+
+This is a collection of documents about Linux memory management (mm) subsystem.
+
+User guides for MM features
+===========================
+
+The following documents provide guides for controlling and tuning
+various features of the Linux memory management
+
+.. toctree::
+   :maxdepth: 1
+
+   swap_numa
+   zswap
+
+Kernel developers MM documentation
+==================================
+
+The below documents describe MM internals with different level of
+details ranging from notes and mailing list responses to elaborate
+descriptions of data structures and algorithms.
+
+.. toctree::
+   :maxdepth: 1
+
+   active_mm
+   balance
+   cleancache
+   frontswap
+   highmem
+   hmm
+   hwpoison
+   hugetlbfs_reserv
+   ksm
+   mmu_notifier
+   numa
+   overcommit-accounting
+   page_migration
+   page_frags
+   page_owner
+   remap_file_pages
+   slub
+   split_page_table_lock
+   transhuge
+   unevictable-lru
+   z3fold
+   zsmalloc
diff --git a/Documentation/vm/ksm.rst b/Documentation/vm/ksm.rst
new file mode 100644
index 0000000..d32016d
--- /dev/null
+++ b/Documentation/vm/ksm.rst
@@ -0,0 +1,87 @@
+.. _ksm:
+
+=======================
+Kernel Samepage Merging
+=======================
+
+KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y,
+added to the Linux kernel in 2.6.32.  See ``mm/ksm.c`` for its implementation,
+and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/
+
+The userspace interface of KSM is described in :ref:`Documentation/admin-guide/mm/ksm.rst <admin_guide_ksm>`
+
+Design
+======
+
+Overview
+--------
+
+.. kernel-doc:: mm/ksm.c
+   :DOC: Overview
+
+Reverse mapping
+---------------
+KSM maintains reverse mapping information for KSM pages in the stable
+tree.
+
+If a KSM page is shared between less than ``max_page_sharing`` VMAs,
+the node of the stable tree that represents such KSM page points to a
+list of :c:type:`struct rmap_item` and the ``page->mapping`` of the
+KSM page points to the stable tree node.
+
+When the sharing passes this threshold, KSM adds a second dimension to
+the stable tree. The tree node becomes a "chain" that links one or
+more "dups". Each "dup" keeps reverse mapping information for a KSM
+page with ``page->mapping`` pointing to that "dup".
+
+Every "chain" and all "dups" linked into a "chain" enforce the
+invariant that they represent the same write protected memory content,
+even if each "dup" will be pointed by a different KSM page copy of
+that content.
+
+This way the stable tree lookup computational complexity is unaffected
+if compared to an unlimited list of reverse mappings. It is still
+enforced that there cannot be KSM page content duplicates in the
+stable tree itself.
+
+The deduplication limit enforced by ``max_page_sharing`` is required
+to avoid the virtual memory rmap lists to grow too large. The rmap
+walk has O(N) complexity where N is the number of rmap_items
+(i.e. virtual mappings) that are sharing the page, which is in turn
+capped by ``max_page_sharing``. So this effectively spreads the linear
+O(N) computational complexity from rmap walk context over different
+KSM pages. The ksmd walk over the stable_node "chains" is also O(N),
+but N is the number of stable_node "dups", not the number of
+rmap_items, so it has not a significant impact on ksmd performance. In
+practice the best stable_node "dup" candidate will be kept and found
+at the head of the "dups" list.
+
+High values of ``max_page_sharing`` result in faster memory merging
+(because there will be fewer stable_node dups queued into the
+stable_node chain->hlist to check for pruning) and higher
+deduplication factor at the expense of slower worst case for rmap
+walks for any KSM page which can happen during swapping, compaction,
+NUMA balancing and page migration.
+
+The ``stable_node_dups/stable_node_chains`` ratio is also affected by the
+``max_page_sharing`` tunable, and an high ratio may indicate fragmentation
+in the stable_node dups, which could be solved by introducing
+fragmentation algorithms in ksmd which would refile rmap_items from
+one stable_node dup to another stable_node dup, in order to free up
+stable_node "dups" with few rmap_items in them, but that may increase
+the ksmd CPU usage and possibly slowdown the readonly computations on
+the KSM pages of the applications.
+
+The whole list of stable_node "dups" linked in the stable_node
+"chains" is scanned periodically in order to prune stale stable_nodes.
+The frequency of such scans is defined by
+``stable_node_chains_prune_millisecs`` sysfs tunable.
+
+Reference
+---------
+.. kernel-doc:: mm/ksm.c
+   :functions: mm_slot ksm_scan stable_node rmap_item
+
+--
+Izik Eidus,
+Hugh Dickins, 17 Nov 2009
diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt
deleted file mode 100644
index 6686bd2..0000000
--- a/Documentation/vm/ksm.txt
+++ /dev/null
@@ -1,178 +0,0 @@
-How to use the Kernel Samepage Merging feature
-----------------------------------------------
-
-KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y,
-added to the Linux kernel in 2.6.32.  See mm/ksm.c for its implementation,
-and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/
-
-The KSM daemon ksmd periodically scans those areas of user memory which
-have been registered with it, looking for pages of identical content which
-can be replaced by a single write-protected page (which is automatically
-copied if a process later wants to update its content).
-
-KSM was originally developed for use with KVM (where it was known as
-Kernel Shared Memory), to fit more virtual machines into physical memory,
-by sharing the data common between them.  But it can be useful to any
-application which generates many instances of the same data.
-
-KSM only merges anonymous (private) pages, never pagecache (file) pages.
-KSM's merged pages were originally locked into kernel memory, but can now
-be swapped out just like other user pages (but sharing is broken when they
-are swapped back in: ksmd must rediscover their identity and merge again).
-
-KSM only operates on those areas of address space which an application
-has advised to be likely candidates for merging, by using the madvise(2)
-system call: int madvise(addr, length, MADV_MERGEABLE).
-
-The app may call int madvise(addr, length, MADV_UNMERGEABLE) to cancel
-that advice and restore unshared pages: whereupon KSM unmerges whatever
-it merged in that range.  Note: this unmerging call may suddenly require
-more memory than is available - possibly failing with EAGAIN, but more
-probably arousing the Out-Of-Memory killer.
-
-If KSM is not configured into the running kernel, madvise MADV_MERGEABLE
-and MADV_UNMERGEABLE simply fail with EINVAL.  If the running kernel was
-built with CONFIG_KSM=y, those calls will normally succeed: even if the
-the KSM daemon is not currently running, MADV_MERGEABLE still registers
-the range for whenever the KSM daemon is started; even if the range
-cannot contain any pages which KSM could actually merge; even if
-MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE.
-
-If a region of memory must be split into at least one new MADV_MERGEABLE
-or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process
-will exceed vm.max_map_count (see Documentation/sysctl/vm.txt).
-
-Like other madvise calls, they are intended for use on mapped areas of
-the user address space: they will report ENOMEM if the specified range
-includes unmapped gaps (though working on the intervening mapped areas),
-and might fail with EAGAIN if not enough memory for internal structures.
-
-Applications should be considerate in their use of MADV_MERGEABLE,
-restricting its use to areas likely to benefit.  KSM's scans may use a lot
-of processing power: some installations will disable KSM for that reason.
-
-The KSM daemon is controlled by sysfs files in /sys/kernel/mm/ksm/,
-readable by all but writable only by root:
-
-pages_to_scan    - how many present pages to scan before ksmd goes to sleep
-                   e.g. "echo 100 > /sys/kernel/mm/ksm/pages_to_scan"
-                   Default: 100 (chosen for demonstration purposes)
-
-sleep_millisecs  - how many milliseconds ksmd should sleep before next scan
-                   e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs"
-                   Default: 20 (chosen for demonstration purposes)
-
-merge_across_nodes - specifies if pages from different numa nodes can be merged.
-                   When set to 0, ksm merges only pages which physically
-                   reside in the memory area of same NUMA node. That brings
-                   lower latency to access of shared pages. Systems with more
-                   nodes, at significant NUMA distances, are likely to benefit
-                   from the lower latency of setting 0. Smaller systems, which
-                   need to minimize memory usage, are likely to benefit from
-                   the greater sharing of setting 1 (default). You may wish to
-                   compare how your system performs under each setting, before
-                   deciding on which to use. merge_across_nodes setting can be
-                   changed only when there are no ksm shared pages in system:
-                   set run 2 to unmerge pages first, then to 1 after changing
-                   merge_across_nodes, to remerge according to the new setting.
-                   Default: 1 (merging across nodes as in earlier releases)
-
-run              - set 0 to stop ksmd from running but keep merged pages,
-                   set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run",
-                   set 2 to stop ksmd and unmerge all pages currently merged,
-                         but leave mergeable areas registered for next run
-                   Default: 0 (must be changed to 1 to activate KSM,
-                               except if CONFIG_SYSFS is disabled)
-
-use_zero_pages   - specifies whether empty pages (i.e. allocated pages
-                   that only contain zeroes) should be treated specially.
-                   When set to 1, empty pages are merged with the kernel
-                   zero page(s) instead of with each other as it would
-                   happen normally. This can improve the performance on
-                   architectures with coloured zero pages, depending on
-                   the workload. Care should be taken when enabling this
-                   setting, as it can potentially degrade the performance
-                   of KSM for some workloads, for example if the checksums
-                   of pages candidate for merging match the checksum of
-                   an empty page. This setting can be changed at any time,
-                   it is only effective for pages merged after the change.
-                   Default: 0 (normal KSM behaviour as in earlier releases)
-
-max_page_sharing - Maximum sharing allowed for each KSM page. This
-                   enforces a deduplication limit to avoid the virtual
-                   memory rmap lists to grow too large. The minimum
-                   value is 2 as a newly created KSM page will have at
-                   least two sharers. The rmap walk has O(N)
-                   complexity where N is the number of rmap_items
-                   (i.e. virtual mappings) that are sharing the page,
-                   which is in turn capped by max_page_sharing. So
-                   this effectively spread the the linear O(N)
-                   computational complexity from rmap walk context
-                   over different KSM pages. The ksmd walk over the
-                   stable_node "chains" is also O(N), but N is the
-                   number of stable_node "dups", not the number of
-                   rmap_items, so it has not a significant impact on
-                   ksmd performance. In practice the best stable_node
-                   "dup" candidate will be kept and found at the head
-                   of the "dups" list. The higher this value the
-                   faster KSM will merge the memory (because there
-                   will be fewer stable_node dups queued into the
-                   stable_node chain->hlist to check for pruning) and
-                   the higher the deduplication factor will be, but
-                   the slowest the worst case rmap walk could be for
-                   any given KSM page. Slowing down the rmap_walk
-                   means there will be higher latency for certain
-                   virtual memory operations happening during
-                   swapping, compaction, NUMA balancing and page
-                   migration, in turn decreasing responsiveness for
-                   the caller of those virtual memory operations. The
-                   scheduler latency of other tasks not involved with
-                   the VM operations doing the rmap walk is not
-                   affected by this parameter as the rmap walks are
-                   always schedule friendly themselves.
-
-stable_node_chains_prune_millisecs - How frequently to walk the whole
-                   list of stable_node "dups" linked in the
-                   stable_node "chains" in order to prune stale
-                   stable_nodes. Smaller milllisecs values will free
-                   up the KSM metadata with lower latency, but they
-                   will make ksmd use more CPU during the scan. This
-                   only applies to the stable_node chains so it's a
-                   noop if not a single KSM page hit the
-                   max_page_sharing yet (there would be no stable_node
-                   chains in such case).
-
-The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/:
-
-pages_shared     - how many shared pages are being used
-pages_sharing    - how many more sites are sharing them i.e. how much saved
-pages_unshared   - how many pages unique but repeatedly checked for merging
-pages_volatile   - how many pages changing too fast to be placed in a tree
-full_scans       - how many times all mergeable areas have been scanned
-
-stable_node_chains - number of stable node chains allocated, this is
-		     effectively the number of KSM pages that hit the
-		     max_page_sharing limit
-stable_node_dups   - number of stable node dups queued into the
-		     stable_node chains
-
-A high ratio of pages_sharing to pages_shared indicates good sharing, but
-a high ratio of pages_unshared to pages_sharing indicates wasted effort.
-pages_volatile embraces several different kinds of activity, but a high
-proportion there would also indicate poor use of madvise MADV_MERGEABLE.
-
-The maximum possible page_sharing/page_shared ratio is limited by the
-max_page_sharing tunable. To increase the ratio max_page_sharing must
-be increased accordingly.
-
-The stable_node_dups/stable_node_chains ratio is also affected by the
-max_page_sharing tunable, and an high ratio may indicate fragmentation
-in the stable_node dups, which could be solved by introducing
-fragmentation algorithms in ksmd which would refile rmap_items from
-one stable_node dup to another stable_node dup, in order to freeup
-stable_node "dups" with few rmap_items in them, but that may increase
-the ksmd CPU usage and possibly slowdown the readonly computations on
-the KSM pages of the applications.
-
-Izik Eidus,
-Hugh Dickins, 17 Nov 2009
diff --git a/Documentation/vm/mmu_notifier.rst b/Documentation/vm/mmu_notifier.rst
new file mode 100644
index 0000000..47baa1c
--- /dev/null
+++ b/Documentation/vm/mmu_notifier.rst
@@ -0,0 +1,99 @@
+.. _mmu_notifier:
+
+When do you need to notify inside page table lock ?
+===================================================
+
+When clearing a pte/pmd we are given a choice to notify the event through
+(notify version of \*_clear_flush call mmu_notifier_invalidate_range) under
+the page table lock. But that notification is not necessary in all cases.
+
+For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use
+thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a
+process virtual address space). There is only 2 cases when you need to notify
+those secondary TLB while holding page table lock when clearing a pte/pmd:
+
+  A) page backing address is free before mmu_notifier_invalidate_range_end()
+  B) a page table entry is updated to point to a new page (COW, write fault
+     on zero page, __replace_page(), ...)
+
+Case A is obvious you do not want to take the risk for the device to write to
+a page that might now be used by some completely different task.
+
+Case B is more subtle. For correctness it requires the following sequence to
+happen:
+
+  - take page table lock
+  - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify())
+  - set page table entry to point to new page
+
+If clearing the page table entry is not followed by a notify before setting
+the new pte/pmd value then you can break memory model like C11 or C++11 for
+the device.
+
+Consider the following scenario (device use a feature similar to ATS/PASID):
+
+Two address addrA and addrB such that \|addrA - addrB\| >= PAGE_SIZE we assume
+they are write protected for COW (other case of B apply too).
+
+::
+
+ [Time N] --------------------------------------------------------------------
+ CPU-thread-0  {try to write to addrA}
+ CPU-thread-1  {try to write to addrB}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {read addrA and populate device TLB}
+ DEV-thread-2  {read addrB and populate device TLB}
+ [Time N+1] ------------------------------------------------------------------
+ CPU-thread-0  {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
+ CPU-thread-1  {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+2] ------------------------------------------------------------------
+ CPU-thread-0  {COW_step1: {update page table to point to new page for addrA}}
+ CPU-thread-1  {COW_step1: {update page table to point to new page for addrB}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+3] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {preempted}
+ CPU-thread-2  {write to addrA which is a write to new page}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+3] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {preempted}
+ CPU-thread-2  {}
+ CPU-thread-3  {write to addrB which is a write to new page}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+4] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+5] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {read addrA from old page}
+ DEV-thread-2  {read addrB from new page}
+
+So here because at time N+2 the clear page table entry was not pair with a
+notification to invalidate the secondary TLB, the device see the new value for
+addrB before seing the new value for addrA. This break total memory ordering
+for the device.
+
+When changing a pte to write protect or to point to a new write protected page
+with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range
+call to mmu_notifier_invalidate_range_end() outside the page table lock. This
+is true even if the thread doing the page table update is preempted right after
+releasing page table lock but before call mmu_notifier_invalidate_range_end().
diff --git a/Documentation/vm/mmu_notifier.txt b/Documentation/vm/mmu_notifier.txt
deleted file mode 100644
index 23b4625..0000000
--- a/Documentation/vm/mmu_notifier.txt
+++ /dev/null
@@ -1,93 +0,0 @@
-When do you need to notify inside page table lock ?
-
-When clearing a pte/pmd we are given a choice to notify the event through
-(notify version of *_clear_flush call mmu_notifier_invalidate_range) under
-the page table lock. But that notification is not necessary in all cases.
-
-For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use
-thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a
-process virtual address space). There is only 2 cases when you need to notify
-those secondary TLB while holding page table lock when clearing a pte/pmd:
-
-  A) page backing address is free before mmu_notifier_invalidate_range_end()
-  B) a page table entry is updated to point to a new page (COW, write fault
-     on zero page, __replace_page(), ...)
-
-Case A is obvious you do not want to take the risk for the device to write to
-a page that might now be used by some completely different task.
-
-Case B is more subtle. For correctness it requires the following sequence to
-happen:
-  - take page table lock
-  - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify())
-  - set page table entry to point to new page
-
-If clearing the page table entry is not followed by a notify before setting
-the new pte/pmd value then you can break memory model like C11 or C++11 for
-the device.
-
-Consider the following scenario (device use a feature similar to ATS/PASID):
-
-Two address addrA and addrB such that |addrA - addrB| >= PAGE_SIZE we assume
-they are write protected for COW (other case of B apply too).
-
-[Time N] --------------------------------------------------------------------
-CPU-thread-0  {try to write to addrA}
-CPU-thread-1  {try to write to addrB}
-CPU-thread-2  {}
-CPU-thread-3  {}
-DEV-thread-0  {read addrA and populate device TLB}
-DEV-thread-2  {read addrB and populate device TLB}
-[Time N+1] ------------------------------------------------------------------
-CPU-thread-0  {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
-CPU-thread-1  {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
-CPU-thread-2  {}
-CPU-thread-3  {}
-DEV-thread-0  {}
-DEV-thread-2  {}
-[Time N+2] ------------------------------------------------------------------
-CPU-thread-0  {COW_step1: {update page table to point to new page for addrA}}
-CPU-thread-1  {COW_step1: {update page table to point to new page for addrB}}
-CPU-thread-2  {}
-CPU-thread-3  {}
-DEV-thread-0  {}
-DEV-thread-2  {}
-[Time N+3] ------------------------------------------------------------------
-CPU-thread-0  {preempted}
-CPU-thread-1  {preempted}
-CPU-thread-2  {write to addrA which is a write to new page}
-CPU-thread-3  {}
-DEV-thread-0  {}
-DEV-thread-2  {}
-[Time N+3] ------------------------------------------------------------------
-CPU-thread-0  {preempted}
-CPU-thread-1  {preempted}
-CPU-thread-2  {}
-CPU-thread-3  {write to addrB which is a write to new page}
-DEV-thread-0  {}
-DEV-thread-2  {}
-[Time N+4] ------------------------------------------------------------------
-CPU-thread-0  {preempted}
-CPU-thread-1  {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
-CPU-thread-2  {}
-CPU-thread-3  {}
-DEV-thread-0  {}
-DEV-thread-2  {}
-[Time N+5] ------------------------------------------------------------------
-CPU-thread-0  {preempted}
-CPU-thread-1  {}
-CPU-thread-2  {}
-CPU-thread-3  {}
-DEV-thread-0  {read addrA from old page}
-DEV-thread-2  {read addrB from new page}
-
-So here because at time N+2 the clear page table entry was not pair with a
-notification to invalidate the secondary TLB, the device see the new value for
-addrB before seing the new value for addrA. This break total memory ordering
-for the device.
-
-When changing a pte to write protect or to point to a new write protected page
-with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range
-call to mmu_notifier_invalidate_range_end() outside the page table lock. This
-is true even if the thread doing the page table update is preempted right after
-releasing page table lock but before call mmu_notifier_invalidate_range_end().
diff --git a/Documentation/vm/numa b/Documentation/vm/numa.rst
index a31b85b..185d8a5 100644
--- a/Documentation/vm/numa
+++ b/Documentation/vm/numa.rst
@@ -1,6 +1,10 @@
+.. _numa:
+
 Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com>
 
+=============
 What is NUMA?
+=============
 
 This question can be answered from a couple of perspectives:  the
 hardware view and the Linux software view.
@@ -106,7 +110,7 @@ to improve NUMA locality using various CPU affinity command line interfaces,
 such as taskset(1) and numactl(1), and program interfaces such as
 sched_setaffinity(2).  Further, one can modify the kernel's default local
 allocation behavior using Linux NUMA memory policy.
-[see Documentation/vm/numa_memory_policy.txt.]
+[see Documentation/admin-guide/mm/numa_memory_policy.rst.]
 
 System administrators can restrict the CPUs and nodes' memories that a non-
 privileged user can specify in the scheduling or NUMA commands and functions
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
deleted file mode 100644
index 622b927..0000000
--- a/Documentation/vm/numa_memory_policy.txt
+++ /dev/null
@@ -1,452 +0,0 @@
-
-What is Linux Memory Policy?
-
-In the Linux kernel, "memory policy" determines from which node the kernel will
-allocate memory in a NUMA system or in an emulated NUMA system.  Linux has
-supported platforms with Non-Uniform Memory Access architectures since 2.4.?.
-The current memory policy support was added to Linux 2.6 around May 2004.  This
-document attempts to describe the concepts and APIs of the 2.6 memory policy
-support.
-
-Memory policies should not be confused with cpusets
-(Documentation/cgroup-v1/cpusets.txt)
-which is an administrative mechanism for restricting the nodes from which
-memory may be allocated by a set of processes. Memory policies are a
-programming interface that a NUMA-aware application can take advantage of.  When
-both cpusets and policies are applied to a task, the restrictions of the cpuset
-takes priority.  See "MEMORY POLICIES AND CPUSETS" below for more details.
-
-MEMORY POLICY CONCEPTS
-
-Scope of Memory Policies
-
-The Linux kernel supports _scopes_ of memory policy, described here from
-most general to most specific:
-
-    System Default Policy:  this policy is "hard coded" into the kernel.  It
-    is the policy that governs all page allocations that aren't controlled
-    by one of the more specific policy scopes discussed below.  When the
-    system is "up and running", the system default policy will use "local
-    allocation" described below.  However, during boot up, the system
-    default policy will be set to interleave allocations across all nodes
-    with "sufficient" memory, so as not to overload the initial boot node
-    with boot-time allocations.
-
-    Task/Process Policy:  this is an optional, per-task policy.  When defined
-    for a specific task, this policy controls all page allocations made by or
-    on behalf of the task that aren't controlled by a more specific scope.
-    If a task does not define a task policy, then all page allocations that
-    would have been controlled by the task policy "fall back" to the System
-    Default Policy.
-
-	The task policy applies to the entire address space of a task. Thus,
-	it is inheritable, and indeed is inherited, across both fork()
-	[clone() w/o the CLONE_VM flag] and exec*().  This allows a parent task
-	to establish the task policy for a child task exec()'d from an
-	executable image that has no awareness of memory policy.  See the
-	MEMORY POLICY APIS section, below, for an overview of the system call
-	that a task may use to set/change its task/process policy.
-
-	In a multi-threaded task, task policies apply only to the thread
-	[Linux kernel task] that installs the policy and any threads
-	subsequently created by that thread.  Any sibling threads existing
-	at the time a new task policy is installed retain their current
-	policy.
-
-	A task policy applies only to pages allocated after the policy is
-	installed.  Any pages already faulted in by the task when the task
-	changes its task policy remain where they were allocated based on
-	the policy at the time they were allocated.
-
-    VMA Policy:  A "VMA" or "Virtual Memory Area" refers to a range of a task's
-    virtual address space.  A task may define a specific policy for a range
-    of its virtual address space.   See the MEMORY POLICIES APIS section,
-    below, for an overview of the mbind() system call used to set a VMA
-    policy.
-
-    A VMA policy will govern the allocation of pages that back this region of
-    the address space.  Any regions of the task's address space that don't
-    have an explicit VMA policy will fall back to the task policy, which may
-    itself fall back to the System Default Policy.
-
-    VMA policies have a few complicating details:
-
-	VMA policy applies ONLY to anonymous pages.  These include pages
-	allocated for anonymous segments, such as the task stack and heap, and
-	any regions of the address space mmap()ed with the MAP_ANONYMOUS flag.
-	If a VMA policy is applied to a file mapping, it will be ignored if
-	the mapping used the MAP_SHARED flag.  If the file mapping used the
-	MAP_PRIVATE flag, the VMA policy will only be applied when an
-	anonymous page is allocated on an attempt to write to the mapping--
-	i.e., at Copy-On-Write.
-
-	VMA policies are shared between all tasks that share a virtual address
-	space--a.k.a. threads--independent of when the policy is installed; and
-	they are inherited across fork().  However, because VMA policies refer
-	to a specific region of a task's address space, and because the address
-	space is discarded and recreated on exec*(), VMA policies are NOT
-	inheritable across exec().  Thus, only NUMA-aware applications may
-	use VMA policies.
-
-	A task may install a new VMA policy on a sub-range of a previously
-	mmap()ed region.  When this happens, Linux splits the existing virtual
-	memory area into 2 or 3 VMAs, each with it's own policy.
-
-	By default, VMA policy applies only to pages allocated after the policy
-	is installed.  Any pages already faulted into the VMA range remain
-	where they were allocated based on the policy at the time they were
-	allocated.  However, since 2.6.16, Linux supports page migration via
-	the mbind() system call, so that page contents can be moved to match
-	a newly installed policy.
-
-    Shared Policy:  Conceptually, shared policies apply to "memory objects"
-    mapped shared into one or more tasks' distinct address spaces.  An
-    application installs a shared policies the same way as VMA policies--using
-    the mbind() system call specifying a range of virtual addresses that map
-    the shared object.  However, unlike VMA policies, which can be considered
-    to be an attribute of a range of a task's address space, shared policies
-    apply directly to the shared object.  Thus, all tasks that attach to the
-    object share the policy, and all pages allocated for the shared object,
-    by any task, will obey the shared policy.
-
-	As of 2.6.22, only shared memory segments, created by shmget() or
-	mmap(MAP_ANONYMOUS|MAP_SHARED), support shared policy.  When shared
-	policy support was added to Linux, the associated data structures were
-	added to hugetlbfs shmem segments.  At the time, hugetlbfs did not
-	support allocation at fault time--a.k.a lazy allocation--so hugetlbfs
-	shmem segments were never "hooked up" to the shared policy support.
-	Although hugetlbfs segments now support lazy allocation, their support
-	for shared policy has not been completed.
-
-	As mentioned above [re: VMA policies], allocations of page cache
-	pages for regular files mmap()ed with MAP_SHARED ignore any VMA
-	policy installed on the virtual address range backed by the shared
-	file mapping.  Rather, shared page cache pages, including pages backing
-	private mappings that have not yet been written by the task, follow
-	task policy, if any, else System Default Policy.
-
-	The shared policy infrastructure supports different policies on subset
-	ranges of the shared object.  However, Linux still splits the VMA of
-	the task that installs the policy for each range of distinct policy.
-	Thus, different tasks that attach to a shared memory segment can have
-	different VMA configurations mapping that one shared object.  This
-	can be seen by examining the /proc/<pid>/numa_maps of tasks sharing
-	a shared memory region, when one task has installed shared policy on
-	one or more ranges of the region.
-
-Components of Memory Policies
-
-    A Linux memory policy consists of a "mode", optional mode flags, and an
-    optional set of nodes.  The mode determines the behavior of the policy,
-    the optional mode flags determine the behavior of the mode, and the
-    optional set of nodes can be viewed as the arguments to the policy
-    behavior.
-
-   Internally, memory policies are implemented by a reference counted
-   structure, struct mempolicy.  Details of this structure will be discussed
-   in context, below, as required to explain the behavior.
-
-   Linux memory policy supports the following 4 behavioral modes:
-
-	Default Mode--MPOL_DEFAULT:  This mode is only used in the memory
-	policy APIs.  Internally, MPOL_DEFAULT is converted to the NULL
-	memory policy in all policy scopes.  Any existing non-default policy
-	will simply be removed when MPOL_DEFAULT is specified.  As a result,
-	MPOL_DEFAULT means "fall back to the next most specific policy scope."
-
-	    For example, a NULL or default task policy will fall back to the
-	    system default policy.  A NULL or default vma policy will fall
-	    back to the task policy.
-
-	    When specified in one of the memory policy APIs, the Default mode
-	    does not use the optional set of nodes.
-
-	    It is an error for the set of nodes specified for this policy to
-	    be non-empty.
-
-	MPOL_BIND:  This mode specifies that memory must come from the
-	set of nodes specified by the policy.  Memory will be allocated from
-	the node in the set with sufficient free memory that is closest to
-	the node where the allocation takes place.
-
-	MPOL_PREFERRED:  This mode specifies that the allocation should be
-	attempted from the single node specified in the policy.  If that
-	allocation fails, the kernel will search other nodes, in order of
-	increasing distance from the preferred node based on information
-	provided by the platform firmware.
-
-	    Internally, the Preferred policy uses a single node--the
-	    preferred_node member of struct mempolicy.  When the internal
-	    mode flag MPOL_F_LOCAL is set, the preferred_node is ignored and
-	    the policy is interpreted as local allocation.  "Local" allocation
-	    policy can be viewed as a Preferred policy that starts at the node
-	    containing the cpu where the allocation takes place.
-
-	    It is possible for the user to specify that local allocation is
-	    always preferred by passing an empty nodemask with this mode.
-	    If an empty nodemask is passed, the policy cannot use the
-	    MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES flags described
-	    below.
-
-	MPOL_INTERLEAVED:  This mode specifies that page allocations be
-	interleaved, on a page granularity, across the nodes specified in
-	the policy.  This mode also behaves slightly differently, based on
-	the context where it is used:
-
-	    For allocation of anonymous pages and shared memory pages,
-	    Interleave mode indexes the set of nodes specified by the policy
-	    using the page offset of the faulting address into the segment
-	    [VMA] containing the address modulo the number of nodes specified
-	    by the policy.  It then attempts to allocate a page, starting at
-	    the selected node, as if the node had been specified by a Preferred
-	    policy or had been selected by a local allocation.  That is,
-	    allocation will follow the per node zonelist.
-
-	    For allocation of page cache pages, Interleave mode indexes the set
-	    of nodes specified by the policy using a node counter maintained
-	    per task.  This counter wraps around to the lowest specified node
-	    after it reaches the highest specified node.  This will tend to
-	    spread the pages out over the nodes specified by the policy based
-	    on the order in which they are allocated, rather than based on any
-	    page offset into an address range or file.  During system boot up,
-	    the temporary interleaved system default policy works in this
-	    mode.
-
-   Linux memory policy supports the following optional mode flags:
-
-	MPOL_F_STATIC_NODES:  This flag specifies that the nodemask passed by
-	the user should not be remapped if the task or VMA's set of allowed
-	nodes changes after the memory policy has been defined.
-
-	    Without this flag, anytime a mempolicy is rebound because of a
-	    change in the set of allowed nodes, the node (Preferred) or
-	    nodemask (Bind, Interleave) is remapped to the new set of
-	    allowed nodes.  This may result in nodes being used that were
-	    previously undesired.
-
-	    With this flag, if the user-specified nodes overlap with the
-	    nodes allowed by the task's cpuset, then the memory policy is
-	    applied to their intersection.  If the two sets of nodes do not
-	    overlap, the Default policy is used.
-
-	    For example, consider a task that is attached to a cpuset with
-	    mems 1-3 that sets an Interleave policy over the same set.  If
-	    the cpuset's mems change to 3-5, the Interleave will now occur
-	    over nodes 3, 4, and 5.  With this flag, however, since only node
-	    3 is allowed from the user's nodemask, the "interleave" only
-	    occurs over that node.  If no nodes from the user's nodemask are
-	    now allowed, the Default behavior is used.
-
-	    MPOL_F_STATIC_NODES cannot be combined with the
-	    MPOL_F_RELATIVE_NODES flag.  It also cannot be used for
-	    MPOL_PREFERRED policies that were created with an empty nodemask
-	    (local allocation).
-
-	MPOL_F_RELATIVE_NODES:  This flag specifies that the nodemask passed
-	by the user will be mapped relative to the set of the task or VMA's
-	set of allowed nodes.  The kernel stores the user-passed nodemask,
-	and if the allowed nodes changes, then that original nodemask will
-	be remapped relative to the new set of allowed nodes.
-
-	    Without this flag (and without MPOL_F_STATIC_NODES), anytime a
-	    mempolicy is rebound because of a change in the set of allowed
-	    nodes, the node (Preferred) or nodemask (Bind, Interleave) is
-	    remapped to the new set of allowed nodes.  That remap may not
-	    preserve the relative nature of the user's passed nodemask to its
-	    set of allowed nodes upon successive rebinds: a nodemask of
-	    1,3,5 may be remapped to 7-9 and then to 1-3 if the set of
-	    allowed nodes is restored to its original state.
-
-	    With this flag, the remap is done so that the node numbers from
-	    the user's passed nodemask are relative to the set of allowed
-	    nodes.  In other words, if nodes 0, 2, and 4 are set in the user's
-	    nodemask, the policy will be effected over the first (and in the
-	    Bind or Interleave case, the third and fifth) nodes in the set of
-	    allowed nodes.  The nodemask passed by the user represents nodes
-	    relative to task or VMA's set of allowed nodes.
-
-	    If the user's nodemask includes nodes that are outside the range
-	    of the new set of allowed nodes (for example, node 5 is set in
-	    the user's nodemask when the set of allowed nodes is only 0-3),
-	    then the remap wraps around to the beginning of the nodemask and,
-	    if not already set, sets the node in the mempolicy nodemask.
-
-	    For example, consider a task that is attached to a cpuset with
-	    mems 2-5 that sets an Interleave policy over the same set with
-	    MPOL_F_RELATIVE_NODES.  If the cpuset's mems change to 3-7, the
-	    interleave now occurs over nodes 3,5-7.  If the cpuset's mems
-	    then change to 0,2-3,5, then the interleave occurs over nodes
-	    0,2-3,5.
-
-	    Thanks to the consistent remapping, applications preparing
-	    nodemasks to specify memory policies using this flag should
-	    disregard their current, actual cpuset imposed memory placement
-	    and prepare the nodemask as if they were always located on
-	    memory nodes 0 to N-1, where N is the number of memory nodes the
-	    policy is intended to manage.  Let the kernel then remap to the
-	    set of memory nodes allowed by the task's cpuset, as that may
-	    change over time.
-
-	    MPOL_F_RELATIVE_NODES cannot be combined with the
-	    MPOL_F_STATIC_NODES flag.  It also cannot be used for
-	    MPOL_PREFERRED policies that were created with an empty nodemask
-	    (local allocation).
-
-MEMORY POLICY REFERENCE COUNTING
-
-To resolve use/free races, struct mempolicy contains an atomic reference
-count field.  Internal interfaces, mpol_get()/mpol_put() increment and
-decrement this reference count, respectively.  mpol_put() will only free
-the structure back to the mempolicy kmem cache when the reference count
-goes to zero.
-
-When a new memory policy is allocated, its reference count is initialized
-to '1', representing the reference held by the task that is installing the
-new policy.  When a pointer to a memory policy structure is stored in another
-structure, another reference is added, as the task's reference will be dropped
-on completion of the policy installation.
-
-During run-time "usage" of the policy, we attempt to minimize atomic operations
-on the reference count, as this can lead to cache lines bouncing between cpus
-and NUMA nodes.  "Usage" here means one of the following:
-
-1) querying of the policy, either by the task itself [using the get_mempolicy()
-   API discussed below] or by another task using the /proc/<pid>/numa_maps
-   interface.
-
-2) examination of the policy to determine the policy mode and associated node
-   or node lists, if any, for page allocation.  This is considered a "hot
-   path".  Note that for MPOL_BIND, the "usage" extends across the entire
-   allocation process, which may sleep during page reclaimation, because the
-   BIND policy nodemask is used, by reference, to filter ineligible nodes.
-
-We can avoid taking an extra reference during the usages listed above as
-follows:
-
-1) we never need to get/free the system default policy as this is never
-   changed nor freed, once the system is up and running.
-
-2) for querying the policy, we do not need to take an extra reference on the
-   target task's task policy nor vma policies because we always acquire the
-   task's mm's mmap_sem for read during the query.  The set_mempolicy() and
-   mbind() APIs [see below] always acquire the mmap_sem for write when
-   installing or replacing task or vma policies.  Thus, there is no possibility
-   of a task or thread freeing a policy while another task or thread is
-   querying it.
-
-3) Page allocation usage of task or vma policy occurs in the fault path where
-   we hold them mmap_sem for read.  Again, because replacing the task or vma
-   policy requires that the mmap_sem be held for write, the policy can't be
-   freed out from under us while we're using it for page allocation.
-
-4) Shared policies require special consideration.  One task can replace a
-   shared memory policy while another task, with a distinct mmap_sem, is
-   querying or allocating a page based on the policy.  To resolve this
-   potential race, the shared policy infrastructure adds an extra reference
-   to the shared policy during lookup while holding a spin lock on the shared
-   policy management structure.  This requires that we drop this extra
-   reference when we're finished "using" the policy.  We must drop the
-   extra reference on shared policies in the same query/allocation paths
-   used for non-shared policies.  For this reason, shared policies are marked
-   as such, and the extra reference is dropped "conditionally"--i.e., only
-   for shared policies.
-
-   Because of this extra reference counting, and because we must lookup
-   shared policies in a tree structure under spinlock, shared policies are
-   more expensive to use in the page allocation path.  This is especially
-   true for shared policies on shared memory regions shared by tasks running
-   on different NUMA nodes.  This extra overhead can be avoided by always
-   falling back to task or system default policy for shared memory regions,
-   or by prefaulting the entire shared memory region into memory and locking
-   it down.  However, this might not be appropriate for all applications.
-
-MEMORY POLICY APIs
-
-Linux supports 3 system calls for controlling memory policy.  These APIS
-always affect only the calling task, the calling task's address space, or
-some shared object mapped into the calling task's address space.
-
-	Note:  the headers that define these APIs and the parameter data types
-	for user space applications reside in a package that is not part of
-	the Linux kernel.  The kernel system call interfaces, with the 'sys_'
-	prefix, are defined in <linux/syscalls.h>; the mode and flag
-	definitions are defined in <linux/mempolicy.h>.
-
-Set [Task] Memory Policy:
-
-	long set_mempolicy(int mode, const unsigned long *nmask,
-					unsigned long maxnode);
-
-	Set's the calling task's "task/process memory policy" to mode
-	specified by the 'mode' argument and the set of nodes defined
-	by 'nmask'.  'nmask' points to a bit mask of node ids containing
-	at least 'maxnode' ids.  Optional mode flags may be passed by
-	combining the 'mode' argument with the flag (for example:
-	MPOL_INTERLEAVE | MPOL_F_STATIC_NODES).
-
-	See the set_mempolicy(2) man page for more details
-
-
-Get [Task] Memory Policy or Related Information
-
-	long get_mempolicy(int *mode,
-			   const unsigned long *nmask, unsigned long maxnode,
-			   void *addr, int flags);
-
-	Queries the "task/process memory policy" of the calling task, or
-	the policy or location of a specified virtual address, depending
-	on the 'flags' argument.
-
-	See the get_mempolicy(2) man page for more details
-
-
-Install VMA/Shared Policy for a Range of Task's Address Space
-
-	long mbind(void *start, unsigned long len, int mode,
-		   const unsigned long *nmask, unsigned long maxnode,
-		   unsigned flags);
-
-	mbind() installs the policy specified by (mode, nmask, maxnodes) as
-	a VMA policy for the range of the calling task's address space
-	specified by the 'start' and 'len' arguments.  Additional actions
-	may be requested via the 'flags' argument.
-
-	See the mbind(2) man page for more details.
-
-MEMORY POLICY COMMAND LINE INTERFACE
-
-Although not strictly part of the Linux implementation of memory policy,
-a command line tool, numactl(8), exists that allows one to:
-
-+ set the task policy for a specified program via set_mempolicy(2), fork(2) and
-  exec(2)
-
-+ set the shared policy for a shared memory segment via mbind(2)
-
-The numactl(8) tool is packaged with the run-time version of the library
-containing the memory policy system call wrappers.  Some distributions
-package the headers and compile-time libraries in a separate development
-package.
-
-
-MEMORY POLICIES AND CPUSETS
-
-Memory policies work within cpusets as described above.  For memory policies
-that require a node or set of nodes, the nodes are restricted to the set of
-nodes whose memories are allowed by the cpuset constraints.  If the nodemask
-specified for the policy contains nodes that are not allowed by the cpuset and
-MPOL_F_RELATIVE_NODES is not used, the intersection of the set of nodes
-specified for the policy and the set of nodes with memory is used.  If the
-result is the empty set, the policy is considered invalid and cannot be
-installed.  If MPOL_F_RELATIVE_NODES is used, the policy's nodes are mapped
-onto and folded into the task's set of allowed nodes as previously described.
-
-The interaction of memory policies and cpusets can be problematic when tasks
-in two cpusets share access to a memory region, such as shared memory segments
-created by shmget() of mmap() with the MAP_ANONYMOUS and MAP_SHARED flags, and
-any of the tasks install shared policy on the region, only nodes whose
-memories are allowed in both cpusets may be used in the policies.  Obtaining
-this information requires "stepping outside" the memory policy APIs to use the
-cpuset information and requires that one know in what cpusets other task might
-be attaching to the shared region.  Furthermore, if the cpusets' allowed
-memory sets are disjoint, "local" allocation is the only valid policy.
diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting
deleted file mode 100644
index cbfaaa6..0000000
--- a/Documentation/vm/overcommit-accounting
+++ /dev/null
@@ -1,80 +0,0 @@
-The Linux kernel supports the following overcommit handling modes
-
-0	-	Heuristic overcommit handling. Obvious overcommits of
-		address space are refused. Used for a typical system. It
-		ensures a seriously wild allocation fails while allowing
-		overcommit to reduce swap usage.  root is allowed to 
-		allocate slightly more memory in this mode. This is the 
-		default.
-
-1	-	Always overcommit. Appropriate for some scientific
-		applications. Classic example is code using sparse arrays
-		and just relying on the virtual memory consisting almost
-		entirely of zero pages.
-
-2	-	Don't overcommit. The total address space commit
-		for the system is not permitted to exceed swap + a
-		configurable amount (default is 50%) of physical RAM.
-		Depending on the amount you use, in most situations
-		this means a process will not be killed while accessing
-		pages but will receive errors on memory allocation as
-		appropriate.
-
-		Useful for applications that want to guarantee their
-		memory allocations will be available in the future
-		without having to initialize every page.
-
-The overcommit policy is set via the sysctl `vm.overcommit_memory'.
-
-The overcommit amount can be set via `vm.overcommit_ratio' (percentage)
-or `vm.overcommit_kbytes' (absolute value).
-
-The current overcommit limit and amount committed are viewable in
-/proc/meminfo as CommitLimit and Committed_AS respectively.
-
-Gotchas
--------
-
-The C language stack growth does an implicit mremap. If you want absolute
-guarantees and run close to the edge you MUST mmap your stack for the 
-largest size you think you will need. For typical stack usage this does
-not matter much but it's a corner case if you really really care
-
-In mode 2 the MAP_NORESERVE flag is ignored. 
-
-
-How It Works
-------------
-
-The overcommit is based on the following rules
-
-For a file backed map
-	SHARED or READ-only	-	0 cost (the file is the map not swap)
-	PRIVATE WRITABLE	-	size of mapping per instance
-
-For an anonymous or /dev/zero map
-	SHARED			-	size of mapping
-	PRIVATE READ-only	-	0 cost (but of little use)
-	PRIVATE WRITABLE	-	size of mapping per instance
-
-Additional accounting
-	Pages made writable copies by mmap
-	shmfs memory drawn from the same pool
-
-Status
-------
-
-o	We account mmap memory mappings
-o	We account mprotect changes in commit
-o	We account mremap changes in size
-o	We account brk
-o	We account munmap
-o	We report the commit status in /proc
-o	Account and check on fork
-o	Review stack handling/building on exec
-o	SHMfs accounting
-o	Implement actual limit enforcement
-
-To Do
------
-o	Account ptrace pages (this is hard)
diff --git a/Documentation/vm/overcommit-accounting.rst b/Documentation/vm/overcommit-accounting.rst
new file mode 100644
index 0000000..0dd54bb
--- /dev/null
+++ b/Documentation/vm/overcommit-accounting.rst
@@ -0,0 +1,87 @@
+.. _overcommit_accounting:
+
+=====================
+Overcommit Accounting
+=====================
+
+The Linux kernel supports the following overcommit handling modes
+
+0
+	Heuristic overcommit handling. Obvious overcommits of address
+	space are refused. Used for a typical system. It ensures a
+	seriously wild allocation fails while allowing overcommit to
+	reduce swap usage.  root is allowed to allocate slightly more
+	memory in this mode. This is the default.
+
+1
+	Always overcommit. Appropriate for some scientific
+	applications. Classic example is code using sparse arrays and
+	just relying on the virtual memory consisting almost entirely
+	of zero pages.
+
+2
+	Don't overcommit. The total address space commit for the
+	system is not permitted to exceed swap + a configurable amount
+	(default is 50%) of physical RAM.  Depending on the amount you
+	use, in most situations this means a process will not be
+	killed while accessing pages but will receive errors on memory
+	allocation as appropriate.
+
+	Useful for applications that want to guarantee their memory
+	allocations will be available in the future without having to
+	initialize every page.
+
+The overcommit policy is set via the sysctl ``vm.overcommit_memory``.
+
+The overcommit amount can be set via ``vm.overcommit_ratio`` (percentage)
+or ``vm.overcommit_kbytes`` (absolute value).
+
+The current overcommit limit and amount committed are viewable in
+``/proc/meminfo`` as CommitLimit and Committed_AS respectively.
+
+Gotchas
+=======
+
+The C language stack growth does an implicit mremap. If you want absolute
+guarantees and run close to the edge you MUST mmap your stack for the
+largest size you think you will need. For typical stack usage this does
+not matter much but it's a corner case if you really really care
+
+In mode 2 the MAP_NORESERVE flag is ignored.
+
+
+How It Works
+============
+
+The overcommit is based on the following rules
+
+For a file backed map
+	| SHARED or READ-only	-	0 cost (the file is the map not swap)
+	| PRIVATE WRITABLE	-	size of mapping per instance
+
+For an anonymous or ``/dev/zero`` map
+	| SHARED			-	size of mapping
+	| PRIVATE READ-only	-	0 cost (but of little use)
+	| PRIVATE WRITABLE	-	size of mapping per instance
+
+Additional accounting
+	| Pages made writable copies by mmap
+	| shmfs memory drawn from the same pool
+
+Status
+======
+
+*	We account mmap memory mappings
+*	We account mprotect changes in commit
+*	We account mremap changes in size
+*	We account brk
+*	We account munmap
+*	We report the commit status in /proc
+*	Account and check on fork
+*	Review stack handling/building on exec
+*	SHMfs accounting
+*	Implement actual limit enforcement
+
+To Do
+=====
+*	Account ptrace pages (this is hard)
diff --git a/Documentation/vm/page_frags b/Documentation/vm/page_frags.rst
index a671456..637cc49 100644
--- a/Documentation/vm/page_frags
+++ b/Documentation/vm/page_frags.rst
@@ -1,5 +1,8 @@
+.. _page_frags:
+
+==============
 Page fragments
---------------
+==============
 
 A page fragment is an arbitrary-length arbitrary-offset area of memory
 which resides within a 0 or higher order compound page.  Multiple
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration.rst
index 4968680..f68d613 100644
--- a/Documentation/vm/page_migration
+++ b/Documentation/vm/page_migration.rst
@@ -1,5 +1,8 @@
+.. _page_migration:
+
+==============
 Page migration
---------------
+==============
 
 Page migration allows the moving of the physical location of pages between
 nodes in a numa system while the process is running. This means that the
@@ -20,7 +23,7 @@ Page migration functions are provided by the numactl package by Andi Kleen
 (a version later than 0.9.3 is required. Get it from
 ftp://oss.sgi.com/www/projects/libnuma/download/). numactl provides libnuma
 which provides an interface similar to other numa functionality for page
-migration.  cat /proc/<pid>/numa_maps allows an easy review of where the
+migration.  cat ``/proc/<pid>/numa_maps`` allows an easy review of where the
 pages of a process are located. See also the numa_maps documentation in the
 proc(5) man page.
 
@@ -56,8 +59,8 @@ description for those trying to use migrate_pages() from the kernel
 (for userspace usage see the Andi Kleen's numactl package mentioned above)
 and then a low level description of how the low level details work.
 
-A. In kernel use of migrate_pages()
------------------------------------
+In kernel use of migrate_pages()
+================================
 
 1. Remove pages from the LRU.
 
@@ -78,8 +81,8 @@ A. In kernel use of migrate_pages()
    the new page for each page that is considered for
    moving.
 
-B. How migrate_pages() works
-----------------------------
+How migrate_pages() works
+=========================
 
 migrate_pages() does several passes over its list of pages. A page is moved
 if all references to a page are removable at the time. The page has
@@ -142,8 +145,8 @@ Steps:
 20. The new page is moved to the LRU and can be scanned by the swapper
     etc again.
 
-C. Non-LRU page migration
--------------------------
+Non-LRU page migration
+======================
 
 Although original migration aimed for reducing the latency of memory access
 for NUMA, compaction who want to create high-order page is also main customer.
@@ -164,89 +167,91 @@ migration path.
 If a driver want to make own pages movable, it should define three functions
 which are function pointers of struct address_space_operations.
 
-1. bool (*isolate_page) (struct page *page, isolate_mode_t mode);
+1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);``
 
-What VM expects on isolate_page function of driver is to return *true*
-if driver isolates page successfully. On returing true, VM marks the page
-as PG_isolated so concurrent isolation in several CPUs skip the page
-for isolation. If a driver cannot isolate the page, it should return *false*.
+   What VM expects on isolate_page function of driver is to return *true*
+   if driver isolates page successfully. On returing true, VM marks the page
+   as PG_isolated so concurrent isolation in several CPUs skip the page
+   for isolation. If a driver cannot isolate the page, it should return *false*.
 
-Once page is successfully isolated, VM uses page.lru fields so driver
-shouldn't expect to preserve values in that fields.
+   Once page is successfully isolated, VM uses page.lru fields so driver
+   shouldn't expect to preserve values in that fields.
 
-2. int (*migratepage) (struct address_space *mapping,
-		struct page *newpage, struct page *oldpage, enum migrate_mode);
+2. ``int (*migratepage) (struct address_space *mapping,``
+|	``struct page *newpage, struct page *oldpage, enum migrate_mode);``
 
-After isolation, VM calls migratepage of driver with isolated page.
-The function of migratepage is to move content of the old page to new page
-and set up fields of struct page newpage. Keep in mind that you should
-indicate to the VM the oldpage is no longer movable via __ClearPageMovable()
-under page_lock if you migrated the oldpage successfully and returns
-MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver
-can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time
-because VM interprets -EAGAIN as "temporal migration failure". On returning
-any error except -EAGAIN, VM will give up the page migration without retrying
-in this time.
+   After isolation, VM calls migratepage of driver with isolated page.
+   The function of migratepage is to move content of the old page to new page
+   and set up fields of struct page newpage. Keep in mind that you should
+   indicate to the VM the oldpage is no longer movable via __ClearPageMovable()
+   under page_lock if you migrated the oldpage successfully and returns
+   MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver
+   can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time
+   because VM interprets -EAGAIN as "temporal migration failure". On returning
+   any error except -EAGAIN, VM will give up the page migration without retrying
+   in this time.
 
-Driver shouldn't touch page.lru field VM using in the functions.
+   Driver shouldn't touch page.lru field VM using in the functions.
 
-3. void (*putback_page)(struct page *);
+3. ``void (*putback_page)(struct page *);``
 
-If migration fails on isolated page, VM should return the isolated page
-to the driver so VM calls driver's putback_page with migration failed page.
-In this function, driver should put the isolated page back to the own data
-structure.
+   If migration fails on isolated page, VM should return the isolated page
+   to the driver so VM calls driver's putback_page with migration failed page.
+   In this function, driver should put the isolated page back to the own data
+   structure.
 
 4. non-lru movable page flags
 
-There are two page flags for supporting non-lru movable page.
+   There are two page flags for supporting non-lru movable page.
 
-* PG_movable
+   * PG_movable
 
-Driver should use the below function to make page movable under page_lock.
+     Driver should use the below function to make page movable under page_lock::
 
 	void __SetPageMovable(struct page *page, struct address_space *mapping)
 
-It needs argument of address_space for registering migration family functions
-which will be called by VM. Exactly speaking, PG_movable is not a real flag of
-struct page. Rather than, VM reuses page->mapping's lower bits to represent it.
+     It needs argument of address_space for registering migration
+     family functions which will be called by VM. Exactly speaking,
+     PG_movable is not a real flag of struct page. Rather than, VM
+     reuses page->mapping's lower bits to represent it.
 
+::
 	#define PAGE_MAPPING_MOVABLE 0x2
 	page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;
 
-so driver shouldn't access page->mapping directly. Instead, driver should
-use page_mapping which mask off the low two bits of page->mapping under
-page lock so it can get right struct address_space.
-
-For testing of non-lru movable page, VM supports __PageMovable function.
-However, it doesn't guarantee to identify non-lru movable page because
-page->mapping field is unified with other variables in struct page.
-As well, if driver releases the page after isolation by VM, page->mapping
-doesn't have stable value although it has PAGE_MAPPING_MOVABLE
-(Look at __ClearPageMovable). But __PageMovable is cheap to catch whether
-page is LRU or non-lru movable once the page has been isolated. Because
-LRU pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also
-good for just peeking to test non-lru movable pages before more expensive
-checking with lock_page in pfn scanning to select victim.
-
-For guaranteeing non-lru movable page, VM provides PageMovable function.
-Unlike __PageMovable, PageMovable functions validates page->mapping and
-mapping->a_ops->isolate_page under lock_page. The lock_page prevents sudden
-destroying of page->mapping.
-
-Driver using __SetPageMovable should clear the flag via __ClearMovablePage
-under page_lock before the releasing the page.
-
-* PG_isolated
-
-To prevent concurrent isolation among several CPUs, VM marks isolated page
-as PG_isolated under lock_page. So if a CPU encounters PG_isolated non-lru
-movable page, it can skip it. Driver doesn't need to manipulate the flag
-because VM will set/clear it automatically. Keep in mind that if driver
-sees PG_isolated page, it means the page have been isolated by VM so it
-shouldn't touch page.lru field.
-PG_isolated is alias with PG_reclaim flag so driver shouldn't use the flag
-for own purpose.
+     so driver shouldn't access page->mapping directly. Instead, driver should
+     use page_mapping which mask off the low two bits of page->mapping under
+     page lock so it can get right struct address_space.
+
+     For testing of non-lru movable page, VM supports __PageMovable function.
+     However, it doesn't guarantee to identify non-lru movable page because
+     page->mapping field is unified with other variables in struct page.
+     As well, if driver releases the page after isolation by VM, page->mapping
+     doesn't have stable value although it has PAGE_MAPPING_MOVABLE
+     (Look at __ClearPageMovable). But __PageMovable is cheap to catch whether
+     page is LRU or non-lru movable once the page has been isolated. Because
+     LRU pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also
+     good for just peeking to test non-lru movable pages before more expensive
+     checking with lock_page in pfn scanning to select victim.
+
+     For guaranteeing non-lru movable page, VM provides PageMovable function.
+     Unlike __PageMovable, PageMovable functions validates page->mapping and
+     mapping->a_ops->isolate_page under lock_page. The lock_page prevents sudden
+     destroying of page->mapping.
+
+     Driver using __SetPageMovable should clear the flag via __ClearMovablePage
+     under page_lock before the releasing the page.
+
+   * PG_isolated
+
+     To prevent concurrent isolation among several CPUs, VM marks isolated page
+     as PG_isolated under lock_page. So if a CPU encounters PG_isolated non-lru
+     movable page, it can skip it. Driver doesn't need to manipulate the flag
+     because VM will set/clear it automatically. Keep in mind that if driver
+     sees PG_isolated page, it means the page have been isolated by VM so it
+     shouldn't touch page.lru field.
+     PG_isolated is alias with PG_reclaim flag so driver shouldn't use the flag
+     for own purpose.
 
 Christoph Lameter, May 8, 2006.
 Minchan Kim, Mar 28, 2016.
diff --git a/Documentation/vm/page_owner.txt b/Documentation/vm/page_owner.rst
index ffff143..0ed5ab8 100644
--- a/Documentation/vm/page_owner.txt
+++ b/Documentation/vm/page_owner.rst
@@ -1,7 +1,11 @@
+.. _page_owner:
+
+==================================================
 page owner: Tracking about who allocated each page
------------------------------------------------------------
+==================================================
 
-* Introduction
+Introduction
+============
 
 page owner is for the tracking about who allocated each page.
 It can be used to debug memory leak or to find a memory hogger.
@@ -34,13 +38,15 @@ not affect to allocation performance, especially if the static keys jump
 label patching functionality is available. Following is the kernel's code
 size change due to this facility.
 
-- Without page owner
+- Without page owner::
+
    text    data     bss     dec     hex filename
-  40662    1493     644   42799    a72f mm/page_alloc.o
+   40662   1493     644   42799    a72f mm/page_alloc.o
+
+- With page owner::
 
-- With page owner
    text    data     bss     dec     hex filename
-  40892    1493     644   43029    a815 mm/page_alloc.o
+   40892   1493     644   43029    a815 mm/page_alloc.o
    1427      24       8    1459     5b3 mm/page_ext.o
    2722      50       0    2772     ad4 mm/page_owner.o
 
@@ -62,21 +68,23 @@ are catched and marked, although they are mostly allocated from struct
 page extension feature. Anyway, after that, no page is left in
 un-tracking state.
 
-* Usage
+Usage
+=====
+
+1) Build user-space helper::
 
-1) Build user-space helper
 	cd tools/vm
 	make page_owner_sort
 
-2) Enable page owner
-	Add "page_owner=on" to boot cmdline.
+2) Enable page owner: add "page_owner=on" to boot cmdline.
 
 3) Do the job what you want to debug
 
-4) Analyze information from page owner
+4) Analyze information from page owner::
+
 	cat /sys/kernel/debug/page_owner > page_owner_full.txt
 	grep -v ^PFN page_owner_full.txt > page_owner.txt
 	./page_owner_sort page_owner.txt sorted_page_owner.txt
 
-	See the result about who allocated each page
-	in the sorted_page_owner.txt.
+   See the result about who allocated each page
+   in the ``sorted_page_owner.txt``.
diff --git a/Documentation/vm/remap_file_pages.txt b/Documentation/vm/remap_file_pages.rst
index f609142..7bef671 100644
--- a/Documentation/vm/remap_file_pages.txt
+++ b/Documentation/vm/remap_file_pages.rst
@@ -1,3 +1,9 @@
+.. _remap_file_pages:
+
+==============================
+remap_file_pages() system call
+==============================
+
 The remap_file_pages() system call is used to create a nonlinear mapping,
 that is, a mapping in which the pages of the file are mapped into a
 nonsequential order in memory. The advantage of using remap_file_pages()
diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst
new file mode 100644
index 0000000..3a775fd
--- /dev/null
+++ b/Documentation/vm/slub.rst
@@ -0,0 +1,361 @@
+.. _slub:
+
+==========================
+Short users guide for SLUB
+==========================
+
+The basic philosophy of SLUB is very different from SLAB. SLAB
+requires rebuilding the kernel to activate debug options for all
+slab caches. SLUB always includes full debugging but it is off by default.
+SLUB can enable debugging only for selected slabs in order to avoid
+an impact on overall system performance which may make a bug more
+difficult to find.
+
+In order to switch debugging on one can add an option ``slub_debug``
+to the kernel command line. That will enable full debugging for
+all slabs.
+
+Typically one would then use the ``slabinfo`` command to get statistical
+data and perform operation on the slabs. By default ``slabinfo`` only lists
+slabs that have data in them. See "slabinfo -h" for more options when
+running the command. ``slabinfo`` can be compiled with
+::
+
+	gcc -o slabinfo tools/vm/slabinfo.c
+
+Some of the modes of operation of ``slabinfo`` require that slub debugging
+be enabled on the command line. F.e. no tracking information will be
+available without debugging on and validation can only partially
+be performed if debugging was not switched on.
+
+Some more sophisticated uses of slub_debug:
+-------------------------------------------
+
+Parameters may be given to ``slub_debug``. If none is specified then full
+debugging is enabled. Format:
+
+slub_debug=<Debug-Options>
+	Enable options for all slabs
+slub_debug=<Debug-Options>,<slab name>
+	Enable options only for select slabs
+
+
+Possible debug options are::
+
+	F		Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS
+			Sorry SLAB legacy issues)
+	Z		Red zoning
+	P		Poisoning (object and padding)
+	U		User tracking (free and alloc)
+	T		Trace (please only use on single slabs)
+	A		Toggle failslab filter mark for the cache
+	O		Switch debugging off for caches that would have
+			caused higher minimum slab orders
+	-		Switch all debugging off (useful if the kernel is
+			configured with CONFIG_SLUB_DEBUG_ON)
+
+F.e. in order to boot just with sanity checks and red zoning one would specify::
+
+	slub_debug=FZ
+
+Trying to find an issue in the dentry cache? Try::
+
+	slub_debug=,dentry
+
+to only enable debugging on the dentry cache.
+
+Red zoning and tracking may realign the slab.  We can just apply sanity checks
+to the dentry cache with::
+
+	slub_debug=F,dentry
+
+Debugging options may require the minimum possible slab order to increase as
+a result of storing the metadata (for example, caches with PAGE_SIZE object
+sizes).  This has a higher liklihood of resulting in slab allocation errors
+in low memory situations or if there's high fragmentation of memory.  To
+switch off debugging for such caches by default, use::
+
+	slub_debug=O
+
+In case you forgot to enable debugging on the kernel command line: It is
+possible to enable debugging manually when the kernel is up. Look at the
+contents of::
+
+	/sys/kernel/slab/<slab name>/
+
+Look at the writable files. Writing 1 to them will enable the
+corresponding debug option. All options can be set on a slab that does
+not contain objects. If the slab already contains objects then sanity checks
+and tracing may only be enabled. The other options may cause the realignment
+of objects.
+
+Careful with tracing: It may spew out lots of information and never stop if
+used on the wrong slab.
+
+Slab merging
+============
+
+If no debug options are specified then SLUB may merge similar slabs together
+in order to reduce overhead and increase cache hotness of objects.
+``slabinfo -a`` displays which slabs were merged together.
+
+Slab validation
+===============
+
+SLUB can validate all object if the kernel was booted with slub_debug. In
+order to do so you must have the ``slabinfo`` tool. Then you can do
+::
+
+	slabinfo -v
+
+which will test all objects. Output will be generated to the syslog.
+
+This also works in a more limited way if boot was without slab debug.
+In that case ``slabinfo -v`` simply tests all reachable objects. Usually
+these are in the cpu slabs and the partial slabs. Full slabs are not
+tracked by SLUB in a non debug situation.
+
+Getting more performance
+========================
+
+To some degree SLUB's performance is limited by the need to take the
+list_lock once in a while to deal with partial slabs. That overhead is
+governed by the order of the allocation for each slab. The allocations
+can be influenced by kernel parameters:
+
+.. slub_min_objects=x		(default 4)
+.. slub_min_order=x		(default 0)
+.. slub_max_order=x		(default 3 (PAGE_ALLOC_COSTLY_ORDER))
+
+``slub_min_objects``
+	allows to specify how many objects must at least fit into one
+	slab in order for the allocation order to be acceptable.  In
+	general slub will be able to perform this number of
+	allocations on a slab without consulting centralized resources
+	(list_lock) where contention may occur.
+
+``slub_min_order``
+	specifies a minim order of slabs. A similar effect like
+	``slub_min_objects``.
+
+``slub_max_order``
+	specified the order at which ``slub_min_objects`` should no
+	longer be checked. This is useful to avoid SLUB trying to
+	generate super large order pages to fit ``slub_min_objects``
+	of a slab cache with large object sizes into one high order
+	page. Setting command line parameter
+	``debug_guardpage_minorder=N`` (N > 0), forces setting
+	``slub_max_order`` to 0, what cause minimum possible order of
+	slabs allocation.
+
+SLUB Debug output
+=================
+
+Here is a sample of slub debug output::
+
+ ====================================================================
+ BUG kmalloc-8: Redzone overwritten
+ --------------------------------------------------------------------
+
+ INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc
+ INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58
+ INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58
+ INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554
+
+ Bytes b4 0xc90f6d10:  00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ
+   Object 0xc90f6d20:  31 30 31 39 2e 30 30 35                         1019.005
+  Redzone 0xc90f6d28:  00 cc cc cc                                     .
+  Padding 0xc90f6d50:  5a 5a 5a 5a 5a 5a 5a 5a                         ZZZZZZZZ
+
+   [<c010523d>] dump_trace+0x63/0x1eb
+   [<c01053df>] show_trace_log_lvl+0x1a/0x2f
+   [<c010601d>] show_trace+0x12/0x14
+   [<c0106035>] dump_stack+0x16/0x18
+   [<c017e0fa>] object_err+0x143/0x14b
+   [<c017e2cc>] check_object+0x66/0x234
+   [<c017eb43>] __slab_free+0x239/0x384
+   [<c017f446>] kfree+0xa6/0xc6
+   [<c02e2335>] get_modalias+0xb9/0xf5
+   [<c02e23b7>] dmi_dev_uevent+0x27/0x3c
+   [<c027866a>] dev_uevent+0x1ad/0x1da
+   [<c0205024>] kobject_uevent_env+0x20a/0x45b
+   [<c020527f>] kobject_uevent+0xa/0xf
+   [<c02779f1>] store_uevent+0x4f/0x58
+   [<c027758e>] dev_attr_store+0x29/0x2f
+   [<c01bec4f>] sysfs_write_file+0x16e/0x19c
+   [<c0183ba7>] vfs_write+0xd1/0x15a
+   [<c01841d7>] sys_write+0x3d/0x72
+   [<c0104112>] sysenter_past_esp+0x5f/0x99
+   [<b7f7b410>] 0xb7f7b410
+   =======================
+
+ FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc
+
+If SLUB encounters a corrupted object (full detection requires the kernel
+to be booted with slub_debug) then the following output will be dumped
+into the syslog:
+
+1. Description of the problem encountered
+
+   This will be a message in the system log starting with::
+
+     ===============================================
+     BUG <slab cache affected>: <What went wrong>
+     -----------------------------------------------
+
+     INFO: <corruption start>-<corruption_end> <more info>
+     INFO: Slab <address> <slab information>
+     INFO: Object <address> <object information>
+     INFO: Allocated in <kernel function> age=<jiffies since alloc> cpu=<allocated by
+	cpu> pid=<pid of the process>
+     INFO: Freed in <kernel function> age=<jiffies since free> cpu=<freed by cpu>
+	pid=<pid of the process>
+
+   (Object allocation / free information is only available if SLAB_STORE_USER is
+   set for the slab. slub_debug sets that option)
+
+2. The object contents if an object was involved.
+
+   Various types of lines can follow the BUG SLUB line:
+
+   Bytes b4 <address> : <bytes>
+	Shows a few bytes before the object where the problem was detected.
+	Can be useful if the corruption does not stop with the start of the
+	object.
+
+   Object <address> : <bytes>
+	The bytes of the object. If the object is inactive then the bytes
+	typically contain poison values. Any non-poison value shows a
+	corruption by a write after free.
+
+   Redzone <address> : <bytes>
+	The Redzone following the object. The Redzone is used to detect
+	writes after the object. All bytes should always have the same
+	value. If there is any deviation then it is due to a write after
+	the object boundary.
+
+	(Redzone information is only available if SLAB_RED_ZONE is set.
+	slub_debug sets that option)
+
+   Padding <address> : <bytes>
+	Unused data to fill up the space in order to get the next object
+	properly aligned. In the debug case we make sure that there are
+	at least 4 bytes of padding. This allows the detection of writes
+	before the object.
+
+3. A stackdump
+
+   The stackdump describes the location where the error was detected. The cause
+   of the corruption is may be more likely found by looking at the function that
+   allocated or freed the object.
+
+4. Report on how the problem was dealt with in order to ensure the continued
+   operation of the system.
+
+   These are messages in the system log beginning with::
+
+	FIX <slab cache affected>: <corrective action taken>
+
+   In the above sample SLUB found that the Redzone of an active object has
+   been overwritten. Here a string of 8 characters was written into a slab that
+   has the length of 8 characters. However, a 8 character string needs a
+   terminating 0. That zero has overwritten the first byte of the Redzone field.
+   After reporting the details of the issue encountered the FIX SLUB message
+   tells us that SLUB has restored the Redzone to its proper value and then
+   system operations continue.
+
+Emergency operations
+====================
+
+Minimal debugging (sanity checks alone) can be enabled by booting with::
+
+	slub_debug=F
+
+This will be generally be enough to enable the resiliency features of slub
+which will keep the system running even if a bad kernel component will
+keep corrupting objects. This may be important for production systems.
+Performance will be impacted by the sanity checks and there will be a
+continual stream of error messages to the syslog but no additional memory
+will be used (unlike full debugging).
+
+No guarantees. The kernel component still needs to be fixed. Performance
+may be optimized further by locating the slab that experiences corruption
+and enabling debugging only for that cache
+
+I.e.::
+
+	slub_debug=F,dentry
+
+If the corruption occurs by writing after the end of the object then it
+may be advisable to enable a Redzone to avoid corrupting the beginning
+of other objects::
+
+	slub_debug=FZ,dentry
+
+Extended slabinfo mode and plotting
+===================================
+
+The ``slabinfo`` tool has a special 'extended' ('-X') mode that includes:
+ - Slabcache Totals
+ - Slabs sorted by size (up to -N <num> slabs, default 1)
+ - Slabs sorted by loss (up to -N <num> slabs, default 1)
+
+Additionally, in this mode ``slabinfo`` does not dynamically scale
+sizes (G/M/K) and reports everything in bytes (this functionality is
+also available to other slabinfo modes via '-B' option) which makes
+reporting more precise and accurate. Moreover, in some sense the `-X'
+mode also simplifies the analysis of slabs' behaviour, because its
+output can be plotted using the ``slabinfo-gnuplot.sh`` script. So it
+pushes the analysis from looking through the numbers (tons of numbers)
+to something easier -- visual analysis.
+
+To generate plots:
+
+a) collect slabinfo extended records, for example::
+
+	while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done
+
+b) pass stats file(-s) to ``slabinfo-gnuplot.sh`` script::
+
+	slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN]
+
+   The ``slabinfo-gnuplot.sh`` script will pre-processes the collected records
+   and generates 3 png files (and 3 pre-processing cache files) per STATS
+   file:
+   - Slabcache Totals: FOO_STATS-totals.png
+   - Slabs sorted by size: FOO_STATS-slabs-by-size.png
+   - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png
+
+Another use case, when ``slabinfo-gnuplot.sh`` can be useful, is when you
+need to compare slabs' behaviour "prior to" and "after" some code
+modification.  To help you out there, ``slabinfo-gnuplot.sh`` script
+can 'merge' the `Slabcache Totals` sections from different
+measurements. To visually compare N plots:
+
+a) Collect as many STATS1, STATS2, .. STATSN files as you need::
+
+	while [ 1 ]; do slabinfo -X >> STATS<X>; sleep 1; done
+
+b) Pre-process those STATS files::
+
+	slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN
+
+c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the
+   generated pre-processed \*-totals::
+
+	slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals
+
+   This will produce a single plot (png file).
+
+   Plots, expectedly, can be large so some fluctuations or small spikes
+   can go unnoticed. To deal with that, ``slabinfo-gnuplot.sh`` has two
+   options to 'zoom-in'/'zoom-out':
+
+   a) ``-s %d,%d`` -- overwrites the default image width and heigh
+   b) ``-r %d,%d`` -- specifies a range of samples to use (for example,
+      in ``slabinfo -X >> FOO_STATS; sleep 1;`` case, using a ``-r
+      40,60`` range will plot only samples collected between 40th and
+      60th seconds).
+
+Christoph Lameter, May 30, 2007
+Sergey Senozhatsky, October 23, 2015
diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt
deleted file mode 100644
index 8465241..0000000
--- a/Documentation/vm/slub.txt
+++ /dev/null
@@ -1,342 +0,0 @@
-Short users guide for SLUB
---------------------------
-
-The basic philosophy of SLUB is very different from SLAB. SLAB
-requires rebuilding the kernel to activate debug options for all
-slab caches. SLUB always includes full debugging but it is off by default.
-SLUB can enable debugging only for selected slabs in order to avoid
-an impact on overall system performance which may make a bug more
-difficult to find.
-
-In order to switch debugging on one can add an option "slub_debug"
-to the kernel command line. That will enable full debugging for
-all slabs.
-
-Typically one would then use the "slabinfo" command to get statistical
-data and perform operation on the slabs. By default slabinfo only lists
-slabs that have data in them. See "slabinfo -h" for more options when
-running the command. slabinfo can be compiled with
-
-gcc -o slabinfo tools/vm/slabinfo.c
-
-Some of the modes of operation of slabinfo require that slub debugging
-be enabled on the command line. F.e. no tracking information will be
-available without debugging on and validation can only partially
-be performed if debugging was not switched on.
-
-Some more sophisticated uses of slub_debug:
--------------------------------------------
-
-Parameters may be given to slub_debug. If none is specified then full
-debugging is enabled. Format:
-
-slub_debug=<Debug-Options>       Enable options for all slabs
-slub_debug=<Debug-Options>,<slab name>
-				Enable options only for select slabs
-
-Possible debug options are
-	F		Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS
-			Sorry SLAB legacy issues)
-	Z		Red zoning
-	P		Poisoning (object and padding)
-	U		User tracking (free and alloc)
-	T		Trace (please only use on single slabs)
-	A		Toggle failslab filter mark for the cache
-	O		Switch debugging off for caches that would have
-			caused higher minimum slab orders
-	-		Switch all debugging off (useful if the kernel is
-			configured with CONFIG_SLUB_DEBUG_ON)
-
-F.e. in order to boot just with sanity checks and red zoning one would specify:
-
-	slub_debug=FZ
-
-Trying to find an issue in the dentry cache? Try
-
-	slub_debug=,dentry
-
-to only enable debugging on the dentry cache.
-
-Red zoning and tracking may realign the slab.  We can just apply sanity checks
-to the dentry cache with
-
-	slub_debug=F,dentry
-
-Debugging options may require the minimum possible slab order to increase as
-a result of storing the metadata (for example, caches with PAGE_SIZE object
-sizes).  This has a higher liklihood of resulting in slab allocation errors
-in low memory situations or if there's high fragmentation of memory.  To
-switch off debugging for such caches by default, use
-
-	slub_debug=O
-
-In case you forgot to enable debugging on the kernel command line: It is
-possible to enable debugging manually when the kernel is up. Look at the
-contents of:
-
-/sys/kernel/slab/<slab name>/
-
-Look at the writable files. Writing 1 to them will enable the
-corresponding debug option. All options can be set on a slab that does
-not contain objects. If the slab already contains objects then sanity checks
-and tracing may only be enabled. The other options may cause the realignment
-of objects.
-
-Careful with tracing: It may spew out lots of information and never stop if
-used on the wrong slab.
-
-Slab merging
-------------
-
-If no debug options are specified then SLUB may merge similar slabs together
-in order to reduce overhead and increase cache hotness of objects.
-slabinfo -a displays which slabs were merged together.
-
-Slab validation
----------------
-
-SLUB can validate all object if the kernel was booted with slub_debug. In
-order to do so you must have the slabinfo tool. Then you can do
-
-slabinfo -v
-
-which will test all objects. Output will be generated to the syslog.
-
-This also works in a more limited way if boot was without slab debug.
-In that case slabinfo -v simply tests all reachable objects. Usually
-these are in the cpu slabs and the partial slabs. Full slabs are not
-tracked by SLUB in a non debug situation.
-
-Getting more performance
-------------------------
-
-To some degree SLUB's performance is limited by the need to take the
-list_lock once in a while to deal with partial slabs. That overhead is
-governed by the order of the allocation for each slab. The allocations
-can be influenced by kernel parameters:
-
-slub_min_objects=x		(default 4)
-slub_min_order=x		(default 0)
-slub_max_order=x		(default 3 (PAGE_ALLOC_COSTLY_ORDER))
-
-slub_min_objects allows to specify how many objects must at least fit
-into one slab in order for the allocation order to be acceptable.
-In general slub will be able to perform this number of allocations
-on a slab without consulting centralized resources (list_lock) where
-contention may occur.
-
-slub_min_order specifies a minim order of slabs. A similar effect like
-slub_min_objects.
-
-slub_max_order specified the order at which slub_min_objects should no
-longer be checked. This is useful to avoid SLUB trying to generate
-super large order pages to fit slub_min_objects of a slab cache with
-large object sizes into one high order page. Setting command line
-parameter debug_guardpage_minorder=N (N > 0), forces setting
-slub_max_order to 0, what cause minimum possible order of slabs
-allocation.
-
-SLUB Debug output
------------------
-
-Here is a sample of slub debug output:
-
-====================================================================
-BUG kmalloc-8: Redzone overwritten
---------------------------------------------------------------------
-
-INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc
-INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58
-INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58
-INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554
-
-Bytes b4 0xc90f6d10:  00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ
-  Object 0xc90f6d20:  31 30 31 39 2e 30 30 35                         1019.005
- Redzone 0xc90f6d28:  00 cc cc cc                                     .
- Padding 0xc90f6d50:  5a 5a 5a 5a 5a 5a 5a 5a                         ZZZZZZZZ
-
-  [<c010523d>] dump_trace+0x63/0x1eb
-  [<c01053df>] show_trace_log_lvl+0x1a/0x2f
-  [<c010601d>] show_trace+0x12/0x14
-  [<c0106035>] dump_stack+0x16/0x18
-  [<c017e0fa>] object_err+0x143/0x14b
-  [<c017e2cc>] check_object+0x66/0x234
-  [<c017eb43>] __slab_free+0x239/0x384
-  [<c017f446>] kfree+0xa6/0xc6
-  [<c02e2335>] get_modalias+0xb9/0xf5
-  [<c02e23b7>] dmi_dev_uevent+0x27/0x3c
-  [<c027866a>] dev_uevent+0x1ad/0x1da
-  [<c0205024>] kobject_uevent_env+0x20a/0x45b
-  [<c020527f>] kobject_uevent+0xa/0xf
-  [<c02779f1>] store_uevent+0x4f/0x58
-  [<c027758e>] dev_attr_store+0x29/0x2f
-  [<c01bec4f>] sysfs_write_file+0x16e/0x19c
-  [<c0183ba7>] vfs_write+0xd1/0x15a
-  [<c01841d7>] sys_write+0x3d/0x72
-  [<c0104112>] sysenter_past_esp+0x5f/0x99
-  [<b7f7b410>] 0xb7f7b410
-  =======================
-
-FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc
-
-If SLUB encounters a corrupted object (full detection requires the kernel
-to be booted with slub_debug) then the following output will be dumped
-into the syslog:
-
-1. Description of the problem encountered
-
-This will be a message in the system log starting with
-
-===============================================
-BUG <slab cache affected>: <What went wrong>
------------------------------------------------
-
-INFO: <corruption start>-<corruption_end> <more info>
-INFO: Slab <address> <slab information>
-INFO: Object <address> <object information>
-INFO: Allocated in <kernel function> age=<jiffies since alloc> cpu=<allocated by
-	cpu> pid=<pid of the process>
-INFO: Freed in <kernel function> age=<jiffies since free> cpu=<freed by cpu>
-	 pid=<pid of the process>
-
-(Object allocation / free information is only available if SLAB_STORE_USER is
-set for the slab. slub_debug sets that option)
-
-2. The object contents if an object was involved.
-
-Various types of lines can follow the BUG SLUB line:
-
-Bytes b4 <address> : <bytes>
-	Shows a few bytes before the object where the problem was detected.
-	Can be useful if the corruption does not stop with the start of the
-	object.
-
-Object <address> : <bytes>
-	The bytes of the object. If the object is inactive then the bytes
-	typically contain poison values. Any non-poison value shows a
-	corruption by a write after free.
-
-Redzone <address> : <bytes>
-	The Redzone following the object. The Redzone is used to detect
-	writes after the object. All bytes should always have the same
-	value. If there is any deviation then it is due to a write after
-	the object boundary.
-
-	(Redzone information is only available if SLAB_RED_ZONE is set.
-	slub_debug sets that option)
-
-Padding <address> : <bytes>
-	Unused data to fill up the space in order to get the next object
-	properly aligned. In the debug case we make sure that there are
-	at least 4 bytes of padding. This allows the detection of writes
-	before the object.
-
-3. A stackdump
-
-The stackdump describes the location where the error was detected. The cause
-of the corruption is may be more likely found by looking at the function that
-allocated or freed the object.
-
-4. Report on how the problem was dealt with in order to ensure the continued
-operation of the system.
-
-These are messages in the system log beginning with
-
-FIX <slab cache affected>: <corrective action taken>
-
-In the above sample SLUB found that the Redzone of an active object has
-been overwritten. Here a string of 8 characters was written into a slab that
-has the length of 8 characters. However, a 8 character string needs a
-terminating 0. That zero has overwritten the first byte of the Redzone field.
-After reporting the details of the issue encountered the FIX SLUB message
-tells us that SLUB has restored the Redzone to its proper value and then
-system operations continue.
-
-Emergency operations:
----------------------
-
-Minimal debugging (sanity checks alone) can be enabled by booting with
-
-	slub_debug=F
-
-This will be generally be enough to enable the resiliency features of slub
-which will keep the system running even if a bad kernel component will
-keep corrupting objects. This may be important for production systems.
-Performance will be impacted by the sanity checks and there will be a
-continual stream of error messages to the syslog but no additional memory
-will be used (unlike full debugging).
-
-No guarantees. The kernel component still needs to be fixed. Performance
-may be optimized further by locating the slab that experiences corruption
-and enabling debugging only for that cache
-
-I.e.
-
-	slub_debug=F,dentry
-
-If the corruption occurs by writing after the end of the object then it
-may be advisable to enable a Redzone to avoid corrupting the beginning
-of other objects.
-
-	slub_debug=FZ,dentry
-
-Extended slabinfo mode and plotting
------------------------------------
-
-The slabinfo tool has a special 'extended' ('-X') mode that includes:
- - Slabcache Totals
- - Slabs sorted by size (up to -N <num> slabs, default 1)
- - Slabs sorted by loss (up to -N <num> slabs, default 1)
-
-Additionally, in this mode slabinfo does not dynamically scale sizes (G/M/K)
-and reports everything in bytes (this functionality is also available to
-other slabinfo modes via '-B' option) which makes reporting more precise and
-accurate. Moreover, in some sense the `-X' mode also simplifies the analysis
-of slabs' behaviour, because its output can be plotted using the
-slabinfo-gnuplot.sh script. So it pushes the analysis from looking through
-the numbers (tons of numbers) to something easier -- visual analysis.
-
-To generate plots:
-a) collect slabinfo extended records, for example:
-
-  while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done
-
-b) pass stats file(-s) to slabinfo-gnuplot.sh script:
-  slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN]
-
-The slabinfo-gnuplot.sh script will pre-processes the collected records
-and generates 3 png files (and 3 pre-processing cache files) per STATS
-file:
- - Slabcache Totals: FOO_STATS-totals.png
- - Slabs sorted by size: FOO_STATS-slabs-by-size.png
- - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png
-
-Another use case, when slabinfo-gnuplot can be useful, is when you need
-to compare slabs' behaviour "prior to" and "after" some code modification.
-To help you out there, slabinfo-gnuplot.sh script can 'merge' the
-`Slabcache Totals` sections from different measurements. To visually
-compare N plots:
-
-a) Collect as many STATS1, STATS2, .. STATSN files as you need
-  while [ 1 ]; do slabinfo -X >> STATS<X>; sleep 1; done
-
-b) Pre-process those STATS files
-  slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN
-
-c) Execute slabinfo-gnuplot.sh in '-t' mode, passing all of the
-generated pre-processed *-totals
-  slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals
-
-This will produce a single plot (png file).
-
-Plots, expectedly, can be large so some fluctuations or small spikes
-can go unnoticed. To deal with that, `slabinfo-gnuplot.sh' has two
-options to 'zoom-in'/'zoom-out':
- a) -s %d,%d  overwrites the default image width and heigh
- b) -r %d,%d  specifies a range of samples to use (for example,
-              in `slabinfo -X >> FOO_STATS; sleep 1;' case, using
-              a "-r 40,60" range will plot only samples collected
-              between 40th and 60th seconds).
-
-Christoph Lameter, May 30, 2007
-Sergey Senozhatsky, October 23, 2015
diff --git a/Documentation/vm/split_page_table_lock b/Documentation/vm/split_page_table_lock.rst
index 62842a8..889b00b 100644
--- a/Documentation/vm/split_page_table_lock
+++ b/Documentation/vm/split_page_table_lock.rst
@@ -1,3 +1,6 @@
+.. _split_page_table_lock:
+
+=====================
 Split page table lock
 =====================
 
@@ -11,6 +14,7 @@ access to the table. At the moment we use split lock for PTE and PMD
 tables. Access to higher level tables protected by mm->page_table_lock.
 
 There are helpers to lock/unlock a table and other accessor functions:
+
  - pte_offset_map_lock()
 	maps pte and takes PTE table lock, returns pointer to the taken
 	lock;
@@ -34,12 +38,13 @@ Split page table lock for PMD tables is enabled, if it's enabled for PTE
 tables and the architecture supports it (see below).
 
 Hugetlb and split page table lock
----------------------------------
+=================================
 
 Hugetlb can support several page sizes. We use split lock only for PMD
 level, but not for PUD.
 
 Hugetlb-specific helpers:
+
  - huge_pte_lock()
 	takes pmd split lock for PMD_SIZE page, mm->page_table_lock
 	otherwise;
@@ -47,7 +52,7 @@ Hugetlb-specific helpers:
 	returns pointer to table lock;
 
 Support of split page table lock by an architecture
----------------------------------------------------
+===================================================
 
 There's no need in special enabling of PTE split page table lock:
 everything required is done by pgtable_page_ctor() and pgtable_page_dtor(),
@@ -73,7 +78,7 @@ NOTE: pgtable_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must
 be handled properly.
 
 page->ptl
----------
+=========
 
 page->ptl is used to access split page table lock, where 'page' is struct
 page of page containing the table. It shares storage with page->private
@@ -81,6 +86,7 @@ page of page containing the table. It shares storage with page->private
 
 To avoid increasing size of struct page and have best performance, we use a
 trick:
+
  - if spinlock_t fits into long, we use page->ptr as spinlock, so we
    can avoid indirect access and save a cache line.
  - if size of spinlock_t is bigger then size of long, we use page->ptl as
diff --git a/Documentation/vm/swap_numa.txt b/Documentation/vm/swap_numa.rst
index d5960c9..e0466f2 100644
--- a/Documentation/vm/swap_numa.txt
+++ b/Documentation/vm/swap_numa.rst
@@ -1,5 +1,8 @@
+.. _swap_numa:
+
+===========================================
 Automatically bind swap device to numa node
--------------------------------------------
+===========================================
 
 If the system has more than one swap device and swap device has the node
 information, we can make use of this information to decide which swap
@@ -7,15 +10,16 @@ device to use in get_swap_pages() to get better performance.
 
 
 How to use this feature
------------------------
+=======================
 
 Swap device has priority and that decides the order of it to be used. To make
 use of automatically binding, there is no need to manipulate priority settings
 for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
 swapB, with swapA attached to node 0 and swapB attached to node 1, are going
-to be swapped on. Simply swapping them on by doing:
-# swapon /dev/swapA
-# swapon /dev/swapB
+to be swapped on. Simply swapping them on by doing::
+
+	# swapon /dev/swapA
+	# swapon /dev/swapB
 
 Then node 0 will use the two swap devices in the order of swapA then swapB and
 node 1 will use the two swap devices in the order of swapB then swapA. Note
@@ -24,32 +28,39 @@ that the order of them being swapped on doesn't matter.
 A more complex example on a 4 node machine. Assume 6 swap devices are going to
 be swapped on: swapA and swapB are attached to node 0, swapC is attached to
 node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
-The way to swap them on is the same as above:
-# swapon /dev/swapA
-# swapon /dev/swapB
-# swapon /dev/swapC
-# swapon /dev/swapD
-# swapon /dev/swapE
-# swapon /dev/swapF
-
-Then node 0 will use them in the order of:
-swapA/swapB -> swapC -> swapD -> swapE -> swapF
+The way to swap them on is the same as above::
+
+	# swapon /dev/swapA
+	# swapon /dev/swapB
+	# swapon /dev/swapC
+	# swapon /dev/swapD
+	# swapon /dev/swapE
+	# swapon /dev/swapF
+
+Then node 0 will use them in the order of::
+
+	swapA/swapB -> swapC -> swapD -> swapE -> swapF
+
 swapA and swapB will be used in a round robin mode before any other swap device.
 
-node 1 will use them in the order of:
-swapC -> swapA -> swapB -> swapD -> swapE -> swapF
+node 1 will use them in the order of::
+
+	swapC -> swapA -> swapB -> swapD -> swapE -> swapF
+
+node 2 will use them in the order of::
+
+	swapD/swapE -> swapA -> swapB -> swapC -> swapF
 
-node 2 will use them in the order of:
-swapD/swapE -> swapA -> swapB -> swapC -> swapF
 Similaly, swapD and swapE will be used in a round robin mode before any
 other swap devices.
 
-node 3 will use them in the order of:
-swapF -> swapA -> swapB -> swapC -> swapD -> swapE
+node 3 will use them in the order of::
+
+	swapF -> swapA -> swapB -> swapC -> swapD -> swapE
 
 
 Implementation details
-----------------------
+======================
 
 The current code uses a priority based list, swap_avail_list, to decide
 which swap device to use and if multiple swap devices share the same
diff --git a/Documentation/vm/transhuge.rst b/Documentation/vm/transhuge.rst
new file mode 100644
index 0000000..a8cf680
--- /dev/null
+++ b/Documentation/vm/transhuge.rst
@@ -0,0 +1,197 @@
+.. _transhuge:
+
+============================
+Transparent Hugepage Support
+============================
+
+This document describes design principles Transparent Hugepage (THP)
+Support and its interaction with other parts of the memory management.
+
+Design principles
+=================
+
+- "graceful fallback": mm components which don't have transparent hugepage
+  knowledge fall back to breaking huge pmd mapping into table of ptes and,
+  if necessary, split a transparent hugepage. Therefore these components
+  can continue working on the regular pages or regular pte mappings.
+
+- if a hugepage allocation fails because of memory fragmentation,
+  regular pages should be gracefully allocated instead and mixed in
+  the same vma without any failure or significant delay and without
+  userland noticing
+
+- if some task quits and more hugepages become available (either
+  immediately in the buddy or through the VM), guest physical memory
+  backed by regular pages should be relocated on hugepages
+  automatically (with khugepaged)
+
+- it doesn't require memory reservation and in turn it uses hugepages
+  whenever possible (the only possible reservation here is kernelcore=
+  to avoid unmovable pages to fragment all the memory but such a tweak
+  is not specific to transparent hugepage support and it's a generic
+  feature that applies to all dynamic high order allocations in the
+  kernel)
+
+get_user_pages and follow_page
+==============================
+
+get_user_pages and follow_page if run on a hugepage, will return the
+head or tail pages as usual (exactly as they would do on
+hugetlbfs). Most gup users will only care about the actual physical
+address of the page and its temporary pinning to release after the I/O
+is complete, so they won't ever notice the fact the page is huge. But
+if any driver is going to mangle over the page structure of the tail
+page (like for checking page->mapping or other bits that are relevant
+for the head page and not the tail page), it should be updated to jump
+to check head page instead. Taking reference on any head/tail page would
+prevent page from being split by anyone.
+
+.. note::
+   these aren't new constraints to the GUP API, and they match the
+   same constrains that applies to hugetlbfs too, so any driver capable
+   of handling GUP on hugetlbfs will also work fine on transparent
+   hugepage backed mappings.
+
+In case you can't handle compound pages if they're returned by
+follow_page, the FOLL_SPLIT bit can be specified as parameter to
+follow_page, so that it will split the hugepages before returning
+them. Migration for example passes FOLL_SPLIT as parameter to
+follow_page because it's not hugepage aware and in fact it can't work
+at all on hugetlbfs (but it instead works fine on transparent
+hugepages thanks to FOLL_SPLIT). migration simply can't deal with
+hugepages being returned (as it's not only checking the pfn of the
+page and pinning it during the copy but it pretends to migrate the
+memory in regular page sizes and with regular pte/pmd mappings).
+
+Graceful fallback
+=================
+
+Code walking pagetables but unaware about huge pmds can simply call
+split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
+pmd_offset. It's trivial to make the code transparent hugepage aware
+by just grepping for "pmd_offset" and adding split_huge_pmd where
+missing after pmd_offset returns the pmd. Thanks to the graceful
+fallback design, with a one liner change, you can avoid to write
+hundred if not thousand of lines of complex code to make your code
+hugepage aware.
+
+If you're not walking pagetables but you run into a physical hugepage
+but you can't handle it natively in your code, you can split it by
+calling split_huge_page(page). This is what the Linux VM does before
+it tries to swapout the hugepage for example. split_huge_page() can fail
+if the page is pinned and you must handle this correctly.
+
+Example to make mremap.c transparent hugepage aware with a one liner
+change::
+
+	diff --git a/mm/mremap.c b/mm/mremap.c
+	--- a/mm/mremap.c
+	+++ b/mm/mremap.c
+	@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
+			return NULL;
+
+		pmd = pmd_offset(pud, addr);
+	+	split_huge_pmd(vma, pmd, addr);
+		if (pmd_none_or_clear_bad(pmd))
+			return NULL;
+
+Locking in hugepage aware code
+==============================
+
+We want as much code as possible hugepage aware, as calling
+split_huge_page() or split_huge_pmd() has a cost.
+
+To make pagetable walks huge pmd aware, all you need to do is to call
+pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
+mmap_sem in read (or write) mode to be sure an huge pmd cannot be
+created from under you by khugepaged (khugepaged collapse_huge_page
+takes the mmap_sem in write mode in addition to the anon_vma lock). If
+pmd_trans_huge returns false, you just fallback in the old code
+paths. If instead pmd_trans_huge returns true, you have to take the
+page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
+page table lock will prevent the huge pmd to be converted into a
+regular pmd from under you (split_huge_pmd can run in parallel to the
+pagetable walk). If the second pmd_trans_huge returns false, you
+should just drop the page table lock and fallback to the old code as
+before. Otherwise you can proceed to process the huge pmd and the
+hugepage natively. Once finished you can drop the page table lock.
+
+Refcounts and transparent huge pages
+====================================
+
+Refcounting on THP is mostly consistent with refcounting on other compound
+pages:
+
+  - get_page()/put_page() and GUP operate in head page's ->_refcount.
+
+  - ->_refcount in tail pages is always zero: get_page_unless_zero() never
+    succeed on tail pages.
+
+  - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
+    on relevant sub-page of the compound page.
+
+  - map/unmap of the whole compound page accounted in compound_mapcount
+    (stored in first tail page). For file huge pages, we also increment
+    ->_mapcount of all sub-pages in order to have race-free detection of
+    last unmap of subpages.
+
+PageDoubleMap() indicates that the page is *possibly* mapped with PTEs.
+
+For anonymous pages PageDoubleMap() also indicates ->_mapcount in all
+subpages is offset up by one. This additional reference is required to
+get race-free detection of unmap of subpages when we have them mapped with
+both PMDs and PTEs.
+
+This is optimization required to lower overhead of per-subpage mapcount
+tracking. The alternative is alter ->_mapcount in all subpages on each
+map/unmap of the whole compound page.
+
+For anonymous pages, we set PG_double_map when a PMD of the page got split
+for the first time, but still have PMD mapping. The additional references
+go away with last compound_mapcount.
+
+File pages get PG_double_map set on first map of the page with PTE and
+goes away when the page gets evicted from page cache.
+
+split_huge_page internally has to distribute the refcounts in the head
+page to the tail pages before clearing all PG_head/tail bits from the page
+structures. It can be done easily for refcounts taken by page table
+entries. But we don't have enough information on how to distribute any
+additional pins (i.e. from get_user_pages). split_huge_page() fails any
+requests to split pinned huge page: it expects page count to be equal to
+sum of mapcount of all sub-pages plus one (split_huge_page caller must
+have reference for head page).
+
+split_huge_page uses migration entries to stabilize page->_refcount and
+page->_mapcount of anonymous pages. File pages just got unmapped.
+
+We safe against physical memory scanners too: the only legitimate way
+scanner can get reference to a page is get_page_unless_zero().
+
+All tail pages have zero ->_refcount until atomic_add(). This prevents the
+scanner from getting a reference to the tail page up to that point. After the
+atomic_add() we don't care about the ->_refcount value. We already known how
+many references should be uncharged from the head page.
+
+For head page get_page_unless_zero() will succeed and we don't mind. It's
+clear where reference should go after split: it will stay on head page.
+
+Note that split_huge_pmd() doesn't have any limitation on refcounting:
+pmd can be split at any point and never fails.
+
+Partial unmap and deferred_split_huge_page()
+============================================
+
+Unmapping part of THP (with munmap() or other way) is not going to free
+memory immediately. Instead, we detect that a subpage of THP is not in use
+in page_remove_rmap() and queue the THP for splitting if memory pressure
+comes. Splitting will free up unused subpages.
+
+Splitting the page right away is not an option due to locking context in
+the place where we can detect partial unmap. It's also might be
+counterproductive since in many cases partial unmap happens during exit(2) if
+a THP crosses a VMA boundary.
+
+Function deferred_split_huge_page() is used to queue page for splitting.
+The splitting itself will happen when we get memory pressure via shrinker
+interface.
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
deleted file mode 100644
index 4dde03b..0000000
--- a/Documentation/vm/transhuge.txt
+++ /dev/null
@@ -1,527 +0,0 @@
-= Transparent Hugepage Support =
-
-== Objective ==
-
-Performance critical computing applications dealing with large memory
-working sets are already running on top of libhugetlbfs and in turn
-hugetlbfs. Transparent Hugepage Support is an alternative means of
-using huge pages for the backing of virtual memory with huge pages
-that supports the automatic promotion and demotion of page sizes and
-without the shortcomings of hugetlbfs.
-
-Currently it only works for anonymous memory mappings and tmpfs/shmem.
-But in the future it can expand to other filesystems.
-
-The reason applications are running faster is because of two
-factors. The first factor is almost completely irrelevant and it's not
-of significant interest because it'll also have the downside of
-requiring larger clear-page copy-page in page faults which is a
-potentially negative effect. The first factor consists in taking a
-single page fault for each 2M virtual region touched by userland (so
-reducing the enter/exit kernel frequency by a 512 times factor). This
-only matters the first time the memory is accessed for the lifetime of
-a memory mapping. The second long lasting and much more important
-factor will affect all subsequent accesses to the memory for the whole
-runtime of the application. The second factor consist of two
-components: 1) the TLB miss will run faster (especially with
-virtualization using nested pagetables but almost always also on bare
-metal without virtualization) and 2) a single TLB entry will be
-mapping a much larger amount of virtual memory in turn reducing the
-number of TLB misses. With virtualization and nested pagetables the
-TLB can be mapped of larger size only if both KVM and the Linux guest
-are using hugepages but a significant speedup already happens if only
-one of the two is using hugepages just because of the fact the TLB
-miss is going to run faster.
-
-== Design ==
-
-- "graceful fallback": mm components which don't have transparent hugepage
-  knowledge fall back to breaking huge pmd mapping into table of ptes and,
-  if necessary, split a transparent hugepage. Therefore these components
-  can continue working on the regular pages or regular pte mappings.
-
-- if a hugepage allocation fails because of memory fragmentation,
-  regular pages should be gracefully allocated instead and mixed in
-  the same vma without any failure or significant delay and without
-  userland noticing
-
-- if some task quits and more hugepages become available (either
-  immediately in the buddy or through the VM), guest physical memory
-  backed by regular pages should be relocated on hugepages
-  automatically (with khugepaged)
-
-- it doesn't require memory reservation and in turn it uses hugepages
-  whenever possible (the only possible reservation here is kernelcore=
-  to avoid unmovable pages to fragment all the memory but such a tweak
-  is not specific to transparent hugepage support and it's a generic
-  feature that applies to all dynamic high order allocations in the
-  kernel)
-
-Transparent Hugepage Support maximizes the usefulness of free memory
-if compared to the reservation approach of hugetlbfs by allowing all
-unused memory to be used as cache or other movable (or even unmovable
-entities). It doesn't require reservation to prevent hugepage
-allocation failures to be noticeable from userland. It allows paging
-and all other advanced VM features to be available on the
-hugepages. It requires no modifications for applications to take
-advantage of it.
-
-Applications however can be further optimized to take advantage of
-this feature, like for example they've been optimized before to avoid
-a flood of mmap system calls for every malloc(4k). Optimizing userland
-is by far not mandatory and khugepaged already can take care of long
-lived page allocations even for hugepage unaware applications that
-deals with large amounts of memory.
-
-In certain cases when hugepages are enabled system wide, application
-may end up allocating more memory resources. An application may mmap a
-large region but only touch 1 byte of it, in that case a 2M page might
-be allocated instead of a 4k page for no good. This is why it's
-possible to disable hugepages system-wide and to only have them inside
-MADV_HUGEPAGE madvise regions.
-
-Embedded systems should enable hugepages only inside madvise regions
-to eliminate any risk of wasting any precious byte of memory and to
-only run faster.
-
-Applications that gets a lot of benefit from hugepages and that don't
-risk to lose memory by using hugepages, should use
-madvise(MADV_HUGEPAGE) on their critical mmapped regions.
-
-== sysfs ==
-
-Transparent Hugepage Support for anonymous memory can be entirely disabled
-(mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE
-regions (to avoid the risk of consuming more memory resources) or enabled
-system wide. This can be achieved with one of:
-
-echo always >/sys/kernel/mm/transparent_hugepage/enabled
-echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
-echo never >/sys/kernel/mm/transparent_hugepage/enabled
-
-It's also possible to limit defrag efforts in the VM to generate
-anonymous hugepages in case they're not immediately free to madvise
-regions or to never try to defrag memory and simply fallback to regular
-pages unless hugepages are immediately available. Clearly if we spend CPU
-time to defrag memory, we would expect to gain even more by the fact we
-use hugepages later instead of regular pages. This isn't always
-guaranteed, but it may be more likely in case the allocation is for a
-MADV_HUGEPAGE region.
-
-echo always >/sys/kernel/mm/transparent_hugepage/defrag
-echo defer >/sys/kernel/mm/transparent_hugepage/defrag
-echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag
-echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
-echo never >/sys/kernel/mm/transparent_hugepage/defrag
-
-"always" means that an application requesting THP will stall on allocation
-failure and directly reclaim pages and compact memory in an effort to
-allocate a THP immediately. This may be desirable for virtual machines
-that benefit heavily from THP use and are willing to delay the VM start
-to utilise them.
-
-"defer" means that an application will wake kswapd in the background
-to reclaim pages and wake kcompactd to compact memory so that THP is
-available in the near future. It's the responsibility of khugepaged
-to then install the THP pages later.
-
-"defer+madvise" will enter direct reclaim and compaction like "always", but
-only for regions that have used madvise(MADV_HUGEPAGE); all other regions
-will wake kswapd in the background to reclaim pages and wake kcompactd to
-compact memory so that THP is available in the near future.
-
-"madvise" will enter direct reclaim like "always" but only for regions
-that are have used madvise(MADV_HUGEPAGE). This is the default behaviour.
-
-"never" should be self-explanatory.
-
-By default kernel tries to use huge zero page on read page fault to
-anonymous mapping. It's possible to disable huge zero page by writing 0
-or enable it back by writing 1:
-
-echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page
-echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page
-
-Some userspace (such as a test program, or an optimized memory allocation
-library) may want to know the size (in bytes) of a transparent hugepage:
-
-cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
-
-khugepaged will be automatically started when
-transparent_hugepage/enabled is set to "always" or "madvise, and it'll
-be automatically shutdown if it's set to "never".
-
-khugepaged runs usually at low frequency so while one may not want to
-invoke defrag algorithms synchronously during the page faults, it
-should be worth invoking defrag at least in khugepaged. However it's
-also possible to disable defrag in khugepaged by writing 0 or enable
-defrag in khugepaged by writing 1:
-
-echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
-echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
-
-You can also control how many pages khugepaged should scan at each
-pass:
-
-/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan
-
-and how many milliseconds to wait in khugepaged between each pass (you
-can set this to 0 to run khugepaged at 100% utilization of one core):
-
-/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs
-
-and how many milliseconds to wait in khugepaged if there's an hugepage
-allocation failure to throttle the next allocation attempt.
-
-/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
-
-The khugepaged progress can be seen in the number of pages collapsed:
-
-/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
-
-for each pass:
-
-/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
-
-max_ptes_none specifies how many extra small pages (that are
-not already mapped) can be allocated when collapsing a group
-of small pages into one large page.
-
-/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none
-
-A higher value leads to use additional memory for programs.
-A lower value leads to gain less thp performance. Value of
-max_ptes_none can waste cpu time very little, you can
-ignore it.
-
-max_ptes_swap specifies how many pages can be brought in from
-swap when collapsing a group of pages into a transparent huge page.
-
-/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap
-
-A higher value can cause excessive swap IO and waste
-memory. A lower value can prevent THPs from being
-collapsed, resulting fewer pages being collapsed into
-THPs, and lower memory access performance.
-
-== Boot parameter ==
-
-You can change the sysfs boot time defaults of Transparent Hugepage
-Support by passing the parameter "transparent_hugepage=always" or
-"transparent_hugepage=madvise" or "transparent_hugepage=never"
-(without "") to the kernel command line.
-
-== Hugepages in tmpfs/shmem ==
-
-You can control hugepage allocation policy in tmpfs with mount option
-"huge=". It can have following values:
-
-  - "always":
-    Attempt to allocate huge pages every time we need a new page;
-
-  - "never":
-    Do not allocate huge pages;
-
-  - "within_size":
-    Only allocate huge page if it will be fully within i_size.
-    Also respect fadvise()/madvise() hints;
-
-  - "advise:
-    Only allocate huge pages if requested with fadvise()/madvise();
-
-The default policy is "never".
-
-"mount -o remount,huge= /mountpoint" works fine after mount: remounting
-huge=never will not attempt to break up huge pages at all, just stop more
-from being allocated.
-
-There's also sysfs knob to control hugepage allocation policy for internal
-shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount
-is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or
-MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem.
-
-In addition to policies listed above, shmem_enabled allows two further
-values:
-
-  - "deny":
-    For use in emergencies, to force the huge option off from
-    all mounts;
-  - "force":
-    Force the huge option on for all - very useful for testing;
-
-== Need of application restart ==
-
-The transparent_hugepage/enabled values and tmpfs mount option only affect
-future behavior. So to make them effective you need to restart any
-application that could have been using hugepages. This also applies to the
-regions registered in khugepaged.
-
-== Monitoring usage ==
-
-The number of anonymous transparent huge pages currently used by the
-system is available by reading the AnonHugePages field in /proc/meminfo.
-To identify what applications are using anonymous transparent huge pages,
-it is necessary to read /proc/PID/smaps and count the AnonHugePages fields
-for each mapping.
-
-The number of file transparent huge pages mapped to userspace is available
-by reading ShmemPmdMapped and ShmemHugePages fields in /proc/meminfo.
-To identify what applications are mapping file transparent huge pages, it
-is necessary to read /proc/PID/smaps and count the FileHugeMapped fields
-for each mapping.
-
-Note that reading the smaps file is expensive and reading it
-frequently will incur overhead.
-
-There are a number of counters in /proc/vmstat that may be used to
-monitor how successfully the system is providing huge pages for use.
-
-thp_fault_alloc is incremented every time a huge page is successfully
-	allocated to handle a page fault. This applies to both the
-	first time a page is faulted and for COW faults.
-
-thp_collapse_alloc is incremented by khugepaged when it has found
-	a range of pages to collapse into one huge page and has
-	successfully allocated a new huge page to store the data.
-
-thp_fault_fallback is incremented if a page fault fails to allocate
-	a huge page and instead falls back to using small pages.
-
-thp_collapse_alloc_failed is incremented if khugepaged found a range
-	of pages that should be collapsed into one huge page but failed
-	the allocation.
-
-thp_file_alloc is incremented every time a file huge page is successfully
-	allocated.
-
-thp_file_mapped is incremented every time a file huge page is mapped into
-	user address space.
-
-thp_split_page is incremented every time a huge page is split into base
-	pages. This can happen for a variety of reasons but a common
-	reason is that a huge page is old and is being reclaimed.
-	This action implies splitting all PMD the page mapped with.
-
-thp_split_page_failed is incremented if kernel fails to split huge
-	page. This can happen if the page was pinned by somebody.
-
-thp_deferred_split_page is incremented when a huge page is put onto split
-	queue. This happens when a huge page is partially unmapped and
-	splitting it would free up some memory. Pages on split queue are
-	going to be split under memory pressure.
-
-thp_split_pmd is incremented every time a PMD split into table of PTEs.
-	This can happen, for instance, when application calls mprotect() or
-	munmap() on part of huge page. It doesn't split huge page, only
-	page table entry.
-
-thp_zero_page_alloc is incremented every time a huge zero page is
-	successfully allocated. It includes allocations which where
-	dropped due race with other allocation. Note, it doesn't count
-	every map of the huge zero page, only its allocation.
-
-thp_zero_page_alloc_failed is incremented if kernel fails to allocate
-	huge zero page and falls back to using small pages.
-
-As the system ages, allocating huge pages may be expensive as the
-system uses memory compaction to copy data around memory to free a
-huge page for use. There are some counters in /proc/vmstat to help
-monitor this overhead.
-
-compact_stall is incremented every time a process stalls to run
-	memory compaction so that a huge page is free for use.
-
-compact_success is incremented if the system compacted memory and
-	freed a huge page for use.
-
-compact_fail is incremented if the system tries to compact memory
-	but failed.
-
-compact_pages_moved is incremented each time a page is moved. If
-	this value is increasing rapidly, it implies that the system
-	is copying a lot of data to satisfy the huge page allocation.
-	It is possible that the cost of copying exceeds any savings
-	from reduced TLB misses.
-
-compact_pagemigrate_failed is incremented when the underlying mechanism
-	for moving a page failed.
-
-compact_blocks_moved is incremented each time memory compaction examines
-	a huge page aligned range of pages.
-
-It is possible to establish how long the stalls were using the function
-tracer to record how long was spent in __alloc_pages_nodemask and
-using the mm_page_alloc tracepoint to identify which allocations were
-for huge pages.
-
-== get_user_pages and follow_page ==
-
-get_user_pages and follow_page if run on a hugepage, will return the
-head or tail pages as usual (exactly as they would do on
-hugetlbfs). Most gup users will only care about the actual physical
-address of the page and its temporary pinning to release after the I/O
-is complete, so they won't ever notice the fact the page is huge. But
-if any driver is going to mangle over the page structure of the tail
-page (like for checking page->mapping or other bits that are relevant
-for the head page and not the tail page), it should be updated to jump
-to check head page instead. Taking reference on any head/tail page would
-prevent page from being split by anyone.
-
-NOTE: these aren't new constraints to the GUP API, and they match the
-same constrains that applies to hugetlbfs too, so any driver capable
-of handling GUP on hugetlbfs will also work fine on transparent
-hugepage backed mappings.
-
-In case you can't handle compound pages if they're returned by
-follow_page, the FOLL_SPLIT bit can be specified as parameter to
-follow_page, so that it will split the hugepages before returning
-them. Migration for example passes FOLL_SPLIT as parameter to
-follow_page because it's not hugepage aware and in fact it can't work
-at all on hugetlbfs (but it instead works fine on transparent
-hugepages thanks to FOLL_SPLIT). migration simply can't deal with
-hugepages being returned (as it's not only checking the pfn of the
-page and pinning it during the copy but it pretends to migrate the
-memory in regular page sizes and with regular pte/pmd mappings).
-
-== Optimizing the applications ==
-
-To be guaranteed that the kernel will map a 2M page immediately in any
-memory region, the mmap region has to be hugepage naturally
-aligned. posix_memalign() can provide that guarantee.
-
-== Hugetlbfs ==
-
-You can use hugetlbfs on a kernel that has transparent hugepage
-support enabled just fine as always. No difference can be noted in
-hugetlbfs other than there will be less overall fragmentation. All
-usual features belonging to hugetlbfs are preserved and
-unaffected. libhugetlbfs will also work fine as usual.
-
-== Graceful fallback ==
-
-Code walking pagetables but unaware about huge pmds can simply call
-split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
-pmd_offset. It's trivial to make the code transparent hugepage aware
-by just grepping for "pmd_offset" and adding split_huge_pmd where
-missing after pmd_offset returns the pmd. Thanks to the graceful
-fallback design, with a one liner change, you can avoid to write
-hundred if not thousand of lines of complex code to make your code
-hugepage aware.
-
-If you're not walking pagetables but you run into a physical hugepage
-but you can't handle it natively in your code, you can split it by
-calling split_huge_page(page). This is what the Linux VM does before
-it tries to swapout the hugepage for example. split_huge_page() can fail
-if the page is pinned and you must handle this correctly.
-
-Example to make mremap.c transparent hugepage aware with a one liner
-change:
-
-diff --git a/mm/mremap.c b/mm/mremap.c
---- a/mm/mremap.c
-+++ b/mm/mremap.c
-@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
-		return NULL;
-
-	pmd = pmd_offset(pud, addr);
-+	split_huge_pmd(vma, pmd, addr);
-	if (pmd_none_or_clear_bad(pmd))
-		return NULL;
-
-== Locking in hugepage aware code ==
-
-We want as much code as possible hugepage aware, as calling
-split_huge_page() or split_huge_pmd() has a cost.
-
-To make pagetable walks huge pmd aware, all you need to do is to call
-pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
-mmap_sem in read (or write) mode to be sure an huge pmd cannot be
-created from under you by khugepaged (khugepaged collapse_huge_page
-takes the mmap_sem in write mode in addition to the anon_vma lock). If
-pmd_trans_huge returns false, you just fallback in the old code
-paths. If instead pmd_trans_huge returns true, you have to take the
-page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
-page table lock will prevent the huge pmd to be converted into a
-regular pmd from under you (split_huge_pmd can run in parallel to the
-pagetable walk). If the second pmd_trans_huge returns false, you
-should just drop the page table lock and fallback to the old code as
-before. Otherwise you can proceed to process the huge pmd and the
-hugepage natively. Once finished you can drop the page table lock.
-
-== Refcounts and transparent huge pages ==
-
-Refcounting on THP is mostly consistent with refcounting on other compound
-pages:
-
-  - get_page()/put_page() and GUP operate in head page's ->_refcount.
-
-  - ->_refcount in tail pages is always zero: get_page_unless_zero() never
-    succeed on tail pages.
-
-  - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
-    on relevant sub-page of the compound page.
-
-  - map/unmap of the whole compound page accounted in compound_mapcount
-    (stored in first tail page). For file huge pages, we also increment
-    ->_mapcount of all sub-pages in order to have race-free detection of
-    last unmap of subpages.
-
-PageDoubleMap() indicates that the page is *possibly* mapped with PTEs.
-
-For anonymous pages PageDoubleMap() also indicates ->_mapcount in all
-subpages is offset up by one. This additional reference is required to
-get race-free detection of unmap of subpages when we have them mapped with
-both PMDs and PTEs.
-
-This is optimization required to lower overhead of per-subpage mapcount
-tracking. The alternative is alter ->_mapcount in all subpages on each
-map/unmap of the whole compound page.
-
-For anonymous pages, we set PG_double_map when a PMD of the page got split
-for the first time, but still have PMD mapping. The additional references
-go away with last compound_mapcount.
-
-File pages get PG_double_map set on first map of the page with PTE and
-goes away when the page gets evicted from page cache.
-
-split_huge_page internally has to distribute the refcounts in the head
-page to the tail pages before clearing all PG_head/tail bits from the page
-structures. It can be done easily for refcounts taken by page table
-entries. But we don't have enough information on how to distribute any
-additional pins (i.e. from get_user_pages). split_huge_page() fails any
-requests to split pinned huge page: it expects page count to be equal to
-sum of mapcount of all sub-pages plus one (split_huge_page caller must
-have reference for head page).
-
-split_huge_page uses migration entries to stabilize page->_refcount and
-page->_mapcount of anonymous pages. File pages just got unmapped.
-
-We safe against physical memory scanners too: the only legitimate way
-scanner can get reference to a page is get_page_unless_zero().
-
-All tail pages have zero ->_refcount until atomic_add(). This prevents the
-scanner from getting a reference to the tail page up to that point. After the
-atomic_add() we don't care about the ->_refcount value. We already known how
-many references should be uncharged from the head page.
-
-For head page get_page_unless_zero() will succeed and we don't mind. It's
-clear where reference should go after split: it will stay on head page.
-
-Note that split_huge_pmd() doesn't have any limitation on refcounting:
-pmd can be split at any point and never fails.
-
-== Partial unmap and deferred_split_huge_page() ==
-
-Unmapping part of THP (with munmap() or other way) is not going to free
-memory immediately. Instead, we detect that a subpage of THP is not in use
-in page_remove_rmap() and queue the THP for splitting if memory pressure
-comes. Splitting will free up unused subpages.
-
-Splitting the page right away is not an option due to locking context in
-the place where we can detect partial unmap. It's also might be
-counterproductive since in many cases partial unmap happens during exit(2) if
-a THP crosses a VMA boundary.
-
-Function deferred_split_huge_page() is used to queue page for splitting.
-The splitting itself will happen when we get memory pressure via shrinker
-interface.
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.rst
index e147185..fdd84cb 100644
--- a/Documentation/vm/unevictable-lru.txt
+++ b/Documentation/vm/unevictable-lru.rst
@@ -1,37 +1,13 @@
-			==============================
-			UNEVICTABLE LRU INFRASTRUCTURE
-			==============================
-
-========
-CONTENTS
-========
-
- (*) The Unevictable LRU
-
-     - The unevictable page list.
-     - Memory control group interaction.
-     - Marking address spaces unevictable.
-     - Detecting Unevictable Pages.
-     - vmscan's handling of unevictable pages.
-
- (*) mlock()'d pages.
-
-     - History.
-     - Basic management.
-     - mlock()/mlockall() system call handling.
-     - Filtering special vmas.
-     - munlock()/munlockall() system call handling.
-     - Migrating mlocked pages.
-     - Compacting mlocked pages.
-     - mmap(MAP_LOCKED) system call handling.
-     - munmap()/exit()/exec() system call handling.
-     - try_to_unmap().
-     - try_to_munlock() reverse map scan.
-     - Page reclaim in shrink_*_list().
+.. _unevictable_lru:
 
+==============================
+Unevictable LRU Infrastructure
+==============================
 
-============
-INTRODUCTION
+.. contents:: :local:
+
+
+Introduction
 ============
 
 This document describes the Linux memory manager's "Unevictable LRU"
@@ -46,8 +22,8 @@ details - the "what does it do?" - by reading the code.  One hopes that the
 descriptions below add value by provide the answer to "why does it do that?".
 
 
-===================
-THE UNEVICTABLE LRU
+
+The Unevictable LRU
 ===================
 
 The Unevictable LRU facility adds an additional LRU list to track unevictable
@@ -66,17 +42,17 @@ completely unresponsive.
 
 The unevictable list addresses the following classes of unevictable pages:
 
- (*) Those owned by ramfs.
+ * Those owned by ramfs.
 
- (*) Those mapped into SHM_LOCK'd shared memory regions.
+ * Those mapped into SHM_LOCK'd shared memory regions.
 
- (*) Those mapped into VM_LOCKED [mlock()ed] VMAs.
+ * Those mapped into VM_LOCKED [mlock()ed] VMAs.
 
 The infrastructure may also be able to handle other conditions that make pages
 unevictable, either by definition or by circumstance, in the future.
 
 
-THE UNEVICTABLE PAGE LIST
+The Unevictable Page List
 -------------------------
 
 The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list
@@ -118,7 +94,7 @@ the unevictable list when one task has the page isolated from the LRU and other
 tasks are changing the "evictability" state of the page.
 
 
-MEMORY CONTROL GROUP INTERACTION
+Memory Control Group Interaction
 --------------------------------
 
 The unevictable LRU facility interacts with the memory control group [aka
@@ -144,7 +120,9 @@ effects:
      the control group to thrash or to OOM-kill tasks.
 
 
-MARKING ADDRESS SPACES UNEVICTABLE
+.. _mark_addr_space_unevict:
+
+Marking Address Spaces Unevictable
 ----------------------------------
 
 For facilities such as ramfs none of the pages attached to the address space
@@ -152,15 +130,15 @@ may be evicted.  To prevent eviction of any such pages, the AS_UNEVICTABLE
 address space flag is provided, and this can be manipulated by a filesystem
 using a number of wrapper functions:
 
- (*) void mapping_set_unevictable(struct address_space *mapping);
+ * ``void mapping_set_unevictable(struct address_space *mapping);``
 
 	Mark the address space as being completely unevictable.
 
- (*) void mapping_clear_unevictable(struct address_space *mapping);
+ * ``void mapping_clear_unevictable(struct address_space *mapping);``
 
 	Mark the address space as being evictable.
 
- (*) int mapping_unevictable(struct address_space *mapping);
+ * ``int mapping_unevictable(struct address_space *mapping);``
 
 	Query the address space, and return true if it is completely
 	unevictable.
@@ -177,12 +155,13 @@ These are currently used in two places in the kernel:
      ensure they're in memory.
 
 
-DETECTING UNEVICTABLE PAGES
+Detecting Unevictable Pages
 ---------------------------
 
 The function page_evictable() in vmscan.c determines whether a page is
-evictable or not using the query function outlined above [see section "Marking
-address spaces unevictable"] to check the AS_UNEVICTABLE flag.
+evictable or not using the query function outlined above [see section
+:ref:`Marking address spaces unevictable <mark_addr_space_unevict>`]
+to check the AS_UNEVICTABLE flag.
 
 For address spaces that are so marked after being populated (as SHM regions
 might be), the lock action (eg: SHM_LOCK) can be lazy, and need not populate
@@ -202,7 +181,7 @@ flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is
 faulted into a VM_LOCKED vma, or found in a vma being VM_LOCKED.
 
 
-VMSCAN'S HANDLING OF UNEVICTABLE PAGES
+Vmscan's Handling of Unevictable Pages
 --------------------------------------
 
 If unevictable pages are culled in the fault path, or moved to the unevictable
@@ -233,8 +212,7 @@ extra evictabilty checks should not occur in the majority of calls to
 putback_lru_page().
 
 
-=============
-MLOCKED PAGES
+MLOCKED Pages
 =============
 
 The unevictable page list is also useful for mlock(), in addition to ramfs and
@@ -242,7 +220,7 @@ SYSV SHM.  Note that mlock() is only available in CONFIG_MMU=y situations; in
 NOMMU situations, all mappings are effectively mlocked.
 
 
-HISTORY
+History
 -------
 
 The "Unevictable mlocked Pages" infrastructure is based on work originally
@@ -263,7 +241,7 @@ replaced by walking the reverse map to determine whether any VM_LOCKED VMAs
 mapped the page.  More on this below.
 
 
-BASIC MANAGEMENT
+Basic Management
 ----------------
 
 mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable
@@ -304,10 +282,10 @@ mlocked pages become unlocked and rescued from the unevictable list when:
  (4) before a page is COW'd in a VM_LOCKED VMA.
 
 
-mlock()/mlockall() SYSTEM CALL HANDLING
+mlock()/mlockall() System Call Handling
 ---------------------------------------
 
-Both [do_]mlock() and [do_]mlockall() system call handlers call mlock_fixup()
+Both [do\_]mlock() and [do\_]mlockall() system call handlers call mlock_fixup()
 for each VMA in the range specified by the call.  In the case of mlockall(),
 this is the entire active address space of the task.  Note that mlock_fixup()
 is used for both mlocking and munlocking a range of memory.  A call to mlock()
@@ -351,7 +329,7 @@ mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle
 it later if and when it attempts to reclaim the page.
 
 
-FILTERING SPECIAL VMAS
+Filtering Special VMAs
 ----------------------
 
 mlock_fixup() filters several classes of "special" VMAs:
@@ -379,8 +357,9 @@ VM_LOCKED flag.  Therefore, we won't have to deal with them later during
 munlock(), munmap() or task exit.  Neither does mlock_fixup() account these
 VMAs against the task's "locked_vm".
 
+.. _munlock_munlockall_handling:
 
-munlock()/munlockall() SYSTEM CALL HANDLING
+munlock()/munlockall() System Call Handling
 -------------------------------------------
 
 The munlock() and munlockall() system calls are handled by the same functions -
@@ -426,7 +405,7 @@ This is fine, because we'll catch it later if and if vmscan tries to reclaim
 the page.  This should be relatively rare.
 
 
-MIGRATING MLOCKED PAGES
+Migrating MLOCKED Pages
 -----------------------
 
 A page that is being migrated has been isolated from the LRU lists and is held
@@ -451,7 +430,7 @@ list because of a race between munlock and migration, page migration uses the
 putback_lru_page() function to add migrated pages back to the LRU.
 
 
-COMPACTING MLOCKED PAGES
+Compacting MLOCKED Pages
 ------------------------
 
 The unevictable LRU can be scanned for compactable regions and the default
@@ -461,7 +440,7 @@ unevictable LRU is enabled, the work of compaction is mostly handled by
 the page migration code and the same work flow as described in MIGRATING
 MLOCKED PAGES will apply.
 
-MLOCKING TRANSPARENT HUGE PAGES
+MLOCKING Transparent Huge Pages
 -------------------------------
 
 A transparent huge page is represented by a single entry on an LRU list.
@@ -483,7 +462,7 @@ to unevictable LRU and the rest can be reclaimed.
 
 See also comment in follow_trans_huge_pmd().
 
-mmap(MAP_LOCKED) SYSTEM CALL HANDLING
+mmap(MAP_LOCKED) System Call Handling
 -------------------------------------
 
 In addition the mlock()/mlockall() system calls, an application can request
@@ -514,7 +493,7 @@ memory range accounted as locked_vm, as the protections could be changed later
 and pages allocated into that region.
 
 
-munmap()/exit()/exec() SYSTEM CALL HANDLING
+munmap()/exit()/exec() System Call Handling
 -------------------------------------------
 
 When unmapping an mlocked region of memory, whether by an explicit call to
@@ -568,16 +547,18 @@ munlock or munmap system calls, mm teardown (munlock_vma_pages_all), reclaim,
 holepunching, and truncation of file pages and their anonymous COWed pages.
 
 
-try_to_munlock() REVERSE MAP SCAN
+try_to_munlock() Reverse Map Scan
 ---------------------------------
 
- [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the
-     page_referenced() reverse map walker.
+.. warning::
+   [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the
+   page_referenced() reverse map walker.
 
-When munlock_vma_page() [see section "munlock()/munlockall() System Call
-Handling" above] tries to munlock a page, it needs to determine whether or not
-the page is mapped by any VM_LOCKED VMA without actually attempting to unmap
-all PTEs from the page.  For this purpose, the unevictable/mlock infrastructure
+When munlock_vma_page() [see section :ref:`munlock()/munlockall() System Call
+Handling <munlock_munlockall_handling>` above] tries to munlock a
+page, it needs to determine whether or not the page is mapped by any
+VM_LOCKED VMA without actually attempting to unmap all PTEs from the
+page.  For this purpose, the unevictable/mlock infrastructure
 introduced a variant of try_to_unmap() called try_to_munlock().
 
 try_to_munlock() calls the same functions as try_to_unmap() for anonymous and
@@ -595,7 +576,7 @@ large region or tearing down a large address space that has been mlocked via
 mlockall(), overall this is a fairly rare event.
 
 
-PAGE RECLAIM IN shrink_*_list()
+Page Reclaim in shrink_*_list()
 -------------------------------
 
 shrink_active_list() culls any obviously unevictable pages - i.e.
diff --git a/Documentation/vm/z3fold.txt b/Documentation/vm/z3fold.rst
index 38e4dac..224e3c6 100644
--- a/Documentation/vm/z3fold.txt
+++ b/Documentation/vm/z3fold.rst
@@ -1,5 +1,8 @@
+.. _z3fold:
+
+======
 z3fold
-------
+======
 
 z3fold is a special purpose allocator for storing compressed pages.
 It is designed to store up to three compressed pages per physical page.
@@ -7,6 +10,7 @@ It is a zbud derivative which allows for higher compression
 ratio keeping the simplicity and determinism of its predecessor.
 
 The main differences between z3fold and zbud are:
+
 * unlike zbud, z3fold allows for up to PAGE_SIZE allocations
 * z3fold can hold up to 3 compressed pages in its page
 * z3fold doesn't export any API itself and is thus intended to be used
diff --git a/Documentation/vm/zsmalloc.txt b/Documentation/vm/zsmalloc.rst
index 64ed63c..6e79893 100644
--- a/Documentation/vm/zsmalloc.txt
+++ b/Documentation/vm/zsmalloc.rst
@@ -1,5 +1,8 @@
+.. _zsmalloc:
+
+========
 zsmalloc
---------
+========
 
 This allocator is designed for use with zram. Thus, the allocator is
 supposed to work well under low memory conditions. In particular, it
@@ -31,40 +34,49 @@ be mapped using zs_map_object() to get a usable pointer and subsequently
 unmapped using zs_unmap_object().
 
 stat
-----
+====
 
 With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via
-/sys/kernel/debug/zsmalloc/<user name>. Here is a sample of stat output:
+``/sys/kernel/debug/zsmalloc/<user name>``. Here is a sample of stat output::
 
-# cat /sys/kernel/debug/zsmalloc/zram0/classes
+ # cat /sys/kernel/debug/zsmalloc/zram0/classes
 
  class  size almost_full almost_empty obj_allocated   obj_used pages_used pages_per_zspage
-    ..
-    ..
+    ...
+    ...
      9   176           0            1           186        129          8                4
     10   192           1            0          2880       2872        135                3
     11   208           0            1           819        795         42                2
     12   224           0            1           219        159         12                4
-    ..
-    ..
+    ...
+    ...
+
 
+class
+	index
+size
+	object size zspage stores
+almost_empty
+	the number of ZS_ALMOST_EMPTY zspages(see below)
+almost_full
+	the number of ZS_ALMOST_FULL zspages(see below)
+obj_allocated
+	the number of objects allocated
+obj_used
+	the number of objects allocated to the user
+pages_used
+	the number of pages allocated for the class
+pages_per_zspage
+	the number of 0-order pages to make a zspage
 
-class: index
-size: object size zspage stores
-almost_empty: the number of ZS_ALMOST_EMPTY zspages(see below)
-almost_full: the number of ZS_ALMOST_FULL zspages(see below)
-obj_allocated: the number of objects allocated
-obj_used: the number of objects allocated to the user
-pages_used: the number of pages allocated for the class
-pages_per_zspage: the number of 0-order pages to make a zspage
+We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where
 
-We assign a zspage to ZS_ALMOST_EMPTY fullness group when:
-      n <= N / f, where
-n = number of allocated objects
-N = total number of objects zspage can store
-f = fullness_threshold_frac(ie, 4 at the moment)
+* n = number of allocated objects
+* N = total number of objects zspage can store
+* f = fullness_threshold_frac(ie, 4 at the moment)
 
 Similarly, we assign zspage to:
-      ZS_ALMOST_FULL  when n > N / f
-      ZS_EMPTY        when n == 0
-      ZS_FULL         when n == N
+
+* ZS_ALMOST_FULL  when n > N / f
+* ZS_EMPTY        when n == 0
+* ZS_FULL         when n == N
diff --git a/Documentation/vm/zswap.txt b/Documentation/vm/zswap.rst
index 0b3a114..1444ecd 100644
--- a/Documentation/vm/zswap.txt
+++ b/Documentation/vm/zswap.rst
@@ -1,4 +1,11 @@
-Overview:
+.. _zswap:
+
+=====
+zswap
+=====
+
+Overview
+========
 
 Zswap is a lightweight compressed cache for swap pages. It takes pages that are
 in the process of being swapped out and attempts to compress them into a
@@ -7,32 +14,34 @@ for potentially reduced swap I/O.  This trade-off can also result in a
 significant performance improvement if reads from the compressed cache are
 faster than reads from a swap device.
 
-NOTE: Zswap is a new feature as of v3.11 and interacts heavily with memory
-reclaim.  This interaction has not been fully explored on the large set of
-potential configurations and workloads that exist.  For this reason, zswap
-is a work in progress and should be considered experimental.
+.. note::
+   Zswap is a new feature as of v3.11 and interacts heavily with memory
+   reclaim.  This interaction has not been fully explored on the large set of
+   potential configurations and workloads that exist.  For this reason, zswap
+   is a work in progress and should be considered experimental.
+
+   Some potential benefits:
 
-Some potential benefits:
 * Desktop/laptop users with limited RAM capacities can mitigate the
-    performance impact of swapping.
+  performance impact of swapping.
 * Overcommitted guests that share a common I/O resource can
-    dramatically reduce their swap I/O pressure, avoiding heavy handed I/O
-    throttling by the hypervisor. This allows more work to get done with less
-    impact to the guest workload and guests sharing the I/O subsystem
+  dramatically reduce their swap I/O pressure, avoiding heavy handed I/O
+  throttling by the hypervisor. This allows more work to get done with less
+  impact to the guest workload and guests sharing the I/O subsystem
 * Users with SSDs as swap devices can extend the life of the device by
-    drastically reducing life-shortening writes.
+  drastically reducing life-shortening writes.
 
 Zswap evicts pages from compressed cache on an LRU basis to the backing swap
 device when the compressed pool reaches its size limit.  This requirement had
 been identified in prior community discussions.
 
 Zswap is disabled by default but can be enabled at boot time by setting
-the "enabled" attribute to 1 at boot time. ie: zswap.enabled=1.  Zswap
+the ``enabled`` attribute to 1 at boot time. ie: ``zswap.enabled=1``.  Zswap
 can also be enabled and disabled at runtime using the sysfs interface.
 An example command to enable zswap at runtime, assuming sysfs is mounted
-at /sys, is:
+at ``/sys``, is::
 
-echo 1 > /sys/module/zswap/parameters/enabled
+	echo 1 > /sys/module/zswap/parameters/enabled
 
 When zswap is disabled at runtime it will stop storing pages that are
 being swapped out.  However, it will _not_ immediately write out or fault
@@ -43,7 +52,8 @@ pages out of the compressed pool, a swapoff on the swap device(s) will
 fault back into memory all swapped out pages, including those in the
 compressed pool.
 
-Design:
+Design
+======
 
 Zswap receives pages for compression through the Frontswap API and is able to
 evict pages from its own compressed pool on an LRU basis and write them back to
@@ -53,12 +63,12 @@ Zswap makes use of zpool for the managing the compressed memory pool.  Each
 allocation in zpool is not directly accessible by address.  Rather, a handle is
 returned by the allocation routine and that handle must be mapped before being
 accessed.  The compressed memory pool grows on demand and shrinks as compressed
-pages are freed.  The pool is not preallocated.  By default, a zpool of type
-zbud is created, but it can be selected at boot time by setting the "zpool"
-attribute, e.g. zswap.zpool=zbud.  It can also be changed at runtime using the
-sysfs "zpool" attribute, e.g.
+pages are freed.  The pool is not preallocated.  By default, a zpool
+of type zbud is created, but it can be selected at boot time by
+setting the ``zpool`` attribute, e.g. ``zswap.zpool=zbud``. It can
+also be changed at runtime using the sysfs ``zpool`` attribute, e.g.::
 
-echo zbud > /sys/module/zswap/parameters/zpool
+	echo zbud > /sys/module/zswap/parameters/zpool
 
 The zbud type zpool allocates exactly 1 page to store 2 compressed pages, which
 means the compression ratio will always be 2:1 or worse (because of half-full
@@ -83,14 +93,16 @@ via frontswap, to free the compressed entry.
 
 Zswap seeks to be simple in its policies.  Sysfs attributes allow for one user
 controlled policy:
+
 * max_pool_percent - The maximum percentage of memory that the compressed
-    pool can occupy.
+  pool can occupy.
 
-The default compressor is lzo, but it can be selected at boot time by setting
-the “compressor” attribute, e.g. zswap.compressor=lzo.  It can also be changed
-at runtime using the sysfs "compressor" attribute, e.g.
+The default compressor is lzo, but it can be selected at boot time by
+setting the ``compressor`` attribute, e.g. ``zswap.compressor=lzo``.
+It can also be changed at runtime using the sysfs "compressor"
+attribute, e.g.::
 
-echo lzo > /sys/module/zswap/parameters/compressor
+	echo lzo > /sys/module/zswap/parameters/compressor
 
 When the zpool and/or compressor parameter is changed at runtime, any existing
 compressed pages are not modified; they are left in their own zpool.  When a
@@ -106,11 +118,12 @@ compressed length of the page is set to zero and the pattern or same-filled
 value is stored.
 
 Same-value filled pages identification feature is enabled by default and can be
-disabled at boot time by setting the "same_filled_pages_enabled" attribute to 0,
-e.g. zswap.same_filled_pages_enabled=0. It can also be enabled and disabled at
-runtime using the sysfs "same_filled_pages_enabled" attribute, e.g.
+disabled at boot time by setting the ``same_filled_pages_enabled`` attribute
+to 0, e.g. ``zswap.same_filled_pages_enabled=0``. It can also be enabled and
+disabled at runtime using the sysfs ``same_filled_pages_enabled``
+attribute, e.g.::
 
-echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
+	echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
 
 When zswap same-filled page identification is disabled at runtime, it will stop
 checking for the same-value filled pages during store operation. However, the
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index b297c48..8d109ef 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -187,9 +187,9 @@ PCI
 
 IOMMU (input/output memory management unit)
 
- Currently four x86-64 PCI-DMA mapping implementations exist:
+ Multiple x86-64 PCI-DMA mapping implementations exist, for example:
 
-   1. <arch/x86_64/kernel/pci-nommu.c>: use no hardware/software IOMMU at all
+   1. <lib/dma-direct.c>: use no hardware/software IOMMU at all
       (e.g. because you have < 3 GB memory).
       Kernel boot message: "PCI-DMA: Disabling IOMMU"
 
@@ -208,7 +208,7 @@ IOMMU (input/output memory management unit)
       Kernel boot message: "PCI-DMA: Using Calgary IOMMU"
 
  iommu=[<size>][,noagp][,off][,force][,noforce][,leak[=<nr_of_leak_pages>]
-	[,memaper[=<order>]][,merge][,forcesac][,fullflush][,nomerge]
+	[,memaper[=<order>]][,merge][,fullflush][,nomerge]
 	[,noaperture][,calgary]
 
   General iommu options:
@@ -235,14 +235,7 @@ IOMMU (input/output memory management unit)
                        (experimental).
     nomerge            Don't do scatter-gather (SG) merging.
     noaperture         Ask the IOMMU not to touch the aperture for AGP.
-    forcesac           Force single-address cycle (SAC) mode for masks <40bits
-                       (experimental).
     noagp              Don't initialize the AGP driver and use full aperture.
-    allowdac           Allow double-address cycle (DAC) mode, i.e. DMA >4GB.
-                       DAC is used with 32-bit PCI to push a 64-bit address in
-                       two cycles. When off all DMA over >4GB is forced through
-                       an IOMMU or software bounce buffering.
-    nodac              Forbid DAC mode, i.e. DMA >4GB.
     panic              Always panic when IOMMU overflows.
     calgary            Use the Calgary IOMMU if it is available
 
diff --git a/LICENSES/exceptions/Linux-syscall-note b/LICENSES/exceptions/Linux-syscall-note
index 6b60b61..9abdad7 100644
--- a/LICENSES/exceptions/Linux-syscall-note
+++ b/LICENSES/exceptions/Linux-syscall-note
@@ -1,6 +1,6 @@
 SPDX-Exception-Identifier: Linux-syscall-note
 SPDX-URL: https://spdx.org/licenses/Linux-syscall-note.html
-SPDX-Licenses: GPL-2.0, GPL-2.0+, GPL-1.0+, LGPL-2.0, LGPL-2.0+, LGPL-2.1, LGPL-2.1+
+SPDX-Licenses: GPL-2.0, GPL-2.0+, GPL-1.0+, LGPL-2.0, LGPL-2.0+, LGPL-2.1, LGPL-2.1+, GPL-2.0-only, GPL-2.0-or-later
 Usage-Guide:
   This exception is used together with one of the above SPDX-Licenses
   to mark user space API (uapi) header files so they can be included
diff --git a/LICENSES/other/Apache-2.0 b/LICENSES/other/Apache-2.0
new file mode 100644
index 0000000..7cd903f
--- /dev/null
+++ b/LICENSES/other/Apache-2.0
@@ -0,0 +1,183 @@
+Valid-License-Identifier: Apache-2.0
+SPDX-URL: https://spdx.org/licenses/Apache-2.0.html
+Usage-Guide:
+  To use the Apache License version 2.0 put the following SPDX tag/value
+  pair into a comment according to the placement guidelines in the
+  licensing rules documentation:
+    SPDX-License-Identifier: Apache-2.0
+License-Text:
+
+Apache License
+
+Version 2.0, January 2004
+
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction, and
+distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by the
+copyright owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all other
+entities that control, are controlled by, or are under common control with
+that entity. For the purposes of this definition, "control" means (i) the
+power, direct or indirect, to cause the direction or management of such
+entity, whether by contract or otherwise, or (ii) ownership of fifty
+percent (50%) or more of the outstanding shares, or (iii) beneficial
+ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity exercising
+permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications,
+including but not limited to software source code, documentation source,
+and configuration files.
+
+"Object" form shall mean any form resulting from mechanical transformation
+or translation of a Source form, including but not limited to compiled
+object code, generated documentation, and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or Object form,
+made available under the License, as indicated by a copyright notice that
+is included in or attached to the work (an example is provided in the
+Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object form,
+that is based on (or derived from) the Work and for which the editorial
+revisions, annotations, elaborations, or other modifications represent, as
+a whole, an original work of authorship. For the purposes of this License,
+Derivative Works shall not include works that remain separable from, or
+merely link (or bind by name) to the interfaces of, the Work and Derivative
+Works thereof.
+
+"Contribution" shall mean any work of authorship, including the original
+version of the Work and any modifications or additions to that Work or
+Derivative Works thereof, that is intentionally submitted to Licensor for
+inclusion in the Work by the copyright owner or by an individual or Legal
+Entity authorized to submit on behalf of the copyright owner. For the
+purposes of this definition, "submitted" means any form of electronic,
+verbal, or written communication sent to the Licensor or its
+representatives, including but not limited to communication on electronic
+mailing lists, source code control systems, and issue tracking systems that
+are managed by, or on behalf of, the Licensor for the purpose of discussing
+and improving the Work, but excluding communication that is conspicuously
+marked or otherwise designated in writing by the copyright owner as "Not a
+Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity on
+behalf of whom a Contribution has been received by Licensor and
+subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of this
+   License, each Contributor hereby grants to You a perpetual, worldwide,
+   non-exclusive, no-charge, royalty-free, irrevocable copyright license to
+   reproduce, prepare Derivative Works of, publicly display, publicly
+   perform, sublicense, and distribute the Work and such Derivative Works
+   in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of this
+   License, each Contributor hereby grants to You a perpetual, worldwide,
+   non-exclusive, no-charge, royalty-free, irrevocable (except as stated in
+   this section) patent license to make, have made, use, offer to sell,
+   sell, import, and otherwise transfer the Work, where such license
+   applies only to those patent claims licensable by such Contributor that
+   are necessarily infringed by their Contribution(s) alone or by
+   combination of their Contribution(s) with the Work to which such
+   Contribution(s) was submitted. If You institute patent litigation
+   against any entity (including a cross-claim or counterclaim in a
+   lawsuit) alleging that the Work or a Contribution incorporated within
+   the Work constitutes direct or contributory patent infringement, then
+   any patent licenses granted to You under this License for that Work
+   shall terminate as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the Work or
+   Derivative Works thereof in any medium, with or without modifications,
+   and in Source or Object form, provided that You meet the following
+   conditions:
+
+   a. You must give any other recipients of the Work or Derivative Works a
+      copy of this License; and
+
+   b. You must cause any modified files to carry prominent notices stating
+      that You changed the files; and
+
+   c. You must retain, in the Source form of any Derivative Works that You
+      distribute, all copyright, patent, trademark, and attribution notices
+      from the Source form of the Work, excluding those notices that do not
+      pertain to any part of the Derivative Works; and
+
+   d. If the Work includes a "NOTICE" text file as part of its
+      distribution, then any Derivative Works that You distribute must
+      include a readable copy of the attribution notices contained within
+      such NOTICE file, excluding those notices that do not pertain to any
+      part of the Derivative Works, in at least one of the following
+      places: within a NOTICE text file distributed as part of the
+      Derivative Works; within the Source form or documentation, if
+      provided along with the Derivative Works; or, within a display
+      generated by the Derivative Works, if and wherever such third-party
+      notices normally appear. The contents of the NOTICE file are for
+      informational purposes only and do not modify the License. You may
+      add Your own attribution notices within Derivative Works that You
+      distribute, alongside or as an addendum to the NOTICE text from the
+      Work, provided that such additional attribution notices cannot be
+      construed as modifying the License.
+
+    You may add Your own copyright statement to Your modifications and may
+    provide additional or different license terms and conditions for use,
+    reproduction, or distribution of Your modifications, or for any such
+    Derivative Works as a whole, provided Your use, reproduction, and
+    distribution of the Work otherwise complies with the conditions stated
+    in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise, any
+   Contribution intentionally submitted for inclusion in the Work by You to
+   the Licensor shall be under the terms and conditions of this License,
+   without any additional terms or conditions. Notwithstanding the above,
+   nothing herein shall supersede or modify the terms of any separate
+   license agreement you may have executed with Licensor regarding such
+   Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to
+   in writing, Licensor provides the Work (and each Contributor provides
+   its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+   OF ANY KIND, either express or implied, including, without limitation,
+   any warranties or conditions of TITLE, NON-INFRINGEMENT,
+   MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely
+   responsible for determining the appropriateness of using or
+   redistributing the Work and assume any risks associated with Your
+   exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory, whether
+   in tort (including negligence), contract, or otherwise, unless required
+   by applicable law (such as deliberate and grossly negligent acts) or
+   agreed to in writing, shall any Contributor be liable to You for
+   damages, including any direct, indirect, special, incidental, or
+   consequential damages of any character arising as a result of this
+   License or out of the use or inability to use the Work (including but
+   not limited to damages for loss of goodwill, work stoppage, computer
+   failure or malfunction, or any and all other commercial damages or
+   losses), even if such Contributor has been advised of the possibility of
+   such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing the
+   Work or Derivative Works thereof, You may choose to offer, and charge a
+   fee for, acceptance of support, warranty, indemnity, or other liability
+   obligations and/or rights consistent with this License. However, in
+   accepting such obligations, You may act only on Your own behalf and on
+   Your sole responsibility, not on behalf of any other Contributor, and
+   only if You agree to indemnify, defend, and hold each Contributor
+   harmless for any liability incurred by, or claims asserted against, such
+   Contributor by reason of your accepting any such warranty or additional
+   liability.
+
+END OF TERMS AND CONDITIONS
diff --git a/LICENSES/other/CC-BY-SA-4.0 b/LICENSES/other/CC-BY-SA-4.0
new file mode 100644
index 0000000..f9158e8
--- /dev/null
+++ b/LICENSES/other/CC-BY-SA-4.0
@@ -0,0 +1,397 @@
+Valid-License-Identifier: CC-BY-SA-4.0
+SPDX-URL: https://spdx.org/licenses/CC-BY-SA-4.0
+Usage-Guide:
+  To use the Creative Commons Attribution Share Alike 4.0 International
+  license put the following SPDX tag/value pair into a comment according to
+  the placement guidelines in the licensing rules documentation:
+    SPDX-License-Identifier: CC-BY-SA-4.0
+License-Text:
+
+Creative Commons Attribution-ShareAlike 4.0 International
+
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of Creative
+Commons public licenses does not create a lawyer-client or other
+relationship. Creative Commons makes its licenses and related information
+available on an "as-is" basis. Creative Commons gives no warranties
+regarding its licenses, any material licensed under their terms and
+conditions, or any related information. Creative Commons disclaims all
+liability for damages resulting from their use to the fullest extent
+possible.
+
+Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share original
+works of authorship and other material subject to copyright and certain
+other rights specified in the public license below. The following
+considerations are for informational purposes only, are not exhaustive, and
+do not form part of our licenses.
+
+Considerations for licensors: Our public licenses are intended for use by
+those authorized to give the public permission to use material in ways
+otherwise restricted by copyright and certain other rights. Our licenses
+are irrevocable. Licensors should read and understand the terms and
+conditions of the license they choose before applying it. Licensors should
+also secure all rights necessary before applying our licenses so that the
+public can reuse the material as expected. Licensors should clearly mark
+any material not subject to the license. This includes other CC-licensed
+material, or material used under an exception or limitation to
+copyright. More considerations for licensors :
+wiki.creativecommons.org/Considerations_for_licensors
+
+Considerations for the public: By using one of our public licenses, a
+licensor grants the public permission to use the licensed material under
+specified terms and conditions. If the licensor's permission is not
+necessary for any reason - for example, because of any applicable exception
+or limitation to copyright - then that use is not regulated by the
+license. Our licenses grant only permissions under copyright and certain
+other rights that a licensor has authority to grant. Use of the licensed
+material may still be restricted for other reasons, including because
+others have copyright or other rights in the material. A licensor may make
+special requests, such as asking that all changes be marked or described.
+
+Although not required by our licenses, you are encouraged to respect those
+requests where reasonable. More considerations for the public :
+wiki.creativecommons.org/Considerations_for_licensees
+
+Creative Commons Attribution-ShareAlike 4.0 International Public License
+
+By exercising the Licensed Rights (defined below), You accept and agree to
+be bound by the terms and conditions of this Creative Commons
+Attribution-ShareAlike 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You such
+rights in consideration of benefits the Licensor receives from making the
+Licensed Material available under these terms and conditions.
+
+Section 1 - Definitions.
+
+    a. Adapted Material means material subject to Copyright and Similar
+       Rights that is derived from or based upon the Licensed Material and
+       in which the Licensed Material is translated, altered, arranged,
+       transformed, or otherwise modified in a manner requiring permission
+       under the Copyright and Similar Rights held by the Licensor. For
+       purposes of this Public License, where the Licensed Material is a
+       musical work, performance, or sound recording, Adapted Material is
+       always produced where the Licensed Material is synched in timed
+       relation with a moving image.
+
+    b. Adapter's License means the license You apply to Your Copyright and
+       Similar Rights in Your contributions to Adapted Material in
+       accordance with the terms and conditions of this Public License.
+
+    c. BY-SA Compatible License means a license listed at
+       creativecommons.org/compatiblelicenses, approved by Creative Commons
+       as essentially the equivalent of this Public License.
+
+    d. Copyright and Similar Rights means copyright and/or similar rights
+       closely related to copyright including, without limitation,
+       performance, broadcast, sound recording, and Sui Generis Database
+       Rights, without regard to how the rights are labeled or
+       categorized. For purposes of this Public License, the rights
+       specified in Section 2(b)(1)-(2) are not Copyright and Similar
+       Rights.
+
+    e. Effective Technological Measures means those measures that, in the
+       absence of proper authority, may not be circumvented under laws
+       fulfilling obligations under Article 11 of the WIPO Copyright Treaty
+       adopted on December 20, 1996, and/or similar international
+       agreements.
+
+    f. Exceptions and Limitations means fair use, fair dealing, and/or any
+       other exception or limitation to Copyright and Similar Rights that
+       applies to Your use of the Licensed Material.
+
+    g. License Elements means the license attributes listed in the name of
+       a Creative Commons Public License. The License Elements of this
+       Public License are Attribution and ShareAlike.
+
+    h. Licensed Material means the artistic or literary work, database, or
+       other material to which the Licensor applied this Public License.
+
+    i. Licensed Rights means the rights granted to You subject to the terms
+       and conditions of this Public License, which are limited to all
+       Copyright and Similar Rights that apply to Your use of the Licensed
+       Material and that the Licensor has authority to license.
+
+    j. Licensor means the individual(s) or entity(ies) granting rights
+       under this Public License.
+
+    k. Share means to provide material to the public by any means or
+       process that requires permission under the Licensed Rights, such as
+       reproduction, public display, public performance, distribution,
+       dissemination, communication, or importation, and to make material
+       available to the public including in ways that members of the public
+       may access the material from a place and at a time individually
+       chosen by them.
+
+    l. Sui Generis Database Rights means rights other than copyright
+       resulting from Directive 96/9/EC of the European Parliament and of
+       the Council of 11 March 1996 on the legal protection of databases,
+       as amended and/or succeeded, as well as other essentially equivalent
+       rights anywhere in the world.  m. You means the individual or entity
+       exercising the Licensed Rights under this Public License. Your has a
+       corresponding meaning.
+
+Section 2 - Scope.
+
+    a. License grant.
+
+        1. Subject to the terms and conditions of this Public License, the
+           Licensor hereby grants You a worldwide, royalty-free,
+           non-sublicensable, non-exclusive, irrevocable license to
+           exercise the Licensed Rights in the Licensed Material to:
+
+            A. reproduce and Share the Licensed Material, in whole or in part; and
+
+            B. produce, reproduce, and Share Adapted Material.
+
+        2. Exceptions and Limitations. For the avoidance of doubt, where
+           Exceptions and Limitations apply to Your use, this Public
+           License does not apply, and You do not need to comply with its
+           terms and conditions.
+
+        3. Term. The term of this Public License is specified in Section 6(a).
+
+        4. Media and formats; technical modifications allowed. The Licensor
+           authorizes You to exercise the Licensed Rights in all media and
+           formats whether now known or hereafter created, and to make
+           technical modifications necessary to do so. The Licensor waives
+           and/or agrees not to assert any right or authority to forbid You
+           from making technical modifications necessary to exercise the
+           Licensed Rights, including technical modifications necessary to
+           circumvent Effective Technological Measures. For purposes of
+           this Public License, simply making modifications authorized by
+           this Section 2(a)(4) never produces Adapted Material.
+
+        5. Downstream recipients.
+
+            A. Offer from the Licensor - Licensed Material. Every recipient
+               of the Licensed Material automatically receives an offer
+               from the Licensor to exercise the Licensed Rights under the
+               terms and conditions of this Public License.
+
+            B. Additional offer from the Licensor - Adapted Material. Every
+               recipient of Adapted Material from You automatically
+               receives an offer from the Licensor to exercise the Licensed
+               Rights in the Adapted Material under the conditions of the
+               Adapter's License You apply.
+
+            C. No downstream restrictions. You may not offer or impose any
+               additional or different terms or conditions on, or apply any
+               Effective Technological Measures to, the Licensed Material
+               if doing so restricts exercise of the Licensed Rights by any
+               recipient of the Licensed Material.
+
+        6. No endorsement. Nothing in this Public License constitutes or
+           may be construed as permission to assert or imply that You are,
+           or that Your use of the Licensed Material is, connected with, or
+           sponsored, endorsed, or granted official status by, the Licensor
+           or others designated to receive attribution as provided in
+           Section 3(a)(1)(A)(i).
+
+    b. Other rights.
+
+        1. Moral rights, such as the right of integrity, are not licensed
+           under this Public License, nor are publicity, privacy, and/or
+           other similar personality rights; however, to the extent
+           possible, the Licensor waives and/or agrees not to assert any
+           such rights held by the Licensor to the limited extent necessary
+           to allow You to exercise the Licensed Rights, but not otherwise.
+
+        2. Patent and trademark rights are not licensed under this Public
+           License.
+
+        3. To the extent possible, the Licensor waives any right to collect
+           royalties from You for the exercise of the Licensed Rights,
+           whether directly or through a collecting society under any
+           voluntary or waivable statutory or compulsory licensing
+           scheme. In all other cases the Licensor expressly reserves any
+           right to collect such royalties.
+
+Section 3 - License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+
+    a. Attribution.
+
+        1. If You Share the Licensed Material (including in modified form),
+           You must:
+
+            A. retain the following if it is supplied by the Licensor with
+               the Licensed Material:
+
+                i. identification of the creator(s) of the Licensed
+                   Material and any others designated to receive
+                   attribution, in any reasonable manner requested by the
+                   Licensor (including by pseudonym if designated);
+
+                ii. a copyright notice;
+
+                iii. a notice that refers to this Public License;
+
+                iv. a notice that refers to the disclaimer of warranties;
+
+                v. a URI or hyperlink to the Licensed Material to the extent reasonably practicable;
+
+            B. indicate if You modified the Licensed Material and retain an
+               indication of any previous modifications; and
+
+            C. indicate the Licensed Material is licensed under this Public
+            License, and include the text of, or the URI or hyperlink to,
+            this Public License.
+
+        2. You may satisfy the conditions in Section 3(a)(1) in any
+           reasonable manner based on the medium, means, and context in
+           which You Share the Licensed Material. For example, it may be
+           reasonable to satisfy the conditions by providing a URI or
+           hyperlink to a resource that includes the required information.
+
+        3. If requested by the Licensor, You must remove any of the
+           information required by Section 3(a)(1)(A) to the extent
+           reasonably practicable.  b. ShareAlike.In addition to the
+           conditions in Section 3(a), if You Share Adapted Material You
+           produce, the following conditions also apply.
+
+           1. The Adapter's License You apply must be a Creative Commons
+              license with the same License Elements, this version or
+              later, or a BY-SA Compatible License.
+
+           2. You must include the text of, or the URI or hyperlink to, the
+              Adapter's License You apply. You may satisfy this condition
+              in any reasonable manner based on the medium, means, and
+              context in which You Share Adapted Material.
+
+           3. You may not offer or impose any additional or different terms
+              or conditions on, or apply any Effective Technological
+              Measures to, Adapted Material that restrict exercise of the
+              rights granted under the Adapter's License You apply.
+
+Section 4 - Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that apply to
+Your use of the Licensed Material:
+
+    a. for the avoidance of doubt, Section 2(a)(1) grants You the right to
+       extract, reuse, reproduce, and Share all or a substantial portion of
+       the contents of the database;
+
+    b. if You include all or a substantial portion of the database contents
+       in a database in which You have Sui Generis Database Rights, then
+       the database in which You have Sui Generis Database Rights (but not
+       its individual contents) is Adapted Material, including for purposes
+       of Section 3(b); and
+
+    c. You must comply with the conditions in Section 3(a) if You Share all
+       or a substantial portion of the contents of the database.
+
+    For the avoidance of doubt, this Section 4 supplements and does not
+    replace Your obligations under this Public License where the Licensed
+    Rights include other Copyright and Similar Rights.
+
+Section 5 - Disclaimer of Warranties and Limitation of Liability.
+
+    a. Unless otherwise separately undertaken by the Licensor, to the
+       extent possible, the Licensor offers the Licensed Material as-is and
+       as-available, and makes no representations or warranties of any kind
+       concerning the Licensed Material, whether express, implied,
+       statutory, or other. This includes, without limitation, warranties
+       of title, merchantability, fitness for a particular purpose,
+       non-infringement, absence of latent or other defects, accuracy, or
+       the presence or absence of errors, whether or not known or
+       discoverable. Where disclaimers of warranties are not allowed in
+       full or in part, this disclaimer may not apply to You.
+
+    b. To the extent possible, in no event will the Licensor be liable to
+       You on any legal theory (including, without limitation, negligence)
+       or otherwise for any direct, special, indirect, incidental,
+       consequential, punitive, exemplary, or other losses, costs,
+       expenses, or damages arising out of this Public License or use of
+       the Licensed Material, even if the Licensor has been advised of the
+       possibility of such losses, costs, expenses, or damages. Where a
+       limitation of liability is not allowed in full or in part, this
+       limitation may not apply to You.
+
+    c. The disclaimer of warranties and limitation of liability provided
+       above shall be interpreted in a manner that, to the extent possible,
+       most closely approximates an absolute disclaimer and waiver of all
+       liability.
+
+Section 6 - Term and Termination.
+
+    a. This Public License applies for the term of the Copyright and
+       Similar Rights licensed here. However, if You fail to comply with
+       this Public License, then Your rights under this Public License
+       terminate automatically.
+
+    b. Where Your right to use the Licensed Material has terminated under
+       Section 6(a), it reinstates:
+
+        1. automatically as of the date the violation is cured, provided it
+           is cured within 30 days of Your discovery of the violation; or
+
+        2. upon express reinstatement by the Licensor.
+
+    c. For the avoidance of doubt, this Section 6(b) does not affect any
+       right the Licensor may have to seek remedies for Your violations of
+       this Public License.
+
+    d. For the avoidance of doubt, the Licensor may also offer the Licensed
+       Material under separate terms or conditions or stop distributing the
+       Licensed Material at any time; however, doing so will not terminate
+       this Public License.
+
+    e. Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
+
+Section 7 - Other Terms and Conditions.
+
+    a. The Licensor shall not be bound by any additional or different terms
+       or conditions communicated by You unless expressly agreed.
+
+    b. Any arrangements, understandings, or agreements regarding the
+       Licensed Material not stated herein are separate from and
+       independent of the terms and conditions of this Public License.
+
+Section 8 - Interpretation.
+
+    a. For the avoidance of doubt, this Public License does not, and shall
+       not be interpreted to, reduce, limit, restrict, or impose conditions
+       on any use of the Licensed Material that could lawfully be made
+       without permission under this Public License.
+
+    b. To the extent possible, if any provision of this Public License is
+       deemed unenforceable, it shall be automatically reformed to the
+       minimum extent necessary to make it enforceable. If the provision
+       cannot be reformed, it shall be severed from this Public License
+       without affecting the enforceability of the remaining terms and
+       conditions.
+
+    c. No term or condition of this Public License will be waived and no
+       failure to comply consented to unless expressly agreed to by the
+       Licensor.
+
+    d. Nothing in this Public License constitutes or may be interpreted as
+       a limitation upon, or waiver of, any privileges and immunities that
+       apply to the Licensor or You, including from the legal processes of
+       any jurisdiction or authority.
+
+Creative Commons is not a party to its public licenses. Notwithstanding,
+Creative Commons may elect to apply one of its public licenses to material
+it publishes and in those instances will be considered the "Licensor." The
+text of the Creative Commons public licenses is dedicated to the public
+domain under the CC0 Public Domain Dedication. Except for the limited
+purpose of indicating that material is shared under a Creative Commons
+public license or as otherwise permitted by the Creative Commons policies
+published at creativecommons.org/policies, Creative Commons does not
+authorize the use of the trademark "Creative Commons" or any other
+trademark or logo of Creative Commons without its prior written consent
+including, without limitation, in connection with any unauthorized
+modifications to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For the
+avoidance of doubt, this paragraph does not form part of the public
+licenses.
+
+Creative Commons may be contacted at creativecommons.org.
diff --git a/LICENSES/other/CDDL-1.0 b/LICENSES/other/CDDL-1.0
new file mode 100644
index 0000000..195a168
--- /dev/null
+++ b/LICENSES/other/CDDL-1.0
@@ -0,0 +1,364 @@
+Valid-License-Identifier: CDDL-1.0
+SPDX-URL: https://spdx.org/licenses/CDDL-1.0.html
+Usage-Guide:
+  To use the Common Development and Distribution License 1.0 put the
+  following SPDX tag/value pair into a comment according to the placement
+  guidelines in the licensing rules documentation:
+    SPDX-License-Identifier: CDDL-1.0
+
+License-Text:
+
+COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL)
+Version 1.0
+
+    1. Definitions.
+
+        1.1. "Contributor" means each individual or entity that creates or
+             contributes to the creation of Modifications.
+
+        1.2. "Contributor Version" means the combination of the Original
+	     Software, prior Modifications used by a Contributor (if any),
+	     and the Modifications made by that particular Contributor.
+
+        1.3. "Covered Software" means (a) the Original Software, or (b)
+             Modifications, or (c) the combination of files containing
+             Original Software with files containing Modifications, in each
+             case including portions thereof.
+
+	1.4. "Executable" means the Covered Software in any form other than
+             Source Code.
+
+        1.5. "Initial Developer" means the individual or entity that first
+             makes Original Software available under this License.
+
+        1.6. "Larger Work" means a work which combines Covered Software or
+             portions thereof with code not governed by the terms of this
+             License.
+
+        1.7. "License" means this document.
+
+        1.8. "Licensable" means having the right to grant, to the maximum
+             extent possible, whether at the time of the initial grant or
+             subsequently acquired, any and all of the rights conveyed herein.
+
+        1.9. "Modifications" means the Source Code and Executable form of
+             any of the following:
+
+            A. Any file that results from an addition to, deletion from or
+               modification of the contents of a file containing Original
+               Software or previous Modifications;
+
+            B. Any new file that contains any part of the Original Software
+               or previous Modification; or
+
+            C. Any new file that is contributed or otherwise made available
+               under the terms of this License.
+
+        1.10. "Original Software" means the Source Code and Executable form
+              of computer software code that is originally released under
+              this License.
+
+        1.11. "Patent Claims" means any patent claim(s), now owned or
+              hereafter acquired, including without limitation, method,
+              process, and apparatus claims, in any patent Licensable by
+              grantor.
+
+        1.12. "Source Code" means (a) the common form of computer software
+	      code in which modifications are made and (b) associated
+              documentation included in or with such code.
+
+        1.13. "You" (or "Your") means an individual or a legal entity
+              exercising rights under, and complying with all of the terms
+              of, this License. For legal entities, "You" includes any
+              entity which controls, is controlled by, or is under common
+              control with You. For purposes of this definition, "control"
+              means (a) the power, direct or indirect, to cause the
+              direction or management of such entity, whether by contract
+              or otherwise, or (b) ownership of more than fifty percent
+              (50%) of the outstanding shares or beneficial ownership of
+              such entity.
+
+    2. License Grants.
+        2.1. The Initial Developer Grant.
+
+        Conditioned upon Your compliance with Section 3.1 below and subject
+        to third party intellectual property claims, the Initial Developer
+        hereby grants You a world-wide, royalty-free, non-exclusive
+        license:
+
+            (a) under intellectual property rights (other than patent or
+                trademark) Licensable by Initial Developer, to use,
+                reproduce, modify, display, perform, sublicense and
+                distribute the Original Software (or portions thereof),
+                with or without Modifications, and/or as part of a Larger
+                Work; and
+
+            (b) under Patent Claims infringed by the making, using or
+                selling of Original Software, to make, have made, use,
+                practice, sell, and offer for sale, and/or otherwise
+                dispose of the Original Software (or portions thereof).
+
+            (c) The licenses granted in Sections 2.1(a) and (b) are
+                effective on the date Initial Developer first distributes
+                or otherwise makes the Original Software available to a
+                third party under the terms of this License.
+
+            (d) Notwithstanding Section 2.1(b) above, no patent license is
+                granted: (1) for code that You delete from the Original
+                Software, or (2) for infringements caused by: (i) the
+                modification of the Original Software, or (ii) the
+                combination of the Original Software with other software or
+                devices.
+
+        2.2. Contributor Grant.
+
+        Conditioned upon Your compliance with Section 3.1 below and subject
+        to third party intellectual property claims, each Contributor
+        hereby grants You a world-wide, royalty-free, non-exclusive
+        license:
+
+            (a) under intellectual property rights (other than patent or
+	        trademark) Licensable by Contributor to use, reproduce,
+	        modify, display, perform, sublicense and distribute the
+	        Modifications created by such Contributor (or portions
+	        thereof), either on an unmodified basis, with other
+	        Modifications, as Covered Software and/or as part of a
+	        Larger Work; and
+
+            (b) under Patent Claims infringed by the making, using, or
+                selling of Modifications made by that Contributor either
+                alone and/or in combination with its Contributor Version
+                (or portions of such combination), to make, use, sell,
+                offer for sale, have made, and/or otherwise dispose of: (1)
+                Modifications made by that Contributor (or portions
+                thereof); and (2) the combination of Modifications made by
+                that Contributor with its Contributor Version (or portions
+                of such combination).
+
+            (c) The licenses granted in Sections 2.2(a) and 2.2(b) are
+                effective on the date Contributor first distributes or
+                otherwise makes the Modifications available to a third
+                party.
+
+            (d) Notwithstanding Section 2.2(b) above, no patent license is
+                granted: (1) for any code that Contributor has deleted from
+                the Contributor Version; (2) for infringements caused by:
+                (i) third party modifications of Contributor Version, or
+                (ii) the combination of Modifications made by that
+                Contributor with other software (except as part of the
+                Contributor Version) or other devices; or (3) under Patent
+                Claims infringed by Covered Software in the absence of
+                Modifications made by that Contributor.
+
+    3. Distribution Obligations.
+        3.1. Availability of Source Code.
+
+        Any Covered Software that You distribute or otherwise make
+        available in Executable form must also be made available in Source
+        Code form and that Source Code form must be distributed only under
+        the terms of this License. You must include a copy of this License
+        with every copy of the Source Code form of the Covered Software You
+        distribute or otherwise make available. You must inform recipients
+        of any such Covered Software in Executable form as to how they can
+        obtain such Covered Software in Source Code form in a reasonable
+        manner on or through a medium customarily used for software
+        exchange.
+
+        3.2. Modifications.
+
+        The Modifications that You create or to which You contribute are
+        governed by the terms of this License. You represent that You
+        believe Your Modifications are Your original creation(s) and/or You
+        have sufficient rights to grant the rights conveyed by this
+        License.
+
+        3.3. Required Notices.
+
+        You must include a notice in each of Your Modifications that
+        identifies You as the Contributor of the Modification. You may not
+        remove or alter any copyright, patent or trademark notices
+        contained within the Covered Software, or any notices of licensing
+        or any descriptive text giving attribution to any Contributor or
+        the Initial Developer.
+
+        3.4. Application of Additional Terms.
+
+        You may not offer or impose any terms on any Covered Software in
+        Source Code form that alters or restricts the applicable version of
+        this License or the recipients' rights hereunder. You may choose to
+        offer, and to charge a fee for, warranty, support, indemnity or
+        liability obligations to one or more recipients of Covered
+        Software. However, you may do so only on Your own behalf, and not
+        on behalf of the Initial Developer or any Contributor. You must
+        make it absolutely clear that any such warranty, support, indemnity
+        or liability obligation is offered by You alone, and You hereby
+        agree to indemnify the Initial Developer and every Contributor for
+        any liability incurred by the Initial Developer or such Contributor
+        as a result of warranty, support, indemnity or liability terms You
+        offer.
+
+        3.5. Distribution of Executable Versions.
+
+        You may distribute the Executable form of the Covered Software
+        under the terms of this License or under the terms of a license of
+        Your choice, which may contain terms different from this License,
+        provided that You are in compliance with the terms of this License
+        and that the license for the Executable form does not attempt to
+        limit or alter the recipient's rights in the Source Code form from
+        the rights set forth in this License. If You distribute the Covered
+        Software in Executable form under a different license, You must
+        make it absolutely clear that any terms which differ from this
+        License are offered by You alone, not by the Initial Developer or
+        Contributor. You hereby agree to indemnify the Initial Developer
+        and every Contributor for any liability incurred by the Initial
+        Developer or such Contributor as a result of any such terms You
+        offer.
+
+        3.6. Larger Works.
+
+        You may create a Larger Work by combining Covered Software with
+        other code not governed by the terms of this License and distribute
+        the Larger Work as a single product. In such a case, You must make
+        sure the requirements of this License are fulfilled for the Covered
+        Software.
+
+    4. Versions of the License.
+        4.1. New Versions.
+
+        Sun Microsystems, Inc. is the initial license steward and may
+        publish revised and/or new versions of this License from time to
+        time. Each version will be given a distinguishing version
+        number. Except as provided in Section 4.3, no one other than the
+        license steward has the right to modify this License.
+
+        4.2. Effect of New Versions.
+
+        You may always continue to use, distribute or otherwise make the
+        Covered Software available under the terms of the version of the
+        License under which You originally received the Covered
+        Software. If the Initial Developer includes a notice in the
+        Original Software prohibiting it from being distributed or
+        otherwise made available under any subsequent version of the
+        License, You must distribute and make the Covered Software
+        available under the terms of the version of the License under which
+        You originally received the Covered Software. Otherwise, You may
+        also choose to use, distribute or otherwise make the Covered
+        Software available under the terms of any subsequent version of the
+        License published by the license steward.
+
+        4.3. Modified Versions.
+
+        When You are an Initial Developer and You want to create a new
+        license for Your Original Software, You may create and use a
+        modified version of this License if You: (a) rename the license and
+        remove any references to the name of the license steward (except to
+        note that the license differs from this License); and (b) otherwise
+        make it clear that the license contains terms which differ from
+        this License.
+
+    5. DISCLAIMER OF WARRANTY.
+
+    COVERED SOFTWARE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS,
+    WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+    WITHOUT LIMITATION, WARRANTIES THAT THE COVERED SOFTWARE IS FREE OF
+    DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR
+    NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF
+    THE COVERED SOFTWARE IS WITH YOU. SHOULD ANY COVERED SOFTWARE PROVE
+    DEFECTIVE IN ANY RESPECT, YOU (NOT THE INITIAL DEVELOPER OR ANY OTHER
+    CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY SERVICING, REPAIR OR
+    CORRECTION. THIS DISCLAIMER OF WARRANTY CONSTITUTES AN ESSENTIAL PART
+    OF THIS LICENSE. NO USE OF ANY COVERED SOFTWARE IS AUTHORIZED HEREUNDER
+    EXCEPT UNDER THIS DISCLAIMER.
+
+    6. TERMINATION.
+
+        6.1. This License and the rights granted hereunder will terminate
+        automatically if You fail to comply with terms herein and fail to
+        cure such breach within 30 days of becoming aware of the
+        breach. Provisions which, by their nature, must remain in effect
+        beyond the termination of this License shall survive.
+
+        6.2. If You assert a patent infringement claim (excluding
+        declaratory judgment actions) against Initial Developer or a
+        Contributor (the Initial Developer or Contributor against whom You
+        assert such claim is referred to as "Participant") alleging that
+        the Participant Software (meaning the Contributor Version where the
+        Participant is a Contributor or the Original Software where the
+        Participant is the Initial Developer) directly or indirectly
+        infringes any patent, then any and all rights granted directly or
+        indirectly to You by such Participant, the Initial Developer (if
+        the Initial Developer is not the Participant) and all Contributors
+        under Sections 2.1 and/or 2.2 of this License shall, upon 60 days
+        notice from Participant terminate prospectively and automatically
+        at the expiration of such 60 day notice period, unless if within
+        such 60 day period You withdraw Your claim with respect to the
+        Participant Software against such Participant either unilaterally
+        or pursuant to a written agreement with Participant.
+
+        6.3. In the event of termination under Sections 6.1 or 6.2 above,
+        all end user licenses that have been validly granted by You or any
+        distributor hereunder prior to termination (excluding licenses
+        granted to You by any distributor) shall survive termination.
+
+    7. LIMITATION OF LIABILITY.
+
+    UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT
+    (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE INITIAL
+    DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED
+    SOFTWARE, OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE TO ANY
+    PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+    OF ANY CHARACTER INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOST
+    PROFITS, LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR
+    MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES, EVEN IF
+    SUCH PARTY SHALL HAVE BEEN INFORMED OF THE POSSIBILITY OF SUCH
+    DAMAGES. THIS LIMITATION OF LIABILITY SHALL NOT APPLY TO LIABILITY FOR
+    DEATH OR PERSONAL INJURY RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE
+    EXTENT APPLICABLE LAW PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO
+    NOT ALLOW THE EXCLUSION OR LIMITATION OF INCIDENTAL OR CONSEQUENTIAL
+    DAMAGES, SO THIS EXCLUSION AND LIMITATION MAY NOT APPLY TO YOU.
+
+    8. U.S. GOVERNMENT END USERS.
+
+    The Covered Software is a "commercial item," as that term is defined in
+    48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer
+    software" (as that term is defined at 48 C.F.R. $ 252.227-7014(a)(1))
+    and "commercial computer software documentation" as such terms are used
+    in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48 C.F.R. 12.212 and
+    48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995), all
+    U.S. Government End Users acquire Covered Software with only those
+    rights set forth herein. This U.S. Government Rights clause is in lieu
+    of, and supersedes, any other FAR, DFAR, or other clause or provision
+    that addresses Government rights in computer software under this
+    License.
+
+    9. MISCELLANEOUS.
+
+    This License represents the complete agreement concerning subject
+    matter hereof. If any provision of this License is held to be
+    unenforceable, such provision shall be reformed only to the extent
+    necessary to make it enforceable. This License shall be governed by the
+    law of the jurisdiction specified in a notice contained within the
+    Original Software (except to the extent applicable law, if any,
+    provides otherwise), excluding such jurisdiction's conflict-of-law
+    provisions. Any litigation relating to this License shall be subject to
+    the jurisdiction of the courts located in the jurisdiction and venue
+    specified in a notice contained within the Original Software, with the
+    losing party responsible for costs, including, without limitation,
+    court costs and reasonable attorneys' fees and expenses. The
+    application of the United Nations Convention on Contracts for the
+    International Sale of Goods is expressly excluded. Any law or
+    regulation which provides that the language of a contract shall be
+    construed against the drafter shall not apply to this License. You
+    agree that You alone are responsible for compliance with the United
+    States export administration regulations (and the export control laws
+    and regulation of any other countries) when You use, distribute or
+    otherwise make available any Covered Software.
+
+    10. RESPONSIBILITY FOR CLAIMS.
+
+    As between Initial Developer and the Contributors, each party is
+    responsible for claims and damages arising, directly or indirectly, out
+    of its utilization of rights under this License and You agree to work
+    with Initial Developer and Contributors to distribute such
+    responsibility on an equitable basis. Nothing herein is intended or
+    shall be deemed to constitute any admission of liability.
diff --git a/LICENSES/other/Linux-OpenIB b/LICENSES/other/Linux-OpenIB
new file mode 100644
index 0000000..1ad85f6
--- /dev/null
+++ b/LICENSES/other/Linux-OpenIB
@@ -0,0 +1,26 @@
+Valid-License-Identifier: Linux-OpenIB
+SPDX-URL: https://spdx.org/licenses/Linux-OpenIB.html
+Usage-Guide:
+  To use the Linux Kernel Variant of OpenIB.org license put the following
+  SPDX tag/value pair into a comment according to the placement guidelines
+  in the licensing rules documentation:
+    SPDX-License-Identifier: Linux-OpenIB
+License-Text:
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/LICENSES/other/X11 b/LICENSES/other/X11
new file mode 100644
index 0000000..fe4353f
--- /dev/null
+++ b/LICENSES/other/X11
@@ -0,0 +1,37 @@
+Valid-License-Identifier: X11
+SPDX-URL: https://spdx.org/licenses/X11.html
+Usage-Guide:
+  To use the X11 put the following SPDX tag/value pair into a comment
+  according to the placement guidelines in the licensing rules
+  documentation:
+    SPDX-License-Identifier: X11
+License-Text:
+
+
+X11 License
+
+Copyright (C) 1996 X Consortium
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+Except as contained in this notice, the name of the X Consortium shall not
+be used in advertising or otherwise to promote the sale, use or other
+dealings in this Software without prior written authorization from the X
+Consortium.
+
+X Window System is a trademark of X Consortium, Inc.
diff --git a/LICENSES/preferred/GPL-2.0 b/LICENSES/preferred/GPL-2.0
index b8db91d..ff0812f 100644
--- a/LICENSES/preferred/GPL-2.0
+++ b/LICENSES/preferred/GPL-2.0
@@ -1,5 +1,7 @@
 Valid-License-Identifier: GPL-2.0
+Valid-License-Identifier: GPL-2.0-only
 Valid-License-Identifier: GPL-2.0+
+Valid-License-Identifier: GPL-2.0-or-later
 SPDX-URL: https://spdx.org/licenses/GPL-2.0.html
 Usage-Guide:
   To use this license in source code, put one of the following SPDX
@@ -7,8 +9,12 @@ Usage-Guide:
   guidelines in the licensing rules documentation.
   For 'GNU General Public License (GPL) version 2 only' use:
     SPDX-License-Identifier: GPL-2.0
+  or
+    SPDX-License-Identifier: GPL-2.0-only
   For 'GNU General Public License (GPL) version 2 or any later version' use:
     SPDX-License-Identifier: GPL-2.0+
+  or
+    SPDX-License-Identifier: GPL-2.0-or-later
 License-Text:
 
 		    GNU GENERAL PUBLIC LICENSE
diff --git a/MAINTAINERS b/MAINTAINERS
index 9c125f7..aa63583 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4330,12 +4330,14 @@ W:	http://git.infradead.org/users/hch/dma-mapping.git
 S:	Supported
 F:	lib/dma-debug.c
 F:	lib/dma-direct.c
+F:	lib/dma-noncoherent.c
 F:	lib/dma-virt.c
 F:	drivers/base/dma-mapping.c
 F:	drivers/base/dma-coherent.c
 F:	include/asm-generic/dma-mapping.h
 F:	include/linux/dma-direct.h
 F:	include/linux/dma-mapping.h
+F:	include/linux/dma-noncoherent.h
 
 DME1737 HARDWARE MONITOR DRIVER
 M:	Juerg Haefliger <juergh@gmail.com>
@@ -5948,8 +5950,8 @@ S:	Maintained
 F:	scripts/get_maintainer.pl
 
 GFS2 FILE SYSTEM
-M:	Steven Whitehouse <swhiteho@redhat.com>
 M:	Bob Peterson <rpeterso@redhat.com>
+M:	Andreas Gruenbacher <agruenba@redhat.com>
 L:	cluster-devel@redhat.com
 W:	http://sources.redhat.com/cluster/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2.git
@@ -6217,6 +6219,7 @@ L:	linux-hwmon@vger.kernel.org
 W:	http://hwmon.wiki.kernel.org/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git
 S:	Maintained
+F:	Documentation/devicetree/bindings/hwmon/
 F:	Documentation/hwmon/
 F:	drivers/hwmon/
 F:	include/linux/hwmon*.h
@@ -8209,7 +8212,7 @@ F:	drivers/misc/lkdtm/*
 
 LINUX KERNEL MEMORY CONSISTENCY MODEL (LKMM)
 M:	Alan Stern <stern@rowland.harvard.edu>
-M:	Andrea Parri <parri.andrea@gmail.com>
+M:	Andrea Parri <andrea.parri@amarulasolutions.com>
 M:	Will Deacon <will.deacon@arm.com>
 M:	Peter Zijlstra <peterz@infradead.org>
 M:	Boqun Feng <boqun.feng@gmail.com>
@@ -8316,6 +8319,7 @@ F:	Documentation/admin-guide/LSM/LoadPin.rst
 LOCKING PRIMITIVES
 M:	Peter Zijlstra <peterz@infradead.org>
 M:	Ingo Molnar <mingo@redhat.com>
+M:	Will Deacon <will.deacon@arm.com>
 L:	linux-kernel@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core
 S:	Maintained
@@ -9700,7 +9704,7 @@ S:	Maintained
 F:	drivers/net/ethernet/netronome/
 
 NETWORK BLOCK DEVICE (NBD)
-M:	Josef Bacik <jbacik@fb.com>
+M:	Josef Bacik <josef@toxicpanda.com>
 S:	Maintained
 L:	linux-block@vger.kernel.org
 L:	nbd@other.debian.org
@@ -15639,7 +15643,7 @@ L:	linux-mm@kvack.org
 S:	Maintained
 F:	mm/zsmalloc.c
 F:	include/linux/zsmalloc.h
-F:	Documentation/vm/zsmalloc.txt
+F:	Documentation/vm/zsmalloc.rst
 
 ZSWAP COMPRESSED SWAP CACHING
 M:	Seth Jennings <sjenning@redhat.com>
diff --git a/Makefile b/Makefile
index 56ba070..554dcad 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 4
 PATCHLEVEL = 17
 SUBLEVEL = 0
-EXTRAVERSION = -rc7
+EXTRAVERSION =
 NAME = Merciless Moray
 
 # *DOCUMENTATION*
diff --git a/arch/Kconfig b/arch/Kconfig
index 75dd23a..b624634 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -278,9 +278,6 @@ config HAVE_CLK
 	  The <linux/clk.h> calls support software clock gating and
 	  thus are a key power management tool on many systems.
 
-config HAVE_DMA_API_DEBUG
-	bool
-
 config HAVE_HW_BREAKPOINT
 	bool
 	depends on PERF_EVENTS
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index f19dc31..0c4805a 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -10,6 +10,8 @@ config ALPHA
 	select HAVE_OPROFILE
 	select HAVE_PCSPKR_PLATFORM
 	select HAVE_PERF_EVENTS
+	select NEED_DMA_MAP_STATE
+	select NEED_SG_DMA_LENGTH
 	select VIRT_TO_BUS
 	select GENERIC_IRQ_PROBE
 	select AUTO_IRQ_AFFINITY if SMP
@@ -64,15 +66,6 @@ config ZONE_DMA
 	bool
 	default y
 
-config ARCH_DMA_ADDR_T_64BIT
-	def_bool y
-
-config NEED_DMA_MAP_STATE
-       def_bool y
-
-config NEED_SG_DMA_LENGTH
-	def_bool y
-
 config GENERIC_ISA_DMA
 	bool
 	default y
@@ -346,9 +339,6 @@ config PCI_DOMAINS
 config PCI_SYSCALL
 	def_bool PCI
 
-config IOMMU_HELPER
-	def_bool PCI
-
 config ALPHA_NONAME
 	bool
 	depends on ALPHA_BOOK1 || ALPHA_NONAME_CH
@@ -586,7 +576,7 @@ config ARCH_DISCONTIGMEM_ENABLE
 	  Say Y to support efficient handling of discontiguous physical memory,
 	  for architectures which are either NUMA (Non-Uniform Memory Access)
 	  or have huge holes in the physical address space for other reasons.
-	  See <file:Documentation/vm/numa> for more.
+	  See <file:Documentation/vm/numa.rst> for more.
 
 source "mm/Kconfig"
 
diff --git a/arch/alpha/include/asm/pci.h b/arch/alpha/include/asm/pci.h
index b9ec553..cf6bc1e 100644
--- a/arch/alpha/include/asm/pci.h
+++ b/arch/alpha/include/asm/pci.h
@@ -56,11 +56,6 @@ struct pci_controller {
 
 /* IOMMU controls.  */
 
-/* The PCI address space does not equal the physical memory address space.
-   The networking and block device layers use this boolean for bounce buffer
-   decisions.  */
-#define PCI_DMA_BUS_IS_PHYS  0
-
 /* TODO: integrate with include/asm-generic/pci.h ? */
 static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 {
diff --git a/arch/alpha/include/uapi/asm/siginfo.h b/arch/alpha/include/uapi/asm/siginfo.h
index 0cf3b52..db3f013 100644
--- a/arch/alpha/include/uapi/asm/siginfo.h
+++ b/arch/alpha/include/uapi/asm/siginfo.h
@@ -7,18 +7,4 @@
 
 #include <asm-generic/siginfo.h>
 
-/*
- * SIGFPE si_codes
- */
-#ifdef __KERNEL__
-#define FPE_FIXME	0	/* Broken dup of SI_USER */
-#endif /* __KERNEL__ */
-
-/*
- * SIGTRAP si_codes
- */
-#ifdef __KERNEL__
-#define TRAP_FIXME	0	/* Broken dup of SI_USER */
-#endif /* __KERNEL__ */
-
 #endif
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 89faa6f..6e92175 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -871,8 +871,7 @@ SYSCALL_DEFINE5(osf_setsysinfo, unsigned long, op, void __user *, buffer,
 		   send a signal.  Old exceptions are not signaled.  */
 		fex = (exc >> IEEE_STATUS_TO_EXCSUM_SHIFT) & swcr;
  		if (fex) {
-			siginfo_t info;
-			int si_code = 0;
+			int si_code = FPE_FLTUNK;
 
 			if (fex & IEEE_TRAP_ENABLE_DNO) si_code = FPE_FLTUND;
 			if (fex & IEEE_TRAP_ENABLE_INE) si_code = FPE_FLTRES;
@@ -881,11 +880,9 @@ SYSCALL_DEFINE5(osf_setsysinfo, unsigned long, op, void __user *, buffer,
 			if (fex & IEEE_TRAP_ENABLE_DZE) si_code = FPE_FLTDIV;
 			if (fex & IEEE_TRAP_ENABLE_INV) si_code = FPE_FLTINV;
 
-			info.si_signo = SIGFPE;
-			info.si_errno = 0;
-			info.si_code = si_code;
-			info.si_addr = NULL;  /* FIXME */
- 			send_sig_info(SIGFPE, &info, current);
+			send_sig_fault(SIGFPE, si_code,
+				       (void __user *)NULL,  /* FIXME */
+				       0, current);
  		}
 		return 0;
 	}
diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 9ebb3bc..8c0c4ee 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -219,14 +219,8 @@ do_sigreturn(struct sigcontext __user *sc)
 
 	/* Send SIGTRAP if we're single-stepping: */
 	if (ptrace_cancel_bpt (current)) {
-		siginfo_t info;
-
-		info.si_signo = SIGTRAP;
-		info.si_errno = 0;
-		info.si_code = TRAP_BRKPT;
-		info.si_addr = (void __user *) regs->pc;
-		info.si_trapno = 0;
-		send_sig_info(SIGTRAP, &info, current);
+		send_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *) regs->pc, 0,
+			       current);
 	}
 	return;
 
@@ -253,14 +247,8 @@ do_rt_sigreturn(struct rt_sigframe __user *frame)
 
 	/* Send SIGTRAP if we're single-stepping: */
 	if (ptrace_cancel_bpt (current)) {
-		siginfo_t info;
-
-		info.si_signo = SIGTRAP;
-		info.si_errno = 0;
-		info.si_code = TRAP_BRKPT;
-		info.si_addr = (void __user *) regs->pc;
-		info.si_trapno = 0;
-		send_sig_info(SIGTRAP, &info, current);
+		send_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *) regs->pc, 0,
+			       current);
 	}
 	return;
 
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index f43bd05..bc96276 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -213,7 +213,6 @@ do_entArith(unsigned long summary, unsigned long write_mask,
 	    struct pt_regs *regs)
 {
 	long si_code = FPE_FLTINV;
-	siginfo_t info;
 
 	if (summary & 1) {
 		/* Software-completion summary bit is set, so try to
@@ -228,17 +227,12 @@ do_entArith(unsigned long summary, unsigned long write_mask,
 	}
 	die_if_kernel("Arithmetic fault", regs, 0, NULL);
 
-	info.si_signo = SIGFPE;
-	info.si_errno = 0;
-	info.si_code = si_code;
-	info.si_addr = (void __user *) regs->pc;
-	send_sig_info(SIGFPE, &info, current);
+	send_sig_fault(SIGFPE, si_code, (void __user *) regs->pc, 0, current);
 }
 
 asmlinkage void
 do_entIF(unsigned long type, struct pt_regs *regs)
 {
-	siginfo_t info;
 	int signo, code;
 
 	if ((regs->ps & ~IPL_MAX) == 0) {
@@ -270,31 +264,20 @@ do_entIF(unsigned long type, struct pt_regs *regs)
 
 	switch (type) {
 	      case 0: /* breakpoint */
-		info.si_signo = SIGTRAP;
-		info.si_errno = 0;
-		info.si_code = TRAP_BRKPT;
-		info.si_trapno = 0;
-		info.si_addr = (void __user *) regs->pc;
-
 		if (ptrace_cancel_bpt(current)) {
 			regs->pc -= 4;	/* make pc point to former bpt */
 		}
 
-		send_sig_info(SIGTRAP, &info, current);
+		send_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc, 0,
+			       current);
 		return;
 
 	      case 1: /* bugcheck */
-		info.si_signo = SIGTRAP;
-		info.si_errno = 0;
-		info.si_code = TRAP_FIXME;
-		info.si_addr = (void __user *) regs->pc;
-		info.si_trapno = 0;
-		send_sig_info(SIGTRAP, &info, current);
+		send_sig_fault(SIGTRAP, TRAP_UNK, (void __user *) regs->pc, 0,
+			       current);
 		return;
 		
 	      case 2: /* gentrap */
-		info.si_addr = (void __user *) regs->pc;
-		info.si_trapno = regs->r16;
 		switch ((long) regs->r16) {
 		case GEN_INTOVF:
 			signo = SIGFPE;
@@ -326,7 +309,7 @@ do_entIF(unsigned long type, struct pt_regs *regs)
 			break;
 		case GEN_ROPRAND:
 			signo = SIGFPE;
-			code = FPE_FIXME;
+			code = FPE_FLTUNK;
 			break;
 
 		case GEN_DECOVF:
@@ -348,15 +331,12 @@ do_entIF(unsigned long type, struct pt_regs *regs)
 		case GEN_SUBRNG7:
 		default:
 			signo = SIGTRAP;
-			code = TRAP_FIXME;
+			code = TRAP_UNK;
 			break;
 		}
 
-		info.si_signo = signo;
-		info.si_errno = 0;
-		info.si_code = code;
-		info.si_addr = (void __user *) regs->pc;
-		send_sig_info(signo, &info, current);
+		send_sig_fault(signo, code, (void __user *) regs->pc, regs->r16,
+			       current);
 		return;
 
 	      case 4: /* opDEC */
@@ -380,11 +360,9 @@ do_entIF(unsigned long type, struct pt_regs *regs)
 			if (si_code == 0)
 				return;
 			if (si_code > 0) {
-				info.si_signo = SIGFPE;
-				info.si_errno = 0;
-				info.si_code = si_code;
-				info.si_addr = (void __user *) regs->pc;
-				send_sig_info(SIGFPE, &info, current);
+				send_sig_fault(SIGFPE, si_code,
+					       (void __user *) regs->pc, 0,
+					       current);
 				return;
 			}
 		}
@@ -409,11 +387,7 @@ do_entIF(unsigned long type, struct pt_regs *regs)
 		      ;
 	}
 
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code = ILL_ILLOPC;
-	info.si_addr = (void __user *) regs->pc;
-	send_sig_info(SIGILL, &info, current);
+	send_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)regs->pc, 0, current);
 }
 
 /* There is an ifdef in the PALcode in MILO that enables a 
@@ -426,15 +400,9 @@ do_entIF(unsigned long type, struct pt_regs *regs)
 asmlinkage void
 do_entDbg(struct pt_regs *regs)
 {
-	siginfo_t info;
-
 	die_if_kernel("Instruction fault", regs, 0, NULL);
 
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code = ILL_ILLOPC;
-	info.si_addr = (void __user *) regs->pc;
-	force_sig_info(SIGILL, &info, current);
+	force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)regs->pc, 0, current);
 }
 
 
@@ -758,7 +726,7 @@ do_entUnaUser(void __user * va, unsigned long opcode,
 
 	unsigned long tmp1, tmp2, tmp3, tmp4;
 	unsigned long fake_reg, *reg_addr = &fake_reg;
-	siginfo_t info;
+	int si_code;
 	long error;
 
 	/* Check the UAC bits to decide what the user wants us to do
@@ -981,34 +949,27 @@ do_entUnaUser(void __user * va, unsigned long opcode,
 
 give_sigsegv:
 	regs->pc -= 4;  /* make pc point to faulting insn */
-	info.si_signo = SIGSEGV;
-	info.si_errno = 0;
 
 	/* We need to replicate some of the logic in mm/fault.c,
 	   since we don't have access to the fault code in the
 	   exception handling return path.  */
 	if ((unsigned long)va >= TASK_SIZE)
-		info.si_code = SEGV_ACCERR;
+		si_code = SEGV_ACCERR;
 	else {
 		struct mm_struct *mm = current->mm;
 		down_read(&mm->mmap_sem);
 		if (find_vma(mm, (unsigned long)va))
-			info.si_code = SEGV_ACCERR;
+			si_code = SEGV_ACCERR;
 		else
-			info.si_code = SEGV_MAPERR;
+			si_code = SEGV_MAPERR;
 		up_read(&mm->mmap_sem);
 	}
-	info.si_addr = va;
-	send_sig_info(SIGSEGV, &info, current);
+	send_sig_fault(SIGSEGV, si_code, va, 0, current);
 	return;
 
 give_sigbus:
 	regs->pc -= 4;
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_ADRALN;
-	info.si_addr = va;
-	send_sig_info(SIGBUS, &info, current);
+	send_sig_fault(SIGBUS, BUS_ADRALN, va, 0, current);
 	return;
 }
 
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index cd3c572..de2bd21 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -88,7 +88,6 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 	struct mm_struct *mm = current->mm;
 	const struct exception_table_entry *fixup;
 	int fault, si_code = SEGV_MAPERR;
-	siginfo_t info;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 	/* As of EV6, a load into $31/$f31 is a prefetch, and never faults
@@ -221,21 +220,13 @@ retry:
 	up_read(&mm->mmap_sem);
 	/* Send a sigbus, regardless of whether we were in kernel
 	   or user mode.  */
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_ADRERR;
-	info.si_addr = (void __user *) address;
-	force_sig_info(SIGBUS, &info, current);
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *) address, 0, current);
 	if (!user_mode(regs))
 		goto no_context;
 	return;
 
  do_sigsegv:
-	info.si_signo = SIGSEGV;
-	info.si_errno = 0;
-	info.si_code = si_code;
-	info.si_addr = (void __user *) address;
-	force_sig_info(SIGSEGV, &info, current);
+	force_sig_fault(SIGSEGV, si_code, (void __user *) address, 0, current);
 	return;
 
 #ifdef CONFIG_ALPHA_LARGE_VMALLOC
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index d76bf4a..89d47ea 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -9,11 +9,15 @@
 config ARC
 	def_bool y
 	select ARC_TIMERS
+	select ARCH_HAS_SYNC_DMA_FOR_CPU
+	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC
 	select BUILDTIME_EXTABLE_SORT
 	select CLONE_BACKWARDS
 	select COMMON_CLK
+	select DMA_NONCOHERENT_OPS
+	select DMA_NONCOHERENT_MMAP
 	select GENERIC_ATOMIC64 if !ISA_ARCV2 || !(ARC_HAS_LL64 && ARC_HAS_LLSC)
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_FIND_FIRST_BIT
@@ -453,16 +457,11 @@ config ARC_HAS_PAE40
 	default n
 	depends on ISA_ARCV2
 	select HIGHMEM
+	select PHYS_ADDR_T_64BIT
 	help
 	  Enable access to physical memory beyond 4G, only supported on
 	  ARC cores with 40 bit Physical Addressing support
 
-config ARCH_PHYS_ADDR_T_64BIT
-	def_bool ARC_HAS_PAE40
-
-config ARCH_DMA_ADDR_T_64BIT
-	bool
-
 config ARC_KVADDR_SIZE
 	int "Kernel Virtual Address Space size (MB)"
 	range 0 512
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index 4bd5d43..bbdcb95 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -2,6 +2,7 @@
 generic-y += bugs.h
 generic-y += device.h
 generic-y += div64.h
+generic-y += dma-mapping.h
 generic-y += emergency-restart.h
 generic-y += extable.h
 generic-y += fb.h
diff --git a/arch/arc/include/asm/dma-mapping.h b/arch/arc/include/asm/dma-mapping.h
deleted file mode 100644
index 7a16824..0000000
--- a/arch/arc/include/asm/dma-mapping.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * DMA Mapping glue for ARC
- *
- * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef ASM_ARC_DMA_MAPPING_H
-#define ASM_ARC_DMA_MAPPING_H
-
-extern const struct dma_map_ops arc_dma_ops;
-
-static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
-{
-	return &arc_dma_ops;
-}
-
-#endif
diff --git a/arch/arc/include/asm/pci.h b/arch/arc/include/asm/pci.h
index ba56c23..4ff53c0 100644
--- a/arch/arc/include/asm/pci.h
+++ b/arch/arc/include/asm/pci.h
@@ -16,12 +16,6 @@
 #define PCIBIOS_MIN_MEM 0x100000
 
 #define pcibios_assign_all_busses()	1
-/*
- * The PCI address space does equal the physical memory address space.
- * The networking and block device layers use this boolean for bounce
- * buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS	1
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index 1dcc404..8c10718 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -16,13 +16,12 @@
  * The default DMA address == Phy address which is 0x8000_0000 based.
  */
 
-#include <linux/dma-mapping.h>
+#include <linux/dma-noncoherent.h>
 #include <asm/cache.h>
 #include <asm/cacheflush.h>
 
-
-static void *arc_dma_alloc(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		gfp_t gfp, unsigned long attrs)
 {
 	unsigned long order = get_order(size);
 	struct page *page;
@@ -89,7 +88,7 @@ static void *arc_dma_alloc(struct device *dev, size_t size,
 	return kvaddr;
 }
 
-static void arc_dma_free(struct device *dev, size_t size, void *vaddr,
+void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 		dma_addr_t dma_handle, unsigned long attrs)
 {
 	phys_addr_t paddr = dma_handle;
@@ -105,9 +104,9 @@ static void arc_dma_free(struct device *dev, size_t size, void *vaddr,
 	__free_pages(page, get_order(size));
 }
 
-static int arc_dma_mmap(struct device *dev, struct vm_area_struct *vma,
-			void *cpu_addr, dma_addr_t dma_addr, size_t size,
-			unsigned long attrs)
+int arch_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs)
 {
 	unsigned long user_count = vma_pages(vma);
 	unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
@@ -130,149 +129,14 @@ static int arc_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 	return ret;
 }
 
-/*
- * streaming DMA Mapping API...
- * CPU accesses page via normal paddr, thus needs to explicitly made
- * consistent before each use
- */
-static void _dma_cache_sync(phys_addr_t paddr, size_t size,
-		enum dma_data_direction dir)
+void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir)
 {
-	switch (dir) {
-	case DMA_FROM_DEVICE:
-		dma_cache_inv(paddr, size);
-		break;
-	case DMA_TO_DEVICE:
-		dma_cache_wback(paddr, size);
-		break;
-	case DMA_BIDIRECTIONAL:
-		dma_cache_wback_inv(paddr, size);
-		break;
-	default:
-		pr_err("Invalid DMA dir [%d] for OP @ %pa[p]\n", dir, &paddr);
-	}
+	dma_cache_wback(paddr, size);
 }
 
-/*
- * arc_dma_map_page - map a portion of a page for streaming DMA
- *
- * Ensure that any data held in the cache is appropriately discarded
- * or written back.
- *
- * The device owns this memory once this call has completed.  The CPU
- * can regain ownership by calling dma_unmap_page().
- *
- * Note: while it takes struct page as arg, caller can "abuse" it to pass
- * a region larger than PAGE_SIZE, provided it is physically contiguous
- * and this still works correctly
- */
-static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page,
-		unsigned long offset, size_t size, enum dma_data_direction dir,
-		unsigned long attrs)
+void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir)
 {
-	phys_addr_t paddr = page_to_phys(page) + offset;
-
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		_dma_cache_sync(paddr, size, dir);
-
-	return paddr;
-}
-
-/*
- * arc_dma_unmap_page - unmap a buffer previously mapped through dma_map_page()
- *
- * After this call, reads by the CPU to the buffer are guaranteed to see
- * whatever the device wrote there.
- *
- * Note: historically this routine was not implemented for ARC
- */
-static void arc_dma_unmap_page(struct device *dev, dma_addr_t handle,
-			       size_t size, enum dma_data_direction dir,
-			       unsigned long attrs)
-{
-	phys_addr_t paddr = handle;
-
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		_dma_cache_sync(paddr, size, dir);
+	dma_cache_inv(paddr, size);
 }
-
-static int arc_dma_map_sg(struct device *dev, struct scatterlist *sg,
-	   int nents, enum dma_data_direction dir, unsigned long attrs)
-{
-	struct scatterlist *s;
-	int i;
-
-	for_each_sg(sg, s, nents, i)
-		s->dma_address = dma_map_page(dev, sg_page(s), s->offset,
-					       s->length, dir);
-
-	return nents;
-}
-
-static void arc_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
-			     int nents, enum dma_data_direction dir,
-			     unsigned long attrs)
-{
-	struct scatterlist *s;
-	int i;
-
-	for_each_sg(sg, s, nents, i)
-		arc_dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir,
-				   attrs);
-}
-
-static void arc_dma_sync_single_for_cpu(struct device *dev,
-		dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
-{
-	_dma_cache_sync(dma_handle, size, DMA_FROM_DEVICE);
-}
-
-static void arc_dma_sync_single_for_device(struct device *dev,
-		dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
-{
-	_dma_cache_sync(dma_handle, size, DMA_TO_DEVICE);
-}
-
-static void arc_dma_sync_sg_for_cpu(struct device *dev,
-		struct scatterlist *sglist, int nelems,
-		enum dma_data_direction dir)
-{
-	int i;
-	struct scatterlist *sg;
-
-	for_each_sg(sglist, sg, nelems, i)
-		_dma_cache_sync(sg_phys(sg), sg->length, dir);
-}
-
-static void arc_dma_sync_sg_for_device(struct device *dev,
-		struct scatterlist *sglist, int nelems,
-		enum dma_data_direction dir)
-{
-	int i;
-	struct scatterlist *sg;
-
-	for_each_sg(sglist, sg, nelems, i)
-		_dma_cache_sync(sg_phys(sg), sg->length, dir);
-}
-
-static int arc_dma_supported(struct device *dev, u64 dma_mask)
-{
-	/* Support 32 bit DMA mask exclusively */
-	return dma_mask == DMA_BIT_MASK(32);
-}
-
-const struct dma_map_ops arc_dma_ops = {
-	.alloc			= arc_dma_alloc,
-	.free			= arc_dma_free,
-	.mmap			= arc_dma_mmap,
-	.map_page		= arc_dma_map_page,
-	.unmap_page		= arc_dma_unmap_page,
-	.map_sg			= arc_dma_map_sg,
-	.unmap_sg		= arc_dma_unmap_sg,
-	.sync_single_for_device	= arc_dma_sync_single_for_device,
-	.sync_single_for_cpu	= arc_dma_sync_single_for_cpu,
-	.sync_sg_for_cpu	= arc_dma_sync_sg_for_cpu,
-	.sync_sg_for_device	= arc_dma_sync_sg_for_device,
-	.dma_supported		= arc_dma_supported,
-};
-EXPORT_SYMBOL(arc_dma_ops);
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index a0b7bd6..b884bbd 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -70,6 +70,8 @@ void do_page_fault(unsigned long address, struct pt_regs *regs)
 	int write = regs->ecr_cause & ECR_C_PROTV_STORE;  /* ST/EX */
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
+	clear_siginfo(&info);
+
 	/*
 	 * We fault-in kernel-space virtual memory on-demand. The
 	 * 'reference' page table is init_mm.pgd.
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index a7f8e7f..c43f5bb 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -60,7 +60,6 @@ config ARM
 	select HAVE_CONTEXT_TRACKING
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_DEBUG_KMEMLEAK
-	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_CONTIGUOUS if MMU
 	select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL) && !CPU_ENDIAN_BE32 && MMU
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
@@ -96,6 +95,7 @@ config ARM
 	select HAVE_VIRT_CPU_ACCOUNTING_GEN
 	select IRQ_FORCED_THREADING
 	select MODULES_USE_ELF_REL
+	select NEED_DMA_MAP_STATE
 	select NO_BOOTMEM
 	select OF_EARLY_FLATTREE if OF
 	select OF_RESERVED_MEM if OF
@@ -119,9 +119,6 @@ config ARM_HAS_SG_CHAIN
 	select ARCH_HAS_SG_CHAIN
 	bool
 
-config NEED_SG_DMA_LENGTH
-	bool
-
 config ARM_DMA_USE_IOMMU
 	bool
 	select ARM_HAS_SG_CHAIN
@@ -224,9 +221,6 @@ config ARCH_MAY_HAVE_PC_FDC
 config ZONE_DMA
 	bool
 
-config NEED_DMA_MAP_STATE
-       def_bool y
-
 config ARCH_SUPPORTS_UPROBES
 	def_bool y
 
@@ -1778,12 +1772,6 @@ config SECCOMP
 	  and the task is only allowed to execute a few safe syscalls
 	  defined by each seccomp mode.
 
-config SWIOTLB
-	def_bool y
-
-config IOMMU_HELPER
-	def_bool SWIOTLB
-
 config PARAVIRT
 	bool "Enable paravirtualization code"
 	help
@@ -1815,6 +1803,7 @@ config XEN
 	depends on MMU
 	select ARCH_DMA_ADDR_T_64BIT
 	select ARM_PSCI
+	select SWIOTLB
 	select SWIOTLB_XEN
 	select PARAVIRT
 	help
diff --git a/arch/arm/include/asm/pci.h b/arch/arm/include/asm/pci.h
index 1f0de80..0abd389 100644
--- a/arch/arm/include/asm/pci.h
+++ b/arch/arm/include/asm/pci.h
@@ -19,13 +19,6 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 }
 #endif /* CONFIG_PCI_DOMAINS */
 
-/*
- * The PCI address space does equal the physical memory address space.
- * The networking and block device layers use this boolean for bounce
- * buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS     (1)
-
 #define HAVE_PCI_MMAP
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE
 
diff --git a/arch/arm/kernel/dma.c b/arch/arm/kernel/dma.c
index e651c4d..6739d37 100644
--- a/arch/arm/kernel/dma.c
+++ b/arch/arm/kernel/dma.c
@@ -276,21 +276,9 @@ static int proc_dma_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int proc_dma_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_dma_show, NULL);
-}
-
-static const struct file_operations proc_dma_operations = {
-	.open		= proc_dma_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_dma_init(void)
 {
-	proc_create("dma", 0, NULL, &proc_dma_operations);
+	proc_create_single("dma", 0, NULL, proc_dma_show);
 	return 0;
 }
 
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 7724b0f..36718a4 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -205,6 +205,7 @@ void ptrace_break(struct task_struct *tsk, struct pt_regs *regs)
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo = SIGTRAP;
 	info.si_errno = 0;
 	info.si_code  = TRAP_BRKPT;
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index fc40a2b..35ca494 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -754,7 +754,7 @@ int __init arm_add_memory(u64 start, u64 size)
 	else
 		size -= aligned_start - start;
 
-#ifndef CONFIG_ARCH_PHYS_ADDR_T_64BIT
+#ifndef CONFIG_PHYS_ADDR_T_64BIT
 	if (aligned_start > ULONG_MAX) {
 		pr_crit("Ignoring memory at 0x%08llx outside 32-bit physical address space\n",
 			(long long)start);
diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c
index 3bda08b..80517f2 100644
--- a/arch/arm/kernel/swp_emulate.c
+++ b/arch/arm/kernel/swp_emulate.c
@@ -91,18 +91,6 @@ static int proc_status_show(struct seq_file *m, void *v)
 		seq_printf(m, "Last process:\t\t%d\n", previous_pid);
 	return 0;
 }
-
-static int proc_status_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_status_show, PDE_DATA(inode));
-}
-
-static const struct file_operations proc_status_fops = {
-	.open		= proc_status_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 /*
@@ -112,6 +100,7 @@ static void set_segfault(struct pt_regs *regs, unsigned long addr)
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	down_read(&current->mm->mmap_sem);
 	if (find_vma(current->mm, addr) == NULL)
 		info.si_code = SEGV_MAPERR;
@@ -260,7 +249,8 @@ static int __init swp_emulation_init(void)
 		return 0;
 
 #ifdef CONFIG_PROC_FS
-	if (!proc_create("cpu/swp_emulation", S_IRUGO, NULL, &proc_status_fops))
+	if (!proc_create_single("cpu/swp_emulation", S_IRUGO, NULL,
+			proc_status_show))
 		return -ENOMEM;
 #endif /* CONFIG_PROC_FS */
 
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 2fe8710..badf02c 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -441,6 +441,7 @@ asmlinkage void do_undefinstr(struct pt_regs *regs)
 	siginfo_t info;
 	void __user *pc;
 
+	clear_siginfo(&info);
 	pc = (void __user *)instruction_pointer(regs);
 
 	if (processor_mode(regs) == SVC_MODE) {
@@ -540,6 +541,7 @@ static int bad_syscall(int n, struct pt_regs *regs)
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	if ((current->personality & PER_MASK) != PER_LINUX) {
 		send_sig(SIGSEGV, current, 1);
 		return regs->ARM_r0;
@@ -607,6 +609,7 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs)
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	if ((no >> 16) != (__ARM_NR_BASE>> 16))
 		return bad_syscall(no, regs);
 
@@ -743,6 +746,8 @@ baddataabort(int code, unsigned long instr, struct pt_regs *regs)
 	unsigned long addr = instruction_pointer(regs);
 	siginfo_t info;
 
+	clear_siginfo(&info);
+
 #ifdef CONFIG_DEBUG_USER
 	if (user_debug & UDBG_BADABORT) {
 		pr_err("[%d] %s: bad data abort: code %d instr 0x%08lx\n",
diff --git a/arch/arm/mach-axxia/Kconfig b/arch/arm/mach-axxia/Kconfig
index bb2ce1c..d3eae60 100644
--- a/arch/arm/mach-axxia/Kconfig
+++ b/arch/arm/mach-axxia/Kconfig
@@ -2,7 +2,6 @@
 config ARCH_AXXIA
 	bool "LSI Axxia platforms"
 	depends on ARCH_MULTI_V7 && ARM_LPAE
-	select ARCH_DMA_ADDR_T_64BIT
 	select ARM_AMBA
 	select ARM_GIC
 	select ARM_TIMER_SP804
diff --git a/arch/arm/mach-bcm/Kconfig b/arch/arm/mach-bcm/Kconfig
index c2f3b0d..c46a728 100644
--- a/arch/arm/mach-bcm/Kconfig
+++ b/arch/arm/mach-bcm/Kconfig
@@ -211,7 +211,6 @@ config ARCH_BRCMSTB
 	select BRCMSTB_L2_IRQ
 	select BCM7120_L2_IRQ
 	select ARCH_HAS_HOLES_MEMORYMODEL
-	select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE
 	select ZONE_DMA if ARM_LPAE
 	select SOC_BRCMSTB
 	select SOC_BUS
diff --git a/arch/arm/mach-exynos/Kconfig b/arch/arm/mach-exynos/Kconfig
index 647c319..2ca4058 100644
--- a/arch/arm/mach-exynos/Kconfig
+++ b/arch/arm/mach-exynos/Kconfig
@@ -112,7 +112,6 @@ config SOC_EXYNOS5440
 	bool "SAMSUNG EXYNOS5440"
 	default y
 	depends on ARCH_EXYNOS5
-	select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE
 	select HAVE_ARM_ARCH_TIMER
 	select AUTO_ZRELADDR
 	select PINCTRL_EXYNOS5440
diff --git a/arch/arm/mach-highbank/Kconfig b/arch/arm/mach-highbank/Kconfig
index 81110ec..5552968 100644
--- a/arch/arm/mach-highbank/Kconfig
+++ b/arch/arm/mach-highbank/Kconfig
@@ -1,7 +1,6 @@
 config ARCH_HIGHBANK
 	bool "Calxeda ECX-1000/2000 (Highbank/Midway)"
 	depends on ARCH_MULTI_V7
-	select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE
 	select ARCH_HAS_HOLES_MEMORYMODEL
 	select ARCH_SUPPORTS_BIG_ENDIAN
 	select ARM_AMBA
diff --git a/arch/arm/mach-rockchip/Kconfig b/arch/arm/mach-rockchip/Kconfig
index a406596..fafd3d7 100644
--- a/arch/arm/mach-rockchip/Kconfig
+++ b/arch/arm/mach-rockchip/Kconfig
@@ -3,7 +3,6 @@ config ARCH_ROCKCHIP
 	depends on ARCH_MULTI_V7
 	select PINCTRL
 	select PINCTRL_ROCKCHIP
-	select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE
 	select ARCH_HAS_RESET_CONTROLLER
 	select ARM_AMBA
 	select ARM_GIC
diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index bdb5ec1..39aef48 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -657,25 +657,13 @@ static int ecard_devices_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int ecard_devices_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ecard_devices_proc_show, NULL);
-}
-
-static const struct file_operations bus_ecard_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ecard_devices_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static struct proc_dir_entry *proc_bus_ecard_dir = NULL;
 
 static void ecard_proc_init(void)
 {
 	proc_bus_ecard_dir = proc_mkdir("bus/ecard", NULL);
-	proc_create("devices", 0, proc_bus_ecard_dir, &bus_ecard_proc_fops);
+	proc_create_single("devices", 0, proc_bus_ecard_dir,
+			ecard_devices_proc_show);
 }
 
 #define ec_set_resource(ec,nr,st,sz)				\
diff --git a/arch/arm/mach-shmobile/Kconfig b/arch/arm/mach-shmobile/Kconfig
index 280e731..fe60cd09 100644
--- a/arch/arm/mach-shmobile/Kconfig
+++ b/arch/arm/mach-shmobile/Kconfig
@@ -29,7 +29,6 @@ config ARCH_RMOBILE
 menuconfig ARCH_RENESAS
 	bool "Renesas ARM SoCs"
 	depends on ARCH_MULTI_V7 && MMU
-	select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE
 	select ARCH_SHMOBILE
 	select ARM_GIC
 	select GPIOLIB
diff --git a/arch/arm/mach-tegra/Kconfig b/arch/arm/mach-tegra/Kconfig
index 1e0aeb4..7f3b83e 100644
--- a/arch/arm/mach-tegra/Kconfig
+++ b/arch/arm/mach-tegra/Kconfig
@@ -15,6 +15,5 @@ menuconfig ARCH_TEGRA
 	select RESET_CONTROLLER
 	select SOC_BUS
 	select ZONE_DMA if ARM_LPAE
-	select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE
 	help
 	  This enables support for NVIDIA Tegra based systems.
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 7f14acf..5a016bc 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -661,6 +661,7 @@ config ARM_LPAE
 	bool "Support for the Large Physical Address Extension"
 	depends on MMU && CPU_32v7 && !CPU_32v6 && !CPU_32v5 && \
 		!CPU_32v4 && !CPU_32v3
+	select PHYS_ADDR_T_64BIT
 	help
 	  Say Y if you have an ARMv7 processor supporting the LPAE page
 	  table format and you would like to access memory beyond the
@@ -673,12 +674,6 @@ config ARM_PV_FIXUP
 	def_bool y
 	depends on ARM_LPAE && ARM_PATCH_PHYS_VIRT && ARCH_KEYSTONE
 
-config ARCH_PHYS_ADDR_T_64BIT
-	def_bool ARM_LPAE
-
-config ARCH_DMA_ADDR_T_64BIT
-	bool
-
 config ARM_THUMB
 	bool "Support Thumb user binaries" if !CPU_THUMBONLY && EXPERT
 	depends on CPU_THUMB_CAPABLE
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index 2c96190..bd2c739 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -950,6 +950,7 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	if (ai_usermode & UM_SIGNAL) {
 		siginfo_t si;
 
+		clear_siginfo(&si);
 		si.si_signo = SIGBUS;
 		si.si_errno = 0;
 		si.si_code = BUS_ADRALN;
diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
index 619f24a..f448a06 100644
--- a/arch/arm/mm/dma-mapping-nommu.c
+++ b/arch/arm/mm/dma-mapping-nommu.c
@@ -241,12 +241,3 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 void arch_teardown_dma_ops(struct device *dev)
 {
 }
-
-#define PREALLOC_DMA_DEBUG_ENTRIES	4096
-
-static int __init dma_debug_do_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-	return 0;
-}
-core_initcall(dma_debug_do_init);
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index ada8eb2..4b6613b 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -1151,15 +1151,6 @@ int arm_dma_supported(struct device *dev, u64 mask)
 	return __dma_supported(dev, mask, false);
 }
 
-#define PREALLOC_DMA_DEBUG_ENTRIES	4096
-
-static int __init dma_debug_do_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-	return 0;
-}
-core_initcall(dma_debug_do_init);
-
 #ifdef CONFIG_ARM_DMA_USE_IOMMU
 
 static int __dma_info_to_prot(enum dma_data_direction dir, unsigned long attrs)
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index b75eada..3203454 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -163,6 +163,8 @@ __do_user_fault(struct task_struct *tsk, unsigned long addr,
 {
 	struct siginfo si;
 
+	clear_siginfo(&si);
+
 #ifdef CONFIG_DEBUG_USER
 	if (((user_debug & UDBG_SEGV) && (sig == SIGSEGV)) ||
 	    ((user_debug & UDBG_BUS)  && (sig == SIGBUS))) {
@@ -557,6 +559,7 @@ do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 		inf->name, fsr, addr);
 	show_pte(current->mm, addr);
 
+	clear_siginfo(&info);
 	info.si_signo = inf->sig;
 	info.si_errno = 0;
 	info.si_code  = inf->code;
@@ -589,6 +592,7 @@ do_PrefetchAbort(unsigned long addr, unsigned int ifsr, struct pt_regs *regs)
 	pr_alert("Unhandled prefetch abort: %s (0x%03x) at 0x%08lx\n",
 		inf->name, ifsr, addr);
 
+	clear_siginfo(&info);
 	info.si_signo = inf->sig;
 	info.si_errno = 0;
 	info.si_code  = inf->code;
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index af4ee2c..35d0f82 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -218,8 +218,7 @@ static void vfp_raise_sigfpe(unsigned int sicode, struct pt_regs *regs)
 {
 	siginfo_t info;
 
-	memset(&info, 0, sizeof(info));
-
+	clear_siginfo(&info);
 	info.si_signo = SIGFPE;
 	info.si_code = sicode;
 	info.si_addr = (void __user *)(instruction_pointer(regs) - 4);
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index eb2cf49..b25ed78 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -105,7 +105,6 @@ config ARM64
 	select HAVE_CONTEXT_TRACKING
 	select HAVE_DEBUG_BUGVERBOSE
 	select HAVE_DEBUG_KMEMLEAK
-	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
@@ -133,6 +132,8 @@ config ARM64
 	select IRQ_FORCED_THREADING
 	select MODULES_USE_ELF_RELA
 	select MULTI_IRQ_HANDLER
+	select NEED_DMA_MAP_STATE
+	select NEED_SG_DMA_LENGTH
 	select NO_BOOTMEM
 	select OF
 	select OF_EARLY_FLATTREE
@@ -142,6 +143,7 @@ config ARM64
 	select POWER_SUPPLY
 	select REFCOUNT_FULL
 	select SPARSE_IRQ
+	select SWIOTLB
 	select SYSCTL_EXCEPTION_TRACE
 	select THREAD_INFO_IN_TASK
 	help
@@ -150,9 +152,6 @@ config ARM64
 config 64BIT
 	def_bool y
 
-config ARCH_PHYS_ADDR_T_64BIT
-	def_bool y
-
 config MMU
 	def_bool y
 
@@ -237,24 +236,9 @@ config ZONE_DMA32
 config HAVE_GENERIC_GUP
 	def_bool y
 
-config ARCH_DMA_ADDR_T_64BIT
-	def_bool y
-
-config NEED_DMA_MAP_STATE
-	def_bool y
-
-config NEED_SG_DMA_LENGTH
-	def_bool y
-
 config SMP
 	def_bool y
 
-config SWIOTLB
-	def_bool y
-
-config IOMMU_HELPER
-	def_bool SWIOTLB
-
 config KERNEL_MODE_NEON
 	def_bool y
 
diff --git a/arch/arm64/include/asm/pci.h b/arch/arm64/include/asm/pci.h
index 8747f7c..9e69068 100644
--- a/arch/arm64/include/asm/pci.h
+++ b/arch/arm64/include/asm/pci.h
@@ -18,11 +18,6 @@
 #define pcibios_assign_all_busses() \
 	(pci_has_flag(PCI_REASSIGN_ALL_BUS))
 
-/*
- * PCI address space differs from physical memory address space
- */
-#define PCI_DMA_BUS_IS_PHYS	(0)
-
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE	1
 
 extern int isa_dma_bridge_buggy;
diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index ebdae15..26c5bd7 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -122,11 +122,6 @@ static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 
 static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
-	/*
-	 * Ensure prior spin_lock operations to other locks have completed
-	 * on this CPU before we test whether "lock" is locked.
-	 */
-	smp_mb(); /* ^^^ */
 	return !arch_spin_value_unlocked(READ_ONCE(*lock));
 }
 
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 87a3536..4bcdd03 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -882,7 +882,7 @@ asmlinkage void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs)
 			si_code = FPE_FLTRES;
 	}
 
-	memset(&info, 0, sizeof(info));
+	clear_siginfo(&info);
 	info.si_signo = SIGFPE;
 	info.si_code = si_code;
 	info.si_addr = (void __user *)instruction_pointer(regs);
diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index 93ab57d..a610982 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -112,6 +112,7 @@ long compat_arm_syscall(struct pt_regs *regs)
 		break;
 	}
 
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code  = ILL_ILLTRP;
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 8bbdc17..d399d45 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -635,6 +635,7 @@ asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
 	siginfo_t info;
 	void __user *pc = (void __user *)instruction_pointer(regs);
 
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code  = ILL_ILLOPC;
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index a96ec01..db01f27 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -508,16 +508,6 @@ static int __init arm64_dma_init(void)
 }
 arch_initcall(arm64_dma_init);
 
-#define PREALLOC_DMA_DEBUG_ENTRIES	4096
-
-static int __init dma_debug_do_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-	return 0;
-}
-fs_initcall(dma_debug_do_init);
-
-
 #ifdef CONFIG_IOMMU_DMA
 #include <linux/dma-iommu.h>
 #include <linux/platform_device.h>
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 2af3dd8..576f151 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -356,11 +356,12 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
 	 */
 	if (user_mode(regs)) {
 		const struct fault_info *inf = esr_to_fault_info(esr);
-		struct siginfo si = {
-			.si_signo	= inf->sig,
-			.si_code	= inf->code,
-			.si_addr	= (void __user *)addr,
-		};
+		struct siginfo si;
+
+		clear_siginfo(&si);
+		si.si_signo	= inf->sig;
+		si.si_code	= inf->code;
+		si.si_addr	= (void __user *)addr;
 
 		__do_user_fault(&si, esr);
 	} else {
@@ -634,6 +635,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 			nmi_exit();
 	}
 
+	clear_siginfo(&info);
 	info.si_signo = inf->sig;
 	info.si_errno = 0;
 	info.si_code  = inf->code;
@@ -738,6 +740,7 @@ asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
 		show_pte(addr);
 	}
 
+	clear_siginfo(&info);
 	info.si_signo = inf->sig;
 	info.si_errno = 0;
 	info.si_code  = inf->code;
@@ -780,6 +783,7 @@ asmlinkage void __exception do_sp_pc_abort(unsigned long addr,
 		local_irq_enable();
 	}
 
+	clear_siginfo(&info);
 	info.si_signo = SIGBUS;
 	info.si_errno = 0;
 	info.si_code  = BUS_ADRALN;
@@ -823,7 +827,6 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
 					      struct pt_regs *regs)
 {
 	const struct fault_info *inf = debug_fault_info + DBG_ESR_EVT(esr);
-	struct siginfo info;
 	int rv;
 
 	/*
@@ -839,6 +842,9 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
 	if (!inf->fn(addr, esr, regs)) {
 		rv = 1;
 	} else {
+		struct siginfo info;
+
+		clear_siginfo(&info);
 		info.si_signo = inf->sig;
 		info.si_errno = 0;
 		info.si_code  = inf->code;
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index c6b4dd1..bf598556 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -6,11 +6,13 @@
 
 config C6X
 	def_bool y
+	select ARCH_HAS_SYNC_DMA_FOR_CPU
+	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select CLKDEV_LOOKUP
+	select DMA_NONCOHERENT_OPS
 	select GENERIC_ATOMIC64
 	select GENERIC_IRQ_SHOW
 	select HAVE_ARCH_TRACEHOOK
-	select HAVE_DMA_API_DEBUG
 	select HAVE_MEMBLOCK
 	select SPARSE_IRQ
 	select IRQ_DOMAIN
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index fd4c840..434600e 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -5,6 +5,7 @@ generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
 generic-y += dma.h
+generic-y += dma-mapping.h
 generic-y += emergency-restart.h
 generic-y += exec.h
 generic-y += extable.h
diff --git a/arch/c6x/include/asm/dma-mapping.h b/arch/c6x/include/asm/dma-mapping.h
deleted file mode 100644
index 05daf10..0000000
--- a/arch/c6x/include/asm/dma-mapping.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- *  Port on Texas Instruments TMS320C6x architecture
- *
- *  Copyright (C) 2004, 2009, 2010, 2011 Texas Instruments Incorporated
- *  Author: Aurelien Jacquiot <aurelien.jacquiot@ti.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- *
- */
-#ifndef _ASM_C6X_DMA_MAPPING_H
-#define _ASM_C6X_DMA_MAPPING_H
-
-extern const struct dma_map_ops c6x_dma_ops;
-
-static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
-{
-	return &c6x_dma_ops;
-}
-
-extern void coherent_mem_init(u32 start, u32 size);
-void *c6x_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
-		gfp_t gfp, unsigned long attrs);
-void c6x_dma_free(struct device *dev, size_t size, void *vaddr,
-		dma_addr_t dma_handle, unsigned long attrs);
-
-#endif	/* _ASM_C6X_DMA_MAPPING_H */
diff --git a/arch/c6x/include/asm/setup.h b/arch/c6x/include/asm/setup.h
index 852afb2..350f34d 100644
--- a/arch/c6x/include/asm/setup.h
+++ b/arch/c6x/include/asm/setup.h
@@ -28,5 +28,7 @@ extern unsigned char c6x_fuse_mac[6];
 extern void machine_init(unsigned long dt_ptr);
 extern void time_init(void);
 
+extern void coherent_mem_init(u32 start, u32 size);
+
 #endif /* !__ASSEMBLY__ */
 #endif /* _ASM_C6X_SETUP_H */
diff --git a/arch/c6x/kernel/Makefile b/arch/c6x/kernel/Makefile
index 02f340d..fbe7417 100644
--- a/arch/c6x/kernel/Makefile
+++ b/arch/c6x/kernel/Makefile
@@ -8,6 +8,6 @@ extra-y := head.o vmlinux.lds
 obj-y := process.o traps.o irq.o signal.o ptrace.o
 obj-y += setup.o sys_c6x.o time.o devicetree.o
 obj-y += switch_to.o entry.o vectors.o c6x_ksyms.o
-obj-y += soc.o dma.o
+obj-y += soc.o
 
 obj-$(CONFIG_MODULES)           += module.o
diff --git a/arch/c6x/kernel/dma.c b/arch/c6x/kernel/dma.c
deleted file mode 100644
index 9fff8be..0000000
--- a/arch/c6x/kernel/dma.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- *  Copyright (C) 2011 Texas Instruments Incorporated
- *  Author: Mark Salter <msalter@redhat.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/dma-mapping.h>
-#include <linux/mm.h>
-#include <linux/mm_types.h>
-#include <linux/scatterlist.h>
-
-#include <asm/cacheflush.h>
-
-static void c6x_dma_sync(dma_addr_t handle, size_t size,
-			 enum dma_data_direction dir)
-{
-	unsigned long paddr = handle;
-
-	BUG_ON(!valid_dma_direction(dir));
-
-	switch (dir) {
-	case DMA_FROM_DEVICE:
-		L2_cache_block_invalidate(paddr, paddr + size);
-		break;
-	case DMA_TO_DEVICE:
-		L2_cache_block_writeback(paddr, paddr + size);
-		break;
-	case DMA_BIDIRECTIONAL:
-		L2_cache_block_writeback_invalidate(paddr, paddr + size);
-		break;
-	default:
-		break;
-	}
-}
-
-static dma_addr_t c6x_dma_map_page(struct device *dev, struct page *page,
-		unsigned long offset, size_t size, enum dma_data_direction dir,
-		unsigned long attrs)
-{
-	dma_addr_t handle = virt_to_phys(page_address(page) + offset);
-
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		c6x_dma_sync(handle, size, dir);
-
-	return handle;
-}
-
-static void c6x_dma_unmap_page(struct device *dev, dma_addr_t handle,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		c6x_dma_sync(handle, size, dir);
-}
-
-static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist,
-		int nents, enum dma_data_direction dir, unsigned long attrs)
-{
-	struct scatterlist *sg;
-	int i;
-
-	for_each_sg(sglist, sg, nents, i) {
-		sg->dma_address = sg_phys(sg);
-		if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-			c6x_dma_sync(sg->dma_address, sg->length, dir);
-	}
-
-	return nents;
-}
-
-static void c6x_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
-		  int nents, enum dma_data_direction dir, unsigned long attrs)
-{
-	struct scatterlist *sg;
-	int i;
-
-	if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
-		return;
-
-	for_each_sg(sglist, sg, nents, i)
-		c6x_dma_sync(sg_dma_address(sg), sg->length, dir);
-}
-
-static void c6x_dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle,
-		size_t size, enum dma_data_direction dir)
-{
-	c6x_dma_sync(handle, size, dir);
-
-}
-
-static void c6x_dma_sync_single_for_device(struct device *dev,
-		dma_addr_t handle, size_t size, enum dma_data_direction dir)
-{
-	c6x_dma_sync(handle, size, dir);
-
-}
-
-static void c6x_dma_sync_sg_for_cpu(struct device *dev,
-		struct scatterlist *sglist, int nents,
-		enum dma_data_direction dir)
-{
-	struct scatterlist *sg;
-	int i;
-
-	for_each_sg(sglist, sg, nents, i)
-		c6x_dma_sync_single_for_cpu(dev, sg_dma_address(sg),
-					sg->length, dir);
-
-}
-
-static void c6x_dma_sync_sg_for_device(struct device *dev,
-		struct scatterlist *sglist, int nents,
-		enum dma_data_direction dir)
-{
-	struct scatterlist *sg;
-	int i;
-
-	for_each_sg(sglist, sg, nents, i)
-		c6x_dma_sync_single_for_device(dev, sg_dma_address(sg),
-					   sg->length, dir);
-
-}
-
-const struct dma_map_ops c6x_dma_ops = {
-	.alloc			= c6x_dma_alloc,
-	.free			= c6x_dma_free,
-	.map_page		= c6x_dma_map_page,
-	.unmap_page		= c6x_dma_unmap_page,
-	.map_sg			= c6x_dma_map_sg,
-	.unmap_sg		= c6x_dma_unmap_sg,
-	.sync_single_for_device	= c6x_dma_sync_single_for_device,
-	.sync_single_for_cpu	= c6x_dma_sync_single_for_cpu,
-	.sync_sg_for_device	= c6x_dma_sync_sg_for_device,
-	.sync_sg_for_cpu	= c6x_dma_sync_sg_for_cpu,
-};
-EXPORT_SYMBOL(c6x_dma_ops);
-
-/* Number of entries preallocated for DMA-API debugging */
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init dma_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-
-	return 0;
-}
-fs_initcall(dma_init);
diff --git a/arch/c6x/kernel/traps.c b/arch/c6x/kernel/traps.c
index 4c1d4b8..5c60aea 100644
--- a/arch/c6x/kernel/traps.c
+++ b/arch/c6x/kernel/traps.c
@@ -244,7 +244,6 @@ static struct exception_info eexcept_table[128] = {
 static void do_trap(struct exception_info *except_info, struct pt_regs *regs)
 {
 	unsigned long addr = instruction_pointer(regs);
-	siginfo_t info;
 
 	if (except_info->code != TRAP_BRKPT)
 		pr_err("TRAP: %s PC[0x%lx] signo[%d] code[%d]\n",
@@ -253,12 +252,8 @@ static void do_trap(struct exception_info *except_info, struct pt_regs *regs)
 
 	die_if_kernel(except_info->kernel_str, regs, addr);
 
-	info.si_signo = except_info->signo;
-	info.si_errno = 0;
-	info.si_code  = except_info->code;
-	info.si_addr  = (void __user *)addr;
-
-	force_sig_info(except_info->signo, &info, current);
+	force_sig_fault(except_info->signo, except_info->code,
+			(void __user *)addr, current);
 }
 
 /*
diff --git a/arch/c6x/mm/dma-coherent.c b/arch/c6x/mm/dma-coherent.c
index 95e38ad..d0a8e0c 100644
--- a/arch/c6x/mm/dma-coherent.c
+++ b/arch/c6x/mm/dma-coherent.c
@@ -19,10 +19,12 @@
 #include <linux/bitops.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-noncoherent.h>
 #include <linux/memblock.h>
 
+#include <asm/cacheflush.h>
 #include <asm/page.h>
+#include <asm/setup.h>
 
 /*
  * DMA coherent memory management, can be redefined using the memdma=
@@ -73,7 +75,7 @@ static void __free_dma_pages(u32 addr, int order)
  * Allocate DMA coherent memory space and return both the kernel
  * virtual and DMA address for that space.
  */
-void *c6x_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
+void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 		gfp_t gfp, unsigned long attrs)
 {
 	u32 paddr;
@@ -98,7 +100,7 @@ void *c6x_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 /*
  * Free DMA coherent memory as defined by the above mapping.
  */
-void c6x_dma_free(struct device *dev, size_t size, void *vaddr,
+void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 		dma_addr_t dma_handle, unsigned long attrs)
 {
 	int order;
@@ -139,3 +141,35 @@ void __init coherent_mem_init(phys_addr_t start, u32 size)
 	dma_bitmap = phys_to_virt(bitmap_phys);
 	memset(dma_bitmap, 0, dma_pages * PAGE_SIZE);
 }
+
+static void c6x_dma_sync(struct device *dev, phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
+{
+	BUG_ON(!valid_dma_direction(dir));
+
+	switch (dir) {
+	case DMA_FROM_DEVICE:
+		L2_cache_block_invalidate(paddr, paddr + size);
+		break;
+	case DMA_TO_DEVICE:
+		L2_cache_block_writeback(paddr, paddr + size);
+		break;
+	case DMA_BIDIRECTIONAL:
+		L2_cache_block_writeback_invalidate(paddr, paddr + size);
+		break;
+	default:
+		break;
+	}
+}
+
+void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir)
+{
+	return c6x_dma_sync(dev, paddr, size, dir);
+}
+
+void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir)
+{
+	return c6x_dma_sync(dev, paddr, size, dir);
+}
diff --git a/arch/h8300/include/asm/pci.h b/arch/h8300/include/asm/pci.h
index 7c9e55d..d4d345a 100644
--- a/arch/h8300/include/asm/pci.h
+++ b/arch/h8300/include/asm/pci.h
@@ -15,6 +15,4 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
 	/* We don't do dynamic PCI IRQ allocation */
 }
 
-#define PCI_DMA_BUS_IS_PHYS	(1)
-
 #endif /* _ASM_H8300_PCI_H */
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 76d2f20..37adb20 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -19,6 +19,7 @@ config HEXAGON
 	select GENERIC_IRQ_SHOW
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_TRACEHOOK
+	select NEED_SG_DMA_LENGTH
 	select NO_IOPORT_MAP
 	select GENERIC_IOMAP
 	select GENERIC_SMP_IDLE_THREAD
@@ -63,9 +64,6 @@ config GENERIC_CSUM
 config GENERIC_IRQ_PROBE
 	def_bool y
 
-config NEED_SG_DMA_LENGTH
-	def_bool y
-
 config RWSEM_GENERIC_SPINLOCK
 	def_bool n
 
diff --git a/arch/hexagon/kernel/dma.c b/arch/hexagon/kernel/dma.c
index ad8347c..77459df 100644
--- a/arch/hexagon/kernel/dma.c
+++ b/arch/hexagon/kernel/dma.c
@@ -208,7 +208,6 @@ const struct dma_map_ops hexagon_dma_ops = {
 	.sync_single_for_cpu = hexagon_sync_single_for_cpu,
 	.sync_single_for_device = hexagon_sync_single_for_device,
 	.mapping_error	= hexagon_mapping_error,
-	.is_phys	= 1,
 };
 
 void __init hexagon_dma_init(void)
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index 2942a92..91ee048 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -412,10 +412,6 @@ void do_trap0(struct pt_regs *regs)
 	case TRAP_DEBUG:
 		/* Trap0 0xdb is debug breakpoint */
 		if (user_mode(regs)) {
-			struct siginfo info;
-
-			info.si_signo = SIGTRAP;
-			info.si_errno = 0;
 			/*
 			 * Some architecures add some per-thread state
 			 * to distinguish between breakpoint traps and
@@ -423,9 +419,8 @@ void do_trap0(struct pt_regs *regs)
 			 * set the si_code value appropriately, or we
 			 * may want to use a different trap0 flavor.
 			 */
-			info.si_code = TRAP_BRKPT;
-			info.si_addr = (void __user *) pt_elr(regs);
-			force_sig_info(SIGTRAP, &info, current);
+			force_sig_fault(SIGTRAP, TRAP_BRKPT,
+					(void __user *) pt_elr(regs), current);
 		} else {
 #ifdef CONFIG_KGDB
 			kgdb_handle_exception(pt_cause(regs), SIGTRAP,
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
index 3eec33c..933bbce 100644
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -50,7 +50,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
 {
 	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
-	siginfo_t info;
+	int si_signo;
 	int si_code = SEGV_MAPERR;
 	int fault;
 	const struct exception_table_entry *fixup;
@@ -140,28 +140,22 @@ good_area:
 	 * unable to fix up the page fault.
 	 */
 	if (fault & VM_FAULT_SIGBUS) {
-		info.si_signo = SIGBUS;
-		info.si_code = BUS_ADRERR;
+		si_signo = SIGBUS;
+		si_code = BUS_ADRERR;
 	}
 	/* Address is not in the memory map */
 	else {
-		info.si_signo = SIGSEGV;
-		info.si_code = SEGV_ACCERR;
+		si_signo = SIGSEGV;
+		si_code  = SEGV_ACCERR;
 	}
-	info.si_errno = 0;
-	info.si_addr = (void __user *)address;
-	force_sig_info(info.si_signo, &info, current);
+	force_sig_fault(si_signo, si_code, (void __user *)address, current);
 	return;
 
 bad_area:
 	up_read(&mm->mmap_sem);
 
 	if (user_mode(regs)) {
-		info.si_signo = SIGSEGV;
-		info.si_errno = 0;
-		info.si_code = si_code;
-		info.si_addr = (void *)address;
-		force_sig_info(info.si_signo, &info, current);
+		force_sig_fault(SIGSEGV, si_code, (void __user *)address, current);
 		return;
 	}
 	/* Kernel-mode fault falls through */
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index bbe12a0..792437d 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -29,7 +29,6 @@ config IA64
 	select HAVE_FUNCTION_TRACER
 	select TTY
 	select HAVE_ARCH_TRACEHOOK
-	select HAVE_DMA_API_DEBUG
 	select HAVE_MEMBLOCK
 	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_VIRT_CPU_ACCOUNTING
@@ -54,6 +53,8 @@ config IA64
 	select MODULES_USE_ELF_RELA
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select HAVE_ARCH_AUDITSYSCALL
+	select NEED_DMA_MAP_STATE
+	select NEED_SG_DMA_LENGTH
 	default y
 	help
 	  The Itanium Processor Family is Intel's 64-bit successor to
@@ -78,18 +79,6 @@ config MMU
 	bool
 	default y
 
-config ARCH_DMA_ADDR_T_64BIT
-	def_bool y
-
-config NEED_DMA_MAP_STATE
-	def_bool y
-
-config NEED_SG_DMA_LENGTH
-	def_bool y
-
-config SWIOTLB
-       bool
-
 config STACKTRACE_SUPPORT
 	def_bool y
 
@@ -146,7 +135,6 @@ config IA64_GENERIC
 	bool "generic"
 	select NUMA
 	select ACPI_NUMA
-	select DMA_DIRECT_OPS
 	select SWIOTLB
 	select PCI_MSI
 	help
@@ -167,7 +155,6 @@ config IA64_GENERIC
 
 config IA64_DIG
 	bool "DIG-compliant"
-	select DMA_DIRECT_OPS
 	select SWIOTLB
 
 config IA64_DIG_VTD
@@ -183,7 +170,6 @@ config IA64_HP_ZX1
 
 config IA64_HP_ZX1_SWIOTLB
 	bool "HP-zx1/sx1000 with software I/O TLB"
-	select DMA_DIRECT_OPS
 	select SWIOTLB
 	help
 	  Build a kernel that runs on HP zx1 and sx1000 systems even when they
@@ -207,7 +193,6 @@ config IA64_SGI_UV
 	bool "SGI-UV"
 	select NUMA
 	select ACPI_NUMA
-	select DMA_DIRECT_OPS
 	select SWIOTLB
 	help
 	  Selecting this option will optimize the kernel for use on UV based
@@ -218,7 +203,6 @@ config IA64_SGI_UV
 
 config IA64_HP_SIM
 	bool "Ski-simulator"
-	select DMA_DIRECT_OPS
 	select SWIOTLB
 	depends on !PM
 
@@ -397,7 +381,7 @@ config ARCH_DISCONTIGMEM_ENABLE
 	  Say Y to support efficient handling of discontiguous physical memory,
 	  for architectures which are either NUMA (Non-Uniform Memory Access)
 	  or have huge holes in the physical address space for other reasons.
- 	  See <file:Documentation/vm/numa> for more.
+	  See <file:Documentation/vm/numa.rst> for more.
 
 config ARCH_FLATMEM_ENABLE
 	def_bool y
@@ -613,6 +597,3 @@ source "security/Kconfig"
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
-
-config IOMMU_HELPER
-	def_bool (IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB || IA64_GENERIC || SWIOTLB)
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index aec4a33..ee5b652 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1845,9 +1845,6 @@ static void ioc_init(unsigned long hpa, struct ioc *ioc)
 	ioc_resource_init(ioc);
 	ioc_sac_init(ioc);
 
-	if ((long) ~iovp_mask > (long) ia64_max_iommu_merge_mask)
-		ia64_max_iommu_merge_mask = ~iovp_mask;
-
 	printk(KERN_INFO PFX
 		"%s %d.%d HPA 0x%lx IOVA space %dMb at 0x%lx\n",
 		ioc->name, (ioc->rev >> 4) & 0xF, ioc->rev & 0xF,
@@ -1942,19 +1939,6 @@ static const struct seq_operations ioc_seq_ops = {
 	.show  = ioc_show
 };
 
-static int
-ioc_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &ioc_seq_ops);
-}
-
-static const struct file_operations ioc_fops = {
-	.open    = ioc_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release
-};
-
 static void __init
 ioc_proc_init(void)
 {
@@ -1964,7 +1948,7 @@ ioc_proc_init(void)
 	if (!dir)
 		return;
 
-	proc_create(ioc_list->name, 0, dir, &ioc_fops);
+	proc_create_seq(ioc_list->name, 0, dir, &ioc_seq_ops);
 }
 #endif
 
diff --git a/arch/ia64/hp/sim/simserial.c b/arch/ia64/hp/sim/simserial.c
index a419ccf..663388a 100644
--- a/arch/ia64/hp/sim/simserial.c
+++ b/arch/ia64/hp/sim/simserial.c
@@ -435,19 +435,6 @@ static int rs_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int rs_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, rs_proc_show, NULL);
-}
-
-static const struct file_operations rs_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= rs_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static const struct tty_operations hp_ops = {
 	.open = rs_open,
 	.close = rs_close,
@@ -462,7 +449,7 @@ static const struct tty_operations hp_ops = {
 	.unthrottle = rs_unthrottle,
 	.send_xchar = rs_send_xchar,
 	.hangup = rs_hangup,
-	.proc_fops = &rs_proc_fops,
+	.proc_show = rs_proc_show,
 };
 
 static const struct tty_port_operations hp_port_ops = {
diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h
index b1d04e8..780e874 100644
--- a/arch/ia64/include/asm/pci.h
+++ b/arch/ia64/include/asm/pci.h
@@ -30,23 +30,6 @@ struct pci_vector_struct {
 #define PCIBIOS_MIN_IO		0x1000
 #define PCIBIOS_MIN_MEM		0x10000000
 
-/*
- * PCI_DMA_BUS_IS_PHYS should be set to 1 if there is _necessarily_ a direct
- * correspondence between device bus addresses and CPU physical addresses.
- * Platforms with a hardware I/O MMU _must_ turn this off to suppress the
- * bounce buffer handling code in the block and network device layers.
- * Platforms with separate bus address spaces _must_ turn this off and provide
- * a device DMA mapping implementation that takes care of the necessary
- * address translation.
- *
- * For now, the ia64 platforms which may have separate/multiple bus address
- * spaces all have I/O MMUs which support the merging of physically
- * discontiguous buffers, so we can use that as the sole factor to determine
- * the setting of PCI_DMA_BUS_IS_PHYS.
- */
-extern unsigned long ia64_max_iommu_merge_mask;
-#define PCI_DMA_BUS_IS_PHYS	(ia64_max_iommu_merge_mask == ~0UL)
-
 #define HAVE_PCI_MMAP
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE
 #define arch_can_pci_mmap_wc()	1
diff --git a/arch/ia64/include/uapi/asm/siginfo.h b/arch/ia64/include/uapi/asm/siginfo.h
index 5aa454e..52b5af4 100644
--- a/arch/ia64/include/uapi/asm/siginfo.h
+++ b/arch/ia64/include/uapi/asm/siginfo.h
@@ -27,11 +27,4 @@
 #define __ISR_VALID_BIT	0
 #define __ISR_VALID	(1 << __ISR_VALID_BIT)
 
-/*
- * SIGFPE si_codes
- */
-#ifdef __KERNEL__
-#define FPE_FIXME	0	/* Broken dup of SI_USER */
-#endif /* __KERNEL__ */
-
 #endif /* _UAPI_ASM_IA64_SIGINFO_H */
diff --git a/arch/ia64/kernel/brl_emu.c b/arch/ia64/kernel/brl_emu.c
index 9bcc908..a61f6c6 100644
--- a/arch/ia64/kernel/brl_emu.c
+++ b/arch/ia64/kernel/brl_emu.c
@@ -62,6 +62,7 @@ ia64_emulate_brl (struct pt_regs *regs, unsigned long ar_ec)
 	struct illegal_op_return rv;
 	long tmp_taken, unimplemented_address;
 
+	clear_siginfo(&siginfo);
 	rv.fkt = (unsigned long) -1;
 
 	/*
diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c
index f2d57e6..7a471d8 100644
--- a/arch/ia64/kernel/dma-mapping.c
+++ b/arch/ia64/kernel/dma-mapping.c
@@ -9,16 +9,6 @@ int iommu_detected __read_mostly;
 const struct dma_map_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init dma_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-
-	return 0;
-}
-fs_initcall(dma_init);
-
 const struct dma_map_ops *dma_get_ops(struct device *dev)
 {
 	return dma_ops;
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
index b6e5978..f4a9424 100644
--- a/arch/ia64/kernel/palinfo.c
+++ b/arch/ia64/kernel/palinfo.c
@@ -920,18 +920,6 @@ static int proc_palinfo_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int proc_palinfo_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_palinfo_show, PDE_DATA(inode));
-}
-
-static const struct file_operations proc_palinfo_fops = {
-	.open		= proc_palinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int palinfo_add_proc(unsigned int cpu)
 {
 	pal_func_cpu_u_t f;
@@ -948,8 +936,8 @@ static int palinfo_add_proc(unsigned int cpu)
 
 	for (j=0; j < NR_PALINFO_ENTRIES; j++) {
 		f.func_id = j;
-		proc_create_data(palinfo_entries[j].name, 0, cpu_dir,
-				 &proc_palinfo_fops, (void *)f.value);
+		proc_create_single_data(palinfo_entries[j].name, 0, cpu_dir,
+				proc_palinfo_show, (void *)f.value);
 	}
 	return 0;
 }
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 8fb280e..3b38c71 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -5708,13 +5708,6 @@ const struct seq_operations pfm_seq_ops = {
  	.show =		pfm_proc_show
 };
 
-static int
-pfm_proc_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &pfm_seq_ops);
-}
-
-
 /*
  * we come here as soon as local_cpu_data->pfm_syst_wide is set. this happens
  * during pfm_enable() hence before pfm_start(). We cannot assume monitoring
@@ -6537,13 +6530,6 @@ found:
 	return 0;
 }
 
-static const struct file_operations pfm_proc_fops = {
-	.open		= pfm_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 int __init
 pfm_init(void)
 {
@@ -6615,7 +6601,7 @@ pfm_init(void)
 	/*
 	 * create /proc/perfmon (mostly for debugging purposes)
 	 */
-	perfmon_dir = proc_create("perfmon", S_IRUGO, NULL, &pfm_proc_fops);
+	perfmon_dir = proc_create_seq("perfmon", S_IRUGO, NULL, &pfm_seq_ops);
 	if (perfmon_dir == NULL) {
 		printk(KERN_ERR "perfmon: cannot create /proc entry, perfmon disabled\n");
 		pmu_conf = NULL;
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index 52c404b..aba1f46 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -54,8 +54,6 @@ MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>");
 MODULE_DESCRIPTION("/proc interface to IA-64 SAL features");
 MODULE_LICENSE("GPL");
 
-static const struct file_operations proc_salinfo_fops;
-
 typedef struct {
 	const char		*name;		/* name of the proc entry */
 	unsigned long           feature;        /* feature bit */
@@ -578,6 +576,17 @@ static int salinfo_cpu_pre_down(unsigned int cpu)
 	return 0;
 }
 
+/*
+ * 'data' contains an integer that corresponds to the feature we're
+ * testing
+ */
+static int proc_salinfo_show(struct seq_file *m, void *v)
+{
+	unsigned long data = (unsigned long)v;
+	seq_puts(m, (sal_platform_features & data) ? "1\n" : "0\n");
+	return 0;
+}
+
 static int __init
 salinfo_init(void)
 {
@@ -593,9 +602,9 @@ salinfo_init(void)
 
 	for (i=0; i < NR_SALINFO_ENTRIES; i++) {
 		/* pass the feature bit in question as misc data */
-		*sdir++ = proc_create_data(salinfo_entries[i].name, 0, salinfo_dir,
-					   &proc_salinfo_fops,
-					   (void *)salinfo_entries[i].feature);
+		*sdir++ = proc_create_single_data(salinfo_entries[i].name, 0,
+				salinfo_dir, proc_salinfo_show,
+				(void *)salinfo_entries[i].feature);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(salinfo_log_name); i++) {
@@ -633,27 +642,4 @@ salinfo_init(void)
 	return 0;
 }
 
-/*
- * 'data' contains an integer that corresponds to the feature we're
- * testing
- */
-static int proc_salinfo_show(struct seq_file *m, void *v)
-{
-	unsigned long data = (unsigned long)v;
-	seq_puts(m, (sal_platform_features & data) ? "1\n" : "0\n");
-	return 0;
-}
-
-static int proc_salinfo_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_salinfo_show, PDE_DATA(inode));
-}
-
-static const struct file_operations proc_salinfo_fops = {
-	.open		= proc_salinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 module_init(salinfo_init);
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index dee56bc..ad43cbf 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -124,18 +124,6 @@ unsigned long ia64_i_cache_stride_shift = ~0;
 unsigned long ia64_cache_stride_shift = ~0;
 
 /*
- * The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 1).  This
- * mask specifies a mask of address bits that must be 0 in order for two buffers to be
- * mergeable by the I/O MMU (i.e., the end address of the first buffer and the start
- * address of the second buffer must be aligned to (merge_mask+1) in order to be
- * mergeable).  By default, we assume there is no I/O MMU which can merge physically
- * discontiguous buffers, so we set the merge_mask to ~0UL, which corresponds to a iommu
- * page-size of 2^64.
- */
-unsigned long ia64_max_iommu_merge_mask = ~0UL;
-EXPORT_SYMBOL(ia64_max_iommu_merge_mask);
-
-/*
  * We use a special marker for the end of memory and it uses the extra (+1) slot
  */
 struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1] __initdata;
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 54547c7..d1234a5 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -153,6 +153,7 @@ ia64_rt_sigreturn (struct sigscratch *scr)
 	return retval;
 
   give_sigsegv:
+	clear_siginfo(&si);
 	si.si_signo = SIGSEGV;
 	si.si_errno = 0;
 	si.si_code = SI_KERNEL;
@@ -236,6 +237,7 @@ force_sigsegv_info (int sig, void __user *addr)
 	unsigned long flags;
 	struct siginfo si;
 
+	clear_siginfo(&si);
 	if (sig == SIGSEGV) {
 		/*
 		 * Acquiring siglock around the sa_handler-update is almost
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index 6d4e76a..c6f4932 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -104,6 +104,7 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 	int sig, code;
 
 	/* SIGILL, SIGFPE, SIGSEGV, and SIGBUS want these field initialized: */
+	clear_siginfo(&siginfo);
 	siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri);
 	siginfo.si_imm = break_num;
 	siginfo.si_flags = 0;		/* clear __ISR_VALID */
@@ -293,7 +294,6 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
 {
 	long exception, bundle[2];
 	unsigned long fault_ip;
-	struct siginfo siginfo;
 
 	fault_ip = regs->cr_iip;
 	if (!fp_fault && (ia64_psr(regs)->ri == 0))
@@ -344,13 +344,16 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
 			printk(KERN_ERR "handle_fpu_swa: fp_emulate() returned -1\n");
 			return -1;
 		} else {
+			struct siginfo siginfo;
+
 			/* is next instruction a trap? */
 			if (exception & 2) {
 				ia64_increment_ip(regs);
 			}
+			clear_siginfo(&siginfo);
 			siginfo.si_signo = SIGFPE;
 			siginfo.si_errno = 0;
-			siginfo.si_code = FPE_FIXME;	/* default code */
+			siginfo.si_code = FPE_FLTUNK;	/* default code */
 			siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri);
 			if (isr & 0x11) {
 				siginfo.si_code = FPE_FLTINV;
@@ -372,9 +375,12 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
 			return -1;
 		} else if (exception != 0) {
 			/* raise exception */
+			struct siginfo siginfo;
+
+			clear_siginfo(&siginfo);
 			siginfo.si_signo = SIGFPE;
 			siginfo.si_errno = 0;
-			siginfo.si_code = FPE_FIXME;	/* default code */
+			siginfo.si_code = FPE_FLTUNK;	/* default code */
 			siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri);
 			if (isr & 0x880) {
 				siginfo.si_code = FPE_FLTOVF;
@@ -420,7 +426,7 @@ ia64_illegal_op_fault (unsigned long ec, long arg1, long arg2, long arg3,
 	if (die_if_kernel(buf, &regs, 0))
 		return rv;
 
-	memset(&si, 0, sizeof(si));
+	clear_siginfo(&si);
 	si.si_signo = SIGILL;
 	si.si_code = ILL_ILLOPC;
 	si.si_addr = (void __user *) (regs.cr_iip + ia64_psr(&regs)->ri);
@@ -434,7 +440,6 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 	    long arg7, struct pt_regs regs)
 {
 	unsigned long code, error = isr, iip;
-	struct siginfo siginfo;
 	char buf[128];
 	int result, sig;
 	static const char *reason[] = {
@@ -485,6 +490,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 
 	      case 26: /* NaT Consumption */
 		if (user_mode(&regs)) {
+			struct siginfo siginfo;
 			void __user *addr;
 
 			if (((isr >> 4) & 0xf) == 2) {
@@ -499,6 +505,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 				addr = (void __user *) (regs.cr_iip
 							+ ia64_psr(&regs)->ri);
 			}
+			clear_siginfo(&siginfo);
 			siginfo.si_signo = sig;
 			siginfo.si_code = code;
 			siginfo.si_errno = 0;
@@ -515,6 +522,9 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 
 	      case 31: /* Unsupported Data Reference */
 		if (user_mode(&regs)) {
+			struct siginfo siginfo;
+
+			clear_siginfo(&siginfo);
 			siginfo.si_signo = SIGILL;
 			siginfo.si_code = ILL_ILLOPN;
 			siginfo.si_errno = 0;
@@ -531,6 +541,10 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 	      case 29: /* Debug */
 	      case 35: /* Taken Branch Trap */
 	      case 36: /* Single Step Trap */
+	      {
+		struct siginfo siginfo;
+
+		clear_siginfo(&siginfo);
 		if (fsys_mode(current, &regs)) {
 			extern char __kernel_syscall_via_break[];
 			/*
@@ -578,11 +592,15 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 		siginfo.si_isr   = isr;
 		force_sig_info(SIGTRAP, &siginfo, current);
 		return;
+	      }
 
 	      case 32: /* fp fault */
 	      case 33: /* fp trap */
 		result = handle_fpu_swa((vector == 32) ? 1 : 0, &regs, isr);
 		if ((result < 0) || (current->thread.flags & IA64_THREAD_FPEMU_SIGFPE)) {
+			struct siginfo siginfo;
+
+			clear_siginfo(&siginfo);
 			siginfo.si_signo = SIGFPE;
 			siginfo.si_errno = 0;
 			siginfo.si_code = FPE_FLTINV;
@@ -616,6 +634,9 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 		} else {
 			/* Unimplemented Instr. Address Trap */
 			if (user_mode(&regs)) {
+				struct siginfo siginfo;
+
+				clear_siginfo(&siginfo);
 				siginfo.si_signo = SIGILL;
 				siginfo.si_code = ILL_BADIADDR;
 				siginfo.si_errno = 0;
diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
index 72e9b42..e309f98 100644
--- a/arch/ia64/kernel/unaligned.c
+++ b/arch/ia64/kernel/unaligned.c
@@ -1537,6 +1537,7 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
 		/* NOT_REACHED */
 	}
   force_sigbus:
+	clear_siginfo(&si);
 	si.si_signo = SIGBUS;
 	si.si_errno = 0;
 	si.si_code = BUS_ADRALN;
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index dfdc152..817fa12 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -85,7 +85,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	int signal = SIGSEGV, code = SEGV_MAPERR;
 	struct vm_area_struct *vma, *prev_vma;
 	struct mm_struct *mm = current->mm;
-	struct siginfo si;
 	unsigned long mask;
 	int fault;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
@@ -249,6 +248,9 @@ retry:
 		return;
 	}
 	if (user_mode(regs)) {
+		struct siginfo si;
+
+		clear_siginfo(&si);
 		si.si_signo = signal;
 		si.si_errno = 0;
 		si.si_code = code;
diff --git a/arch/ia64/sn/kernel/io_common.c b/arch/ia64/sn/kernel/io_common.c
index 11f2275..8479e9a 100644
--- a/arch/ia64/sn/kernel/io_common.c
+++ b/arch/ia64/sn/kernel/io_common.c
@@ -480,11 +480,6 @@ sn_io_early_init(void)
 	tioca_init_provider();
 	tioce_init_provider();
 
-	/*
-	 * This is needed to avoid bounce limit checks in the blk layer
-	 */
-	ia64_max_iommu_merge_mask = ~PAGE_MASK;
-
 	sn_irq_lh_init();
 	INIT_LIST_HEAD(&sn_sysdata_list);
 	sn_init_cpei_timer();
diff --git a/arch/ia64/sn/kernel/sn2/prominfo_proc.c b/arch/ia64/sn/kernel/sn2/prominfo_proc.c
index ec4de2b..e15457b 100644
--- a/arch/ia64/sn/kernel/sn2/prominfo_proc.c
+++ b/arch/ia64/sn/kernel/sn2/prominfo_proc.c
@@ -140,18 +140,6 @@ static int proc_fit_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int proc_fit_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_fit_show, PDE_DATA(inode));
-}
-
-static const struct file_operations proc_fit_fops = {
-	.open		= proc_fit_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int proc_version_show(struct seq_file *m, void *v)
 {
 	unsigned long nasid = (unsigned long)m->private;
@@ -174,18 +162,6 @@ static int proc_version_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int proc_version_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_version_show, PDE_DATA(inode));
-}
-
-static const struct file_operations proc_version_fops = {
-	.open		= proc_version_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /* module entry points */
 int __init prominfo_init(void);
 void __exit prominfo_exit(void);
@@ -217,10 +193,10 @@ int __init prominfo_init(void)
 		if (!dir)
 			continue;
 		nasid = cnodeid_to_nasid(cnodeid);
-		proc_create_data("fit", 0, dir,
-				 &proc_fit_fops, (void *)nasid);
-		proc_create_data("version", 0, dir,
-				 &proc_version_fops, (void *)nasid);
+		proc_create_single_data("fit", 0, dir, proc_fit_show, 
+				(void *)nasid);
+		proc_create_single_data("version", 0, dir, proc_version_show,
+				(void *)nasid);
 	}
 	return 0;
 }
diff --git a/arch/ia64/sn/kernel/sn2/sn_proc_fs.c b/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
index 29cf8f8..c2a4d84 100644
--- a/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
+++ b/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
@@ -18,33 +18,18 @@ static int partition_id_show(struct seq_file *s, void *p)
 	return 0;
 }
 
-static int partition_id_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, partition_id_show, NULL);
-}
-
 static int system_serial_number_show(struct seq_file *s, void *p)
 {
 	seq_printf(s, "%s\n", sn_system_serial_number());
 	return 0;
 }
 
-static int system_serial_number_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, system_serial_number_show, NULL);
-}
-
 static int licenseID_show(struct seq_file *s, void *p)
 {
 	seq_printf(s, "0x%llx\n", sn_partition_serial_number_val());
 	return 0;
 }
 
-static int licenseID_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, licenseID_show, NULL);
-}
-
 static int coherence_id_show(struct seq_file *s, void *p)
 {
 	seq_printf(s, "%d\n", partition_coherence_id());
@@ -52,43 +37,10 @@ static int coherence_id_show(struct seq_file *s, void *p)
 	return 0;
 }
 
-static int coherence_id_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, coherence_id_show, NULL);
-}
-
 /* /proc/sgi_sn/sn_topology uses seq_file, see sn_hwperf.c */
 extern int sn_topology_open(struct inode *, struct file *);
 extern int sn_topology_release(struct inode *, struct file *);
 
-static const struct file_operations proc_partition_id_fops = {
-	.open		= partition_id_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static const struct file_operations proc_system_sn_fops = {
-	.open		= system_serial_number_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static const struct file_operations proc_license_id_fops = {
-	.open		= licenseID_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static const struct file_operations proc_coherence_id_fops = {
-	.open		= coherence_id_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static const struct file_operations proc_sn_topo_fops = {
 	.open		= sn_topology_open,
 	.read		= seq_read,
@@ -104,13 +56,13 @@ void register_sn_procfs(void)
 	if (!(sgi_proc_dir = proc_mkdir("sgi_sn", NULL)))
 		return;
 
-	proc_create("partition_id", 0444, sgi_proc_dir,
-		    &proc_partition_id_fops);
-	proc_create("system_serial_number", 0444, sgi_proc_dir,
-		    &proc_system_sn_fops);
-	proc_create("licenseID", 0444, sgi_proc_dir, &proc_license_id_fops);
-	proc_create("coherence_id", 0444, sgi_proc_dir,
-		    &proc_coherence_id_fops);
+	proc_create_single("partition_id", 0444, sgi_proc_dir,
+			partition_id_show);
+	proc_create_single("system_serial_number", 0444, sgi_proc_dir,
+			system_serial_number_show);
+	proc_create_single("licenseID", 0444, sgi_proc_dir, licenseID_show);
+	proc_create_single("coherence_id", 0444, sgi_proc_dir,
+			coherence_id_show);
 	proc_create("sn_topology", 0444, sgi_proc_dir, &proc_sn_topo_fops);
 }
 
diff --git a/arch/m68k/68000/timers.c b/arch/m68k/68000/timers.c
index 252455b..71ddb4c 100644
--- a/arch/m68k/68000/timers.c
+++ b/arch/m68k/68000/timers.c
@@ -125,7 +125,9 @@ int m68328_hwclk(int set, struct rtc_time *t)
 {
 	if (!set) {
 		long now = RTCTIME;
-		t->tm_year = t->tm_mon = t->tm_mday = 1;
+		t->tm_year = 1;
+		t->tm_mon = 0;
+		t->tm_mday = 1;
 		t->tm_hour = (now >> 24) % 24;
 		t->tm_min = (now >> 16) % 60;
 		t->tm_sec = now % 60;
diff --git a/arch/m68k/apollo/config.c b/arch/m68k/apollo/config.c
index 0d27706..b2a6bc6 100644
--- a/arch/m68k/apollo/config.c
+++ b/arch/m68k/apollo/config.c
@@ -221,8 +221,10 @@ int dn_dummy_hwclk(int op, struct rtc_time *t) {
     t->tm_hour=rtc->hours;
     t->tm_mday=rtc->day_of_month;
     t->tm_wday=rtc->day_of_week;
-    t->tm_mon=rtc->month;
+    t->tm_mon = rtc->month - 1;
     t->tm_year=rtc->year;
+    if (t->tm_year < 70)
+	t->tm_year += 100;
   } else {
     rtc->second=t->tm_sec;
     rtc->minute=t->tm_min;
@@ -230,8 +232,8 @@ int dn_dummy_hwclk(int op, struct rtc_time *t) {
     rtc->day_of_month=t->tm_mday;
     if(t->tm_wday!=-1)
       rtc->day_of_week=t->tm_wday;
-    rtc->month=t->tm_mon;
-    rtc->year=t->tm_year;
+    rtc->month = t->tm_mon + 1;
+    rtc->year = t->tm_year % 100;
   }
 
   return 0;
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index 37a8e5a..a874e54 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -98,8 +98,8 @@ CONFIG_NF_CONNTRACK_SANE=m
 CONFIG_NF_CONNTRACK_SIP=m
 CONFIG_NF_CONNTRACK_TFTP=m
 CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
 CONFIG_NFT_EXTHDR=m
 CONFIG_NFT_META=m
 CONFIG_NFT_RT=m
@@ -204,7 +204,7 @@ CONFIG_NF_SOCKET_IPV4=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NFT_DUP_IPV4=m
 CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
 CONFIG_NF_FLOW_TABLE_IPV4=m
 CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -233,12 +233,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NF_CONNTRACK_IPV6=m
 CONFIG_NF_SOCKET_IPV6=m
 CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_NFT_CHAIN_NAT_IPV6=m
 CONFIG_NFT_MASQ_IPV6=m
 CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP6_NF_MATCH_AH=m
 CONFIG_IP6_NF_MATCH_EUI64=m
@@ -259,7 +259,7 @@ CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
 CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_LOG_BRIDGE=m
@@ -310,7 +310,6 @@ CONFIG_NET_MPLS_GSO=m
 CONFIG_MPLS_ROUTING=m
 CONFIG_MPLS_IPTUNNEL=m
 CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
 CONFIG_AF_KCM=m
 # CONFIG_WIRELESS is not set
 CONFIG_PSAMPLE=m
@@ -414,6 +413,7 @@ CONFIG_ARIADNE=y
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
 CONFIG_HYDRA=y
 CONFIG_APNE=y
 CONFIG_ZORRO8390=y
@@ -485,6 +485,7 @@ CONFIG_RTC_DRV_MSM6242=m
 CONFIG_RTC_DRV_RP5C01=m
 # CONFIG_VIRTIO_MENU is not set
 # CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
 CONFIG_HEARTBEAT=y
 CONFIG_PROC_HARDWARE=y
 CONFIG_AMIGA_BUILTIN_SERIAL=y
@@ -621,6 +622,7 @@ CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_MCRYPTD=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_KEYWRAP=m
@@ -647,6 +649,8 @@ CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index 6a46626..8ce39e2 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -96,8 +96,8 @@ CONFIG_NF_CONNTRACK_SANE=m
 CONFIG_NF_CONNTRACK_SIP=m
 CONFIG_NF_CONNTRACK_TFTP=m
 CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
 CONFIG_NFT_EXTHDR=m
 CONFIG_NFT_META=m
 CONFIG_NFT_RT=m
@@ -202,7 +202,7 @@ CONFIG_NF_SOCKET_IPV4=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NFT_DUP_IPV4=m
 CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
 CONFIG_NF_FLOW_TABLE_IPV4=m
 CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -231,12 +231,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NF_CONNTRACK_IPV6=m
 CONFIG_NF_SOCKET_IPV6=m
 CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_NFT_CHAIN_NAT_IPV6=m
 CONFIG_NFT_MASQ_IPV6=m
 CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP6_NF_MATCH_AH=m
 CONFIG_IP6_NF_MATCH_EUI64=m
@@ -257,7 +257,7 @@ CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
 CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_LOG_BRIDGE=m
@@ -308,7 +308,6 @@ CONFIG_NET_MPLS_GSO=m
 CONFIG_MPLS_ROUTING=m
 CONFIG_MPLS_IPTUNNEL=m
 CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
 CONFIG_AF_KCM=m
 # CONFIG_WIRELESS is not set
 CONFIG_PSAMPLE=m
@@ -392,6 +391,7 @@ CONFIG_VETH=m
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_NATSEMI is not set
 # CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
 # CONFIG_NET_VENDOR_ROCKER is not set
@@ -446,6 +446,7 @@ CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_VIRTIO_MENU is not set
 # CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
 CONFIG_HEARTBEAT=y
 CONFIG_PROC_HARDWARE=y
 CONFIG_EXT4_FS=y
@@ -580,6 +581,7 @@ CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_MCRYPTD=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_KEYWRAP=m
@@ -606,6 +608,8 @@ CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index b0691a7..346c4e7 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -96,8 +96,8 @@ CONFIG_NF_CONNTRACK_SANE=m
 CONFIG_NF_CONNTRACK_SIP=m
 CONFIG_NF_CONNTRACK_TFTP=m
 CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
 CONFIG_NFT_EXTHDR=m
 CONFIG_NFT_META=m
 CONFIG_NFT_RT=m
@@ -202,7 +202,7 @@ CONFIG_NF_SOCKET_IPV4=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NFT_DUP_IPV4=m
 CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
 CONFIG_NF_FLOW_TABLE_IPV4=m
 CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -231,12 +231,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NF_CONNTRACK_IPV6=m
 CONFIG_NF_SOCKET_IPV6=m
 CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_NFT_CHAIN_NAT_IPV6=m
 CONFIG_NFT_MASQ_IPV6=m
 CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP6_NF_MATCH_AH=m
 CONFIG_IP6_NF_MATCH_EUI64=m
@@ -257,7 +257,7 @@ CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
 CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_LOG_BRIDGE=m
@@ -308,7 +308,6 @@ CONFIG_NET_MPLS_GSO=m
 CONFIG_MPLS_ROUTING=m
 CONFIG_MPLS_IPTUNNEL=m
 CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
 CONFIG_AF_KCM=m
 # CONFIG_WIRELESS is not set
 CONFIG_PSAMPLE=m
@@ -401,6 +400,7 @@ CONFIG_ATARILANCE=y
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
 CONFIG_NE2000=y
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
@@ -461,6 +461,7 @@ CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_VIRTIO_MENU is not set
 # CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
 CONFIG_HEARTBEAT=y
 CONFIG_PROC_HARDWARE=y
 CONFIG_NATFEAT=y
@@ -602,6 +603,7 @@ CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_MCRYPTD=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_KEYWRAP=m
@@ -628,6 +630,8 @@ CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index 6f6470f..fca9c7a 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -94,8 +94,8 @@ CONFIG_NF_CONNTRACK_SANE=m
 CONFIG_NF_CONNTRACK_SIP=m
 CONFIG_NF_CONNTRACK_TFTP=m
 CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
 CONFIG_NFT_EXTHDR=m
 CONFIG_NFT_META=m
 CONFIG_NFT_RT=m
@@ -200,7 +200,7 @@ CONFIG_NF_SOCKET_IPV4=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NFT_DUP_IPV4=m
 CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
 CONFIG_NF_FLOW_TABLE_IPV4=m
 CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -229,12 +229,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NF_CONNTRACK_IPV6=m
 CONFIG_NF_SOCKET_IPV6=m
 CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_NFT_CHAIN_NAT_IPV6=m
 CONFIG_NFT_MASQ_IPV6=m
 CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP6_NF_MATCH_AH=m
 CONFIG_IP6_NF_MATCH_EUI64=m
@@ -255,7 +255,7 @@ CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
 CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_LOG_BRIDGE=m
@@ -306,7 +306,6 @@ CONFIG_NET_MPLS_GSO=m
 CONFIG_MPLS_ROUTING=m
 CONFIG_MPLS_IPTUNNEL=m
 CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
 CONFIG_AF_KCM=m
 # CONFIG_WIRELESS is not set
 CONFIG_PSAMPLE=m
@@ -391,6 +390,7 @@ CONFIG_BVME6000_NET=y
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_NATSEMI is not set
 # CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
 # CONFIG_NET_VENDOR_ROCKER is not set
@@ -439,6 +439,7 @@ CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_VIRTIO_MENU is not set
 # CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
 CONFIG_PROC_HARDWARE=y
 CONFIG_EXT4_FS=y
 CONFIG_REISERFS_FS=m
@@ -572,6 +573,7 @@ CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_MCRYPTD=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_KEYWRAP=m
@@ -598,6 +600,8 @@ CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index 31a1a2b5..f9eab17 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -96,8 +96,8 @@ CONFIG_NF_CONNTRACK_SANE=m
 CONFIG_NF_CONNTRACK_SIP=m
 CONFIG_NF_CONNTRACK_TFTP=m
 CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
 CONFIG_NFT_EXTHDR=m
 CONFIG_NFT_META=m
 CONFIG_NFT_RT=m
@@ -202,7 +202,7 @@ CONFIG_NF_SOCKET_IPV4=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NFT_DUP_IPV4=m
 CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
 CONFIG_NF_FLOW_TABLE_IPV4=m
 CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -231,12 +231,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NF_CONNTRACK_IPV6=m
 CONFIG_NF_SOCKET_IPV6=m
 CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_NFT_CHAIN_NAT_IPV6=m
 CONFIG_NFT_MASQ_IPV6=m
 CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP6_NF_MATCH_AH=m
 CONFIG_IP6_NF_MATCH_EUI64=m
@@ -257,7 +257,7 @@ CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
 CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_LOG_BRIDGE=m
@@ -308,7 +308,6 @@ CONFIG_NET_MPLS_GSO=m
 CONFIG_MPLS_ROUTING=m
 CONFIG_MPLS_IPTUNNEL=m
 CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
 CONFIG_AF_KCM=m
 # CONFIG_WIRELESS is not set
 CONFIG_PSAMPLE=m
@@ -393,6 +392,7 @@ CONFIG_HPLANCE=y
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_NATSEMI is not set
 # CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
 # CONFIG_NET_VENDOR_ROCKER is not set
@@ -449,6 +449,7 @@ CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_VIRTIO_MENU is not set
 # CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
 CONFIG_PROC_HARDWARE=y
 CONFIG_EXT4_FS=y
 CONFIG_REISERFS_FS=m
@@ -582,6 +583,7 @@ CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_MCRYPTD=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_KEYWRAP=m
@@ -608,6 +610,8 @@ CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index 390d4a8..b52e597 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -95,8 +95,8 @@ CONFIG_NF_CONNTRACK_SANE=m
 CONFIG_NF_CONNTRACK_SIP=m
 CONFIG_NF_CONNTRACK_TFTP=m
 CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
 CONFIG_NFT_EXTHDR=m
 CONFIG_NFT_META=m
 CONFIG_NFT_RT=m
@@ -201,7 +201,7 @@ CONFIG_NF_SOCKET_IPV4=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NFT_DUP_IPV4=m
 CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
 CONFIG_NF_FLOW_TABLE_IPV4=m
 CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -230,12 +230,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NF_CONNTRACK_IPV6=m
 CONFIG_NF_SOCKET_IPV6=m
 CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_NFT_CHAIN_NAT_IPV6=m
 CONFIG_NFT_MASQ_IPV6=m
 CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP6_NF_MATCH_AH=m
 CONFIG_IP6_NF_MATCH_EUI64=m
@@ -256,7 +256,7 @@ CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
 CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_LOG_BRIDGE=m
@@ -310,7 +310,6 @@ CONFIG_NET_MPLS_GSO=m
 CONFIG_MPLS_ROUTING=m
 CONFIG_MPLS_IPTUNNEL=m
 CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
 CONFIG_AF_KCM=m
 # CONFIG_WIRELESS is not set
 CONFIG_PSAMPLE=m
@@ -410,6 +409,7 @@ CONFIG_MAC89x0=y
 # CONFIG_NET_VENDOR_MICREL is not set
 CONFIG_MACSONIC=y
 # CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
 CONFIG_MAC8390=y
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
@@ -471,6 +471,7 @@ CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_VIRTIO_MENU is not set
 # CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
 CONFIG_PROC_HARDWARE=y
 CONFIG_EXT4_FS=y
 CONFIG_REISERFS_FS=m
@@ -604,6 +605,7 @@ CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_MCRYPTD=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_KEYWRAP=m
@@ -630,6 +632,8 @@ CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index 77be97d..2a84eee 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -105,8 +105,8 @@ CONFIG_NF_CONNTRACK_SANE=m
 CONFIG_NF_CONNTRACK_SIP=m
 CONFIG_NF_CONNTRACK_TFTP=m
 CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
 CONFIG_NFT_EXTHDR=m
 CONFIG_NFT_META=m
 CONFIG_NFT_RT=m
@@ -211,7 +211,7 @@ CONFIG_NF_SOCKET_IPV4=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NFT_DUP_IPV4=m
 CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
 CONFIG_NF_FLOW_TABLE_IPV4=m
 CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -240,12 +240,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NF_CONNTRACK_IPV6=m
 CONFIG_NF_SOCKET_IPV6=m
 CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_NFT_CHAIN_NAT_IPV6=m
 CONFIG_NFT_MASQ_IPV6=m
 CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP6_NF_MATCH_AH=m
 CONFIG_IP6_NF_MATCH_EUI64=m
@@ -266,7 +266,7 @@ CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
 CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_LOG_BRIDGE=m
@@ -320,7 +320,6 @@ CONFIG_NET_MPLS_GSO=m
 CONFIG_MPLS_ROUTING=m
 CONFIG_MPLS_IPTUNNEL=m
 CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
 CONFIG_AF_KCM=m
 # CONFIG_WIRELESS is not set
 CONFIG_PSAMPLE=m
@@ -452,6 +451,7 @@ CONFIG_MVME16x_NET=y
 # CONFIG_NET_VENDOR_MICREL is not set
 CONFIG_MACSONIC=y
 # CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
 CONFIG_HYDRA=y
 CONFIG_MAC8390=y
 CONFIG_NE2000=y
@@ -541,6 +541,7 @@ CONFIG_RTC_DRV_RP5C01=m
 CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_VIRTIO_MENU is not set
 # CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
 CONFIG_HEARTBEAT=y
 CONFIG_PROC_HARDWARE=y
 CONFIG_NATFEAT=y
@@ -684,6 +685,7 @@ CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_MCRYPTD=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_KEYWRAP=m
@@ -710,6 +712,8 @@ CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index 2ca1407..476e699 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -93,8 +93,8 @@ CONFIG_NF_CONNTRACK_SANE=m
 CONFIG_NF_CONNTRACK_SIP=m
 CONFIG_NF_CONNTRACK_TFTP=m
 CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
 CONFIG_NFT_EXTHDR=m
 CONFIG_NFT_META=m
 CONFIG_NFT_RT=m
@@ -199,7 +199,7 @@ CONFIG_NF_SOCKET_IPV4=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NFT_DUP_IPV4=m
 CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
 CONFIG_NF_FLOW_TABLE_IPV4=m
 CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -228,12 +228,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NF_CONNTRACK_IPV6=m
 CONFIG_NF_SOCKET_IPV6=m
 CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_NFT_CHAIN_NAT_IPV6=m
 CONFIG_NFT_MASQ_IPV6=m
 CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP6_NF_MATCH_AH=m
 CONFIG_IP6_NF_MATCH_EUI64=m
@@ -254,7 +254,7 @@ CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
 CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_LOG_BRIDGE=m
@@ -305,7 +305,6 @@ CONFIG_NET_MPLS_GSO=m
 CONFIG_MPLS_ROUTING=m
 CONFIG_MPLS_IPTUNNEL=m
 CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
 CONFIG_AF_KCM=m
 # CONFIG_WIRELESS is not set
 CONFIG_PSAMPLE=m
@@ -391,6 +390,7 @@ CONFIG_MVME147_NET=y
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_NATSEMI is not set
 # CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
 # CONFIG_NET_VENDOR_ROCKER is not set
@@ -439,6 +439,7 @@ CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_VIRTIO_MENU is not set
 # CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
 CONFIG_PROC_HARDWARE=y
 CONFIG_EXT4_FS=y
 CONFIG_REISERFS_FS=m
@@ -572,6 +573,7 @@ CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_MCRYPTD=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_KEYWRAP=m
@@ -598,6 +600,8 @@ CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index 6a3b4dc..1477cda 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -94,8 +94,8 @@ CONFIG_NF_CONNTRACK_SANE=m
 CONFIG_NF_CONNTRACK_SIP=m
 CONFIG_NF_CONNTRACK_TFTP=m
 CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
 CONFIG_NFT_EXTHDR=m
 CONFIG_NFT_META=m
 CONFIG_NFT_RT=m
@@ -200,7 +200,7 @@ CONFIG_NF_SOCKET_IPV4=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NFT_DUP_IPV4=m
 CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
 CONFIG_NF_FLOW_TABLE_IPV4=m
 CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -229,12 +229,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NF_CONNTRACK_IPV6=m
 CONFIG_NF_SOCKET_IPV6=m
 CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_NFT_CHAIN_NAT_IPV6=m
 CONFIG_NFT_MASQ_IPV6=m
 CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP6_NF_MATCH_AH=m
 CONFIG_IP6_NF_MATCH_EUI64=m
@@ -255,7 +255,7 @@ CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
 CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_LOG_BRIDGE=m
@@ -306,7 +306,6 @@ CONFIG_NET_MPLS_GSO=m
 CONFIG_MPLS_ROUTING=m
 CONFIG_MPLS_IPTUNNEL=m
 CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
 CONFIG_AF_KCM=m
 # CONFIG_WIRELESS is not set
 CONFIG_PSAMPLE=m
@@ -391,6 +390,7 @@ CONFIG_MVME16x_NET=y
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_NATSEMI is not set
 # CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
 # CONFIG_NET_VENDOR_ROCKER is not set
@@ -439,6 +439,7 @@ CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_VIRTIO_MENU is not set
 # CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
 CONFIG_PROC_HARDWARE=y
 CONFIG_EXT4_FS=y
 CONFIG_REISERFS_FS=m
@@ -572,6 +573,7 @@ CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_MCRYPTD=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_KEYWRAP=m
@@ -598,6 +600,8 @@ CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index 2a3e29c..b3a543d 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -94,8 +94,8 @@ CONFIG_NF_CONNTRACK_SANE=m
 CONFIG_NF_CONNTRACK_SIP=m
 CONFIG_NF_CONNTRACK_TFTP=m
 CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
 CONFIG_NFT_EXTHDR=m
 CONFIG_NFT_META=m
 CONFIG_NFT_RT=m
@@ -200,7 +200,7 @@ CONFIG_NF_SOCKET_IPV4=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NFT_DUP_IPV4=m
 CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
 CONFIG_NF_FLOW_TABLE_IPV4=m
 CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -229,12 +229,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NF_CONNTRACK_IPV6=m
 CONFIG_NF_SOCKET_IPV6=m
 CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_NFT_CHAIN_NAT_IPV6=m
 CONFIG_NFT_MASQ_IPV6=m
 CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP6_NF_MATCH_AH=m
 CONFIG_IP6_NF_MATCH_EUI64=m
@@ -255,7 +255,7 @@ CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
 CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_LOG_BRIDGE=m
@@ -306,7 +306,6 @@ CONFIG_NET_MPLS_GSO=m
 CONFIG_MPLS_ROUTING=m
 CONFIG_MPLS_IPTUNNEL=m
 CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
 CONFIG_AF_KCM=m
 # CONFIG_WIRELESS is not set
 CONFIG_PSAMPLE=m
@@ -400,6 +399,7 @@ CONFIG_VETH=m
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
 CONFIG_NE2000=y
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
@@ -461,6 +461,7 @@ CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_VIRTIO_MENU is not set
 # CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
 CONFIG_HEARTBEAT=y
 CONFIG_PROC_HARDWARE=y
 CONFIG_EXT4_FS=y
@@ -595,6 +596,7 @@ CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_MCRYPTD=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_KEYWRAP=m
@@ -621,6 +623,8 @@ CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index cba2494..d543ed5 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -91,8 +91,8 @@ CONFIG_NF_CONNTRACK_SANE=m
 CONFIG_NF_CONNTRACK_SIP=m
 CONFIG_NF_CONNTRACK_TFTP=m
 CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
 CONFIG_NFT_EXTHDR=m
 CONFIG_NFT_META=m
 CONFIG_NFT_RT=m
@@ -197,7 +197,7 @@ CONFIG_NF_SOCKET_IPV4=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NFT_DUP_IPV4=m
 CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
 CONFIG_NF_FLOW_TABLE_IPV4=m
 CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -226,12 +226,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NF_CONNTRACK_IPV6=m
 CONFIG_NF_SOCKET_IPV6=m
 CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_NFT_CHAIN_NAT_IPV6=m
 CONFIG_NFT_MASQ_IPV6=m
 CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP6_NF_MATCH_AH=m
 CONFIG_IP6_NF_MATCH_EUI64=m
@@ -252,7 +252,7 @@ CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
 CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_LOG_BRIDGE=m
@@ -303,7 +303,6 @@ CONFIG_NET_MPLS_GSO=m
 CONFIG_MPLS_ROUTING=m
 CONFIG_MPLS_IPTUNNEL=m
 CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
 CONFIG_AF_KCM=m
 # CONFIG_WIRELESS is not set
 CONFIG_PSAMPLE=m
@@ -388,6 +387,7 @@ CONFIG_SUN3_82586=y
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_NATSEMI is not set
 # CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
 # CONFIG_NET_VENDOR_ROCKER is not set
@@ -441,6 +441,7 @@ CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_VIRTIO_MENU is not set
 # CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
 CONFIG_PROC_HARDWARE=y
 CONFIG_EXT4_FS=y
 CONFIG_REISERFS_FS=m
@@ -573,6 +574,7 @@ CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_MCRYPTD=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_KEYWRAP=m
@@ -599,6 +601,8 @@ CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index d911561..a67e542 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -91,8 +91,8 @@ CONFIG_NF_CONNTRACK_SANE=m
 CONFIG_NF_CONNTRACK_SIP=m
 CONFIG_NF_CONNTRACK_TFTP=m
 CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
 CONFIG_NFT_EXTHDR=m
 CONFIG_NFT_META=m
 CONFIG_NFT_RT=m
@@ -197,7 +197,7 @@ CONFIG_NF_SOCKET_IPV4=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NFT_DUP_IPV4=m
 CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
 CONFIG_NF_FLOW_TABLE_IPV4=m
 CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -226,12 +226,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NF_CONNTRACK_IPV6=m
 CONFIG_NF_SOCKET_IPV6=m
 CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_NFT_CHAIN_NAT_IPV6=m
 CONFIG_NFT_MASQ_IPV6=m
 CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP6_NF_MATCH_AH=m
 CONFIG_IP6_NF_MATCH_EUI64=m
@@ -252,7 +252,7 @@ CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
 CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_LOG_BRIDGE=m
@@ -303,7 +303,6 @@ CONFIG_NET_MPLS_GSO=m
 CONFIG_MPLS_ROUTING=m
 CONFIG_MPLS_IPTUNNEL=m
 CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
 CONFIG_AF_KCM=m
 # CONFIG_WIRELESS is not set
 CONFIG_PSAMPLE=m
@@ -389,6 +388,7 @@ CONFIG_SUN3LANCE=y
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_NATSEMI is not set
 # CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
 # CONFIG_NET_VENDOR_ROCKER is not set
@@ -441,6 +441,7 @@ CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_VIRTIO_MENU is not set
 # CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
 CONFIG_PROC_HARDWARE=y
 CONFIG_EXT4_FS=y
 CONFIG_REISERFS_FS=m
@@ -574,6 +575,7 @@ CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_MCRYPTD=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_KEYWRAP=m
@@ -600,6 +602,8 @@ CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/include/asm/delay.h b/arch/m68k/include/asm/delay.h
index 7f47412..751712f 100644
--- a/arch/m68k/include/asm/delay.h
+++ b/arch/m68k/include/asm/delay.h
@@ -49,8 +49,6 @@ extern void __bad_udelay(void);
  * The simpler m68k and ColdFire processors do not have a 32*32->64
  * multiply instruction. So we need to handle them a little differently.
  * We use a bit of shifting and a single 32*32->32 multiply to get close.
- * This is a macro so that the const version can factor out the first
- * multiply and shift.
  */
 #define	HZSCALE		(268435456 / (1000000 / HZ))
 
@@ -115,6 +113,13 @@ static inline void __udelay(unsigned long usecs)
  */
 #define	HZSCALE		(268435456 / (1000000 / HZ))
 
-#define ndelay(n) __delay(DIV_ROUND_UP((n) * ((((HZSCALE) >> 11) * (loops_per_jiffy >> 11)) >> 6), 1000))
+static inline void ndelay(unsigned long nsec)
+{
+	__delay(DIV_ROUND_UP(nsec *
+			     ((((HZSCALE) >> 11) *
+			       (loops_per_jiffy >> 11)) >> 6),
+			     1000));
+}
+#define ndelay(n) ndelay(n)
 
 #endif /* defined(_M68K_DELAY_H) */
diff --git a/arch/m68k/include/asm/pci.h b/arch/m68k/include/asm/pci.h
index ef26fae..5a4bc22 100644
--- a/arch/m68k/include/asm/pci.h
+++ b/arch/m68k/include/asm/pci.h
@@ -4,12 +4,6 @@
 
 #include <asm-generic/pci.h>
 
-/* The PCI address space does equal the physical memory
- * address space.  The networking and block device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS	(1)
-
 #define	pcibios_assign_all_busses()	1
 
 #define	PCIBIOS_MIN_IO		0x00000100
diff --git a/arch/m68k/include/asm/uaccess_mm.h b/arch/m68k/include/asm/uaccess_mm.h
index 75c172e..c4cb889 100644
--- a/arch/m68k/include/asm/uaccess_mm.h
+++ b/arch/m68k/include/asm/uaccess_mm.h
@@ -141,10 +141,12 @@ asm volatile ("\n"					\
 	case 4:								\
 		__get_user_asm(__gu_err, x, ptr, u32, l, r, -EFAULT);	\
 		break;							\
-/*	case 8:	disabled because gcc-4.1 has a broken typeof		\
- 	    {								\
- 		const void *__gu_ptr = (ptr);				\
- 		u64 __gu_val;						\
+	case 8: {							\
+		const void *__gu_ptr = (ptr);				\
+		union {							\
+			u64 l;						\
+			__typeof__(*(ptr)) t;				\
+		} __gu_val;						\
 		asm volatile ("\n"					\
 			"1:	"MOVES".l	(%2)+,%1\n"		\
 			"2:	"MOVES".l	(%2),%R1\n"		\
@@ -162,13 +164,13 @@ asm volatile ("\n"					\
 			"	.long	1b,10b\n"			\
 			"	.long	2b,10b\n"			\
 			"	.previous"				\
-			: "+d" (__gu_err), "=&r" (__gu_val),		\
+			: "+d" (__gu_err), "=&r" (__gu_val.l),		\
 			  "+a" (__gu_ptr)				\
 			: "i" (-EFAULT)					\
 			: "memory");					\
-		(x) = (__force typeof(*(ptr)))__gu_val;			\
+		(x) = __gu_val.t;					\
 		break;							\
-	    }	*/							\
+	}								\
 	default:							\
 		__gu_err = __get_user_bad();				\
 		break;							\
diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c
index c01b9b8..463572c 100644
--- a/arch/m68k/kernel/dma.c
+++ b/arch/m68k/kernel/dma.c
@@ -9,6 +9,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/device.h>
 #include <linux/kernel.h>
+#include <linux/platform_device.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
@@ -165,3 +166,12 @@ const struct dma_map_ops m68k_dma_ops = {
 	.sync_sg_for_device	= m68k_dma_sync_sg_for_device,
 };
 EXPORT_SYMBOL(m68k_dma_ops);
+
+void arch_setup_pdev_archdata(struct platform_device *pdev)
+{
+	if (pdev->dev.coherent_dma_mask == DMA_MASK_NONE &&
+	    pdev->dev.dma_mask == NULL) {
+		pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+		pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
+	}
+}
diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c
index dd25bfc..f35e3eb 100644
--- a/arch/m68k/kernel/setup_mm.c
+++ b/arch/m68k/kernel/setup_mm.c
@@ -527,21 +527,9 @@ static int hardware_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int hardware_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, hardware_proc_show, NULL);
-}
-
-static const struct file_operations hardware_proc_fops = {
-	.open		= hardware_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_hardware_init(void)
 {
-	proc_create("hardware", 0, NULL, &hardware_proc_fops);
+	proc_create_single("hardware", 0, NULL, hardware_proc_show);
 	return 0;
 }
 module_init(proc_hardware_init);
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index f7cd5ec..72850b8 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -576,41 +576,42 @@ static inline int rt_save_fpu_state(struct ucontext __user *uc, struct pt_regs *
 
 static inline void siginfo_build_tests(void)
 {
-	/* This needs to be tested on m68k as it has a lesser
-	 * alignment requirment than x86 and that can cause surprises.
+	/*
+	 * This needs to be tested on m68k as it has a lesser
+	 * alignment requirement than x86 and that can cause surprises.
 	 */
 
 	/* This is part of the ABI and can never change in size: */
 	BUILD_BUG_ON(sizeof(siginfo_t) != 128);
 
-	/* Ensure the know fields never change in location */
+	/* Ensure the known fields never change in location */
 	BUILD_BUG_ON(offsetof(siginfo_t, si_signo) != 0);
 	BUILD_BUG_ON(offsetof(siginfo_t, si_errno) != 4);
 	BUILD_BUG_ON(offsetof(siginfo_t, si_code)  != 8);
 
 	/* _kill */
-	BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x0C);
+	BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x0c);
 	BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x10);
 
 	/* _timer */
-	BUILD_BUG_ON(offsetof(siginfo_t, si_tid)     != 0x0C);
+	BUILD_BUG_ON(offsetof(siginfo_t, si_tid)     != 0x0c);
 	BUILD_BUG_ON(offsetof(siginfo_t, si_overrun) != 0x10);
 	BUILD_BUG_ON(offsetof(siginfo_t, si_value)   != 0x14);
 
 	/* _rt */
-	BUILD_BUG_ON(offsetof(siginfo_t, si_pid)   != 0x0C);
+	BUILD_BUG_ON(offsetof(siginfo_t, si_pid)   != 0x0c);
 	BUILD_BUG_ON(offsetof(siginfo_t, si_uid)   != 0x10);
 	BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x14);
 
 	/* _sigchld */
-	BUILD_BUG_ON(offsetof(siginfo_t, si_pid)    != 0x0C);
+	BUILD_BUG_ON(offsetof(siginfo_t, si_pid)    != 0x0c);
 	BUILD_BUG_ON(offsetof(siginfo_t, si_uid)    != 0x10);
 	BUILD_BUG_ON(offsetof(siginfo_t, si_status) != 0x14);
 	BUILD_BUG_ON(offsetof(siginfo_t, si_utime)  != 0x18);
-	BUILD_BUG_ON(offsetof(siginfo_t, si_stime)  != 0x1C);
+	BUILD_BUG_ON(offsetof(siginfo_t, si_stime)  != 0x1c);
 
 	/* _sigfault */
-	BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x0C);
+	BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x0c);
 
 	/* _sigfault._mcerr */
 	BUILD_BUG_ON(offsetof(siginfo_t, si_addr_lsb) != 0x10);
@@ -623,11 +624,11 @@ static inline void siginfo_build_tests(void)
 	BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x12);
 
 	/* _sigpoll */
-	BUILD_BUG_ON(offsetof(siginfo_t, si_band)   != 0x0C);
+	BUILD_BUG_ON(offsetof(siginfo_t, si_band)   != 0x0c);
 	BUILD_BUG_ON(offsetof(siginfo_t, si_fd)     != 0x10);
 
 	/* _sigsys */
-	BUILD_BUG_ON(offsetof(siginfo_t, si_call_addr) != 0x0C);
+	BUILD_BUG_ON(offsetof(siginfo_t, si_call_addr) != 0x0c);
 	BUILD_BUG_ON(offsetof(siginfo_t, si_syscall)   != 0x10);
 	BUILD_BUG_ON(offsetof(siginfo_t, si_arch)      != 0x14);
 
diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c
index 97dd4e2..3a8b47f 100644
--- a/arch/m68k/kernel/time.c
+++ b/arch/m68k/kernel/time.c
@@ -71,23 +71,26 @@ static irqreturn_t timer_interrupt(int irq, void *dummy)
 	return IRQ_HANDLED;
 }
 
-void read_persistent_clock(struct timespec *ts)
+#ifdef CONFIG_M68KCLASSIC
+#if !IS_BUILTIN(CONFIG_RTC_DRV_GENERIC)
+void read_persistent_clock64(struct timespec64 *ts)
 {
 	struct rtc_time time;
+
 	ts->tv_sec = 0;
 	ts->tv_nsec = 0;
 
-	if (mach_hwclk) {
-		mach_hwclk(0, &time);
+	if (!mach_hwclk)
+		return;
 
-		if ((time.tm_year += 1900) < 1970)
-			time.tm_year += 100;
-		ts->tv_sec = mktime(time.tm_year, time.tm_mon, time.tm_mday,
-				      time.tm_hour, time.tm_min, time.tm_sec);
-	}
+	mach_hwclk(0, &time);
+
+	ts->tv_sec = mktime64(time.tm_year + 1900, time.tm_mon + 1, time.tm_mday,
+			      time.tm_hour, time.tm_min, time.tm_sec);
 }
+#endif
 
-#if defined(CONFIG_ARCH_USES_GETTIMEOFFSET) && IS_ENABLED(CONFIG_RTC_DRV_GENERIC)
+#if IS_ENABLED(CONFIG_RTC_DRV_GENERIC)
 static int rtc_generic_get_time(struct device *dev, struct rtc_time *tm)
 {
 	mach_hwclk(0, tm);
@@ -145,8 +148,8 @@ static int __init rtc_init(void)
 }
 
 module_init(rtc_init);
-
-#endif /* CONFIG_ARCH_USES_GETTIMEOFFSET */
+#endif /* CONFIG_RTC_DRV_GENERIC */
+#endif /* CONFIG M68KCLASSIC */
 
 void __init time_init(void)
 {
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index c1cc4e9..b2fd000 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -1007,9 +1007,9 @@ void bad_super_trap (struct frame *fp)
 
 asmlinkage void trap_c(struct frame *fp)
 {
-	int sig;
+	int sig, si_code;
+	void __user *addr;
 	int vector = (fp->ptregs.vector >> 2) & 0xff;
-	siginfo_t info;
 
 	if (fp->ptregs.sr & PS_S) {
 		if (vector == VEC_TRACE) {
@@ -1029,21 +1029,21 @@ asmlinkage void trap_c(struct frame *fp)
 	/* send the appropriate signal to the user program */
 	switch (vector) {
 	    case VEC_ADDRERR:
-		info.si_code = BUS_ADRALN;
+		si_code = BUS_ADRALN;
 		sig = SIGBUS;
 		break;
 	    case VEC_ILLEGAL:
 	    case VEC_LINE10:
 	    case VEC_LINE11:
-		info.si_code = ILL_ILLOPC;
+		si_code = ILL_ILLOPC;
 		sig = SIGILL;
 		break;
 	    case VEC_PRIV:
-		info.si_code = ILL_PRVOPC;
+		si_code = ILL_PRVOPC;
 		sig = SIGILL;
 		break;
 	    case VEC_COPROC:
-		info.si_code = ILL_COPROC;
+		si_code = ILL_COPROC;
 		sig = SIGILL;
 		break;
 	    case VEC_TRAP1:
@@ -1060,76 +1060,74 @@ asmlinkage void trap_c(struct frame *fp)
 	    case VEC_TRAP12:
 	    case VEC_TRAP13:
 	    case VEC_TRAP14:
-		info.si_code = ILL_ILLTRP;
+		si_code = ILL_ILLTRP;
 		sig = SIGILL;
 		break;
 	    case VEC_FPBRUC:
 	    case VEC_FPOE:
 	    case VEC_FPNAN:
-		info.si_code = FPE_FLTINV;
+		si_code = FPE_FLTINV;
 		sig = SIGFPE;
 		break;
 	    case VEC_FPIR:
-		info.si_code = FPE_FLTRES;
+		si_code = FPE_FLTRES;
 		sig = SIGFPE;
 		break;
 	    case VEC_FPDIVZ:
-		info.si_code = FPE_FLTDIV;
+		si_code = FPE_FLTDIV;
 		sig = SIGFPE;
 		break;
 	    case VEC_FPUNDER:
-		info.si_code = FPE_FLTUND;
+		si_code = FPE_FLTUND;
 		sig = SIGFPE;
 		break;
 	    case VEC_FPOVER:
-		info.si_code = FPE_FLTOVF;
+		si_code = FPE_FLTOVF;
 		sig = SIGFPE;
 		break;
 	    case VEC_ZERODIV:
-		info.si_code = FPE_INTDIV;
+		si_code = FPE_INTDIV;
 		sig = SIGFPE;
 		break;
 	    case VEC_CHK:
 	    case VEC_TRAP:
-		info.si_code = FPE_INTOVF;
+		si_code = FPE_INTOVF;
 		sig = SIGFPE;
 		break;
 	    case VEC_TRACE:		/* ptrace single step */
-		info.si_code = TRAP_TRACE;
+		si_code = TRAP_TRACE;
 		sig = SIGTRAP;
 		break;
 	    case VEC_TRAP15:		/* breakpoint */
-		info.si_code = TRAP_BRKPT;
+		si_code = TRAP_BRKPT;
 		sig = SIGTRAP;
 		break;
 	    default:
-		info.si_code = ILL_ILLOPC;
+		si_code = ILL_ILLOPC;
 		sig = SIGILL;
 		break;
 	}
-	info.si_signo = sig;
-	info.si_errno = 0;
 	switch (fp->ptregs.format) {
 	    default:
-		info.si_addr = (void *) fp->ptregs.pc;
+		addr = (void __user *) fp->ptregs.pc;
 		break;
 	    case 2:
-		info.si_addr = (void *) fp->un.fmt2.iaddr;
+		addr = (void __user *) fp->un.fmt2.iaddr;
 		break;
 	    case 7:
-		info.si_addr = (void *) fp->un.fmt7.effaddr;
+		addr = (void __user *) fp->un.fmt7.effaddr;
 		break;
 	    case 9:
-		info.si_addr = (void *) fp->un.fmt9.iaddr;
+		addr = (void __user *) fp->un.fmt9.iaddr;
 		break;
 	    case 10:
-		info.si_addr = (void *) fp->un.fmta.daddr;
+		addr = (void __user *) fp->un.fmta.daddr;
 		break;
 	    case 11:
-		info.si_addr = (void *) fp->un.fmtb.daddr;
+		addr = (void __user*) fp->un.fmtb.daddr;
 		break;
 	}
-	force_sig_info (sig, &info, current);
+	force_sig_fault(sig, si_code, addr, current);
 }
 
 void die_if_kernel (char *str, struct pt_regs *fp, int nr)
@@ -1161,12 +1159,6 @@ asmlinkage void fpsp040_die(void)
 #ifdef CONFIG_M68KFPU_EMU
 asmlinkage void fpemu_signal(int signal, int code, void *addr)
 {
-	siginfo_t info;
-
-	info.si_signo = signal;
-	info.si_errno = 0;
-	info.si_code = code;
-	info.si_addr = addr;
-	force_sig_info(signal, &info, current);
+	force_sig_fault(signal, code, addr, current);
 }
 #endif
diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c
index 0c3275a..e522307 100644
--- a/arch/m68k/mac/config.c
+++ b/arch/m68k/mac/config.c
@@ -1005,7 +1005,7 @@ int __init mac_platform_init(void)
 		struct resource swim_rsrc = {
 			.flags = IORESOURCE_MEM,
 			.start = (resource_size_t)swim_base,
-			.end   = (resource_size_t)swim_base + 0x2000,
+			.end   = (resource_size_t)swim_base + 0x1FFF,
 		};
 
 		platform_device_register_simple("swim", -1, &swim_rsrc, 1);
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index 03253c4..f2ff377 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -21,35 +21,32 @@ extern void die_if_kernel(char *, struct pt_regs *, long);
 
 int send_fault_sig(struct pt_regs *regs)
 {
-	siginfo_t siginfo;
+	int signo, si_code;
+	void __user *addr;
 
-	clear_siginfo(&siginfo);
-	siginfo.si_signo = current->thread.signo;
-	siginfo.si_code = current->thread.code;
-	siginfo.si_addr = (void *)current->thread.faddr;
-	pr_debug("send_fault_sig: %p,%d,%d\n", siginfo.si_addr,
-		 siginfo.si_signo, siginfo.si_code);
+	signo = current->thread.signo;
+	si_code = current->thread.code;
+	addr = (void __user *)current->thread.faddr;
+	pr_debug("send_fault_sig: %p,%d,%d\n", addr, signo, si_code);
 
 	if (user_mode(regs)) {
-		force_sig_info(siginfo.si_signo,
-			       &siginfo, current);
+		force_sig_fault(signo, si_code, addr, current);
 	} else {
 		if (fixup_exception(regs))
 			return -1;
 
-		//if (siginfo.si_signo == SIGBUS)
-		//	force_sig_info(siginfo.si_signo,
-		//		       &siginfo, current);
+		//if (signo == SIGBUS)
+		//	force_sig_fault(si_signo, si_code, addr, current);
 
 		/*
 		 * Oops. The kernel tried to access some bad page. We'll have to
 		 * terminate things with extreme prejudice.
 		 */
-		if ((unsigned long)siginfo.si_addr < PAGE_SIZE)
+		if ((unsigned long)addr < PAGE_SIZE)
 			pr_alert("Unable to handle kernel NULL pointer dereference");
 		else
 			pr_alert("Unable to handle kernel access");
-		pr_cont(" at virtual address %p\n", siginfo.si_addr);
+		pr_cont(" at virtual address %p\n", addr);
 		die_if_kernel("Oops", regs, 0 /*error_code*/);
 		do_exit(SIGKILL);
 	}
diff --git a/arch/m68k/mm/kmap.c b/arch/m68k/mm/kmap.c
index c2a3832..3b420f6 100644
--- a/arch/m68k/mm/kmap.c
+++ b/arch/m68k/mm/kmap.c
@@ -89,7 +89,8 @@ static inline void free_io_area(void *addr)
 	for (p = &iolist ; (tmp = *p) ; p = &tmp->next) {
 		if (tmp->addr == addr) {
 			*p = tmp->next;
-			__iounmap(tmp->addr, tmp->size);
+			/* remove gap added in get_io_area() */
+			__iounmap(tmp->addr, tmp->size - IO_SIZE);
 			kfree(tmp);
 			return;
 		}
diff --git a/arch/m68k/mvme147/config.c b/arch/m68k/mvme147/config.c
index 8778612..f8a710f 100644
--- a/arch/m68k/mvme147/config.c
+++ b/arch/m68k/mvme147/config.c
@@ -153,12 +153,14 @@ int mvme147_hwclk(int op, struct rtc_time *t)
 	if (!op) {
 		m147_rtc->ctrl = RTC_READ;
 		t->tm_year = bcd2int (m147_rtc->bcd_year);
-		t->tm_mon  = bcd2int (m147_rtc->bcd_mth);
+		t->tm_mon  = bcd2int(m147_rtc->bcd_mth) - 1;
 		t->tm_mday = bcd2int (m147_rtc->bcd_dom);
 		t->tm_hour = bcd2int (m147_rtc->bcd_hr);
 		t->tm_min  = bcd2int (m147_rtc->bcd_min);
 		t->tm_sec  = bcd2int (m147_rtc->bcd_sec);
 		m147_rtc->ctrl = 0;
+		if (t->tm_year < 70)
+			t->tm_year += 100;
 	}
 	return 0;
 }
diff --git a/arch/m68k/mvme16x/config.c b/arch/m68k/mvme16x/config.c
index 6fa06d4..4ffd9ef 100644
--- a/arch/m68k/mvme16x/config.c
+++ b/arch/m68k/mvme16x/config.c
@@ -400,12 +400,14 @@ int mvme16x_hwclk(int op, struct rtc_time *t)
 	if (!op) {
 		rtc->ctrl = RTC_READ;
 		t->tm_year = bcd2int (rtc->bcd_year);
-		t->tm_mon  = bcd2int (rtc->bcd_mth);
+		t->tm_mon  = bcd2int(rtc->bcd_mth) - 1;
 		t->tm_mday = bcd2int (rtc->bcd_dom);
 		t->tm_hour = bcd2int (rtc->bcd_hr);
 		t->tm_min  = bcd2int (rtc->bcd_min);
 		t->tm_sec  = bcd2int (rtc->bcd_sec);
 		rtc->ctrl = 0;
+		if (t->tm_year < 70)
+			t->tm_year += 100;
 	}
 	return 0;
 }
diff --git a/arch/m68k/sun3/intersil.c b/arch/m68k/sun3/intersil.c
index 2cd0bcb..d911070 100644
--- a/arch/m68k/sun3/intersil.c
+++ b/arch/m68k/sun3/intersil.c
@@ -48,9 +48,9 @@ int sun3_hwclk(int set, struct rtc_time *t)
 		todintersil->hour = t->tm_hour;
 		todintersil->minute = t->tm_min;
 		todintersil->second = t->tm_sec;
-		todintersil->month = t->tm_mon;
+		todintersil->month = t->tm_mon + 1;
 		todintersil->day = t->tm_mday;
-		todintersil->year = t->tm_year - 68;
+		todintersil->year = (t->tm_year - 68) % 100;
 		todintersil->weekday = t->tm_wday;
 	} else {
 		/* read clock */
@@ -58,10 +58,12 @@ int sun3_hwclk(int set, struct rtc_time *t)
 		t->tm_hour = todintersil->hour;
 		t->tm_min = todintersil->minute;
 		t->tm_sec = todintersil->second;
-		t->tm_mon = todintersil->month;
+		t->tm_mon = todintersil->month - 1;
 		t->tm_mday = todintersil->day;
 		t->tm_year = todintersil->year + 68;
 		t->tm_wday = todintersil->weekday;
+		if (t->tm_year < 70)
+			t->tm_year += 100;
 	}
 
 	intersil_clock->cmd_reg = START_VAL;
diff --git a/arch/m68k/sun3x/time.c b/arch/m68k/sun3x/time.c
index 7a2c53d..047e2bc 100644
--- a/arch/m68k/sun3x/time.c
+++ b/arch/m68k/sun3x/time.c
@@ -52,8 +52,8 @@ int sun3x_hwclk(int set, struct rtc_time *t)
 		h->hour = bin2bcd(t->tm_hour);
 		h->wday = bin2bcd(t->tm_wday);
 		h->mday = bin2bcd(t->tm_mday);
-		h->month = bin2bcd(t->tm_mon);
-		h->year = bin2bcd(t->tm_year);
+		h->month = bin2bcd(t->tm_mon + 1);
+		h->year = bin2bcd(t->tm_year % 100);
 		h->csr &= ~C_WRITE;
 	} else {
 		h->csr |= C_READ;
@@ -62,9 +62,11 @@ int sun3x_hwclk(int set, struct rtc_time *t)
 		t->tm_hour = bcd2bin(h->hour);
 		t->tm_wday = bcd2bin(h->wday);
 		t->tm_mday = bcd2bin(h->mday);
-		t->tm_mon = bcd2bin(h->month);
+		t->tm_mon = bcd2bin(h->month) - 1;
 		t->tm_year = bcd2bin(h->year);
 		h->csr &= ~C_READ;
+		if (t->tm_year < 70)
+			t->tm_year += 100;
 	}
 
 	local_irq_restore(flags);
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 3817a3e..d147821 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -19,7 +19,6 @@ config MICROBLAZE
 	select HAVE_ARCH_HASH
 	select HAVE_ARCH_KGDB
 	select HAVE_DEBUG_KMEMLEAK
-	select HAVE_DMA_API_DEBUG
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_TRACER
diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h
index 5de871e..0033786 100644
--- a/arch/microblaze/include/asm/pci.h
+++ b/arch/microblaze/include/asm/pci.h
@@ -62,12 +62,6 @@ extern int pci_mmap_legacy_page_range(struct pci_bus *bus,
 
 #define HAVE_PCI_LEGACY	1
 
-/* The PCI address space does equal the physical memory
- * address space (no IOMMU).  The IDE and SCSI device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS     (1)
-
 extern void pcibios_claim_one_bus(struct pci_bus *b);
 
 extern void pcibios_finish_adding_to_bus(struct pci_bus *bus);
diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c
index c91e8ce..3145e7d 100644
--- a/arch/microblaze/kernel/dma.c
+++ b/arch/microblaze/kernel/dma.c
@@ -184,14 +184,3 @@ const struct dma_map_ops dma_nommu_ops = {
 	.sync_sg_for_device	= dma_nommu_sync_sg_for_device,
 };
 EXPORT_SYMBOL(dma_nommu_ops);
-
-/* Number of entries preallocated for DMA-API debugging */
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init dma_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-
-	return 0;
-}
-fs_initcall(dma_init);
diff --git a/arch/microblaze/kernel/exceptions.c b/arch/microblaze/kernel/exceptions.c
index e6f338d..eafff21 100644
--- a/arch/microblaze/kernel/exceptions.c
+++ b/arch/microblaze/kernel/exceptions.c
@@ -60,16 +60,10 @@ asmlinkage void sw_exception(struct pt_regs *regs)
 
 void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
 {
-	siginfo_t info;
-
 	if (kernel_mode(regs))
 		die("Exception in kernel mode", regs, signr);
 
-	info.si_signo = signr;
-	info.si_errno = 0;
-	info.si_code = code;
-	info.si_addr = (void __user *) addr;
-	force_sig_info(signr, &info, current);
+	force_sig_fault(signr, code, (void __user *)addr, current);
 }
 
 asmlinkage void full_exception(struct pt_regs *regs, unsigned int type,
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index f91b30f..af60744 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -88,7 +88,6 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 {
 	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
-	siginfo_t info;
 	int code = SEGV_MAPERR;
 	int is_write = error_code & ESR_S;
 	int fault;
@@ -269,11 +268,6 @@ bad_area_nosemaphore:
 	/* User mode accesses cause a SIGSEGV */
 	if (user_mode(regs)) {
 		_exception(SIGSEGV, regs, code, address);
-/*		info.si_signo = SIGSEGV;
-		info.si_errno = 0;
-		info.si_code = code;
-		info.si_addr = (void *) address;
-		force_sig_info(SIGSEGV, &info, current);*/
 		return;
 	}
 
@@ -295,11 +289,7 @@ out_of_memory:
 do_sigbus:
 	up_read(&mm->mmap_sem);
 	if (user_mode(regs)) {
-		info.si_signo = SIGBUS;
-		info.si_errno = 0;
-		info.si_code = BUS_ADRERR;
-		info.si_addr = (void __user *)address;
-		force_sig_info(SIGBUS, &info, current);
+		force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current);
 		return;
 	}
 	bad_page_fault(regs, address, SIGBUS);
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 225c95d..7074b22 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -42,7 +42,6 @@ config MIPS
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DEBUG_STACKOVERFLOW
-	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_EXIT_THREAD
@@ -132,7 +131,7 @@ config MIPS_GENERIC
 
 config MIPS_ALCHEMY
 	bool "Alchemy processor based machines"
-	select ARCH_PHYS_ADDR_T_64BIT
+	select PHYS_ADDR_T_64BIT
 	select CEVT_R4K
 	select CSRC_R4K
 	select IRQ_MIPS_CPU
@@ -890,7 +889,7 @@ config CAVIUM_OCTEON_SOC
 	bool "Cavium Networks Octeon SoC based boards"
 	select CEVT_R4K
 	select ARCH_HAS_PHYS_TO_DMA
-	select ARCH_PHYS_ADDR_T_64BIT
+	select PHYS_ADDR_T_64BIT
 	select DMA_COHERENT
 	select SYS_SUPPORTS_64BIT_KERNEL
 	select SYS_SUPPORTS_BIG_ENDIAN
@@ -912,6 +911,7 @@ config CAVIUM_OCTEON_SOC
 	select MIPS_NR_CPU_NR_MAP_1024
 	select BUILTIN_DTB
 	select MTD_COMPLEX_MAPPINGS
+	select SWIOTLB
 	select SYS_SUPPORTS_RELOCATABLE
 	help
 	  This option supports all of the Octeon reference boards from Cavium
@@ -936,7 +936,7 @@ config NLM_XLR_BOARD
 	select SWAP_IO_SPACE
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_64BIT_KERNEL
-	select ARCH_PHYS_ADDR_T_64BIT
+	select PHYS_ADDR_T_64BIT
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_HIGHMEM
 	select DMA_COHERENT
@@ -962,7 +962,7 @@ config NLM_XLP_BOARD
 	select HW_HAS_PCI
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_64BIT_KERNEL
-	select ARCH_PHYS_ADDR_T_64BIT
+	select PHYS_ADDR_T_64BIT
 	select GPIOLIB
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_LITTLE_ENDIAN
@@ -1101,9 +1101,6 @@ config GPIO_TXX9
 config FW_CFE
 	bool
 
-config ARCH_DMA_ADDR_T_64BIT
-	def_bool (HIGHMEM && ARCH_PHYS_ADDR_T_64BIT) || 64BIT
-
 config ARCH_SUPPORTS_UPROBES
 	bool
 
@@ -1122,9 +1119,6 @@ config DMA_NONCOHERENT
 	bool
 	select NEED_DMA_MAP_STATE
 
-config NEED_DMA_MAP_STATE
-	bool
-
 config SYS_HAS_EARLY_PRINTK
 	bool
 
@@ -1373,6 +1367,7 @@ config CPU_LOONGSON3
 	select MIPS_PGD_C0_CONTEXT
 	select MIPS_L1_CACHE_SHIFT_6
 	select GPIOLIB
+	select SWIOTLB
 	help
 		The Loongson 3 processor implements the MIPS64R2 instruction
 		set with many extensions.
@@ -1770,7 +1765,7 @@ config CPU_MIPS32_R5_XPA
 	depends on SYS_SUPPORTS_HIGHMEM
 	select XPA
 	select HIGHMEM
-	select ARCH_PHYS_ADDR_T_64BIT
+	select PHYS_ADDR_T_64BIT
 	default n
 	help
 	  Choose this option if you want to enable the Extended Physical
@@ -2402,9 +2397,6 @@ config SB1_PASS_2_1_WORKAROUNDS
 	default y
 
 
-config ARCH_PHYS_ADDR_T_64BIT
-       bool
-
 choice
 	prompt "SmartMIPS or microMIPS ASE support"
 
@@ -2556,7 +2548,7 @@ config ARCH_DISCONTIGMEM_ENABLE
 	  Say Y to support efficient handling of discontiguous physical memory,
 	  for architectures which are either NUMA (Non-Uniform Memory Access)
 	  or have huge holes in the physical address space for other reasons.
-	  See <file:Documentation/vm/numa> for more.
+	  See <file:Documentation/vm/numa.rst> for more.
 
 config ARCH_SPARSEMEM_ENABLE
 	bool
diff --git a/arch/mips/cavium-octeon/Kconfig b/arch/mips/cavium-octeon/Kconfig
index b5eee1a..4984e46 100644
--- a/arch/mips/cavium-octeon/Kconfig
+++ b/arch/mips/cavium-octeon/Kconfig
@@ -67,18 +67,6 @@ config CAVIUM_OCTEON_LOCK_L2_MEMCPY
 	help
 	  Lock the kernel's implementation of memcpy() into L2.
 
-config IOMMU_HELPER
-	bool
-
-config NEED_SG_DMA_LENGTH
-	bool
-
-config SWIOTLB
-	def_bool y
-	select DMA_DIRECT_OPS
-	select IOMMU_HELPER
-	select NEED_SG_DMA_LENGTH
-
 config OCTEON_ILM
 	tristate "Module to measure interrupt latency using Octeon CIU Timer"
 	help
diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h
index 2339f42..4360998 100644
--- a/arch/mips/include/asm/pci.h
+++ b/arch/mips/include/asm/pci.h
@@ -121,13 +121,6 @@ extern unsigned long PCIBIOS_MIN_MEM;
 #include <linux/string.h>
 #include <asm/io.h>
 
-/*
- * The PCI address space does equal the physical memory address space.
- * The networking and block device layers use this boolean for bounce
- * buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS     (1)
-
 #ifdef CONFIG_PCI_DOMAINS_GENERIC
 static inline int pci_proc_domain(struct pci_bus *bus)
 {
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 967e9e4..d67fa74 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -699,17 +699,11 @@ static int simulate_sync(struct pt_regs *regs, unsigned int opcode)
 asmlinkage void do_ov(struct pt_regs *regs)
 {
 	enum ctx_state prev_state;
-	siginfo_t info;
-
-	clear_siginfo(&info);
-	info.si_signo = SIGFPE;
-	info.si_code = FPE_INTOVF;
-	info.si_addr = (void __user *)regs->cp0_epc;
 
 	prev_state = exception_enter();
 	die_if_kernel("Integer overflow", regs);
 
-	force_sig_info(SIGFPE, &info, current);
+	force_sig_fault(SIGFPE, FPE_INTOVF, (void __user *)regs->cp0_epc, current);
 	exception_exit(prev_state);
 }
 
@@ -722,32 +716,27 @@ asmlinkage void do_ov(struct pt_regs *regs)
 void force_fcr31_sig(unsigned long fcr31, void __user *fault_addr,
 		     struct task_struct *tsk)
 {
-	struct siginfo si;
-
-	clear_siginfo(&si);
-	si.si_addr = fault_addr;
-	si.si_signo = SIGFPE;
+	int si_code = FPE_FLTUNK;
 
 	if (fcr31 & FPU_CSR_INV_X)
-		si.si_code = FPE_FLTINV;
+		si_code = FPE_FLTINV;
 	else if (fcr31 & FPU_CSR_DIV_X)
-		si.si_code = FPE_FLTDIV;
+		si_code = FPE_FLTDIV;
 	else if (fcr31 & FPU_CSR_OVF_X)
-		si.si_code = FPE_FLTOVF;
+		si_code = FPE_FLTOVF;
 	else if (fcr31 & FPU_CSR_UDF_X)
-		si.si_code = FPE_FLTUND;
+		si_code = FPE_FLTUND;
 	else if (fcr31 & FPU_CSR_INE_X)
-		si.si_code = FPE_FLTRES;
+		si_code = FPE_FLTRES;
 
-	force_sig_info(SIGFPE, &si, tsk);
+	force_sig_fault(SIGFPE, si_code, fault_addr, tsk);
 }
 
 int process_fpemu_return(int sig, void __user *fault_addr, unsigned long fcr31)
 {
-	struct siginfo si;
+	int si_code;
 	struct vm_area_struct *vma;
 
-	clear_siginfo(&si);
 	switch (sig) {
 	case 0:
 		return 0;
@@ -757,23 +746,18 @@ int process_fpemu_return(int sig, void __user *fault_addr, unsigned long fcr31)
 		return 1;
 
 	case SIGBUS:
-		si.si_addr = fault_addr;
-		si.si_signo = sig;
-		si.si_code = BUS_ADRERR;
-		force_sig_info(sig, &si, current);
+		force_sig_fault(SIGBUS, BUS_ADRERR, fault_addr, current);
 		return 1;
 
 	case SIGSEGV:
-		si.si_addr = fault_addr;
-		si.si_signo = sig;
 		down_read(&current->mm->mmap_sem);
 		vma = find_vma(current->mm, (unsigned long)fault_addr);
 		if (vma && (vma->vm_start <= (unsigned long)fault_addr))
-			si.si_code = SEGV_ACCERR;
+			si_code = SEGV_ACCERR;
 		else
-			si.si_code = SEGV_MAPERR;
+			si_code = SEGV_MAPERR;
 		up_read(&current->mm->mmap_sem);
-		force_sig_info(sig, &si, current);
+		force_sig_fault(SIGSEGV, si_code, fault_addr, current);
 		return 1;
 
 	default:
@@ -896,10 +880,8 @@ out:
 void do_trap_or_bp(struct pt_regs *regs, unsigned int code, int si_code,
 	const char *str)
 {
-	siginfo_t info;
 	char b[40];
 
-	clear_siginfo(&info);
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
 	if (kgdb_ll_trap(DIE_TRAP, str, regs, code, current->thread.trap_nr,
 			 SIGTRAP) == NOTIFY_STOP)
@@ -921,13 +903,9 @@ void do_trap_or_bp(struct pt_regs *regs, unsigned int code, int si_code,
 	case BRK_DIVZERO:
 		scnprintf(b, sizeof(b), "%s instruction in kernel code", str);
 		die_if_kernel(b, regs);
-		if (code == BRK_DIVZERO)
-			info.si_code = FPE_INTDIV;
-		else
-			info.si_code = FPE_INTOVF;
-		info.si_signo = SIGFPE;
-		info.si_addr = (void __user *) regs->cp0_epc;
-		force_sig_info(SIGFPE, &info, current);
+		force_sig_fault(SIGFPE,
+				code == BRK_DIVZERO ? FPE_INTDIV : FPE_INTOVF,
+				(void __user *) regs->cp0_epc, current);
 		break;
 	case BRK_BUG:
 		die_if_kernel("Kernel bug detected", regs);
@@ -952,9 +930,7 @@ void do_trap_or_bp(struct pt_regs *regs, unsigned int code, int si_code,
 		scnprintf(b, sizeof(b), "%s instruction in kernel code", str);
 		die_if_kernel(b, regs);
 		if (si_code) {
-			info.si_signo = SIGTRAP;
-			info.si_code = si_code;
-			force_sig_info(SIGTRAP, &info, current);
+			force_sig_fault(SIGTRAP, si_code, NULL,	current);
 		} else {
 			force_sig(SIGTRAP, current);
 		}
@@ -1506,13 +1482,8 @@ asmlinkage void do_mdmx(struct pt_regs *regs)
  */
 asmlinkage void do_watch(struct pt_regs *regs)
 {
-	siginfo_t info;
 	enum ctx_state prev_state;
 
-	clear_siginfo(&info);
-	info.si_signo = SIGTRAP;
-	info.si_code = TRAP_HWBKPT;
-
 	prev_state = exception_enter();
 	/*
 	 * Clear WP (bit 22) bit of cause register so we don't loop
@@ -1528,7 +1499,7 @@ asmlinkage void do_watch(struct pt_regs *regs)
 	if (test_tsk_thread_flag(current, TIF_LOAD_WATCH)) {
 		mips_read_watch_registers();
 		local_irq_enable();
-		force_sig_info(SIGTRAP, &info, current);
+		force_sig_fault(SIGTRAP, TRAP_HWBKPT, NULL, current);
 	} else {
 		mips_clear_watch_registers();
 		local_irq_enable();
diff --git a/arch/mips/loongson64/Kconfig b/arch/mips/loongson64/Kconfig
index 72af0c1..c79e6a5 100644
--- a/arch/mips/loongson64/Kconfig
+++ b/arch/mips/loongson64/Kconfig
@@ -130,21 +130,6 @@ config LOONGSON_UART_BASE
 	default y
 	depends on EARLY_PRINTK || SERIAL_8250
 
-config IOMMU_HELPER
-	bool
-
-config NEED_SG_DMA_LENGTH
-	bool
-
-config SWIOTLB
-	bool "Soft IOMMU Support for All-Memory DMA"
-	default y
-	depends on CPU_LOONGSON3
-	select DMA_DIRECT_OPS
-	select IOMMU_HELPER
-	select NEED_SG_DMA_LENGTH
-	select NEED_DMA_MAP_STATE
-
 config PHYS48_TO_HT40
 	bool
 	default y if CPU_LOONGSON3
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index dcafa43..f9fef00 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -402,13 +402,3 @@ static const struct dma_map_ops mips_default_dma_map_ops = {
 
 const struct dma_map_ops *mips_dma_map_ops = &mips_default_dma_map_ops;
 EXPORT_SYMBOL(mips_dma_map_ops);
-
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init mips_dma_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-
-	return 0;
-}
-fs_initcall(mips_dma_init);
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 4f8f5bf..5f71f2b 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -42,7 +42,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
 	struct task_struct *tsk = current;
 	struct mm_struct *mm = tsk->mm;
 	const int field = sizeof(unsigned long) * 2;
-	siginfo_t info;
+	int si_code;
 	int fault;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
@@ -63,7 +63,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
 		return;
 #endif
 
-	info.si_code = SEGV_MAPERR;
+	si_code = SEGV_MAPERR;
 
 	/*
 	 * We fault-in kernel-space virtual memory on-demand. The
@@ -112,7 +112,7 @@ retry:
  * we can handle it..
  */
 good_area:
-	info.si_code = SEGV_ACCERR;
+	si_code = SEGV_ACCERR;
 
 	if (write) {
 		if (!(vma->vm_flags & VM_WRITE))
@@ -223,11 +223,7 @@ bad_area_nosemaphore:
 			pr_cont("\n");
 		}
 		current->thread.trap_nr = (regs->cp0_cause >> 2) & 0x1f;
-		info.si_signo = SIGSEGV;
-		info.si_errno = 0;
-		/* info.si_code has been set above */
-		info.si_addr = (void __user *) address;
-		force_sig_info(SIGSEGV, &info, tsk);
+		force_sig_fault(SIGSEGV, si_code, (void __user *)address, tsk);
 		return;
 	}
 
@@ -283,11 +279,7 @@ do_sigbus:
 #endif
 	current->thread.trap_nr = (regs->cp0_cause >> 2) & 0x1f;
 	tsk->thread.cp0_badvaddr = address;
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_ADRERR;
-	info.si_addr = (void __user *) address;
-	force_sig_info(SIGBUS, &info, tsk);
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, tsk);
 
 	return;
 #ifndef CONFIG_64BIT
diff --git a/arch/mips/netlogic/Kconfig b/arch/mips/netlogic/Kconfig
index 7fcfc7f..412351c 100644
--- a/arch/mips/netlogic/Kconfig
+++ b/arch/mips/netlogic/Kconfig
@@ -83,10 +83,4 @@ endif
 config NLM_COMMON
 	bool
 
-config IOMMU_HELPER
-	bool
-
-config NEED_SG_DMA_LENGTH
-	bool
-
 endif
diff --git a/arch/mips/pci/ops-pmcmsp.c b/arch/mips/pci/ops-pmcmsp.c
index dd2d9f7..7649372 100644
--- a/arch/mips/pci/ops-pmcmsp.c
+++ b/arch/mips/pci/ops-pmcmsp.c
@@ -83,18 +83,6 @@ static int show_msp_pci_counts(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int msp_pci_rd_cnt_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, show_msp_pci_counts, NULL);
-}
-
-static const struct file_operations msp_pci_rd_cnt_fops = {
-	.open		= msp_pci_rd_cnt_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /*****************************************************************************
  *
  *  FUNCTION: gen_pci_cfg_wr_show
@@ -160,18 +148,6 @@ static int gen_pci_cfg_wr_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int gen_pci_cfg_wr_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, gen_pci_cfg_wr_show, NULL);
-}
-
-static const struct file_operations gen_pci_cfg_wr_fops = {
-	.open		= gen_pci_cfg_wr_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /*****************************************************************************
  *
  *  FUNCTION: pci_proc_init
@@ -188,8 +164,8 @@ static const struct file_operations gen_pci_cfg_wr_fops = {
  ****************************************************************************/
 static void pci_proc_init(void)
 {
-	proc_create("pmc_msp_pci_rd_cnt", 0, NULL, &msp_pci_rd_cnt_fops);
-	proc_create("pmc_msp_pci_cfg_wr", 0, NULL, &gen_pci_cfg_wr_fops);
+	proc_create_single("pmc_msp_pci_rd_cnt", 0, NULL, show_msp_pci_counts);
+	proc_create_single("pmc_msp_pci_cfg_wr", 0, NULL, gen_pci_cfg_wr_show);
 }
 #endif /* CONFIG_PROC_FS && PCI_COUNTERS */
 
diff --git a/arch/mips/sibyte/common/bus_watcher.c b/arch/mips/sibyte/common/bus_watcher.c
index a4e55999..4bb85de 100644
--- a/arch/mips/sibyte/common/bus_watcher.c
+++ b/arch/mips/sibyte/common/bus_watcher.c
@@ -142,24 +142,12 @@ static int bw_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int bw_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, bw_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations bw_proc_fops = {
-	.open		= bw_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static void create_proc_decoder(struct bw_stats_struct *stats)
 {
 	struct proc_dir_entry *ent;
 
-	ent = proc_create_data("bus_watcher", S_IWUSR | S_IRUGO, NULL,
-			       &bw_proc_fops, stats);
+	ent = proc_create_single_data("bus_watcher", S_IWUSR | S_IRUGO, NULL,
+			bw_proc_show, stats);
 	if (!ent) {
 		printk(KERN_INFO "Unable to initialize bus_watcher /proc entry\n");
 		return;
diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig
index b7404f2..6aed9742 100644
--- a/arch/nds32/Kconfig
+++ b/arch/nds32/Kconfig
@@ -5,10 +5,13 @@
 
 config NDS32
         def_bool y
+	select ARCH_HAS_SYNC_DMA_FOR_CPU
+	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_WANT_FRAME_POINTERS if FTRACE
 	select CLKSRC_MMIO
 	select CLONE_BACKWARDS
 	select COMMON_CLK
+	select DMA_NONCOHERENT_OPS
 	select GENERIC_ASHLDI3
 	select GENERIC_ASHRDI3
 	select GENERIC_LSHRDI3
diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild
index 142e612..6f5cc29 100644
--- a/arch/nds32/include/asm/Kbuild
+++ b/arch/nds32/include/asm/Kbuild
@@ -13,6 +13,7 @@ generic-y += cputime.h
 generic-y += device.h
 generic-y += div64.h
 generic-y += dma.h
+generic-y += dma-mapping.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
diff --git a/arch/nds32/include/asm/dma-mapping.h b/arch/nds32/include/asm/dma-mapping.h
deleted file mode 100644
index 2dd47d24..0000000
--- a/arch/nds32/include/asm/dma-mapping.h
+++ /dev/null
@@ -1,14 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (C) 2005-2017 Andes Technology Corporation
-
-#ifndef ASMNDS32_DMA_MAPPING_H
-#define ASMNDS32_DMA_MAPPING_H
-
-extern struct dma_map_ops nds32_dma_ops;
-
-static inline struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
-{
-	return &nds32_dma_ops;
-}
-
-#endif
diff --git a/arch/nds32/kernel/dma.c b/arch/nds32/kernel/dma.c
index d291800..d0dbd4f 100644
--- a/arch/nds32/kernel/dma.c
+++ b/arch/nds32/kernel/dma.c
@@ -3,17 +3,14 @@
 
 #include <linux/types.h>
 #include <linux/mm.h>
-#include <linux/export.h>
 #include <linux/string.h>
-#include <linux/scatterlist.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-noncoherent.h>
 #include <linux/io.h>
 #include <linux/cache.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
-#include <asm/dma-mapping.h>
 #include <asm/proc-fns.h>
 
 /*
@@ -22,11 +19,6 @@
 static pte_t *consistent_pte;
 static DEFINE_RAW_SPINLOCK(consistent_lock);
 
-enum master_type {
-	FOR_CPU = 0,
-	FOR_DEVICE = 1,
-};
-
 /*
  * VM region handling support.
  *
@@ -124,10 +116,8 @@ out:
 	return c;
 }
 
-/* FIXME: attrs is not used. */
-static void *nds32_dma_alloc_coherent(struct device *dev, size_t size,
-				      dma_addr_t * handle, gfp_t gfp,
-				      unsigned long attrs)
+void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
+		gfp_t gfp, unsigned long attrs)
 {
 	struct page *page;
 	struct arch_vm_region *c;
@@ -232,8 +222,8 @@ no_page:
 	return NULL;
 }
 
-static void nds32_dma_free(struct device *dev, size_t size, void *cpu_addr,
-			   dma_addr_t handle, unsigned long attrs)
+void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
+		dma_addr_t handle, unsigned long attrs)
 {
 	struct arch_vm_region *c;
 	unsigned long flags, addr;
@@ -333,145 +323,69 @@ static int __init consistent_init(void)
 }
 
 core_initcall(consistent_init);
-static void consistent_sync(void *vaddr, size_t size, int direction, int master_type);
-static dma_addr_t nds32_dma_map_page(struct device *dev, struct page *page,
-				     unsigned long offset, size_t size,
-				     enum dma_data_direction dir,
-				     unsigned long attrs)
-{
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		consistent_sync((void *)(page_address(page) + offset), size, dir, FOR_DEVICE);
-	return page_to_phys(page) + offset;
-}
-
-static void nds32_dma_unmap_page(struct device *dev, dma_addr_t handle,
-				 size_t size, enum dma_data_direction dir,
-				 unsigned long attrs)
-{
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		consistent_sync(phys_to_virt(handle), size, dir, FOR_CPU);
-}
-
-/*
- * Make an area consistent for devices.
- */
-static void consistent_sync(void *vaddr, size_t size, int direction, int master_type)
-{
-	unsigned long start = (unsigned long)vaddr;
-	unsigned long end = start + size;
-
-	if (master_type == FOR_CPU) {
-		switch (direction) {
-		case DMA_TO_DEVICE:
-			break;
-		case DMA_FROM_DEVICE:
-		case DMA_BIDIRECTIONAL:
-			cpu_dma_inval_range(start, end);
-			break;
-		default:
-			BUG();
-		}
-	} else {
-		/* FOR_DEVICE */
-		switch (direction) {
-		case DMA_FROM_DEVICE:
-			break;
-		case DMA_TO_DEVICE:
-		case DMA_BIDIRECTIONAL:
-			cpu_dma_wb_range(start, end);
-			break;
-		default:
-			BUG();
-		}
-	}
-}
 
-static int nds32_dma_map_sg(struct device *dev, struct scatterlist *sg,
-			    int nents, enum dma_data_direction dir,
-			    unsigned long attrs)
+static inline void cache_op(phys_addr_t paddr, size_t size,
+		void (*fn)(unsigned long start, unsigned long end))
 {
-	int i;
+	struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
+	unsigned offset = paddr & ~PAGE_MASK;
+	size_t left = size;
+	unsigned long start;
 
-	for (i = 0; i < nents; i++, sg++) {
-		void *virt;
-		unsigned long pfn;
-		struct page *page = sg_page(sg);
+	do {
+		size_t len = left;
 
-		sg->dma_address = sg_phys(sg);
-		pfn = page_to_pfn(page) + sg->offset / PAGE_SIZE;
-		page = pfn_to_page(pfn);
 		if (PageHighMem(page)) {
-			virt = kmap_atomic(page);
-			consistent_sync(virt, sg->length, dir, FOR_CPU);
-			kunmap_atomic(virt);
+			void *addr;
+
+			if (offset + len > PAGE_SIZE) {
+				if (offset >= PAGE_SIZE) {
+					page += offset >> PAGE_SHIFT;
+					offset &= ~PAGE_MASK;
+				}
+				len = PAGE_SIZE - offset;
+			}
+
+			addr = kmap_atomic(page);
+			start = (unsigned long)(addr + offset);
+			fn(start, start + len);
+			kunmap_atomic(addr);
 		} else {
-			if (sg->offset > PAGE_SIZE)
-				panic("sg->offset:%08x > PAGE_SIZE\n",
-				      sg->offset);
-			virt = page_address(page) + sg->offset;
-			consistent_sync(virt, sg->length, dir, FOR_CPU);
+			start = (unsigned long)phys_to_virt(paddr);
+			fn(start, start + size);
 		}
-	}
-	return nents;
+		offset = 0;
+		page++;
+		left -= len;
+	} while (left);
 }
 
-static void nds32_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
-			       int nhwentries, enum dma_data_direction dir,
-			       unsigned long attrs)
+void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir)
 {
-}
-
-static void
-nds32_dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle,
-			      size_t size, enum dma_data_direction dir)
-{
-	consistent_sync((void *)phys_to_virt(handle), size, dir, FOR_CPU);
-}
-
-static void
-nds32_dma_sync_single_for_device(struct device *dev, dma_addr_t handle,
-				 size_t size, enum dma_data_direction dir)
-{
-	consistent_sync((void *)phys_to_virt(handle), size, dir, FOR_DEVICE);
-}
-
-static void
-nds32_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nents,
-			  enum dma_data_direction dir)
-{
-	int i;
-
-	for (i = 0; i < nents; i++, sg++) {
-		char *virt =
-		    page_address((struct page *)sg->page_link) + sg->offset;
-		consistent_sync(virt, sg->length, dir, FOR_CPU);
+	switch (dir) {
+	case DMA_FROM_DEVICE:
+		break;
+	case DMA_TO_DEVICE:
+	case DMA_BIDIRECTIONAL:
+		cache_op(paddr, size, cpu_dma_wb_range);
+		break;
+	default:
+		BUG();
 	}
 }
 
-static void
-nds32_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
-			     int nents, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir)
 {
-	int i;
-
-	for (i = 0; i < nents; i++, sg++) {
-		char *virt =
-		    page_address((struct page *)sg->page_link) + sg->offset;
-		consistent_sync(virt, sg->length, dir, FOR_DEVICE);
+	switch (dir) {
+	case DMA_TO_DEVICE:
+		break;
+	case DMA_FROM_DEVICE:
+	case DMA_BIDIRECTIONAL:
+		cache_op(paddr, size, cpu_dma_inval_range);
+		break;
+	default:
+		BUG();
 	}
 }
-
-struct dma_map_ops nds32_dma_ops = {
-	.alloc = nds32_dma_alloc_coherent,
-	.free = nds32_dma_free,
-	.map_page = nds32_dma_map_page,
-	.unmap_page = nds32_dma_unmap_page,
-	.map_sg = nds32_dma_map_sg,
-	.unmap_sg = nds32_dma_unmap_sg,
-	.sync_single_for_device = nds32_dma_sync_single_for_device,
-	.sync_single_for_cpu = nds32_dma_sync_single_for_cpu,
-	.sync_sg_for_cpu = nds32_dma_sync_sg_for_cpu,
-	.sync_sg_for_device = nds32_dma_sync_sg_for_device,
-};
-
-EXPORT_SYMBOL(nds32_dma_ops);
diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c
index 6e34eb9..a6205fd 100644
--- a/arch/nds32/kernel/traps.c
+++ b/arch/nds32/kernel/traps.c
@@ -222,19 +222,13 @@ void die_if_kernel(const char *str, struct pt_regs *regs, int err)
 
 int bad_syscall(int n, struct pt_regs *regs)
 {
-	siginfo_t info;
-
 	if (current->personality != PER_LINUX) {
 		send_sig(SIGSEGV, current, 1);
 		return regs->uregs[0];
 	}
 
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code = ILL_ILLTRP;
-	info.si_addr = (void __user *)instruction_pointer(regs) - 4;
-
-	force_sig_info(SIGILL, &info, current);
+	force_sig_fault(SIGILL, ILL_ILLTRP,
+			(void __user *)instruction_pointer(regs) - 4, current);
 	die_if_kernel("Oops - bad syscall", regs, n);
 	return regs->uregs[0];
 }
@@ -287,16 +281,11 @@ void __init early_trap_init(void)
 void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 		  int error_code, int si_code)
 {
-	struct siginfo info;
-
 	tsk->thread.trap_no = ENTRY_DEBUG_RELATED;
 	tsk->thread.error_code = error_code;
 
-	memset(&info, 0, sizeof(info));
-	info.si_signo = SIGTRAP;
-	info.si_code = si_code;
-	info.si_addr = (void __user *)instruction_pointer(regs);
-	force_sig_info(SIGTRAP, &info, tsk);
+	force_sig_fault(SIGTRAP, si_code,
+			(void __user *)instruction_pointer(regs), tsk);
 }
 
 void do_debug_trap(unsigned long entry, unsigned long addr,
@@ -318,29 +307,22 @@ void do_debug_trap(unsigned long entry, unsigned long addr,
 
 void unhandled_interruption(struct pt_regs *regs)
 {
-	siginfo_t si;
 	pr_emerg("unhandled_interruption\n");
 	show_regs(regs);
 	if (!user_mode(regs))
 		do_exit(SIGKILL);
-	si.si_signo = SIGKILL;
-	si.si_errno = 0;
-	force_sig_info(SIGKILL, &si, current);
+	force_sig(SIGKILL, current);
 }
 
 void unhandled_exceptions(unsigned long entry, unsigned long addr,
 			  unsigned long type, struct pt_regs *regs)
 {
-	siginfo_t si;
 	pr_emerg("Unhandled Exception: entry: %lx addr:%lx itype:%lx\n", entry,
 		 addr, type);
 	show_regs(regs);
 	if (!user_mode(regs))
 		do_exit(SIGKILL);
-	si.si_signo = SIGKILL;
-	si.si_errno = 0;
-	si.si_addr = (void *)addr;
-	force_sig_info(SIGKILL, &si, current);
+	force_sig(SIGKILL, current);
 }
 
 extern int do_page_fault(unsigned long entry, unsigned long addr,
@@ -363,14 +345,11 @@ void do_dispatch_tlb_misc(unsigned long entry, unsigned long addr,
 
 void do_revinsn(struct pt_regs *regs)
 {
-	siginfo_t si;
 	pr_emerg("Reserved Instruction\n");
 	show_regs(regs);
 	if (!user_mode(regs))
 		do_exit(SIGILL);
-	si.si_signo = SIGILL;
-	si.si_errno = 0;
-	force_sig_info(SIGILL, &si, current);
+	force_sig(SIGILL, current);
 }
 
 #ifdef CONFIG_ALIGNMENT_TRAP
diff --git a/arch/nds32/mm/fault.c b/arch/nds32/mm/fault.c
index 3a246fb..9bdb7c3 100644
--- a/arch/nds32/mm/fault.c
+++ b/arch/nds32/mm/fault.c
@@ -72,7 +72,7 @@ void do_page_fault(unsigned long entry, unsigned long addr,
 	struct task_struct *tsk;
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
-	siginfo_t info;
+	int si_code;
 	int fault;
 	unsigned int mask = VM_READ | VM_WRITE | VM_EXEC;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
@@ -80,7 +80,7 @@ void do_page_fault(unsigned long entry, unsigned long addr,
 	error_code = error_code & (ITYPE_mskINST | ITYPE_mskETYPE);
 	tsk = current;
 	mm = tsk->mm;
-	info.si_code = SEGV_MAPERR;
+	si_code = SEGV_MAPERR;
 	/*
 	 * We fault-in kernel-space virtual memory on-demand. The
 	 * 'reference' page table is init_mm.pgd.
@@ -161,7 +161,7 @@ retry:
 	 */
 
 good_area:
-	info.si_code = SEGV_ACCERR;
+	si_code = SEGV_ACCERR;
 
 	/* first do some preliminary protection checks */
 	if (entry == ENTRY_PTE_NOT_PRESENT) {
@@ -266,11 +266,7 @@ bad_area_nosemaphore:
 		tsk->thread.address = addr;
 		tsk->thread.error_code = error_code;
 		tsk->thread.trap_no = entry;
-		info.si_signo = SIGSEGV;
-		info.si_errno = 0;
-		/* info.si_code has been set above */
-		info.si_addr = (void *)addr;
-		force_sig_info(SIGSEGV, &info, tsk);
+		force_sig_fault(SIGSEGV, si_code, (void __user *)addr, tsk);
 		return;
 	}
 
@@ -339,11 +335,7 @@ do_sigbus:
 	tsk->thread.address = addr;
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_no = entry;
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_ADRERR;
-	info.si_addr = (void *)addr;
-	force_sig_info(SIGBUS, &info, tsk);
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr, tsk);
 
 	return;
 
diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c
index 8184e7d..3bc3cd2 100644
--- a/arch/nios2/kernel/traps.c
+++ b/arch/nios2/kernel/traps.c
@@ -26,13 +26,7 @@ static DEFINE_SPINLOCK(die_lock);
 
 static void _send_sig(int signo, int code, unsigned long addr)
 {
-	siginfo_t info;
-
-	info.si_signo = signo;
-	info.si_errno = 0;
-	info.si_code = code;
-	info.si_addr = (void __user *) addr;
-	force_sig_info(signo, &info, current);
+	force_sig_fault(signo, code, (void __user *) addr, current);
 }
 
 void die(const char *str, struct pt_regs *regs, long err)
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c
index a945f00..ec7fd45 100644
--- a/arch/openrisc/kernel/dma.c
+++ b/arch/openrisc/kernel/dma.c
@@ -247,14 +247,3 @@ const struct dma_map_ops or1k_dma_map_ops = {
 	.sync_single_for_device = or1k_sync_single_for_device,
 };
 EXPORT_SYMBOL(or1k_dma_map_ops);
-
-/* Number of entries preallocated for DMA-API debugging */
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init dma_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-
-	return 0;
-}
-fs_initcall(dma_init);
diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c
index 113c175fe..fac246e 100644
--- a/arch/openrisc/kernel/traps.c
+++ b/arch/openrisc/kernel/traps.c
@@ -250,27 +250,16 @@ void __init trap_init(void)
 
 asmlinkage void do_trap(struct pt_regs *regs, unsigned long address)
 {
-	siginfo_t info;
-	memset(&info, 0, sizeof(info));
-	info.si_signo = SIGTRAP;
-	info.si_code = TRAP_TRACE;
-	info.si_addr = (void *)address;
-	force_sig_info(SIGTRAP, &info, current);
+	force_sig_fault(SIGTRAP, TRAP_TRACE, (void __user *)address, current);
 
 	regs->pc += 4;
 }
 
 asmlinkage void do_unaligned_access(struct pt_regs *regs, unsigned long address)
 {
-	siginfo_t info;
-
 	if (user_mode(regs)) {
 		/* Send a SIGBUS */
-		info.si_signo = SIGBUS;
-		info.si_errno = 0;
-		info.si_code = BUS_ADRALN;
-		info.si_addr = (void __user *)address;
-		force_sig_info(SIGBUS, &info, current);
+		force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)address, current);
 	} else {
 		printk("KERNEL: Unaligned Access 0x%.8lx\n", address);
 		show_registers(regs);
@@ -281,15 +270,9 @@ asmlinkage void do_unaligned_access(struct pt_regs *regs, unsigned long address)
 
 asmlinkage void do_bus_fault(struct pt_regs *regs, unsigned long address)
 {
-	siginfo_t info;
-
 	if (user_mode(regs)) {
 		/* Send a SIGBUS */
-		info.si_signo = SIGBUS;
-		info.si_errno = 0;
-		info.si_code = BUS_ADRERR;
-		info.si_addr = (void *)address;
-		force_sig_info(SIGBUS, &info, current);
+		force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current);
 	} else {		/* Kernel mode */
 		printk("KERNEL: Bus error (SIGBUS) 0x%.8lx\n", address);
 		show_registers(regs);
@@ -464,7 +447,6 @@ static inline void simulate_swa(struct pt_regs *regs, unsigned long address,
 asmlinkage void do_illegal_instruction(struct pt_regs *regs,
 				       unsigned long address)
 {
-	siginfo_t info;
 	unsigned int op;
 	unsigned int insn = *((unsigned int *)address);
 
@@ -485,11 +467,7 @@ asmlinkage void do_illegal_instruction(struct pt_regs *regs,
 
 	if (user_mode(regs)) {
 		/* Send a SIGILL */
-		info.si_signo = SIGILL;
-		info.si_errno = 0;
-		info.si_code = ILL_ILLOPC;
-		info.si_addr = (void *)address;
-		force_sig_info(SIGBUS, &info, current);
+		force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)address, current);
 	} else {		/* Kernel mode */
 		printk("KERNEL: Illegal instruction (SIGILL) 0x%.8lx\n",
 		       address);
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index d0021df..9f011d1 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -52,7 +52,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address,
 	struct task_struct *tsk;
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
-	siginfo_t info;
+	int si_code;
 	int fault;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
@@ -97,7 +97,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address,
 	}
 
 	mm = tsk->mm;
-	info.si_code = SEGV_MAPERR;
+	si_code = SEGV_MAPERR;
 
 	/*
 	 * If we're in an interrupt or have no user
@@ -139,7 +139,7 @@ retry:
 	 */
 
 good_area:
-	info.si_code = SEGV_ACCERR;
+	si_code = SEGV_ACCERR;
 
 	/* first do some preliminary protection checks */
 
@@ -213,11 +213,7 @@ bad_area_nosemaphore:
 	/* User mode accesses just cause a SIGSEGV */
 
 	if (user_mode(regs)) {
-		info.si_signo = SIGSEGV;
-		info.si_errno = 0;
-		/* info.si_code has been set above */
-		info.si_addr = (void *)address;
-		force_sig_info(SIGSEGV, &info, tsk);
+		force_sig_fault(SIGSEGV, si_code, (void __user *)address, tsk);
 		return;
 	}
 
@@ -282,11 +278,7 @@ do_sigbus:
 	 * Send a sigbus, regardless of whether we were in kernel
 	 * or user mode.
 	 */
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_ADRERR;
-	info.si_addr = (void *)address;
-	force_sig_info(SIGBUS, &info, tsk);
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, tsk);
 
 	/* Kernel mode? Handle exceptions or die */
 	if (!user_mode(regs))
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index fc5a574..4d8f64d 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -51,6 +51,8 @@ config PARISC
 	select GENERIC_CLOCKEVENTS
 	select ARCH_NO_COHERENT_DMA_MMAP
 	select CPU_NO_EFFICIENT_FFS
+	select NEED_DMA_MAP_STATE
+	select NEED_SG_DMA_LENGTH
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
@@ -111,12 +113,6 @@ config PM
 config STACKTRACE_SUPPORT
 	def_bool y
 
-config NEED_DMA_MAP_STATE
-	def_bool y
-
-config NEED_SG_DMA_LENGTH
-	def_bool y
-
 config ISA_DMA_API
 	bool
 
diff --git a/arch/parisc/include/asm/pci.h b/arch/parisc/include/asm/pci.h
index 96b7dee..3328fd1 100644
--- a/arch/parisc/include/asm/pci.h
+++ b/arch/parisc/include/asm/pci.h
@@ -88,29 +88,6 @@ struct pci_hba_data {
 #endif /* !CONFIG_64BIT */
 
 /*
- * If the PCI device's view of memory is the same as the CPU's view of memory,
- * PCI_DMA_BUS_IS_PHYS is true.  The networking and block device layers use
- * this boolean for bounce buffer decisions.
- */
-#ifdef CONFIG_PA20
-/* All PA-2.0 machines have an IOMMU. */
-#define PCI_DMA_BUS_IS_PHYS	0
-#define parisc_has_iommu()	do { } while (0)
-#else
-
-#if defined(CONFIG_IOMMU_CCIO) || defined(CONFIG_IOMMU_SBA)
-extern int parisc_bus_is_phys; 	/* in arch/parisc/kernel/setup.c */
-#define PCI_DMA_BUS_IS_PHYS	parisc_bus_is_phys
-#define parisc_has_iommu()	do { parisc_bus_is_phys = 0; } while (0)
-#else
-#define PCI_DMA_BUS_IS_PHYS	1
-#define parisc_has_iommu()	do { } while (0)
-#endif
-
-#endif	/* !CONFIG_PA20 */
-
-
-/*
 ** Most PCI devices (eg Tulip, NCR720) also export the same registers
 ** to both MMIO and I/O port space.  Due to poor performance of I/O Port
 ** access under HP PCI bus adapters, strongly recommend the use of MMIO
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index 91bc0ca..6df07ce 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -367,19 +367,6 @@ static int proc_pcxl_dma_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int proc_pcxl_dma_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_pcxl_dma_show, NULL);
-}
-
-static const struct file_operations proc_pcxl_dma_ops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_pcxl_dma_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init
 pcxl_dma_init(void)
 {
@@ -397,8 +384,8 @@ pcxl_dma_init(void)
 			"pcxl_dma_init: Unable to create gsc /proc dir entry\n");
 	else {
 		struct proc_dir_entry* ent;
-		ent = proc_create("pcxl_dma", 0, proc_gsc_root,
-				  &proc_pcxl_dma_ops);
+		ent = proc_create_single("pcxl_dma", 0, proc_gsc_root,
+				proc_pcxl_dma_show);
 		if (!ent)
 			printk(KERN_WARNING
 				"pci-dma.c: Unable to create pcxl_dma /proc entry.\n");
diff --git a/arch/parisc/kernel/pdc_chassis.c b/arch/parisc/kernel/pdc_chassis.c
index 3e04242..28e0748 100644
--- a/arch/parisc/kernel/pdc_chassis.c
+++ b/arch/parisc/kernel/pdc_chassis.c
@@ -266,18 +266,6 @@ static int pdc_chassis_warn_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int pdc_chassis_warn_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pdc_chassis_warn_show, NULL);
-}
-
-static const struct file_operations pdc_chassis_warn_fops = {
-	.open		= pdc_chassis_warn_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init pdc_chassis_create_procfs(void)
 {
 	unsigned long test;
@@ -292,7 +280,7 @@ static int __init pdc_chassis_create_procfs(void)
 
 	printk(KERN_INFO "Enabling PDC chassis warnings support v%s\n",
 			PDC_CHASSIS_VER);
-	proc_create("chassis", 0400, NULL, &pdc_chassis_warn_fops);
+	proc_create_single("chassis", 0400, NULL, pdc_chassis_warn_show);
 	return 0;
 }
 
diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index 1a2be6e..7aa1d4d 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -76,8 +76,6 @@ void user_enable_single_step(struct task_struct *task)
 	set_tsk_thread_flag(task, TIF_SINGLESTEP);
 
 	if (pa_psw(task)->n) {
-		struct siginfo si;
-
 		/* Nullified, just crank over the queue. */
 		task_regs(task)->iaoq[0] = task_regs(task)->iaoq[1];
 		task_regs(task)->iasq[0] = task_regs(task)->iasq[1];
@@ -90,11 +88,9 @@ void user_enable_single_step(struct task_struct *task)
 		ptrace_disable(task);
 		/* Don't wake up the task, but let the
 		   parent know something happened. */
-		si.si_code = TRAP_TRACE;
-		si.si_addr = (void __user *) (task_regs(task)->iaoq[0] & ~3);
-		si.si_signo = SIGTRAP;
-		si.si_errno = 0;
-		force_sig_info(SIGTRAP, &si, task);
+		force_sig_fault(SIGTRAP, TRAP_TRACE,
+				(void __user *) (task_regs(task)->iaoq[0] & ~3),
+				task);
 		/* notify_parent(task, SIGCHLD); */
 		return;
 	}
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index 0e9675f..8d3a7b8 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -58,11 +58,6 @@ struct proc_dir_entry * proc_runway_root __read_mostly = NULL;
 struct proc_dir_entry * proc_gsc_root __read_mostly = NULL;
 struct proc_dir_entry * proc_mckinley_root __read_mostly = NULL;
 
-#if !defined(CONFIG_PA20) && (defined(CONFIG_IOMMU_CCIO) || defined(CONFIG_IOMMU_SBA))
-int parisc_bus_is_phys __read_mostly = 1;	/* Assume no IOMMU is present */
-EXPORT_SYMBOL(parisc_bus_is_phys);
-#endif
-
 void __init setup_cmdline(char **cmdline_p)
 {
 	extern unsigned int boot_args[];
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 71d3127..4309ad3 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -297,13 +297,8 @@ void die_if_kernel(char *str, struct pt_regs *regs, long err)
 #define GDB_BREAK_INSN 0x10004
 static void handle_gdb_break(struct pt_regs *regs, int wot)
 {
-	struct siginfo si;
-
-	si.si_signo = SIGTRAP;
-	si.si_errno = 0;
-	si.si_code = wot;
-	si.si_addr = (void __user *) (regs->iaoq[0] & ~3);
-	force_sig_info(SIGTRAP, &si, current);
+	force_sig_fault(SIGTRAP, wot,
+			(void __user *) (regs->iaoq[0] & ~3), current);
 }
 
 static void handle_break(struct pt_regs *regs)
@@ -487,7 +482,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 {
 	unsigned long fault_address = 0;
 	unsigned long fault_space = 0;
-	struct siginfo si;
+	int si_code;
 
 	if (code == 1)
 	    pdc_console_restart();  /* switch back to pdc if HPMC */
@@ -571,7 +566,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 	case  8:
 		/* Illegal instruction trap */
 		die_if_kernel("Illegal instruction", regs, code);
-		si.si_code = ILL_ILLOPC;
+		si_code = ILL_ILLOPC;
 		goto give_sigill;
 
 	case  9:
@@ -582,7 +577,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 	case 10:
 		/* Privileged operation trap */
 		die_if_kernel("Privileged operation", regs, code);
-		si.si_code = ILL_PRVOPC;
+		si_code = ILL_PRVOPC;
 		goto give_sigill;
 
 	case 11:
@@ -605,20 +600,16 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 		}
 
 		die_if_kernel("Privileged register usage", regs, code);
-		si.si_code = ILL_PRVREG;
+		si_code = ILL_PRVREG;
 	give_sigill:
-		si.si_signo = SIGILL;
-		si.si_errno = 0;
-		si.si_addr = (void __user *) regs->iaoq[0];
-		force_sig_info(SIGILL, &si, current);
+		force_sig_fault(SIGILL, si_code,
+				(void __user *) regs->iaoq[0], current);
 		return;
 
 	case 12:
 		/* Overflow Trap, let the userland signal handler do the cleanup */
-		si.si_signo = SIGFPE;
-		si.si_code = FPE_INTOVF;
-		si.si_addr = (void __user *) regs->iaoq[0];
-		force_sig_info(SIGFPE, &si, current);
+		force_sig_fault(SIGFPE, FPE_INTOVF,
+				(void __user *) regs->iaoq[0], current);
 		return;
 		
 	case 13:
@@ -626,13 +617,11 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 		   The condition succeeds in an instruction which traps
 		   on condition  */
 		if(user_mode(regs)){
-			si.si_signo = SIGFPE;
 			/* Let userspace app figure it out from the insn pointed
 			 * to by si_addr.
 			 */
-			si.si_code = FPE_CONDTRAP;
-			si.si_addr = (void __user *) regs->iaoq[0];
-			force_sig_info(SIGFPE, &si, current);
+			force_sig_fault(SIGFPE, FPE_CONDTRAP,
+					(void __user *) regs->iaoq[0], current);
 			return;
 		} 
 		/* The kernel doesn't want to handle condition codes */
@@ -741,14 +730,10 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 			return;
 
 		die_if_kernel("Protection id trap", regs, code);
-		si.si_code = SEGV_MAPERR;
-		si.si_signo = SIGSEGV;
-		si.si_errno = 0;
-		if (code == 7)
-		    si.si_addr = (void __user *) regs->iaoq[0];
-		else
-		    si.si_addr = (void __user *) regs->ior;
-		force_sig_info(SIGSEGV, &si, current);
+		force_sig_fault(SIGSEGV, SEGV_MAPERR,
+				(code == 7)?
+				((void __user *) regs->iaoq[0]) :
+				((void __user *) regs->ior), current);
 		return;
 
 	case 28: 
@@ -762,11 +747,8 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 				"handle_interruption() pid=%d command='%s'\n",
 				task_pid_nr(current), current->comm);
 			/* SIGBUS, for lack of a better one. */
-			si.si_signo = SIGBUS;
-			si.si_code = BUS_OBJERR;
-			si.si_errno = 0;
-			si.si_addr = (void __user *) regs->ior;
-			force_sig_info(SIGBUS, &si, current);
+			force_sig_fault(SIGBUS, BUS_OBJERR,
+					(void __user *)regs->ior, current);
 			return;
 		}
 		pdc_chassis_send_status(PDC_CHASSIS_DIRECT_PANIC);
@@ -781,11 +763,8 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 				"User fault %d on space 0x%08lx, pid=%d command='%s'\n",
 				code, fault_space,
 				task_pid_nr(current), current->comm);
-		si.si_signo = SIGSEGV;
-		si.si_errno = 0;
-		si.si_code = SEGV_MAPERR;
-		si.si_addr = (void __user *) regs->ior;
-		force_sig_info(SIGSEGV, &si, current);
+		force_sig_fault(SIGSEGV, SEGV_MAPERR,
+				(void __user *)regs->ior, current);
 		return;
 	    }
 	}
diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c
index e36f7b7..932bfc0 100644
--- a/arch/parisc/kernel/unaligned.c
+++ b/arch/parisc/kernel/unaligned.c
@@ -452,7 +452,6 @@ void handle_unaligned(struct pt_regs *regs)
 	unsigned long newbase = R1(regs->iir)?regs->gr[R1(regs->iir)]:0;
 	int modify = 0;
 	int ret = ERR_NOTHANDLED;
-	struct siginfo si;
 	register int flop=0;	/* true if this is a flop */
 
 	__inc_irq_stat(irq_unaligned_count);
@@ -690,21 +689,15 @@ void handle_unaligned(struct pt_regs *regs)
 
 		if (ret == ERR_PAGEFAULT)
 		{
-			si.si_signo = SIGSEGV;
-			si.si_errno = 0;
-			si.si_code = SEGV_MAPERR;
-			si.si_addr = (void __user *)regs->ior;
-			force_sig_info(SIGSEGV, &si, current);
+			force_sig_fault(SIGSEGV, SEGV_MAPERR,
+					(void __user *)regs->ior, current);
 		}
 		else
 		{
 force_sigbus:
 			/* couldn't handle it ... */
-			si.si_signo = SIGBUS;
-			si.si_errno = 0;
-			si.si_code = BUS_ADRALN;
-			si.si_addr = (void __user *)regs->ior;
-			force_sig_info(SIGBUS, &si, current);
+			force_sig_fault(SIGBUS, BUS_ADRALN,
+					(void __user *)regs->ior, current);
 		}
 		
 		return;
diff --git a/arch/parisc/math-emu/driver.c b/arch/parisc/math-emu/driver.c
index 2fb59d2..0590e05 100644
--- a/arch/parisc/math-emu/driver.c
+++ b/arch/parisc/math-emu/driver.c
@@ -81,7 +81,6 @@ int
 handle_fpe(struct pt_regs *regs)
 {
 	extern void printbinary(unsigned long x, int nbits);
-	struct siginfo si;
 	unsigned int orig_sw, sw;
 	int signalcode;
 	/* need an intermediate copy of float regs because FPU emulation
@@ -117,11 +116,8 @@ handle_fpe(struct pt_regs *regs)
 
 	memcpy(regs->fr, frcopy, sizeof regs->fr);
 	if (signalcode != 0) {
-	    si.si_signo = signalcode >> 24;
-	    si.si_errno = 0;
-	    si.si_code = signalcode & 0xffffff;
-	    si.si_addr = (void __user *) regs->iaoq[0];
-	    force_sig_info(si.si_signo, &si, current);
+	    force_sig_fault(signalcode >> 24, signalcode & 0xffffff,
+			    (void __user *) regs->iaoq[0], current);
 	    return -1;
 	}
 
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index e247edb..a801179 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -353,23 +353,22 @@ bad_area:
 	up_read(&mm->mmap_sem);
 
 	if (user_mode(regs)) {
-		struct siginfo si;
-		unsigned int lsb = 0;
+		int signo, si_code;
 
 		switch (code) {
 		case 15:	/* Data TLB miss fault/Data page fault */
 			/* send SIGSEGV when outside of vma */
 			if (!vma ||
 			    address < vma->vm_start || address >= vma->vm_end) {
-				si.si_signo = SIGSEGV;
-				si.si_code = SEGV_MAPERR;
+				signo = SIGSEGV;
+				si_code = SEGV_MAPERR;
 				break;
 			}
 
 			/* send SIGSEGV for wrong permissions */
 			if ((vma->vm_flags & acc_type) != acc_type) {
-				si.si_signo = SIGSEGV;
-				si.si_code = SEGV_ACCERR;
+				signo = SIGSEGV;
+				si_code = SEGV_ACCERR;
 				break;
 			}
 
@@ -377,43 +376,40 @@ bad_area:
 			/* fall through */
 		case 17:	/* NA data TLB miss / page fault */
 		case 18:	/* Unaligned access - PCXS only */
-			si.si_signo = SIGBUS;
-			si.si_code = (code == 18) ? BUS_ADRALN : BUS_ADRERR;
+			signo = SIGBUS;
+			si_code = (code == 18) ? BUS_ADRALN : BUS_ADRERR;
 			break;
 		case 16:	/* Non-access instruction TLB miss fault */
 		case 26:	/* PCXL: Data memory access rights trap */
 		default:
-			si.si_signo = SIGSEGV;
-			si.si_code = (code == 26) ? SEGV_ACCERR : SEGV_MAPERR;
+			signo = SIGSEGV;
+			si_code = (code == 26) ? SEGV_ACCERR : SEGV_MAPERR;
 			break;
 		}
-
 #ifdef CONFIG_MEMORY_FAILURE
 		if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
+			unsigned int lsb = 0;
 			printk(KERN_ERR
 	"MCE: Killing %s:%d due to hardware memory corruption fault at %08lx\n",
 			tsk->comm, tsk->pid, address);
-			si.si_signo = SIGBUS;
-			si.si_code = BUS_MCEERR_AR;
+			/*
+			 * Either small page or large page may be poisoned.
+			 * In other words, VM_FAULT_HWPOISON_LARGE and
+			 * VM_FAULT_HWPOISON are mutually exclusive.
+			 */
+			if (fault & VM_FAULT_HWPOISON_LARGE)
+				lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
+			else if (fault & VM_FAULT_HWPOISON)
+				lsb = PAGE_SHIFT;
+
+			force_sig_mceerr(BUS_MCEERR_AR, (void __user *) address,
+					 lsb, current);
+			return;
 		}
 #endif
+		show_signal_msg(regs, code, address, tsk, vma);
 
-		/*
-		 * Either small page or large page may be poisoned.
-		 * In other words, VM_FAULT_HWPOISON_LARGE and
-		 * VM_FAULT_HWPOISON are mutually exclusive.
-		 */
-		if (fault & VM_FAULT_HWPOISON_LARGE)
-			lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
-		else if (fault & VM_FAULT_HWPOISON)
-			lsb = PAGE_SHIFT;
-		else
-			show_signal_msg(regs, code, address, tsk, vma);
-		si.si_addr_lsb = lsb;
-
-		si.si_errno = 0;
-		si.si_addr = (void __user *) address;
-		force_sig_info(si.si_signo, &si, current);
+		force_sig_fault(signo, si_code, (void __user *) address, current);
 		return;
 	}
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c32a181..f674006 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -13,12 +13,6 @@ config 64BIT
 	bool
 	default y if PPC64
 
-config ARCH_PHYS_ADDR_T_64BIT
-       def_bool PPC64 || PHYS_64BIT
-
-config ARCH_DMA_ADDR_T_64BIT
-	def_bool ARCH_PHYS_ADDR_T_64BIT
-
 config MMU
 	bool
 	default y
@@ -187,7 +181,6 @@ config PPC
 	select HAVE_CONTEXT_TRACKING		if PPC64
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DEBUG_STACKOVERFLOW
-	select HAVE_DMA_API_DEBUG
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS	if MPROFILE_KERNEL
 	select HAVE_EBPF_JIT			if PPC64
@@ -223,9 +216,11 @@ config PPC
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_VIRT_CPU_ACCOUNTING
 	select HAVE_IRQ_TIME_ACCOUNTING
+	select IOMMU_HELPER			if PPC64
 	select IRQ_DOMAIN
 	select IRQ_FORCED_THREADING
 	select MODULES_USE_ELF_RELA
+	select NEED_SG_DMA_LENGTH
 	select NO_BOOTMEM
 	select OF
 	select OF_EARLY_FLATTREE
@@ -478,19 +473,6 @@ config MPROFILE_KERNEL
 	depends on PPC64 && CPU_LITTLE_ENDIAN
 	def_bool !DISABLE_MPROFILE_KERNEL
 
-config IOMMU_HELPER
-	def_bool PPC64
-
-config SWIOTLB
-	bool "SWIOTLB support"
-	default n
-	select IOMMU_HELPER
-	---help---
-	  Support for IO bounce buffering for systems without an IOMMU.
-	  This allows us to DMA to the full physical address space on
-	  platforms where the size of a physical address is larger
-	  than the bus address.  Not all platforms support this.
-
 config HOTPLUG_CPU
 	bool "Support for enabling/disabling CPUs"
 	depends on SMP && (PPC_PSERIES || \
@@ -883,7 +865,7 @@ config PPC_MEM_KEYS
 	  page-based protections, but without requiring modification of the
 	  page tables when an application changes protection domains.
 
-	  For details, see Documentation/vm/protection-keys.txt
+	  For details, see Documentation/vm/protection-keys.rst
 
 	  If unsure, say y.
 
@@ -913,9 +895,6 @@ config ZONE_DMA
 config NEED_DMA_MAP_STATE
 	def_bool (PPC64 || NOT_COHERENT_CACHE)
 
-config NEED_SG_DMA_LENGTH
-	def_bool y
-
 config GENERIC_ISA_DMA
 	bool
 	depends on ISA_DMA_API
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 401c62a..2af9ded 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -92,24 +92,6 @@ extern int pci_mmap_legacy_page_range(struct pci_bus *bus,
 
 #define HAVE_PCI_LEGACY	1
 
-#ifdef CONFIG_PPC64
-
-/* The PCI address space does not equal the physical memory address
- * space (we have an IOMMU).  The IDE and SCSI device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS	(0)
-
-#else /* 32-bit */
-
-/* The PCI address space does equal the physical memory
- * address space (no IOMMU).  The IDE and SCSI device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS     (1)
-
-#endif /* CONFIG_PPC64 */
-
 extern void pcibios_claim_one_bus(struct pci_bus *b);
 
 extern void pcibios_finish_adding_to_bus(struct pci_bus *bus);
diff --git a/arch/powerpc/include/uapi/asm/siginfo.h b/arch/powerpc/include/uapi/asm/siginfo.h
index 9f14245..1d51d9b 100644
--- a/arch/powerpc/include/uapi/asm/siginfo.h
+++ b/arch/powerpc/include/uapi/asm/siginfo.h
@@ -15,19 +15,4 @@
 
 #include <asm-generic/siginfo.h>
 
-/*
- * SIGFPE si_codes
- */
-#ifdef __KERNEL__
-#define FPE_FIXME	0	/* Broken dup of SI_USER */
-#endif /* __KERNEL__ */
-
-/*
- * SIGTRAP si_codes
- */
-#ifdef __KERNEL__
-#define TRAP_FIXME	0	/* Broken dup of SI_USER */
-#endif /* __KERNEL__ */
-
-
 #endif	/* _ASM_POWERPC_SIGINFO_H */
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index da20569..138157de 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -309,8 +309,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_set_coherent_mask);
 
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
 int dma_set_mask(struct device *dev, u64 dma_mask)
 {
 	if (ppc_md.dma_set_mask)
@@ -361,7 +359,6 @@ EXPORT_SYMBOL_GPL(dma_get_required_mask);
 
 static int __init dma_init(void)
 {
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
 #ifdef CONFIG_PCI
 	dma_debug_add_bus(&pci_bus_type);
 #endif
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index bc640e4..90bb39b 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1775,18 +1775,6 @@ static int proc_eeh_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int proc_eeh_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_eeh_show, NULL);
-}
-
-static const struct file_operations proc_eeh_operations = {
-	.open      = proc_eeh_open,
-	.read      = seq_read,
-	.llseek    = seq_lseek,
-	.release   = single_release,
-};
-
 #ifdef CONFIG_DEBUG_FS
 static int eeh_enable_dbgfs_set(void *data, u64 val)
 {
@@ -1828,7 +1816,7 @@ DEFINE_SIMPLE_ATTRIBUTE(eeh_freeze_dbgfs_ops, eeh_freeze_dbgfs_get,
 static int __init eeh_init_proc(void)
 {
 	if (machine_is(pseries) || machine_is(powernv)) {
-		proc_create("powerpc/eeh", 0, NULL, &proc_eeh_operations);
+		proc_create_single("powerpc/eeh", 0, NULL, proc_eeh_show);
 #ifdef CONFIG_DEBUG_FS
 		debugfs_create_file("eeh_enable", 0600,
                                     powerpc_debugfs_root, NULL,
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 1237f13..26ea979 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -632,6 +632,7 @@ void do_break (struct pt_regs *regs, unsigned long address,
 	hw_breakpoint_disable();
 
 	/* Deliver the signal to userspace */
+	clear_siginfo(&info);
 	info.si_signo = SIGTRAP;
 	info.si_errno = 0;
 	info.si_code = TRAP_HWBKPT;
diff --git a/arch/powerpc/kernel/rtas-proc.c b/arch/powerpc/kernel/rtas-proc.c
index fb070d8..d49063d 100644
--- a/arch/powerpc/kernel/rtas-proc.c
+++ b/arch/powerpc/kernel/rtas-proc.c
@@ -154,18 +154,6 @@ static ssize_t ppc_rtas_tone_volume_write(struct file *file,
 static int ppc_rtas_tone_volume_show(struct seq_file *m, void *v);
 static int ppc_rtas_rmo_buf_show(struct seq_file *m, void *v);
 
-static int sensors_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ppc_rtas_sensors_show, NULL);
-}
-
-static const struct file_operations ppc_rtas_sensors_operations = {
-	.open		= sensors_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int poweron_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, ppc_rtas_poweron_show, NULL);
@@ -231,18 +219,6 @@ static const struct file_operations ppc_rtas_tone_volume_operations = {
 	.release	= single_release,
 };
 
-static int rmo_buf_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ppc_rtas_rmo_buf_show, NULL);
-}
-
-static const struct file_operations ppc_rtas_rmo_buf_ops = {
-	.open		= rmo_buf_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int ppc_rtas_find_all_sensors(void);
 static void ppc_rtas_process_sensor(struct seq_file *m,
 	struct individual_sensor *s, int state, int error, const char *loc);
@@ -267,14 +243,14 @@ static int __init proc_rtas_init(void)
 		    &ppc_rtas_clock_operations);
 	proc_create("powerpc/rtas/poweron", 0644, NULL,
 		    &ppc_rtas_poweron_operations);
-	proc_create("powerpc/rtas/sensors", 0444, NULL,
-		    &ppc_rtas_sensors_operations);
+	proc_create_single("powerpc/rtas/sensors", 0444, NULL,
+			ppc_rtas_sensors_show);
 	proc_create("powerpc/rtas/frequency", 0644, NULL,
 		    &ppc_rtas_tone_freq_operations);
 	proc_create("powerpc/rtas/volume", 0644, NULL,
 		    &ppc_rtas_tone_volume_operations);
-	proc_create("powerpc/rtas/rmo_buffer", 0400, NULL,
-		    &ppc_rtas_rmo_buf_ops);
+	proc_create_single("powerpc/rtas/rmo_buffer", 0400, NULL,
+			ppc_rtas_rmo_buf_show);
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 0904492..0e17dcb 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -296,7 +296,6 @@ NOKPROBE_SYMBOL(die);
 void user_single_step_siginfo(struct task_struct *tsk,
 				struct pt_regs *regs, siginfo_t *info)
 {
-	memset(info, 0, sizeof(*info));
 	info->si_signo = SIGTRAP;
 	info->si_code = TRAP_TRACE;
 	info->si_addr = (void __user *)regs->nip;
@@ -334,7 +333,7 @@ void _exception_pkey(int signr, struct pt_regs *regs, int code,
 	 */
 	thread_pkey_regs_save(&current->thread);
 
-	memset(&info, 0, sizeof(info));
+	clear_siginfo(&info);
 	info.si_signo = signr;
 	info.si_code = code;
 	info.si_addr = (void __user *) addr;
@@ -970,7 +969,7 @@ void unknown_exception(struct pt_regs *regs)
 	printk("Bad trap at PC: %lx, SR: %lx, vector=%lx\n",
 	       regs->nip, regs->msr, regs->trap);
 
-	_exception(SIGTRAP, regs, TRAP_FIXME, 0);
+	_exception(SIGTRAP, regs, TRAP_UNK, 0);
 
 	exception_exit(prev_state);
 }
@@ -992,7 +991,7 @@ bail:
 
 void RunModeException(struct pt_regs *regs)
 {
-	_exception(SIGTRAP, regs, TRAP_FIXME, 0);
+	_exception(SIGTRAP, regs, TRAP_UNK, 0);
 }
 
 void single_step_exception(struct pt_regs *regs)
@@ -1032,7 +1031,7 @@ static void emulate_single_step(struct pt_regs *regs)
 
 static inline int __parse_fpscr(unsigned long fpscr)
 {
-	int ret = FPE_FIXME;
+	int ret = FPE_FLTUNK;
 
 	/* Invalid operation */
 	if ((fpscr & FPSCR_VE) && (fpscr & FPSCR_VX))
@@ -1973,7 +1972,7 @@ void SPEFloatingPointException(struct pt_regs *regs)
 	extern int do_spe_mathemu(struct pt_regs *regs);
 	unsigned long spefscr;
 	int fpexc_mode;
-	int code = FPE_FIXME;
+	int code = FPE_FLTUNK;
 	int err;
 
 	flush_spe_to_thread(current);
@@ -2042,7 +2041,7 @@ void SPEFloatingPointRoundException(struct pt_regs *regs)
 		printk(KERN_ERR "unrecognized spe instruction "
 		       "in %s at %lx\n", current->comm, regs->nip);
 	} else {
-		_exception(SIGFPE, regs, FPE_FIXME, regs->nip);
+		_exception(SIGFPE, regs, FPE_FLTUNK, regs->nip);
 		return;
 	}
 }
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index c01d627..ef268d5 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -168,6 +168,7 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
 		return SIGBUS;
 
 	current->thread.trap_nr = BUS_ADRERR;
+	clear_siginfo(&info);
 	info.si_signo = SIGBUS;
 	info.si_errno = 0;
 	info.si_code = BUS_ADRERR;
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 67d3125..84b58ab 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -222,6 +222,7 @@ config PTE_64BIT
 config PHYS_64BIT
 	bool 'Large physical address support' if E500 || PPC_86xx
 	depends on (44x || E500 || PPC_86xx) && !PPC_83xx && !PPC_82xx
+	select PHYS_ADDR_T_64BIT
 	---help---
 	  This option enables kernel support for larger than 32-bit physical
 	  addresses.  This feature may not be available on all cores.
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
index 870c0a8..1e002e9 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -44,7 +44,7 @@ static void spufs_handle_event(struct spu_context *ctx,
 		return;
 	}
 
-	memset(&info, 0, sizeof(info));
+	clear_siginfo(&info);
 
 	switch (type) {
 	case SPE_EVENT_INVALID_DMA:
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index ccc4215..c9ef3c5 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -1095,18 +1095,6 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
 		atomic_read(&nr_spu_contexts),
 		idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
 	return 0;
-}
-
-static int spu_loadavg_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, show_spu_loadavg, NULL);
-}
-
-static const struct file_operations spu_loadavg_fops = {
-	.open		= spu_loadavg_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
 };
 
 int __init spu_sched_init(void)
@@ -1135,7 +1123,7 @@ int __init spu_sched_init(void)
 
 	mod_timer(&spuloadavg_timer, 0);
 
-	entry = proc_create("spu_loadavg", 0, NULL, &spu_loadavg_fops);
+	entry = proc_create_single("spu_loadavg", 0, NULL, show_spu_loadavg);
 	if (!entry)
 		goto out_stop_kthread;
 
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index cd4fd85..274bc06 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -3,8 +3,16 @@
 # see Documentation/kbuild/kconfig-language.txt.
 #
 
+config 64BIT
+	bool
+
+config 32BIT
+	bool
+
 config RISCV
 	def_bool y
+	# even on 32-bit, physical (and DMA) addresses are > 32-bits
+	select PHYS_ADDR_T_64BIT
 	select OF
 	select OF_EARLY_FLATTREE
 	select OF_IRQ
@@ -22,7 +30,6 @@ config RISCV
 	select GENERIC_ATOMIC64 if !64BIT || !RISCV_ISA_A
 	select HAVE_MEMBLOCK
 	select HAVE_MEMBLOCK_NODE_MAP
-	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_GENERIC_DMA_COHERENT
 	select IRQ_DOMAIN
@@ -39,16 +46,9 @@ config RISCV
 config MMU
 	def_bool y
 
-# even on 32-bit, physical (and DMA) addresses are > 32-bits
-config ARCH_PHYS_ADDR_T_64BIT
-	def_bool y
-
 config ZONE_DMA32
 	bool
-	default y
-
-config ARCH_DMA_ADDR_T_64BIT
-	def_bool y
+	default y if 64BIT
 
 config PAGE_OFFSET
 	hex
@@ -101,7 +101,6 @@ choice
 
 config ARCH_RV32I
 	bool "RV32I"
-	select CPU_SUPPORTS_32BIT_KERNEL
 	select 32BIT
 	select GENERIC_ASHLDI3
 	select GENERIC_ASHRDI3
@@ -109,13 +108,13 @@ config ARCH_RV32I
 
 config ARCH_RV64I
 	bool "RV64I"
-	select CPU_SUPPORTS_64BIT_KERNEL
 	select 64BIT
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
+	select SWIOTLB
 
 endchoice
 
@@ -171,11 +170,6 @@ config NR_CPUS
 	depends on SMP
 	default "8"
 
-config CPU_SUPPORTS_32BIT_KERNEL
-	bool
-config CPU_SUPPORTS_64BIT_KERNEL
-	bool
-
 choice
 	prompt "CPU Tuning"
 	default TUNE_GENERIC
@@ -202,24 +196,6 @@ endmenu
 
 menu "Kernel type"
 
-choice
-	prompt "Kernel code model"
-	default 64BIT
-
-config 32BIT
-	bool "32-bit kernel"
-	depends on CPU_SUPPORTS_32BIT_KERNEL
-	help
-	  Select this option to build a 32-bit kernel.
-
-config 64BIT
-	bool "64-bit kernel"
-	depends on CPU_SUPPORTS_64BIT_KERNEL
-	help
-	  Select this option to build a 64-bit kernel.
-
-endchoice
-
 source "mm/Kconfig"
 
 source "kernel/Kconfig.preempt"
diff --git a/arch/riscv/include/asm/dma-mapping.h b/arch/riscv/include/asm/dma-mapping.h
new file mode 100644
index 0000000..8facc1c
--- /dev/null
+++ b/arch/riscv/include/asm/dma-mapping.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _RISCV_ASM_DMA_MAPPING_H
+#define _RISCV_ASM_DMA_MAPPING_H 1
+
+#ifdef CONFIG_SWIOTLB
+#include <linux/swiotlb.h>
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
+{
+	return &swiotlb_dma_ops;
+}
+#else
+#include <asm-generic/dma-mapping.h>
+#endif /* CONFIG_SWIOTLB */
+
+#endif /* _RISCV_ASM_DMA_MAPPING_H */
diff --git a/arch/riscv/include/asm/pci.h b/arch/riscv/include/asm/pci.h
index 0f2fc9e..b3638c5 100644
--- a/arch/riscv/include/asm/pci.h
+++ b/arch/riscv/include/asm/pci.h
@@ -26,9 +26,6 @@
 /* RISC-V shim does not initialize PCI bus */
 #define pcibios_assign_all_busses() 1
 
-/* We do not have an IOMMU */
-#define PCI_DMA_BUS_IS_PHYS 1
-
 extern int isa_dma_bridge_buggy;
 
 #ifdef CONFIG_PCI
diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index c11f40c..ee44a48 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -29,6 +29,7 @@
 #include <linux/of_fdt.h>
 #include <linux/of_platform.h>
 #include <linux/sched/task.h>
+#include <linux/swiotlb.h>
 
 #include <asm/setup.h>
 #include <asm/sections.h>
@@ -206,6 +207,7 @@ void __init setup_arch(char **cmdline_p)
 	setup_bootmem();
 	paging_init();
 	unflatten_device_tree();
+	swiotlb_init(1);
 
 #ifdef CONFIG_SMP
 	setup_smp();
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 93132cb..b99d9dd 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -63,18 +63,6 @@ void die(struct pt_regs *regs, const char *str)
 		do_exit(SIGSEGV);
 }
 
-static inline void do_trap_siginfo(int signo, int code,
-	unsigned long addr, struct task_struct *tsk)
-{
-	siginfo_t info;
-
-	info.si_signo = signo;
-	info.si_errno = 0;
-	info.si_code = code;
-	info.si_addr = (void __user *)addr;
-	force_sig_info(signo, &info, tsk);
-}
-
 void do_trap(struct pt_regs *regs, int signo, int code,
 	unsigned long addr, struct task_struct *tsk)
 {
@@ -87,7 +75,7 @@ void do_trap(struct pt_regs *regs, int signo, int code,
 		show_regs(regs);
 	}
 
-	do_trap_siginfo(signo, code, addr, tsk);
+	force_sig_fault(signo, code, (void __user *)addr, tsk);
 }
 
 static void do_trap_error(struct pt_regs *regs, int signo, int code,
@@ -149,7 +137,7 @@ asmlinkage void do_trap_break(struct pt_regs *regs)
 	}
 #endif /* CONFIG_GENERIC_BUG */
 
-	do_trap_siginfo(SIGTRAP, TRAP_BRKPT, regs->sepc, current);
+	force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)(regs->sepc), current);
 	regs->sepc += 0x4;
 }
 
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 199ac3e..6a64287 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -35,9 +35,6 @@ config GENERIC_BUG
 config GENERIC_BUG_RELATIVE_POINTERS
 	def_bool y
 
-config ARCH_DMA_ADDR_T_64BIT
-	def_bool y
-
 config GENERIC_LOCKBREAK
 	def_bool y if SMP && PREEMPT
 
@@ -133,7 +130,6 @@ config S390
 	select HAVE_CMPXCHG_LOCAL
 	select HAVE_COPY_THREAD_TLS
 	select HAVE_DEBUG_KMEMLEAK
-	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_CONTIGUOUS
 	select DMA_DIRECT_OPS
 	select HAVE_DYNAMIC_FTRACE
@@ -709,7 +705,11 @@ config QDIO
 menuconfig PCI
 	bool "PCI support"
 	select PCI_MSI
+	select IOMMU_HELPER
 	select IOMMU_SUPPORT
+	select NEED_DMA_MAP_STATE
+	select NEED_SG_DMA_LENGTH
+
 	help
 	  Enable PCI support.
 
@@ -733,15 +733,6 @@ config PCI_DOMAINS
 config HAS_IOMEM
 	def_bool PCI
 
-config IOMMU_HELPER
-	def_bool PCI
-
-config NEED_SG_DMA_LENGTH
-	def_bool PCI
-
-config NEED_DMA_MAP_STATE
-	def_bool PCI
-
 config CHSC_SCH
 	def_tristate m
 	prompt "Support for CHSC subchannels"
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 12fe359..94f8db4 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -2,8 +2,6 @@
 #ifndef __ASM_S390_PCI_H
 #define __ASM_S390_PCI_H
 
-/* must be set before including asm-generic/pci.h */
-#define PCI_DMA_BUS_IS_PHYS (0)
 /* must be set before including pci_clp.h */
 #define PCI_BAR_COUNT	6
 
diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c
index fc7e04c..54f5496 100644
--- a/arch/s390/kernel/sysinfo.c
+++ b/arch/s390/kernel/sysinfo.c
@@ -294,21 +294,9 @@ static int sysinfo_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int sysinfo_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, sysinfo_show, NULL);
-}
-
-static const struct file_operations sysinfo_fops = {
-	.open		= sysinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init sysinfo_create_proc(void)
 {
-	proc_create("sysinfo", 0444, NULL, &sysinfo_fops);
+	proc_create_single("sysinfo", 0444, NULL, sysinfo_show);
 	return 0;
 }
 device_initcall(sysinfo_create_proc);
@@ -386,18 +374,6 @@ static const struct seq_operations service_level_seq_ops = {
 	.show		= service_level_show
 };
 
-static int service_level_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &service_level_seq_ops);
-}
-
-static const struct file_operations service_level_ops = {
-	.open		= service_level_open,
-	.read		= seq_read,
-	.llseek 	= seq_lseek,
-	.release	= seq_release
-};
-
 static void service_level_vm_print(struct seq_file *m,
 				   struct service_level *slr)
 {
@@ -420,7 +396,7 @@ static struct service_level service_level_vm = {
 
 static __init int create_proc_service_level(void)
 {
-	proc_create("service_levels", 0, NULL, &service_level_ops);
+	proc_create_seq("service_levels", 0, NULL, &service_level_seq_ops);
 	if (MACHINE_IS_VM)
 		register_service_level(&service_level_vm);
 	return 0;
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index a5297a2..8003b38 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -44,14 +44,8 @@ int is_valid_bugaddr(unsigned long addr)
 
 void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str)
 {
-	siginfo_t info;
-
 	if (user_mode(regs)) {
-		info.si_signo = si_signo;
-		info.si_errno = 0;
-		info.si_code = si_code;
-		info.si_addr = get_trap_ip(regs);
-		force_sig_info(si_signo, &info, current);
+		force_sig_fault(si_signo, si_code, get_trap_ip(regs), current);
 		report_user_fault(regs, si_signo, 0);
         } else {
                 const struct exception_table_entry *fixup;
@@ -80,18 +74,12 @@ NOKPROBE_SYMBOL(do_trap);
 
 void do_per_trap(struct pt_regs *regs)
 {
-	siginfo_t info;
-
 	if (notify_die(DIE_SSTEP, "sstep", regs, 0, 0, SIGTRAP) == NOTIFY_STOP)
 		return;
 	if (!current->ptrace)
 		return;
-	info.si_signo = SIGTRAP;
-	info.si_errno = 0;
-	info.si_code = TRAP_HWBKPT;
-	info.si_addr =
-		(void __force __user *) current->thread.per_event.address;
-	force_sig_info(SIGTRAP, &info, current);
+	force_sig_fault(SIGTRAP, TRAP_HWBKPT,
+		(void __force __user *) current->thread.per_event.address, current);
 }
 NOKPROBE_SYMBOL(do_per_trap);
 
@@ -165,7 +153,6 @@ void translation_exception(struct pt_regs *regs)
 
 void illegal_op(struct pt_regs *regs)
 {
-	siginfo_t info;
         __u8 opcode[6];
 	__u16 __user *location;
 	int is_uprobe_insn = 0;
@@ -177,13 +164,9 @@ void illegal_op(struct pt_regs *regs)
 		if (get_user(*((__u16 *) opcode), (__u16 __user *) location))
 			return;
 		if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) {
-			if (current->ptrace) {
-				info.si_signo = SIGTRAP;
-				info.si_errno = 0;
-				info.si_code = TRAP_BRKPT;
-				info.si_addr = location;
-				force_sig_info(SIGTRAP, &info, current);
-			} else
+			if (current->ptrace)
+				force_sig_fault(SIGTRAP, TRAP_BRKPT, location, current);
+			else
 				signal = SIGILL;
 #ifdef CONFIG_UPROBES
 		} else if (*((__u16 *) opcode) == UPROBE_SWBP_INSN) {
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 93faeca..e074480 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -265,14 +265,10 @@ void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
  */
 static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
 {
-	struct siginfo si;
-
 	report_user_fault(regs, SIGSEGV, 1);
-	si.si_signo = SIGSEGV;
-	si.si_errno = 0;
-	si.si_code = si_code;
-	si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);
-	force_sig_info(SIGSEGV, &si, current);
+	force_sig_fault(SIGSEGV, si_code,
+			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK),
+			current);
 }
 
 static noinline void do_no_context(struct pt_regs *regs)
@@ -316,18 +312,13 @@ static noinline void do_low_address(struct pt_regs *regs)
 
 static noinline void do_sigbus(struct pt_regs *regs)
 {
-	struct task_struct *tsk = current;
-	struct siginfo si;
-
 	/*
 	 * Send a sigbus, regardless of whether we were in kernel
 	 * or user mode.
 	 */
-	si.si_signo = SIGBUS;
-	si.si_errno = 0;
-	si.si_code = BUS_ADRERR;
-	si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);
-	force_sig_info(SIGBUS, &si, tsk);
+	force_sig_fault(SIGBUS, BUS_ADRERR,
+			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK),
+			current);
 }
 
 static noinline int signal_return(struct pt_regs *regs)
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 2d15d84..d387a0f 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -668,15 +668,6 @@ void zpci_dma_exit(void)
 	kmem_cache_destroy(dma_region_table_cache);
 }
 
-#define PREALLOC_DMA_DEBUG_ENTRIES	(1 << 16)
-
-static int __init dma_debug_do_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-	return 0;
-}
-fs_initcall(dma_debug_do_init);
-
 const struct dma_map_ops s390_pci_dma_ops = {
 	.alloc		= s390_dma_alloc,
 	.free		= s390_dma_free,
@@ -685,8 +676,6 @@ const struct dma_map_ops s390_pci_dma_ops = {
 	.map_page	= s390_dma_map_pages,
 	.unmap_page	= s390_dma_unmap_pages,
 	.mapping_error	= s390_mapping_error,
-	/* if we support direct DMA this must be conditional */
-	.is_phys	= 0,
 	/* dma_supported is unconditionally true without a callback */
 };
 EXPORT_SYMBOL_GPL(s390_pci_dma_ops);
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 1851eae..a97538b 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -14,7 +14,6 @@ config SUPERH
 	select HAVE_OPROFILE
 	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_ARCH_TRACEHOOK
-	select HAVE_DMA_API_DEBUG
 	select HAVE_PERF_EVENTS
 	select HAVE_DEBUG_BUGVERBOSE
 	select ARCH_HAVE_CUSTOM_GPIO_H
@@ -51,6 +50,9 @@ config SUPERH
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_FUTEX_CMPXCHG if FUTEX
 	select HAVE_NMI
+	select NEED_DMA_MAP_STATE
+	select NEED_SG_DMA_LENGTH
+
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
@@ -161,12 +163,6 @@ config DMA_COHERENT
 config DMA_NONCOHERENT
 	def_bool !DMA_COHERENT
 
-config NEED_DMA_MAP_STATE
-	def_bool DMA_NONCOHERENT
-
-config NEED_SG_DMA_LENGTH
-	def_bool y
-
 config PGTABLE_LEVELS
 	default 3 if X2TLB
 	default 2
diff --git a/arch/sh/drivers/dma/dma-api.c b/arch/sh/drivers/dma/dma-api.c
index c0eec08..b05be59 100644
--- a/arch/sh/drivers/dma/dma-api.c
+++ b/arch/sh/drivers/dma/dma-api.c
@@ -339,18 +339,6 @@ static int dma_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int dma_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, dma_proc_show, NULL);
-}
-
-static const struct file_operations dma_proc_fops = {
-	.open		= dma_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 int register_dmac(struct dma_info *info)
 {
 	unsigned int total_channels, i;
@@ -423,7 +411,7 @@ EXPORT_SYMBOL(unregister_dmac);
 static int __init dma_api_init(void)
 {
 	printk(KERN_NOTICE "DMA: Registering DMA API.\n");
-	return proc_create("dma", 0, NULL, &dma_proc_fops) ? 0 : -ENOMEM;
+	return proc_create_single("dma", 0, NULL, dma_proc_show) ? 0 : -ENOMEM;
 }
 subsys_initcall(dma_api_init);
 
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 0033f0d..10a36b1 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -71,12 +71,6 @@ extern unsigned long PCIBIOS_MIN_IO, PCIBIOS_MIN_MEM;
  * SuperH has everything mapped statically like x86.
  */
 
-/* The PCI address space does equal the physical memory
- * address space.  The networking and block device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS	(dma_ops->is_phys)
-
 #ifdef CONFIG_PCI
 /*
  * None of the SH PCI controllers support MWI, it is always treated as a
diff --git a/arch/sh/kernel/dma-nommu.c b/arch/sh/kernel/dma-nommu.c
index 178457d..3e3a32f 100644
--- a/arch/sh/kernel/dma-nommu.c
+++ b/arch/sh/kernel/dma-nommu.c
@@ -78,7 +78,6 @@ const struct dma_map_ops nommu_dma_ops = {
 	.sync_single_for_device	= nommu_sync_single_for_device,
 	.sync_sg_for_device	= nommu_sync_sg_for_device,
 #endif
-	.is_phys		= 1,
 };
 
 void __init no_iommu_init(void)
diff --git a/arch/sh/kernel/hw_breakpoint.c b/arch/sh/kernel/hw_breakpoint.c
index afe9657..8648ed0 100644
--- a/arch/sh/kernel/hw_breakpoint.c
+++ b/arch/sh/kernel/hw_breakpoint.c
@@ -347,13 +347,8 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
 
 		/* Deliver the signal to userspace */
 		if (!arch_check_bp_in_kernelspace(bp)) {
-			siginfo_t info;
-
-			info.si_signo = args->signr;
-			info.si_errno = notifier_to_errno(rc);
-			info.si_code = TRAP_HWBKPT;
-
-			force_sig_info(args->signr, &info, current);
+			force_sig_fault(SIGTRAP, TRAP_HWBKPT,
+					(void __user *)NULL, current);
 		}
 
 		rcu_read_unlock();
diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c
index b3770bb..60709ad 100644
--- a/arch/sh/kernel/traps_32.c
+++ b/arch/sh/kernel/traps_32.c
@@ -477,7 +477,6 @@ asmlinkage void do_address_error(struct pt_regs *regs,
 {
 	unsigned long error_code = 0;
 	mm_segment_t oldfs;
-	siginfo_t info;
 	insn_size_t instruction;
 	int tmp;
 
@@ -537,11 +536,7 @@ uspace_segv:
 		       "access (PC %lx PR %lx)\n", current->comm, regs->pc,
 		       regs->pr);
 
-		info.si_signo = SIGBUS;
-		info.si_errno = 0;
-		info.si_code = si_code;
-		info.si_addr = (void __user *)address;
-		force_sig_info(SIGBUS, &info, current);
+		force_sig_fault(SIGBUS, si_code, (void __user *)address, current);
 	} else {
 		inc_unaligned_kernel_access();
 
@@ -598,19 +593,20 @@ int is_dsp_inst(struct pt_regs *regs)
 #ifdef CONFIG_CPU_SH2A
 asmlinkage void do_divide_error(unsigned long r4)
 {
-	siginfo_t info;
+	int code;
 
 	switch (r4) {
 	case TRAP_DIVZERO_ERROR:
-		info.si_code = FPE_INTDIV;
+		code = FPE_INTDIV;
 		break;
 	case TRAP_DIVOVF_ERROR:
-		info.si_code = FPE_INTOVF;
+		code = FPE_INTOVF;
 		break;
+	default:
+		/* Let gcc know unhandled cases don't make it past here */
+		return;
 	}
-
-	info.si_signo = SIGFPE;
-	force_sig_info(info.si_signo, &info, current);
+	force_sig_fault(SIGFPE, code, NULL, current);
 }
 #endif
 
diff --git a/arch/sh/math-emu/math.c b/arch/sh/math-emu/math.c
index c86f436..a0fa8fc 100644
--- a/arch/sh/math-emu/math.c
+++ b/arch/sh/math-emu/math.c
@@ -507,7 +507,6 @@ static int ieee_fpe_handler(struct pt_regs *regs)
 	unsigned short insn = *(unsigned short *)regs->pc;
 	unsigned short finsn;
 	unsigned long nextpc;
-	siginfo_t info;
 	int nib[4] = {
 		(insn >> 12) & 0xf,
 		(insn >> 8) & 0xf,
@@ -560,11 +559,8 @@ static int ieee_fpe_handler(struct pt_regs *regs)
 				~(FPSCR_CAUSE_MASK | FPSCR_FLAG_MASK);
 			task_thread_info(tsk)->status |= TS_USEDFPU;
 		} else {
-			info.si_signo = SIGFPE;
-			info.si_errno = 0;
-			info.si_code = FPE_FLTINV;
-			info.si_addr = (void __user *)regs->pc;
-			force_sig_info(SIGFPE, &info, tsk);
+			force_sig_fault(SIGFPE, FPE_FLTINV,
+					(void __user *)regs->pc, tsk);
 		}
 
 		regs->pc = nextpc;
diff --git a/arch/sh/mm/consistent.c b/arch/sh/mm/consistent.c
index f1b4469..fceb2ad 100644
--- a/arch/sh/mm/consistent.c
+++ b/arch/sh/mm/consistent.c
@@ -20,18 +20,9 @@
 #include <asm/cacheflush.h>
 #include <asm/addrspace.h>
 
-#define PREALLOC_DMA_DEBUG_ENTRIES	4096
-
 const struct dma_map_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
-static int __init dma_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-	return 0;
-}
-fs_initcall(dma_init);
-
 void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 				 dma_addr_t *dma_handle, gfp_t gfp,
 				 unsigned long attrs)
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 6fd1bf7..b8e7bb8 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -42,14 +42,7 @@ static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
 		     struct task_struct *tsk)
 {
-	siginfo_t info;
-
-	info.si_signo	= si_signo;
-	info.si_errno	= 0;
-	info.si_code	= si_code;
-	info.si_addr	= (void __user *)address;
-
-	force_sig_info(si_signo, &info, tsk);
+	force_sig_fault(si_signo, si_code, (void __user *)address, tsk);
 }
 
 /*
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 8767e45..435dbc0 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -25,7 +25,6 @@ config SPARC
 	select RTC_CLASS
 	select RTC_DRV_M48T59
 	select RTC_SYSTOHC
-	select HAVE_DMA_API_DEBUG
 	select HAVE_ARCH_JUMP_LABEL if SPARC64
 	select GENERIC_IRQ_SHOW
 	select ARCH_WANT_IPC_PARSE_VERSION
@@ -44,6 +43,8 @@ config SPARC
 	select ARCH_HAS_SG_CHAIN
 	select CPU_NO_EFFICIENT_FFS
 	select LOCKDEP_SMALL if LOCKDEP
+	select NEED_DMA_MAP_STATE
+	select NEED_SG_DMA_LENGTH
 
 config SPARC32
 	def_bool !64BIT
@@ -67,6 +68,7 @@ config SPARC64
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_CONTEXT_TRACKING
 	select HAVE_DEBUG_KMEMLEAK
+	select IOMMU_HELPER
 	select SPARSE_IRQ
 	select RTC_DRV_CMOS
 	select RTC_DRV_BQ4802
@@ -102,14 +104,6 @@ config ARCH_ATU
 	bool
 	default y if SPARC64
 
-config ARCH_DMA_ADDR_T_64BIT
-	bool
-	default y if ARCH_ATU
-
-config IOMMU_HELPER
-	bool
-	default y if SPARC64
-
 config STACKTRACE_SUPPORT
 	bool
 	default y if SPARC64
@@ -146,12 +140,6 @@ config ZONE_DMA
 	bool
 	default y if SPARC32
 
-config NEED_DMA_MAP_STATE
-	def_bool y
-
-config NEED_SG_DMA_LENGTH
-	def_bool y
-
 config GENERIC_ISA_DMA
 	bool
 	default y if SPARC32
diff --git a/include/linux/iommu-common.h b/arch/sparc/include/asm/iommu-common.h
index 802c90c..802c90c 100644
--- a/include/linux/iommu-common.h
+++ b/arch/sparc/include/asm/iommu-common.h
diff --git a/arch/sparc/include/asm/iommu_64.h b/arch/sparc/include/asm/iommu_64.h
index 9ed6b54..0ef6ded 100644
--- a/arch/sparc/include/asm/iommu_64.h
+++ b/arch/sparc/include/asm/iommu_64.h
@@ -17,7 +17,7 @@
 #define IOPTE_WRITE   0x0000000000000002UL
 
 #define IOMMU_NUM_CTXS	4096
-#include <linux/iommu-common.h>
+#include <asm/iommu-common.h>
 
 struct iommu_arena {
 	unsigned long	*map;
diff --git a/arch/sparc/include/asm/pci_32.h b/arch/sparc/include/asm/pci_32.h
index 98917e4..cfc0ee9 100644
--- a/arch/sparc/include/asm/pci_32.h
+++ b/arch/sparc/include/asm/pci_32.h
@@ -17,10 +17,6 @@
 
 #define PCI_IRQ_NONE		0xffffffff
 
-/* Dynamic DMA mapping stuff.
- */
-#define PCI_DMA_BUS_IS_PHYS	(0)
-
 #endif /* __KERNEL__ */
 
 #ifndef CONFIG_LEON_PCI
diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h
index 671274e..fac7781 100644
--- a/arch/sparc/include/asm/pci_64.h
+++ b/arch/sparc/include/asm/pci_64.h
@@ -17,12 +17,6 @@
 
 #define PCI_IRQ_NONE		0xffffffff
 
-/* The PCI address space does not equal the physical memory
- * address space.  The networking and block device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS	(0)
-
 /* PCI IOMMU mapping bypass support. */
 
 /* PCI 64-bit addressing works for all slots on all controller
diff --git a/arch/sparc/include/uapi/asm/jsflash.h b/arch/sparc/include/uapi/asm/jsflash.h
deleted file mode 100644
index 68c98a5..0000000
--- a/arch/sparc/include/uapi/asm/jsflash.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * jsflash.h: OS Flash SIMM support for JavaStations.
- *
- * Copyright (C) 1999  Pete Zaitcev
- */
-
-#ifndef _SPARC_JSFLASH_H
-#define _SPARC_JSFLASH_H
-
-#ifndef _SPARC_TYPES_H
-#include <linux/types.h>
-#endif
-
-/*
- * Semantics of the offset is a full address.
- * Hardcode it or get it from probe ioctl.
- *
- * We use full bus address, so that we would be
- * automatically compatible with possible future systems.
- */
-
-#define JSFLASH_IDENT   (('F'<<8)|54)
-struct jsflash_ident_arg {
-	__u64 off;                /* 0x20000000 is included */
-	__u32 size;
-	char name[32];		/* With trailing zero */
-};
-
-#define JSFLASH_ERASE   (('F'<<8)|55)
-/* Put 0 as argument, may be flags or sector number... */
-
-#define JSFLASH_PROGRAM (('F'<<8)|56)
-struct jsflash_program_arg {
-	__u64 data;		/* char* for sparc and sparc64 */
-	__u64 off;
-	__u32 size;
-};
-
-#endif /* _SPARC_JSFLASH_H */
diff --git a/arch/sparc/include/uapi/asm/siginfo.h b/arch/sparc/include/uapi/asm/siginfo.h
index 896ce44..e704955 100644
--- a/arch/sparc/include/uapi/asm/siginfo.h
+++ b/arch/sparc/include/uapi/asm/siginfo.h
@@ -18,13 +18,6 @@
 #define SI_NOINFO	32767		/* no information in siginfo_t */
 
 /*
- * SIGFPE si_codes
- */
-#ifdef __KERNEL__
-#define FPE_FIXME	0	/* Broken dup of SI_USER */
-#endif /* __KERNEL__ */
-
-/*
  * SIGEMT si_codes
  */
 #define EMT_TAGOVF	1	/* tag overflow */
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 76cb577..cf86408 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -59,7 +59,7 @@ obj-$(CONFIG_SPARC32)   += leon_pmc.o
 
 obj-$(CONFIG_SPARC64)   += reboot.o
 obj-$(CONFIG_SPARC64)   += sysfs.o
-obj-$(CONFIG_SPARC64)   += iommu.o
+obj-$(CONFIG_SPARC64)   += iommu.o iommu-common.o
 obj-$(CONFIG_SPARC64)   += central.o
 obj-$(CONFIG_SPARC64)   += starfire.o
 obj-$(CONFIG_SPARC64)   += power.o
@@ -74,8 +74,6 @@ obj-$(CONFIG_SPARC64)	+= pcr.o
 obj-$(CONFIG_SPARC64)	+= nmi.o
 obj-$(CONFIG_SPARC64_SMP) += cpumap.o
 
-obj-y                     += dma.o
-
 obj-$(CONFIG_PCIC_PCI)    += pcic.o
 obj-$(CONFIG_LEON_PCI)    += leon_pci.o
 obj-$(CONFIG_SPARC_GRPCI2)+= leon_pci_grpci2.o
diff --git a/arch/sparc/kernel/dma.c b/arch/sparc/kernel/dma.c
deleted file mode 100644
index f73e759..0000000
--- a/arch/sparc/kernel/dma.c
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/dma-mapping.h>
-#include <linux/dma-debug.h>
-
-#define PREALLOC_DMA_DEBUG_ENTRIES       (1 << 15)
-
-static int __init dma_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-	return 0;
-}
-fs_initcall(dma_init);
diff --git a/lib/iommu-common.c b/arch/sparc/kernel/iommu-common.c
index 55b00de..59cb166 100644
--- a/lib/iommu-common.c
+++ b/arch/sparc/kernel/iommu-common.c
@@ -8,9 +8,9 @@
 #include <linux/bitmap.h>
 #include <linux/bug.h>
 #include <linux/iommu-helper.h>
-#include <linux/iommu-common.h>
 #include <linux/dma-mapping.h>
 #include <linux/hash.h>
+#include <asm/iommu-common.h>
 
 static unsigned long iommu_large_alloc = 15;
 
@@ -93,7 +93,6 @@ void iommu_tbl_pool_init(struct iommu_map_table *iommu,
 	p->hint = p->start;
 	p->end = num_entries;
 }
-EXPORT_SYMBOL(iommu_tbl_pool_init);
 
 unsigned long iommu_tbl_range_alloc(struct device *dev,
 				struct iommu_map_table *iommu,
@@ -224,7 +223,6 @@ bail:
 
 	return n;
 }
-EXPORT_SYMBOL(iommu_tbl_range_alloc);
 
 static struct iommu_pool *get_pool(struct iommu_map_table *tbl,
 				   unsigned long entry)
@@ -264,4 +262,3 @@ void iommu_tbl_range_free(struct iommu_map_table *iommu, u64 dma_addr,
 	bitmap_clear(iommu->map, entry, npages);
 	spin_unlock_irqrestore(&(pool->lock), flags);
 }
-EXPORT_SYMBOL(iommu_tbl_range_free);
diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index b08dc34..40d008b 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -14,7 +14,7 @@
 #include <linux/errno.h>
 #include <linux/iommu-helper.h>
 #include <linux/bitmap.h>
-#include <linux/iommu-common.h>
+#include <asm/iommu-common.h>
 
 #ifdef CONFIG_PCI
 #include <linux/pci.h>
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index 3bcef9c..cca9134 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -678,25 +678,14 @@ static int sparc_io_proc_show(struct seq_file *m, void *v)
 
 	return 0;
 }
-
-static int sparc_io_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, sparc_io_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations sparc_io_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= sparc_io_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif /* CONFIG_PROC_FS */
 
 static void register_proc_sparc_ioport(void)
 {
 #ifdef CONFIG_PROC_FS
-	proc_create_data("io_map", 0, NULL, &sparc_io_proc_fops, &sparc_iomap);
-	proc_create_data("dvma_map", 0, NULL, &sparc_io_proc_fops, &_sparc_dvma);
+	proc_create_single_data("io_map", 0, NULL, sparc_io_proc_show,
+			&sparc_iomap);
+	proc_create_single_data("dvma_map", 0, NULL, sparc_io_proc_show,
+			&_sparc_dvma);
 #endif
 }
diff --git a/arch/sparc/kernel/ldc.c b/arch/sparc/kernel/ldc.c
index 86b625f..c0fa3ef 100644
--- a/arch/sparc/kernel/ldc.c
+++ b/arch/sparc/kernel/ldc.c
@@ -16,7 +16,7 @@
 #include <linux/list.h>
 #include <linux/init.h>
 #include <linux/bitmap.h>
-#include <linux/iommu-common.h>
+#include <asm/iommu-common.h>
 
 #include <asm/hypervisor.h>
 #include <asm/iommu.h>
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index 2493672..565d9ac 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -16,7 +16,7 @@
 #include <linux/export.h>
 #include <linux/log2.h>
 #include <linux/of_device.h>
-#include <linux/iommu-common.h>
+#include <asm/iommu-common.h>
 
 #include <asm/iommu.h>
 #include <asm/irq.h>
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 454a8af..6c08608 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -518,14 +518,7 @@ void synchronize_user_stack(void)
 
 static void stack_unaligned(unsigned long sp)
 {
-	siginfo_t info;
-
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_ADRALN;
-	info.si_addr = (void __user *) sp;
-	info.si_trapno = 0;
-	force_sig_info(SIGBUS, &info, current);
+	force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) sp, 0, current);
 }
 
 void fault_in_user_windows(void)
diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c
index e8c3cb6..7f3d9c5 100644
--- a/arch/sparc/kernel/sys_sparc_32.c
+++ b/arch/sparc/kernel/sys_sparc_32.c
@@ -147,17 +147,11 @@ SYSCALL_DEFINE0(nis_syscall)
 asmlinkage void
 sparc_breakpoint (struct pt_regs *regs)
 {
-	siginfo_t info;
 
 #ifdef DEBUG_SPARC_BREAKPOINT
         printk ("TRAP: Entering kernel PC=%x, nPC=%x\n", regs->pc, regs->npc);
 #endif
-	info.si_signo = SIGTRAP;
-	info.si_errno = 0;
-	info.si_code = TRAP_BRKPT;
-	info.si_addr = (void __user *)regs->pc;
-	info.si_trapno = 0;
-	force_sig_info(SIGTRAP, &info, current);
+	force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc, 0, current);
 
 #ifdef DEBUG_SPARC_BREAKPOINT
 	printk ("TRAP: Returning to space: PC=%x nPC=%x\n", regs->pc, regs->npc);
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 9ef8de6..7e49bbc9 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -502,7 +502,6 @@ SYSCALL_DEFINE0(nis_syscall)
 asmlinkage void sparc_breakpoint(struct pt_regs *regs)
 {
 	enum ctx_state prev_state = exception_enter();
-	siginfo_t info;
 
 	if (test_thread_flag(TIF_32BIT)) {
 		regs->tpc &= 0xffffffff;
@@ -511,12 +510,7 @@ asmlinkage void sparc_breakpoint(struct pt_regs *regs)
 #ifdef DEBUG_SPARC_BREAKPOINT
         printk ("TRAP: Entering kernel PC=%lx, nPC=%lx\n", regs->tpc, regs->tnpc);
 #endif
-	info.si_signo = SIGTRAP;
-	info.si_errno = 0;
-	info.si_code = TRAP_BRKPT;
-	info.si_addr = (void __user *)regs->tpc;
-	info.si_trapno = 0;
-	force_sig_info(SIGTRAP, &info, current);
+	force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->tpc, 0, current);
 #ifdef DEBUG_SPARC_BREAKPOINT
 	printk ("TRAP: Returning to space: PC=%lx nPC=%lx\n", regs->tpc, regs->tnpc);
 #endif
diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c
index b1ed763..bcdfc61 100644
--- a/arch/sparc/kernel/traps_32.c
+++ b/arch/sparc/kernel/traps_32.c
@@ -93,8 +93,6 @@ void __noreturn die_if_kernel(char *str, struct pt_regs *regs)
 
 void do_hw_interrupt(struct pt_regs *regs, unsigned long type)
 {
-	siginfo_t info;
-
 	if(type < 0x80) {
 		/* Sun OS's puke from bad traps, Linux survives! */
 		printk("Unimplemented Sparc TRAP, type = %02lx\n", type);
@@ -104,19 +102,13 @@ void do_hw_interrupt(struct pt_regs *regs, unsigned long type)
 	if(regs->psr & PSR_PS)
 		die_if_kernel("Kernel bad trap", regs);
 
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code = ILL_ILLTRP;
-	info.si_addr = (void __user *)regs->pc;
-	info.si_trapno = type - 0x80;
-	force_sig_info(SIGILL, &info, current);
+	force_sig_fault(SIGILL, ILL_ILLTRP,
+			(void __user *)regs->pc, type - 0x80, current);
 }
 
 void do_illegal_instruction(struct pt_regs *regs, unsigned long pc, unsigned long npc,
 			    unsigned long psr)
 {
-	siginfo_t info;
-
 	if(psr & PSR_PS)
 		die_if_kernel("Kernel illegal instruction", regs);
 #ifdef TRAP_DEBUG
@@ -124,27 +116,15 @@ void do_illegal_instruction(struct pt_regs *regs, unsigned long pc, unsigned lon
 	       regs->pc, *(unsigned long *)regs->pc);
 #endif
 
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code = ILL_ILLOPC;
-	info.si_addr = (void __user *)pc;
-	info.si_trapno = 0;
-	send_sig_info(SIGILL, &info, current);
+	send_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)pc, 0, current);
 }
 
 void do_priv_instruction(struct pt_regs *regs, unsigned long pc, unsigned long npc,
 			 unsigned long psr)
 {
-	siginfo_t info;
-
 	if(psr & PSR_PS)
 		die_if_kernel("Penguin instruction from Penguin mode??!?!", regs);
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code = ILL_PRVOPC;
-	info.si_addr = (void __user *)pc;
-	info.si_trapno = 0;
-	send_sig_info(SIGILL, &info, current);
+	send_sig_fault(SIGILL, ILL_PRVOPC, (void __user *)pc, 0, current);
 }
 
 /* XXX User may want to be allowed to do this. XXX */
@@ -152,8 +132,6 @@ void do_priv_instruction(struct pt_regs *regs, unsigned long pc, unsigned long n
 void do_memaccess_unaligned(struct pt_regs *regs, unsigned long pc, unsigned long npc,
 			    unsigned long psr)
 {
-	siginfo_t info;
-
 	if(regs->psr & PSR_PS) {
 		printk("KERNEL MNA at pc %08lx npc %08lx called by %08lx\n", pc, npc,
 		       regs->u_regs[UREG_RETPC]);
@@ -165,12 +143,9 @@ void do_memaccess_unaligned(struct pt_regs *regs, unsigned long pc, unsigned lon
 	instruction_dump ((unsigned long *) regs->pc);
 	printk ("do_MNA!\n");
 #endif
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_ADRALN;
-	info.si_addr = /* FIXME: Should dig out mna address */ (void *)0;
-	info.si_trapno = 0;
-	send_sig_info(SIGBUS, &info, current);
+	send_sig_fault(SIGBUS, BUS_ADRALN,
+		       /* FIXME: Should dig out mna address */ (void *)0,
+		       0, current);
 }
 
 static unsigned long init_fsr = 0x0UL;
@@ -226,9 +201,9 @@ void do_fpe_trap(struct pt_regs *regs, unsigned long pc, unsigned long npc,
 		 unsigned long psr)
 {
 	static int calls;
-	siginfo_t info;
 	unsigned long fsr;
 	int ret = 0;
+	int code;
 #ifndef CONFIG_SMP
 	struct task_struct *fpt = last_task_used_math;
 #else
@@ -303,24 +278,20 @@ void do_fpe_trap(struct pt_regs *regs, unsigned long pc, unsigned long npc,
 	}
 
 	fsr = fpt->thread.fsr;
-	info.si_signo = SIGFPE;
-	info.si_errno = 0;
-	info.si_addr = (void __user *)pc;
-	info.si_trapno = 0;
-	info.si_code = FPE_FIXME;
+	code = FPE_FLTUNK;
 	if ((fsr & 0x1c000) == (1 << 14)) {
 		if (fsr & 0x10)
-			info.si_code = FPE_FLTINV;
+			code = FPE_FLTINV;
 		else if (fsr & 0x08)
-			info.si_code = FPE_FLTOVF;
+			code = FPE_FLTOVF;
 		else if (fsr & 0x04)
-			info.si_code = FPE_FLTUND;
+			code = FPE_FLTUND;
 		else if (fsr & 0x02)
-			info.si_code = FPE_FLTDIV;
+			code = FPE_FLTDIV;
 		else if (fsr & 0x01)
-			info.si_code = FPE_FLTRES;
+			code = FPE_FLTRES;
 	}
-	send_sig_info(SIGFPE, &info, fpt);
+	send_sig_fault(SIGFPE, code, (void __user *)pc, 0, fpt);
 #ifndef CONFIG_SMP
 	last_task_used_math = NULL;
 #endif
@@ -332,16 +303,9 @@ void do_fpe_trap(struct pt_regs *regs, unsigned long pc, unsigned long npc,
 void handle_tag_overflow(struct pt_regs *regs, unsigned long pc, unsigned long npc,
 			 unsigned long psr)
 {
-	siginfo_t info;
-
 	if(psr & PSR_PS)
 		die_if_kernel("Penguin overflow trap from kernel mode", regs);
-	info.si_signo = SIGEMT;
-	info.si_errno = 0;
-	info.si_code = EMT_TAGOVF;
-	info.si_addr = (void __user *)pc;
-	info.si_trapno = 0;
-	send_sig_info(SIGEMT, &info, current);
+	send_sig_fault(SIGEMT, EMT_TAGOVF, (void __user *)pc, 0, current);
 }
 
 void handle_watchpoint(struct pt_regs *regs, unsigned long pc, unsigned long npc,
@@ -359,61 +323,33 @@ void handle_watchpoint(struct pt_regs *regs, unsigned long pc, unsigned long npc
 void handle_reg_access(struct pt_regs *regs, unsigned long pc, unsigned long npc,
 		       unsigned long psr)
 {
-	siginfo_t info;
-
 #ifdef TRAP_DEBUG
 	printk("Register Access Exception at PC %08lx NPC %08lx PSR %08lx\n",
 	       pc, npc, psr);
 #endif
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_OBJERR;
-	info.si_addr = (void __user *)pc;
-	info.si_trapno = 0;
-	force_sig_info(SIGBUS, &info, current);
+	force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)pc, 0, current);
 }
 
 void handle_cp_disabled(struct pt_regs *regs, unsigned long pc, unsigned long npc,
 			unsigned long psr)
 {
-	siginfo_t info;
-
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code = ILL_COPROC;
-	info.si_addr = (void __user *)pc;
-	info.si_trapno = 0;
-	send_sig_info(SIGILL, &info, current);
+	send_sig_fault(SIGILL, ILL_COPROC, (void __user *)pc, 0, current);
 }
 
 void handle_cp_exception(struct pt_regs *regs, unsigned long pc, unsigned long npc,
 			 unsigned long psr)
 {
-	siginfo_t info;
-
 #ifdef TRAP_DEBUG
 	printk("Co-Processor Exception at PC %08lx NPC %08lx PSR %08lx\n",
 	       pc, npc, psr);
 #endif
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code = ILL_COPROC;
-	info.si_addr = (void __user *)pc;
-	info.si_trapno = 0;
-	send_sig_info(SIGILL, &info, current);
+	send_sig_fault(SIGILL, ILL_COPROC, (void __user *)pc, 0, current);
 }
 
 void handle_hw_divzero(struct pt_regs *regs, unsigned long pc, unsigned long npc,
 		       unsigned long psr)
 {
-	siginfo_t info;
-
-	info.si_signo = SIGFPE;
-	info.si_errno = 0;
-	info.si_code = FPE_INTDIV;
-	info.si_addr = (void __user *)pc;
-	info.si_trapno = 0;
-	send_sig_info(SIGFPE, &info, current);
+	send_sig_fault(SIGFPE, FPE_INTDIV, (void __user *)pc, 0, current);
 }
 
 #ifdef CONFIG_DEBUG_BUGVERBOSE
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 462a21a..aa624ed 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -87,7 +87,6 @@ static void dump_tl1_traplog(struct tl1_traplog *p)
 void bad_trap(struct pt_regs *regs, long lvl)
 {
 	char buffer[36];
-	siginfo_t info;
 
 	if (notify_die(DIE_TRAP, "bad trap", regs,
 		       0, lvl, SIGTRAP) == NOTIFY_STOP)
@@ -107,12 +106,8 @@ void bad_trap(struct pt_regs *regs, long lvl)
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code = ILL_ILLTRP;
-	info.si_addr = (void __user *)regs->tpc;
-	info.si_trapno = lvl;
-	force_sig_info(SIGILL, &info, current);
+	force_sig_fault(SIGILL, ILL_ILLTRP,
+			(void __user *)regs->tpc, lvl, current);
 }
 
 void bad_trap_tl1(struct pt_regs *regs, long lvl)
@@ -191,7 +186,6 @@ EXPORT_SYMBOL_GPL(unregister_dimm_printer);
 void spitfire_insn_access_exception(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar)
 {
 	enum ctx_state prev_state = exception_enter();
-	siginfo_t info;
 
 	if (notify_die(DIE_TRAP, "instruction access exception", regs,
 		       0, 0x8, SIGTRAP) == NOTIFY_STOP)
@@ -206,12 +200,8 @@ void spitfire_insn_access_exception(struct pt_regs *regs, unsigned long sfsr, un
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
-	info.si_signo = SIGSEGV;
-	info.si_errno = 0;
-	info.si_code = SEGV_MAPERR;
-	info.si_addr = (void __user *)regs->tpc;
-	info.si_trapno = 0;
-	force_sig_info(SIGSEGV, &info, current);
+	force_sig_fault(SIGSEGV, SEGV_MAPERR,
+			(void __user *)regs->tpc, 0, current);
 out:
 	exception_exit(prev_state);
 }
@@ -230,7 +220,6 @@ void sun4v_insn_access_exception(struct pt_regs *regs, unsigned long addr, unsig
 {
 	unsigned short type = (type_ctx >> 16);
 	unsigned short ctx  = (type_ctx & 0xffff);
-	siginfo_t info;
 
 	if (notify_die(DIE_TRAP, "instruction access exception", regs,
 		       0, 0x8, SIGTRAP) == NOTIFY_STOP)
@@ -247,12 +236,7 @@ void sun4v_insn_access_exception(struct pt_regs *regs, unsigned long addr, unsig
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
-	info.si_signo = SIGSEGV;
-	info.si_errno = 0;
-	info.si_code = SEGV_MAPERR;
-	info.si_addr = (void __user *) addr;
-	info.si_trapno = 0;
-	force_sig_info(SIGSEGV, &info, current);
+	force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *) addr, 0, current);
 }
 
 void sun4v_insn_access_exception_tl1(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx)
@@ -307,7 +291,6 @@ bool is_no_fault_exception(struct pt_regs *regs)
 void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar)
 {
 	enum ctx_state prev_state = exception_enter();
-	siginfo_t info;
 
 	if (notify_die(DIE_TRAP, "data access exception", regs,
 		       0, 0x30, SIGTRAP) == NOTIFY_STOP)
@@ -338,12 +321,7 @@ void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, un
 	if (is_no_fault_exception(regs))
 		return;
 
-	info.si_signo = SIGSEGV;
-	info.si_errno = 0;
-	info.si_code = SEGV_MAPERR;
-	info.si_addr = (void __user *)sfar;
-	info.si_trapno = 0;
-	force_sig_info(SIGSEGV, &info, current);
+	force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)sfar, 0, current);
 out:
 	exception_exit(prev_state);
 }
@@ -559,8 +537,6 @@ static void spitfire_cee_log(unsigned long afsr, unsigned long afar, unsigned lo
 
 static void spitfire_ue_log(unsigned long afsr, unsigned long afar, unsigned long udbh, unsigned long udbl, unsigned long tt, int tl1, struct pt_regs *regs)
 {
-	siginfo_t info;
-
 	printk(KERN_WARNING "CPU[%d]: Uncorrectable Error AFSR[%lx] "
 	       "AFAR[%lx] UDBL[%lx] UDBH[%ld] TT[%lx] TL>1[%d]\n",
 	       smp_processor_id(), afsr, afar, udbl, udbh, tt, tl1);
@@ -595,12 +571,7 @@ static void spitfire_ue_log(unsigned long afsr, unsigned long afar, unsigned lon
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_OBJERR;
-	info.si_addr = (void *)0;
-	info.si_trapno = 0;
-	force_sig_info(SIGBUS, &info, current);
+	force_sig_fault(SIGBUS, BUS_OBJERR, (void *)0, 0, current);
 }
 
 void spitfire_access_error(struct pt_regs *regs, unsigned long status_encoded, unsigned long afar)
@@ -2190,7 +2161,6 @@ bool sun4v_nonresum_error_user_handled(struct pt_regs *regs,
 
 	if (attrs & SUN4V_ERR_ATTRS_MEMORY) {
 		unsigned long addr = ent->err_raddr;
-		siginfo_t info;
 
 		if (addr == ~(u64)0) {
 			/* This seems highly unlikely to ever occur */
@@ -2211,21 +2181,13 @@ bool sun4v_nonresum_error_user_handled(struct pt_regs *regs,
 				addr += PAGE_SIZE;
 			}
 		}
-		info.si_signo = SIGKILL;
-		info.si_errno = 0;
-		info.si_trapno = 0;
-		force_sig_info(info.si_signo, &info, current);
+		force_sig(SIGKILL, current);
 
 		return true;
 	}
 	if (attrs & SUN4V_ERR_ATTRS_PIO) {
-		siginfo_t info;
-
-		info.si_signo = SIGBUS;
-		info.si_code = BUS_ADRERR;
-		info.si_addr = (void __user *)sun4v_get_vaddr(regs);
-		force_sig_info(info.si_signo, &info, current);
-
+		force_sig_fault(SIGBUS, BUS_ADRERR,
+				(void __user *)sun4v_get_vaddr(regs), 0, current);
 		return true;
 	}
 
@@ -2362,30 +2324,27 @@ static void do_fpe_common(struct pt_regs *regs)
 		regs->tnpc += 4;
 	} else {
 		unsigned long fsr = current_thread_info()->xfsr[0];
-		siginfo_t info;
+		int code;
 
 		if (test_thread_flag(TIF_32BIT)) {
 			regs->tpc &= 0xffffffff;
 			regs->tnpc &= 0xffffffff;
 		}
-		info.si_signo = SIGFPE;
-		info.si_errno = 0;
-		info.si_addr = (void __user *)regs->tpc;
-		info.si_trapno = 0;
-		info.si_code = FPE_FIXME;
+		code = FPE_FLTUNK;
 		if ((fsr & 0x1c000) == (1 << 14)) {
 			if (fsr & 0x10)
-				info.si_code = FPE_FLTINV;
+				code = FPE_FLTINV;
 			else if (fsr & 0x08)
-				info.si_code = FPE_FLTOVF;
+				code = FPE_FLTOVF;
 			else if (fsr & 0x04)
-				info.si_code = FPE_FLTUND;
+				code = FPE_FLTUND;
 			else if (fsr & 0x02)
-				info.si_code = FPE_FLTDIV;
+				code = FPE_FLTDIV;
 			else if (fsr & 0x01)
-				info.si_code = FPE_FLTRES;
+				code = FPE_FLTRES;
 		}
-		force_sig_info(SIGFPE, &info, current);
+		force_sig_fault(SIGFPE, code,
+				(void __user *)regs->tpc, 0, current);
 	}
 }
 
@@ -2428,7 +2387,6 @@ out:
 void do_tof(struct pt_regs *regs)
 {
 	enum ctx_state prev_state = exception_enter();
-	siginfo_t info;
 
 	if (notify_die(DIE_TRAP, "tagged arithmetic overflow", regs,
 		       0, 0x26, SIGEMT) == NOTIFY_STOP)
@@ -2440,12 +2398,8 @@ void do_tof(struct pt_regs *regs)
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
-	info.si_signo = SIGEMT;
-	info.si_errno = 0;
-	info.si_code = EMT_TAGOVF;
-	info.si_addr = (void __user *)regs->tpc;
-	info.si_trapno = 0;
-	force_sig_info(SIGEMT, &info, current);
+	force_sig_fault(SIGEMT, EMT_TAGOVF,
+			(void __user *)regs->tpc, 0, current);
 out:
 	exception_exit(prev_state);
 }
@@ -2453,7 +2407,6 @@ out:
 void do_div0(struct pt_regs *regs)
 {
 	enum ctx_state prev_state = exception_enter();
-	siginfo_t info;
 
 	if (notify_die(DIE_TRAP, "integer division by zero", regs,
 		       0, 0x28, SIGFPE) == NOTIFY_STOP)
@@ -2465,12 +2418,8 @@ void do_div0(struct pt_regs *regs)
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
-	info.si_signo = SIGFPE;
-	info.si_errno = 0;
-	info.si_code = FPE_INTDIV;
-	info.si_addr = (void __user *)regs->tpc;
-	info.si_trapno = 0;
-	force_sig_info(SIGFPE, &info, current);
+	force_sig_fault(SIGFPE, FPE_INTDIV,
+			(void __user *)regs->tpc, 0, current);
 out:
 	exception_exit(prev_state);
 }
@@ -2632,7 +2581,6 @@ void do_illegal_instruction(struct pt_regs *regs)
 	unsigned long pc = regs->tpc;
 	unsigned long tstate = regs->tstate;
 	u32 insn;
-	siginfo_t info;
 
 	if (notify_die(DIE_TRAP, "illegal instruction", regs,
 		       0, 0x10, SIGILL) == NOTIFY_STOP)
@@ -2666,12 +2614,7 @@ void do_illegal_instruction(struct pt_regs *regs)
 			}
 		}
 	}
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code = ILL_ILLOPC;
-	info.si_addr = (void __user *)pc;
-	info.si_trapno = 0;
-	force_sig_info(SIGILL, &info, current);
+	force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)pc, 0, current);
 out:
 	exception_exit(prev_state);
 }
@@ -2679,7 +2622,6 @@ out:
 void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr)
 {
 	enum ctx_state prev_state = exception_enter();
-	siginfo_t info;
 
 	if (notify_die(DIE_TRAP, "memory address unaligned", regs,
 		       0, 0x34, SIGSEGV) == NOTIFY_STOP)
@@ -2692,20 +2634,13 @@ void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned lo
 	if (is_no_fault_exception(regs))
 		return;
 
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_ADRALN;
-	info.si_addr = (void __user *)sfar;
-	info.si_trapno = 0;
-	force_sig_info(SIGBUS, &info, current);
+	force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)sfar, 0, current);
 out:
 	exception_exit(prev_state);
 }
 
 void sun4v_do_mna(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx)
 {
-	siginfo_t info;
-
 	if (notify_die(DIE_TRAP, "memory address unaligned", regs,
 		       0, 0x34, SIGSEGV) == NOTIFY_STOP)
 		return;
@@ -2717,12 +2652,7 @@ void sun4v_do_mna(struct pt_regs *regs, unsigned long addr, unsigned long type_c
 	if (is_no_fault_exception(regs))
 		return;
 
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_ADRALN;
-	info.si_addr = (void __user *) addr;
-	info.si_trapno = 0;
-	force_sig_info(SIGBUS, &info, current);
+	force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) addr, 0, current);
 }
 
 /* sun4v_mem_corrupt_detect_precise() - Handle precise exception on an ADI
@@ -2775,7 +2705,6 @@ void sun4v_mem_corrupt_detect_precise(struct pt_regs *regs, unsigned long addr,
 void do_privop(struct pt_regs *regs)
 {
 	enum ctx_state prev_state = exception_enter();
-	siginfo_t info;
 
 	if (notify_die(DIE_TRAP, "privileged operation", regs,
 		       0, 0x11, SIGILL) == NOTIFY_STOP)
@@ -2785,12 +2714,8 @@ void do_privop(struct pt_regs *regs)
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code = ILL_PRVOPC;
-	info.si_addr = (void __user *)regs->tpc;
-	info.si_trapno = 0;
-	force_sig_info(SIGILL, &info, current);
+	force_sig_fault(SIGILL, ILL_PRVOPC,
+			(void __user *)regs->tpc, 0, current);
 out:
 	exception_exit(prev_state);
 }
diff --git a/arch/sparc/kernel/unaligned_32.c b/arch/sparc/kernel/unaligned_32.c
index 7642d7e..64ac8c0 100644
--- a/arch/sparc/kernel/unaligned_32.c
+++ b/arch/sparc/kernel/unaligned_32.c
@@ -311,14 +311,9 @@ static inline int ok_for_user(struct pt_regs *regs, unsigned int insn,
 
 static void user_mna_trap_fault(struct pt_regs *regs, unsigned int insn)
 {
-	siginfo_t info;
-
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_ADRALN;
-	info.si_addr = (void __user *)safe_compute_effective_address(regs, insn);
-	info.si_trapno = 0;
-	send_sig_info(SIGBUS, &info, current);
+	send_sig_fault(SIGBUS, BUS_ADRALN,
+		       (void __user *)safe_compute_effective_address(regs, insn),
+		       0, current);
 }
 
 asmlinkage void user_unaligned_trap(struct pt_regs *regs, unsigned int insn)
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index a8103a8..9f75b64 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -127,19 +127,11 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,
 static void __do_fault_siginfo(int code, int sig, struct pt_regs *regs,
 			       unsigned long addr)
 {
-	siginfo_t info;
-
-	info.si_signo = sig;
-	info.si_code = code;
-	info.si_errno = 0;
-	info.si_addr = (void __user *) addr;
-	info.si_trapno = 0;
-
 	if (unlikely(show_unhandled_signals))
-		show_signal_msg(regs, sig, info.si_code,
+		show_signal_msg(regs, sig, code,
 				addr, current);
 
-	force_sig_info (sig, &info, current);
+	force_sig_fault(sig, code, (void __user *) addr, 0, current);
 }
 
 static unsigned long compute_si_addr(struct pt_regs *regs, int text_fault)
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 41363f4..63166fc 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -170,11 +170,7 @@ static void do_fault_siginfo(int code, int sig, struct pt_regs *regs,
 			     int fault_code)
 {
 	unsigned long addr;
-	siginfo_t info;
 
-	info.si_code = code;
-	info.si_signo = sig;
-	info.si_errno = 0;
 	if (fault_code & FAULT_CODE_ITLB) {
 		addr = regs->tpc;
 	} else {
@@ -187,13 +183,11 @@ static void do_fault_siginfo(int code, int sig, struct pt_regs *regs,
 		else
 			addr = fault_addr;
 	}
-	info.si_addr = (void __user *) addr;
-	info.si_trapno = 0;
 
 	if (unlikely(show_unhandled_signals))
 		show_signal_msg(regs, sig, code, addr, current);
 
-	force_sig_info(sig, &info, current);
+	force_sig_fault(sig, code, (void __user *) addr, 0, current);
 }
 
 static unsigned int get_fault_insn(struct pt_regs *regs, unsigned int insn)
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index d4e8c49..dcf5ea2 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -208,19 +208,6 @@ static int fake_ide_media_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int fake_ide_media_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, fake_ide_media_proc_show, NULL);
-}
-
-static const struct file_operations fake_ide_media_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= fake_ide_media_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static void make_ide_entries(const char *dev_name)
 {
 	struct proc_dir_entry *dir, *ent;
@@ -231,7 +218,8 @@ static void make_ide_entries(const char *dev_name)
 	dir = proc_mkdir(dev_name, proc_ide);
 	if(!dir) return;
 
-	ent = proc_create("media", S_IRUGO, dir, &fake_ide_media_proc_fops);
+	ent = proc_create_single("media", S_IRUGO, dir,
+			fake_ide_media_proc_show);
 	if(!ent) return;
 	snprintf(name, sizeof(name), "ide0/%s", dev_name);
 	proc_symlink(dev_name, proc_ide_root, name);
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index bc2a516..1a1d88a 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -115,17 +115,10 @@ long arch_ptrace(struct task_struct *child, long request,
 static void send_sigtrap(struct task_struct *tsk, struct uml_pt_regs *regs,
 		  int error_code)
 {
-	struct siginfo info;
-
-	memset(&info, 0, sizeof(info));
-	info.si_signo = SIGTRAP;
-	info.si_code = TRAP_BRKPT;
-
-	/* User-mode eip? */
-	info.si_addr = UPT_IS_USER(regs) ? (void __user *) UPT_IP(regs) : NULL;
-
 	/* Send us the fake SIGTRAP */
-	force_sig_info(SIGTRAP, &info, tsk);
+	force_sig_fault(SIGTRAP, TRAP_BRKPT,
+			/* User-mode eip? */
+			UPT_IS_USER(regs) ? (void __user *) UPT_IP(regs) : NULL, tsk);
 }
 
 /*
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index b2b02df..ec9a42c 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -162,13 +162,9 @@ static void show_segv_info(struct uml_pt_regs *regs)
 
 static void bad_segv(struct faultinfo fi, unsigned long ip)
 {
-	struct siginfo si;
-
-	si.si_signo = SIGSEGV;
-	si.si_code = SEGV_ACCERR;
-	si.si_addr = (void __user *) FAULT_ADDRESS(fi);
 	current->thread.arch.faultinfo = fi;
-	force_sig_info(SIGSEGV, &si, current);
+	force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *) FAULT_ADDRESS(fi),
+			current);
 }
 
 void fatal_sigsegv(void)
@@ -214,8 +210,8 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
 unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
 		   struct uml_pt_regs *regs)
 {
-	struct siginfo si;
 	jmp_buf *catcher;
+	int si_code;
 	int err;
 	int is_write = FAULT_WRITE(fi);
 	unsigned long address = FAULT_ADDRESS(fi);
@@ -239,7 +235,7 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
 
 	if (SEGV_IS_FIXABLE(&fi))
 		err = handle_page_fault(address, ip, is_write, is_user,
-					&si.si_code);
+					&si_code);
 	else {
 		err = -EFAULT;
 		/*
@@ -271,18 +267,14 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
 	show_segv_info(regs);
 
 	if (err == -EACCES) {
-		si.si_signo = SIGBUS;
-		si.si_errno = 0;
-		si.si_code = BUS_ADRERR;
-		si.si_addr = (void __user *)address;
 		current->thread.arch.faultinfo = fi;
-		force_sig_info(SIGBUS, &si, current);
+		force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address,
+				current);
 	} else {
 		BUG_ON(err != -EFAULT);
-		si.si_signo = SIGSEGV;
-		si.si_addr = (void __user *) address;
 		current->thread.arch.faultinfo = fi;
-		force_sig_info(SIGSEGV, &si, current);
+		force_sig_fault(SIGSEGV, si_code, (void __user *) address,
+				current);
 	}
 
 out:
@@ -294,9 +286,7 @@ out:
 
 void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs)
 {
-	struct faultinfo *fi;
-	struct siginfo clean_si;
-
+	int code, err;
 	if (!UPT_IS_USER(regs)) {
 		if (sig == SIGBUS)
 			printk(KERN_ERR "Bus error - the host /dev/shm or /tmp "
@@ -306,29 +296,21 @@ void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs)
 
 	arch_examine_signal(sig, regs);
 
-	clear_siginfo(&clean_si);
-	clean_si.si_signo = si->si_signo;
-	clean_si.si_errno = si->si_errno;
-	clean_si.si_code = si->si_code;
-	switch (sig) {
-	case SIGILL:
-	case SIGFPE:
-	case SIGSEGV:
-	case SIGBUS:
-	case SIGTRAP:
-		fi = UPT_FAULTINFO(regs);
-		clean_si.si_addr = (void __user *) FAULT_ADDRESS(*fi);
+	/* Is the signal layout for the signal known?
+	 * Signal data must be scrubbed to prevent information leaks.
+	 */
+	code = si->si_code;
+	err = si->si_errno;
+	if ((err == 0) && (siginfo_layout(sig, code) == SIL_FAULT)) {
+		struct faultinfo *fi = UPT_FAULTINFO(regs);
 		current->thread.arch.faultinfo = *fi;
-#ifdef __ARCH_SI_TRAPNO
-		clean_si.si_trapno = si->si_trapno;
-#endif
-		break;
-	default:
-		printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d)\n",
-			sig, si->si_code);
+		force_sig_fault(sig, code, (void __user *)FAULT_ADDRESS(*fi),
+				current);
+	} else {
+		printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d) with errno %d\n",
+		       sig, code, err);
+		force_sig(sig, current);
 	}
-
-	force_sig_info(sig, &clean_si, current);
 }
 
 void bus_handler(int sig, struct siginfo *si, struct uml_pt_regs *regs)
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index 462e59a..03f991e 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -19,6 +19,8 @@ config UNICORE32
 	select ARCH_WANT_FRAME_POINTERS
 	select GENERIC_IOMAP
 	select MODULES_USE_ELF_REL
+	select NEED_DMA_MAP_STATE
+	select SWIOTLB
 	help
 	  UniCore-32 is 32-bit Instruction Set Architecture,
 	  including a series of low-power-consumption RISC chip
@@ -61,9 +63,6 @@ config ARCH_MAY_HAVE_PC_FDC
 config ZONE_DMA
 	def_bool y
 
-config NEED_DMA_MAP_STATE
-       def_bool y
-
 source "init/Kconfig"
 
 source "kernel/Kconfig.freezer"
diff --git a/arch/unicore32/kernel/fpu-ucf64.c b/arch/unicore32/kernel/fpu-ucf64.c
index 12c8c95..8594b16 100644
--- a/arch/unicore32/kernel/fpu-ucf64.c
+++ b/arch/unicore32/kernel/fpu-ucf64.c
@@ -52,14 +52,14 @@
  * Raise a SIGFPE for the current process.
  * sicode describes the signal being raised.
  */
-void ucf64_raise_sigfpe(unsigned int sicode, struct pt_regs *regs)
+void ucf64_raise_sigfpe(struct pt_regs *regs)
 {
 	siginfo_t info;
 
-	memset(&info, 0, sizeof(info));
+	clear_siginfo(&info);
 
 	info.si_signo = SIGFPE;
-	info.si_code = sicode;
+	info.si_code = FPE_FLTUNK;
 	info.si_addr = (void __user *)(instruction_pointer(regs) - 4);
 
 	/*
@@ -94,7 +94,7 @@ void ucf64_exchandler(u32 inst, u32 fpexc, struct pt_regs *regs)
 		pr_debug("UniCore-F64 FPSCR 0x%08x INST 0x%08x\n",
 				cff(FPSCR), inst);
 
-		ucf64_raise_sigfpe(0, regs);
+		ucf64_raise_sigfpe(regs);
 		return;
 	}
 
diff --git a/arch/unicore32/mm/Kconfig b/arch/unicore32/mm/Kconfig
index e9154a5..82759b6 100644
--- a/arch/unicore32/mm/Kconfig
+++ b/arch/unicore32/mm/Kconfig
@@ -39,14 +39,3 @@ config CPU_TLB_SINGLE_ENTRY_DISABLE
 	default y
 	help
 	  Say Y here to disable the TLB single entry operations.
-
-config SWIOTLB
-	def_bool y
-	select DMA_DIRECT_OPS
-
-config IOMMU_HELPER
-	def_bool SWIOTLB
-
-config NEED_SG_DMA_LENGTH
-	def_bool SWIOTLB
-
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
index bbefcc4..3814734 100644
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -125,6 +125,7 @@ static void __do_user_fault(struct task_struct *tsk, unsigned long addr,
 	tsk->thread.address = addr;
 	tsk->thread.error_code = fsr;
 	tsk->thread.trap_no = 14;
+	clear_siginfo(&si);
 	si.si_signo = sig;
 	si.si_errno = 0;
 	si.si_code = code;
@@ -472,6 +473,7 @@ asmlinkage void do_DataAbort(unsigned long addr, unsigned int fsr,
 	printk(KERN_ALERT "Unhandled fault: %s (0x%03x) at 0x%08lx\n",
 	       inf->name, fsr, addr);
 
+	clear_siginfo(&info);
 	info.si_signo = inf->sig;
 	info.si_errno = 0;
 	info.si_code = inf->code;
@@ -491,6 +493,7 @@ asmlinkage void do_PrefetchAbort(unsigned long addr,
 	printk(KERN_ALERT "Unhandled prefetch abort: %s (0x%03x) at 0x%08lx\n",
 	       inf->name, ifsr, addr);
 
+	clear_siginfo(&info);
 	info.si_signo = inf->sig;
 	info.si_errno = 0;
 	info.si_code = inf->code;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c07f492..1fe24b6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,6 +28,8 @@ config X86_64
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select HAVE_ARCH_SOFT_DIRTY
 	select MODULES_USE_ELF_RELA
+	select NEED_DMA_MAP_STATE
+	select SWIOTLB
 	select X86_DEV_DMA_OPS
 	select ARCH_HAS_SYSCALL_WRAPPER
 
@@ -60,6 +62,7 @@ config X86
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_REFCOUNT
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
+	select ARCH_HAS_UACCESS_MCSAFE		if X86_64
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_STRICT_KERNEL_RWX
@@ -134,7 +137,6 @@ config X86
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DEBUG_STACKOVERFLOW
-	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
@@ -184,6 +186,7 @@ config X86
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_USER_RETURN_NOTIFIER
 	select IRQ_FORCED_THREADING
+	select NEED_SG_DMA_LENGTH
 	select PCI_LOCKLESS_CONFIG
 	select PERF_EVENTS
 	select RTC_LIB
@@ -236,13 +239,6 @@ config ARCH_MMAP_RND_COMPAT_BITS_MAX
 config SBUS
 	bool
 
-config NEED_DMA_MAP_STATE
-	def_bool y
-	depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB
-
-config NEED_SG_DMA_LENGTH
-	def_bool y
-
 config GENERIC_ISA_DMA
 	def_bool y
 	depends on ISA_DMA_API
@@ -875,6 +871,7 @@ config DMI
 
 config GART_IOMMU
 	bool "Old AMD GART IOMMU support"
+	select IOMMU_HELPER
 	select SWIOTLB
 	depends on X86_64 && PCI && AMD_NB
 	---help---
@@ -896,6 +893,7 @@ config GART_IOMMU
 
 config CALGARY_IOMMU
 	bool "IBM Calgary IOMMU support"
+	select IOMMU_HELPER
 	select SWIOTLB
 	depends on X86_64 && PCI
 	---help---
@@ -923,20 +921,6 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
 	  Calgary anyway, pass 'iommu=calgary' on the kernel command line.
 	  If unsure, say Y.
 
-# need this always selected by IOMMU for the VIA workaround
-config SWIOTLB
-	def_bool y if X86_64
-	---help---
-	  Support for software bounce buffers used on x86-64 systems
-	  which don't have a hardware IOMMU. Using this PCI devices
-	  which can only access 32-bits of memory can be used on systems
-	  with more than 3 GB of memory.
-	  If unsure, say Y.
-
-config IOMMU_HELPER
-	def_bool y
-	depends on CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU
-
 config MAXSMP
 	bool "Enable Maximum number of SMP Processors and NUMA Nodes"
 	depends on X86_64 && SMP && DEBUG_KERNEL
@@ -1458,6 +1442,7 @@ config HIGHMEM
 config X86_PAE
 	bool "PAE (Physical Address Extension) Support"
 	depends on X86_32 && !HIGHMEM4G
+	select PHYS_ADDR_T_64BIT
 	select SWIOTLB
 	---help---
 	  PAE is required for NX support, and furthermore enables
@@ -1485,14 +1470,6 @@ config X86_5LEVEL
 
 	  Say N if unsure.
 
-config ARCH_PHYS_ADDR_T_64BIT
-	def_bool y
-	depends on X86_64 || X86_PAE
-
-config ARCH_DMA_ADDR_T_64BIT
-	def_bool y
-	depends on X86_64 || HIGHMEM64G
-
 config X86_DIRECT_GBPAGES
 	def_bool y
 	depends on X86_64 && !DEBUG_PAGEALLOC
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index 0cb3257..af6cda0 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "misc.h"
 
-#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE
+#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE || CONFIG_X86_5LEVEL
 
 static unsigned long fs;
 static inline void set_fs(unsigned long seg)
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 09f36c0..a8a8642 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -109,23 +109,34 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
 }
 
 static efi_status_t
-__setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom)
+__setup_efi_pci(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom)
 {
 	struct pci_setup_rom *rom = NULL;
 	efi_status_t status;
 	unsigned long size;
-	uint64_t attributes;
+	uint64_t attributes, romsize;
+	void *romimage;
 
-	status = efi_early->call(pci->attributes, pci,
-				 EfiPciIoAttributeOperationGet, 0, 0,
-				 &attributes);
+	status = efi_call_proto(efi_pci_io_protocol, attributes, pci,
+				EfiPciIoAttributeOperationGet, 0, 0,
+				&attributes);
 	if (status != EFI_SUCCESS)
 		return status;
 
-	if (!pci->romimage || !pci->romsize)
+	/*
+	 * Some firmware images contain EFI function pointers at the place where the
+	 * romimage and romsize fields are supposed to be. Typically the EFI
+	 * code is mapped at high addresses, translating to an unrealistically
+	 * large romsize. The UEFI spec limits the size of option ROMs to 16
+	 * MiB so we reject any ROMs over 16 MiB in size to catch this.
+	 */
+	romimage = (void *)(unsigned long)efi_table_attr(efi_pci_io_protocol,
+							 romimage, pci);
+	romsize = efi_table_attr(efi_pci_io_protocol, romsize, pci);
+	if (!romimage || !romsize || romsize > SZ_16M)
 		return EFI_INVALID_PARAMETER;
 
-	size = pci->romsize + sizeof(*rom);
+	size = romsize + sizeof(*rom);
 
 	status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom);
 	if (status != EFI_SUCCESS) {
@@ -141,30 +152,32 @@ __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom)
 	rom->pcilen = pci->romsize;
 	*__rom = rom;
 
-	status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
-				 PCI_VENDOR_ID, 1, &(rom->vendor));
+	status = efi_call_proto(efi_pci_io_protocol, pci.read, pci,
+				EfiPciIoWidthUint16, PCI_VENDOR_ID, 1,
+				&rom->vendor);
 
 	if (status != EFI_SUCCESS) {
 		efi_printk(sys_table, "Failed to read rom->vendor\n");
 		goto free_struct;
 	}
 
-	status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
-				 PCI_DEVICE_ID, 1, &(rom->devid));
+	status = efi_call_proto(efi_pci_io_protocol, pci.read, pci,
+				EfiPciIoWidthUint16, PCI_DEVICE_ID, 1,
+				&rom->devid);
 
 	if (status != EFI_SUCCESS) {
 		efi_printk(sys_table, "Failed to read rom->devid\n");
 		goto free_struct;
 	}
 
-	status = efi_early->call(pci->get_location, pci, &(rom->segment),
-				 &(rom->bus), &(rom->device), &(rom->function));
+	status = efi_call_proto(efi_pci_io_protocol, get_location, pci,
+				&rom->segment, &rom->bus, &rom->device,
+				&rom->function);
 
 	if (status != EFI_SUCCESS)
 		goto free_struct;
 
-	memcpy(rom->romdata, (void *)(unsigned long)pci->romimage,
-	       pci->romsize);
+	memcpy(rom->romdata, romimage, romsize);
 	return status;
 
 free_struct:
@@ -176,7 +189,7 @@ static void
 setup_efi_pci32(struct boot_params *params, void **pci_handle,
 		unsigned long size)
 {
-	efi_pci_io_protocol_32 *pci = NULL;
+	efi_pci_io_protocol_t *pci = NULL;
 	efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
 	u32 *handles = (u32 *)(unsigned long)pci_handle;
 	efi_status_t status;
@@ -203,7 +216,7 @@ setup_efi_pci32(struct boot_params *params, void **pci_handle,
 		if (!pci)
 			continue;
 
-		status = __setup_efi_pci32(pci, &rom);
+		status = __setup_efi_pci(pci, &rom);
 		if (status != EFI_SUCCESS)
 			continue;
 
@@ -217,74 +230,11 @@ setup_efi_pci32(struct boot_params *params, void **pci_handle,
 	}
 }
 
-static efi_status_t
-__setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom)
-{
-	struct pci_setup_rom *rom;
-	efi_status_t status;
-	unsigned long size;
-	uint64_t attributes;
-
-	status = efi_early->call(pci->attributes, pci,
-				 EfiPciIoAttributeOperationGet, 0,
-				 &attributes);
-	if (status != EFI_SUCCESS)
-		return status;
-
-	if (!pci->romimage || !pci->romsize)
-		return EFI_INVALID_PARAMETER;
-
-	size = pci->romsize + sizeof(*rom);
-
-	status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom);
-	if (status != EFI_SUCCESS) {
-		efi_printk(sys_table, "Failed to alloc mem for rom\n");
-		return status;
-	}
-
-	rom->data.type = SETUP_PCI;
-	rom->data.len = size - sizeof(struct setup_data);
-	rom->data.next = 0;
-	rom->pcilen = pci->romsize;
-	*__rom = rom;
-
-	status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
-				 PCI_VENDOR_ID, 1, &(rom->vendor));
-
-	if (status != EFI_SUCCESS) {
-		efi_printk(sys_table, "Failed to read rom->vendor\n");
-		goto free_struct;
-	}
-
-	status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
-				 PCI_DEVICE_ID, 1, &(rom->devid));
-
-	if (status != EFI_SUCCESS) {
-		efi_printk(sys_table, "Failed to read rom->devid\n");
-		goto free_struct;
-	}
-
-	status = efi_early->call(pci->get_location, pci, &(rom->segment),
-				 &(rom->bus), &(rom->device), &(rom->function));
-
-	if (status != EFI_SUCCESS)
-		goto free_struct;
-
-	memcpy(rom->romdata, (void *)(unsigned long)pci->romimage,
-	       pci->romsize);
-	return status;
-
-free_struct:
-	efi_call_early(free_pool, rom);
-	return status;
-
-}
-
 static void
 setup_efi_pci64(struct boot_params *params, void **pci_handle,
 		unsigned long size)
 {
-	efi_pci_io_protocol_64 *pci = NULL;
+	efi_pci_io_protocol_t *pci = NULL;
 	efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
 	u64 *handles = (u64 *)(unsigned long)pci_handle;
 	efi_status_t status;
@@ -311,7 +261,7 @@ setup_efi_pci64(struct boot_params *params, void **pci_handle,
 		if (!pci)
 			continue;
 
-		status = __setup_efi_pci64(pci, &rom);
+		status = __setup_efi_pci(pci, &rom);
 		if (status != EFI_SUCCESS)
 			continue;
 
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 8169e8b..6403789 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -365,6 +365,7 @@ ENTRY(startup_64)
 	 * this function call.
 	 */
 	pushq	%rsi
+	movq	%rsi, %rdi		/* real mode address */
 	call	paging_prepare
 	popq	%rsi
 
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index a0a50b9..b87a758 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -47,7 +47,7 @@
 #include <linux/decompress/mm.h>
 
 #ifdef CONFIG_X86_5LEVEL
-unsigned int pgtable_l5_enabled __ro_after_init;
+unsigned int __pgtable_l5_enabled;
 unsigned int pgdir_shift __ro_after_init = 39;
 unsigned int ptrs_per_p4d __ro_after_init = 1;
 #endif
@@ -734,7 +734,7 @@ void choose_random_location(unsigned long input,
 
 #ifdef CONFIG_X86_5LEVEL
 	if (__read_cr4() & X86_CR4_LA57) {
-		pgtable_l5_enabled = 1;
+		__pgtable_l5_enabled = 1;
 		pgdir_shift = 48;
 		ptrs_per_p4d = 512;
 	}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 9e11be4..a423bdb 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -12,10 +12,8 @@
 #undef CONFIG_PARAVIRT_SPINLOCKS
 #undef CONFIG_KASAN
 
-#ifdef CONFIG_X86_5LEVEL
-/* cpu_feature_enabled() cannot be used that early */
-#define pgtable_l5_enabled __pgtable_l5_enabled
-#endif
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
 
 #include <linux/linkage.h>
 #include <linux/screen_info.h>
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index a362fa0..8c51075 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -31,16 +31,23 @@ static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
  */
 unsigned long *trampoline_32bit __section(.data);
 
-struct paging_config paging_prepare(void)
+extern struct boot_params *boot_params;
+int cmdline_find_option_bool(const char *option);
+
+struct paging_config paging_prepare(void *rmode)
 {
 	struct paging_config paging_config = {};
 	unsigned long bios_start, ebda_start;
 
+	/* Initialize boot_params. Required for cmdline_find_option_bool(). */
+	boot_params = rmode;
+
 	/*
 	 * Check if LA57 is desired and supported.
 	 *
-	 * There are two parts to the check:
+	 * There are several parts to the check:
 	 *   - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y
+	 *   - if user asked to disable 5-level paging: no5lvl in cmdline
 	 *   - if the machine supports 5-level paging:
 	 *     + CPUID leaf 7 is supported
 	 *     + the leaf has the feature bit set
@@ -48,6 +55,7 @@ struct paging_config paging_prepare(void)
 	 * That's substitute for boot_cpu_has() in early boot code.
 	 */
 	if (IS_ENABLED(CONFIG_X86_5LEVEL) &&
+			!cmdline_find_option_bool("no5lvl") &&
 			native_cpuid_eax(0) >= 7 &&
 			(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) {
 		paging_config.l5_required = 1;
@@ -130,7 +138,7 @@ void cleanup_trampoline(void *pgtable)
 {
 	void *trampoline_pgtable;
 
-	trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET;
+	trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long);
 
 	/*
 	 * Move the top level page table out of trampoline memory,
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index d6b27da..14a2f99 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -396,3 +396,4 @@
 382	i386	pkey_free		sys_pkey_free			__ia32_sys_pkey_free
 383	i386	statx			sys_statx			__ia32_sys_statx
 384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
+385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 4dfe426..cd36232 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -341,6 +341,7 @@
 330	common	pkey_alloc		__x64_sys_pkey_alloc
 331	common	pkey_free		__x64_sys_pkey_free
 332	common	statx			__x64_sys_statx
+333	common	io_pgetevents		__x64_sys_io_pgetevents
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index d998a48..261802b 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -44,14 +44,14 @@ obj-y += $(vdso_img_objs)
 targets += $(vdso_img_cfiles)
 targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so)
 
-export CPPFLAGS_vdso.lds += -P -C
+CPPFLAGS_vdso.lds += -P -C
 
 VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
 			-Wl,--no-undefined \
 			-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \
 			$(DISABLE_LTO)
 
-$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
+$(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
 	$(call if_changed,vdso)
 
 HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/$(SUBARCH)/include/uapi
@@ -100,11 +100,8 @@ VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \
 			   -Wl,-z,max-page-size=4096 \
 			   -Wl,-z,common-page-size=4096
 
-# 64-bit objects to re-brand as x32
-vobjs64-for-x32 := $(filter-out $(vobjs-nox32),$(vobjs-y))
-
 # x32-rebranded versions
-vobjx32s-y := $(vobjs64-for-x32:.o=-x32.o)
+vobjx32s-y := $(vobjs-y:.o=-x32.o)
 
 # same thing, but in the output directory
 vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F)
@@ -122,7 +119,7 @@ $(obj)/%.so: OBJCOPYFLAGS := -S
 $(obj)/%.so: $(obj)/%.so.dbg
 	$(call if_changed,objcopy)
 
-$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
+$(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE
 	$(call if_changed,vdso)
 
 CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 70b7845..7782cdb 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -107,7 +107,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
 		thread->cr2		= ptr;
 		thread->trap_nr		= X86_TRAP_PF;
 
-		memset(&info, 0, sizeof(info));
+		clear_siginfo(&info);
 		info.si_signo		= SIGSEGV;
 		info.si_errno		= 0;
 		info.si_code		= SEGV_MAPERR;
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 786fd87..4b98101 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -889,7 +889,7 @@ static void force_ibs_eilvt_setup(void)
 	if (!ibs_eilvt_valid())
 		goto out;
 
-	pr_info("IBS: LVT offset %d assigned\n", offset);
+	pr_info("LVT offset %d assigned\n", offset);
 
 	return;
 out:
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index f5cbbba..981ba5e 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -19,6 +19,7 @@
 #include <asm/cpufeature.h>
 #include <asm/perf_event.h>
 #include <asm/msr.h>
+#include <asm/smp.h>
 
 #define NUM_COUNTERS_NB		4
 #define NUM_COUNTERS_L2		4
@@ -399,26 +400,8 @@ static int amd_uncore_cpu_starting(unsigned int cpu)
 	}
 
 	if (amd_uncore_llc) {
-		unsigned int apicid = cpu_data(cpu).apicid;
-		unsigned int nshared, subleaf, prev_eax = 0;
-
 		uncore = *per_cpu_ptr(amd_uncore_llc, cpu);
-		/*
-		 * Iterate over Cache Topology Definition leaves until no
-		 * more cache descriptions are available.
-		 */
-		for (subleaf = 0; subleaf < 5; subleaf++) {
-			cpuid_count(0x8000001d, subleaf, &eax, &ebx, &ecx, &edx);
-
-			/* EAX[0:4] gives type of cache */
-			if (!(eax & 0x1f))
-				break;
-
-			prev_eax = eax;
-		}
-		nshared = ((prev_eax >> 14) & 0xfff) + 1;
-
-		uncore->id = apicid - (apicid % nshared);
+		uncore->id = per_cpu(cpu_llc_id, cpu);
 
 		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc);
 		*per_cpu_ptr(amd_uncore_llc, cpu) = uncore;
diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h
new file mode 100644
index 0000000..e958e28
--- /dev/null
+++ b/arch/x86/include/asm/cacheinfo.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CACHEINFO_H
+#define _ASM_X86_CACHEINFO_H
+
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id);
+
+#endif /* _ASM_X86_CACHEINFO_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 89ce4bf..ce4d176 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -30,10 +30,7 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return dma_ops;
 }
 
-int arch_dma_supported(struct device *dev, u64 mask);
-#define arch_dma_supported arch_dma_supported
-
-bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
+bool arch_dma_alloc_attrs(struct device **dev);
 #define arch_dma_alloc_attrs arch_dma_alloc_attrs
 
 #endif
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index f6e5b93..6de6484 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -94,10 +94,10 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
 
 #ifdef CONFIG_X86_64
 
-build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
-build_mmio_read(__readq, "q", unsigned long, "=r", )
-build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
-build_mmio_write(__writeq, "q", unsigned long, "r", )
+build_mmio_read(readq, "q", u64, "=r", :"memory")
+build_mmio_read(__readq, "q", u64, "=r", )
+build_mmio_write(writeq, "q", u64, "r", :"memory")
+build_mmio_write(__writeq, "q", u64, "r", )
 
 #define readq_relaxed(a)	__readq(a)
 #define writeq_relaxed(v, a)	__writeq(v, a)
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 2c5a966..6afac38 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -53,7 +53,7 @@
 #define __PHYSICAL_MASK_SHIFT	52
 
 #ifdef CONFIG_X86_5LEVEL
-#define __VIRTUAL_MASK_SHIFT	(pgtable_l5_enabled ? 56 : 47)
+#define __VIRTUAL_MASK_SHIFT	(pgtable_l5_enabled() ? 56 : 47)
 #else
 #define __VIRTUAL_MASK_SHIFT	47
 #endif
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9be2bf1..d49bbf4 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -574,14 +574,14 @@ static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
 }
 
 #define set_pgd(pgdp, pgdval) do {					\
-	if (pgtable_l5_enabled)						\
+	if (pgtable_l5_enabled())						\
 		__set_pgd(pgdp, pgdval);				\
 	else								\
 		set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd });	\
 } while (0)
 
 #define pgd_clear(pgdp) do {						\
-	if (pgtable_l5_enabled)						\
+	if (pgtable_l5_enabled())						\
 		set_pgd(pgdp, __pgd(0));				\
 } while (0)
 
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d32175e..6629636 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -117,9 +117,6 @@ void native_restore_msi_irqs(struct pci_dev *dev);
 #define native_setup_msi_irqs		NULL
 #define native_teardown_msi_irq		NULL
 #endif
-
-#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
-
 #endif  /* __KERNEL__ */
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 263c142..ada6410 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -167,7 +167,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
 #if CONFIG_PGTABLE_LEVELS > 4
 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
 {
-	if (!pgtable_l5_enabled)
+	if (!pgtable_l5_enabled())
 		return;
 	paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
 	set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
@@ -193,7 +193,7 @@ extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);
 static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
 				  unsigned long address)
 {
-	if (pgtable_l5_enabled)
+	if (pgtable_l5_enabled())
 		___p4d_free_tlb(tlb, p4d);
 }
 
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f1633de..99ecde2 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -65,7 +65,7 @@ extern pmdval_t early_pmd_flags;
 
 #ifndef __PAGETABLE_P4D_FOLDED
 #define set_pgd(pgdp, pgd)		native_set_pgd(pgdp, pgd)
-#define pgd_clear(pgd)			(pgtable_l5_enabled ? native_pgd_clear(pgd) : 0)
+#define pgd_clear(pgd)			(pgtable_l5_enabled() ? native_pgd_clear(pgd) : 0)
 #endif
 
 #ifndef set_p4d
@@ -881,7 +881,7 @@ static inline unsigned long p4d_index(unsigned long address)
 #if CONFIG_PGTABLE_LEVELS > 4
 static inline int pgd_present(pgd_t pgd)
 {
-	if (!pgtable_l5_enabled)
+	if (!pgtable_l5_enabled())
 		return 1;
 	return pgd_flags(pgd) & _PAGE_PRESENT;
 }
@@ -898,9 +898,9 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
 #define pgd_page(pgd)	pfn_to_page(pgd_pfn(pgd))
 
 /* to find an entry in a page-table-directory. */
-static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
+static __always_inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
 {
-	if (!pgtable_l5_enabled)
+	if (!pgtable_l5_enabled())
 		return (p4d_t *)pgd;
 	return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
 }
@@ -909,7 +909,7 @@ static inline int pgd_bad(pgd_t pgd)
 {
 	unsigned long ignore_flags = _PAGE_USER;
 
-	if (!pgtable_l5_enabled)
+	if (!pgtable_l5_enabled())
 		return 0;
 
 	if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
@@ -920,7 +920,7 @@ static inline int pgd_bad(pgd_t pgd)
 
 static inline int pgd_none(pgd_t pgd)
 {
-	if (!pgtable_l5_enabled)
+	if (!pgtable_l5_enabled())
 		return 0;
 	/*
 	 * There is no need to do a workaround for the KNL stray
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index e3225e8..d9a001a 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -15,7 +15,7 @@
 # include <asm/pgtable-2level_types.h>
 #endif
 
-#define pgtable_l5_enabled 0
+#define pgtable_l5_enabled() 0
 
 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE - 1))
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 877bc27..3c5385f 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -220,7 +220,7 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
 {
 	pgd_t pgd;
 
-	if (pgtable_l5_enabled || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
+	if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
 		*p4dp = p4d;
 		return;
 	}
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index adb4755..054765a 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -22,12 +22,23 @@ typedef struct { pteval_t pte; } pte_t;
 
 #ifdef CONFIG_X86_5LEVEL
 extern unsigned int __pgtable_l5_enabled;
-#ifndef pgtable_l5_enabled
-#define pgtable_l5_enabled cpu_feature_enabled(X86_FEATURE_LA57)
-#endif
+
+#ifdef USE_EARLY_PGTABLE_L5
+/*
+ * cpu_feature_enabled() is not available in early boot code.
+ * Use variable instead.
+ */
+static inline bool pgtable_l5_enabled(void)
+{
+	return __pgtable_l5_enabled;
+}
 #else
-#define pgtable_l5_enabled 0
-#endif
+#define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57)
+#endif /* USE_EARLY_PGTABLE_L5 */
+
+#else
+#define pgtable_l5_enabled() 0
+#endif /* CONFIG_X86_5LEVEL */
 
 extern unsigned int pgdir_shift;
 extern unsigned int ptrs_per_p4d;
@@ -102,7 +113,7 @@ extern unsigned int ptrs_per_p4d;
 
 #define LDT_PGD_ENTRY_L4	-3UL
 #define LDT_PGD_ENTRY_L5	-112UL
-#define LDT_PGD_ENTRY		(pgtable_l5_enabled ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
+#define LDT_PGD_ENTRY		(pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
 #define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)
 
 #define __VMALLOC_BASE_L4	0xffffc90000000000UL
@@ -116,7 +127,7 @@ extern unsigned int ptrs_per_p4d;
 
 #ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
 # define VMALLOC_START		vmalloc_base
-# define VMALLOC_SIZE_TB	(pgtable_l5_enabled ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4)
+# define VMALLOC_SIZE_TB	(pgtable_l5_enabled() ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4)
 # define VMEMMAP_START		vmemmap_base
 #else
 # define VMALLOC_START		__VMALLOC_BASE_L4
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 21a1149..e28add6 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -186,15 +186,6 @@ extern void identify_boot_cpu(void);
 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
 extern void print_cpu_info(struct cpuinfo_x86 *);
 void print_cpu_msr(struct cpuinfo_x86 *);
-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
-extern u32 get_scattered_cpuid_leaf(unsigned int level,
-				    unsigned int sub_leaf,
-				    enum cpuid_regs_idx reg);
-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
-extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
-
-extern void detect_extended_topology(struct cpuinfo_x86 *c);
-extern void detect_ht(struct cpuinfo_x86 *c);
 
 #ifdef CONFIG_X86_32
 extern int have_cpuid_p(void);
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index 5e16b5d..3e70bed 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -7,6 +7,14 @@
 #include <asm-generic/qspinlock_types.h>
 #include <asm/paravirt.h>
 
+#define _Q_PENDING_LOOPS	(1 << 9)
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
+
 #define	queued_spin_unlock queued_spin_unlock
 /**
  * queued_spin_unlock - release a queued spinlock
@@ -16,15 +24,9 @@
  */
 static inline void native_queued_spin_unlock(struct qspinlock *lock)
 {
-	smp_store_release((u8 *)lock, 0);
+	smp_store_release(&lock->locked, 0);
 }
 
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
-extern void __pv_init_lock_hash(void);
-extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
-extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
-
 static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 {
 	pv_queued_spin_lock_slowpath(lock, val);
@@ -40,11 +42,6 @@ static inline bool vcpu_is_preempted(long cpu)
 {
 	return pv_vcpu_is_preempted(cpu);
 }
-#else
-static inline void queued_spin_unlock(struct qspinlock *lock)
-{
-	native_queued_spin_unlock(lock);
-}
 #endif
 
 #ifdef CONFIG_PARAVIRT
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index 923307e..9ef5ee0 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -22,8 +22,7 @@ PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
  *
  * void __pv_queued_spin_unlock(struct qspinlock *lock)
  * {
- *	struct __qspinlock *l = (void *)lock;
- *	u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ *	u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0);
  *
  *	if (likely(lockval == _Q_LOCKED_VAL))
  *		return;
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index f75bff8..547c4fe 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -171,7 +171,6 @@ static inline int wbinvd_on_all_cpus(void)
 	wbinvd();
 	return 0;
 }
-#define smp_num_siblings	1
 #endif /* CONFIG_SMP */
 
 extern unsigned disabled_cpus;
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 4617a2b..1992187 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -27,8 +27,8 @@
 # endif
 #else /* CONFIG_X86_32 */
 # define SECTION_SIZE_BITS	27 /* matt - 128 is convenient right now */
-# define MAX_PHYSADDR_BITS	(pgtable_l5_enabled ? 52 : 44)
-# define MAX_PHYSMEM_BITS	(pgtable_l5_enabled ? 52 : 46)
+# define MAX_PHYSADDR_BITS	(pgtable_l5_enabled() ? 52 : 44)
+# define MAX_PHYSMEM_BITS	(pgtable_l5_enabled() ? 52 : 46)
 #endif
 
 #endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 133d942..b6dc698 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -111,4 +111,6 @@ static inline unsigned long caller_frame_pointer(void)
 	return (unsigned long)frame;
 }
 
+void show_opcodes(u8 *rip, const char *loglvl);
+void show_ip(struct pt_regs *regs, const char *loglvl);
 #endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 533f74c..d33f92b 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -116,7 +116,8 @@ int strcmp(const char *cs, const char *ct);
 #endif
 
 #define __HAVE_ARCH_MEMCPY_MCSAFE 1
-__must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt);
+__must_check unsigned long __memcpy_mcsafe(void *dst, const void *src,
+		size_t cnt);
 DECLARE_STATIC_KEY_FALSE(mcsafe_key);
 
 /**
@@ -131,14 +132,15 @@ DECLARE_STATIC_KEY_FALSE(mcsafe_key);
  * actually do machine check recovery. Everyone else can just
  * use memcpy().
  *
- * Return 0 for success, -EFAULT for fail
+ * Return 0 for success, or number of bytes not copied if there was an
+ * exception.
  */
-static __always_inline __must_check int
+static __always_inline __must_check unsigned long
 memcpy_mcsafe(void *dst, const void *src, size_t cnt)
 {
 #ifdef CONFIG_X86_MCE
 	if (static_branch_unlikely(&mcsafe_key))
-		return memcpy_mcsafe_unrolled(dst, src, cnt);
+		return __memcpy_mcsafe(dst, src, cnt);
 	else
 #endif
 		memcpy(dst, src, cnt);
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 62546b3..62acb61 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -47,6 +47,17 @@ copy_user_generic(void *to, const void *from, unsigned len)
 }
 
 static __always_inline __must_check unsigned long
+copy_to_user_mcsafe(void *to, const void *from, unsigned len)
+{
+	unsigned long ret;
+
+	__uaccess_begin();
+	ret = memcpy_mcsafe(to, from, len);
+	__uaccess_end();
+	return ret;
+}
+
+static __always_inline __must_check unsigned long
 raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
 {
 	int ret = 0;
@@ -194,4 +205,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
 unsigned long
 copy_user_handle_tail(char *to, char *from, unsigned len);
 
+unsigned long
+mcsafe_handle_tail(char *to, char *from, unsigned len);
+
 #endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index dfcbe69..5d0de79 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1715,19 +1715,6 @@ static int proc_apm_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int proc_apm_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_apm_show, NULL);
-}
-
-static const struct file_operations apm_file_ops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_apm_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int apm(void *unused)
 {
 	unsigned short	bx;
@@ -2360,7 +2347,7 @@ static int __init apm_init(void)
 	set_desc_base(&gdt[APM_DS >> 3],
 		 (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4));
 
-	proc_create("apm", 0, NULL, &apm_file_ops);
+	proc_create_single("apm", 0, NULL, proc_apm_show);
 
 	kapmd_task = kthread_create(apm, NULL, "kapmd");
 	if (IS_ERR(kapmd_task)) {
@@ -2446,7 +2433,7 @@ MODULE_PARM_DESC(idle_threshold,
 	"System idle percentage above which to make APM BIOS idle calls");
 module_param(idle_period, int, 0444);
 MODULE_PARM_DESC(idle_period,
-	"Period (in sec/100) over which to caculate the idle percentage");
+	"Period (in sec/100) over which to calculate the idle percentage");
 module_param(smp, bool, 0444);
 MODULE_PARM_DESC(smp,
 	"Set this to enable APM use on an SMP platform. Use with caution on older systems");
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index a66229f..7a40196 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -17,7 +17,7 @@ KCOV_INSTRUMENT_perf_event.o := n
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_common.o		:= $(nostackp)
 
-obj-y			:= intel_cacheinfo.o scattered.o topology.o
+obj-y			:= cacheinfo.o scattered.o topology.o
 obj-y			+= common.o
 obj-y			+= rdrand.o
 obj-y			+= match.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 1b18be3..082d787 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -9,6 +9,7 @@
 #include <linux/random.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
+#include <asm/cacheinfo.h>
 #include <asm/cpu.h>
 #include <asm/spec-ctrl.h>
 #include <asm/smp.h>
@@ -298,7 +299,6 @@ static int nearby_node(int apicid)
 }
 #endif
 
-#ifdef CONFIG_SMP
 /*
  * Fix up cpu_core_id for pre-F17h systems to be in the
  * [0 .. cores_per_node - 1] range. Not really needed but
@@ -328,6 +328,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 
 	/* get information required for multi-node processors */
 	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+		int err;
 		u32 eax, ebx, ecx, edx;
 
 		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
@@ -346,21 +347,15 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 		}
 
 		/*
-		 * We may have multiple LLCs if L3 caches exist, so check if we
-		 * have an L3 cache by looking at the L3 cache CPUID leaf.
+		 * In case leaf B is available, use it to derive
+		 * topology information.
 		 */
-		if (cpuid_edx(0x80000006)) {
-			if (c->x86 == 0x17) {
-				/*
-				 * LLC is at the core complex level.
-				 * Core complex id is ApicId[3].
-				 */
-				per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
-			} else {
-				/* LLC is at the node level. */
-				per_cpu(cpu_llc_id, cpu) = node_id;
-			}
-		}
+		err = detect_extended_topology(c);
+		if (!err)
+			c->x86_coreid_bits = get_count_order(c->x86_max_cores);
+
+		cacheinfo_amd_init_llc_id(c, cpu, node_id);
+
 	} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
 		u64 value;
 
@@ -376,7 +371,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 		legacy_fixup_core_id(c);
 	}
 }
-#endif
 
 /*
  * On a AMD dual core setup the lower bits of the APIC id distinguish the cores.
@@ -384,7 +378,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
  */
 static void amd_detect_cmp(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_SMP
 	unsigned bits;
 	int cpu = smp_processor_id();
 
@@ -395,17 +388,11 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
 	c->phys_proc_id = c->initial_apicid >> bits;
 	/* use socket ID also for last level cache */
 	per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
-	amd_get_topology(c);
-#endif
 }
 
 u16 amd_get_nb_id(int cpu)
 {
-	u16 id = 0;
-#ifdef CONFIG_SMP
-	id = per_cpu(cpu_llc_id, cpu);
-#endif
-	return id;
+	return per_cpu(cpu_llc_id, cpu);
 }
 EXPORT_SYMBOL_GPL(amd_get_nb_id);
 
@@ -864,6 +851,7 @@ static void init_amd(struct cpuinfo_x86 *c)
 	/* Multi core CPU? */
 	if (c->extended_cpuid_level >= 0x80000008) {
 		amd_detect_cmp(c);
+		amd_get_topology(c);
 		srat_detect_node(c);
 	}
 
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 54d04d5..38354c6 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -20,6 +20,8 @@
 #include <asm/amd_nb.h>
 #include <asm/smp.h>
 
+#include "cpu.h"
+
 #define LVL_1_INST	1
 #define LVL_1_DATA	2
 #define LVL_2		3
@@ -637,6 +639,45 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
 	return i;
 }
 
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
+{
+	/*
+	 * We may have multiple LLCs if L3 caches exist, so check if we
+	 * have an L3 cache by looking at the L3 cache CPUID leaf.
+	 */
+	if (!cpuid_edx(0x80000006))
+		return;
+
+	if (c->x86 < 0x17) {
+		/* LLC is at the node level. */
+		per_cpu(cpu_llc_id, cpu) = node_id;
+	} else if (c->x86 == 0x17 &&
+		   c->x86_model >= 0 && c->x86_model <= 0x1F) {
+		/*
+		 * LLC is at the core complex level.
+		 * Core complex ID is ApicId[3] for these processors.
+		 */
+		per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+	} else {
+		/*
+		 * LLC ID is calculated from the number of threads sharing the
+		 * cache.
+		 * */
+		u32 eax, ebx, ecx, edx, num_sharing_cache = 0;
+		u32 llc_index = find_num_cache_leaves(c) - 1;
+
+		cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx);
+		if (eax)
+			num_sharing_cache = ((eax >> 14) & 0xfff) + 1;
+
+		if (num_sharing_cache) {
+			int bits = get_count_order(num_sharing_cache) - 1;
+
+			per_cpu(cpu_llc_id, cpu) = c->apicid >> bits;
+		}
+	}
+}
+
 void init_amd_cacheinfo(struct cpuinfo_x86 *c)
 {
 
@@ -650,7 +691,7 @@ void init_amd_cacheinfo(struct cpuinfo_x86 *c)
 	}
 }
 
-unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
+void init_intel_cacheinfo(struct cpuinfo_x86 *c)
 {
 	/* Cache sizes */
 	unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
@@ -802,7 +843,8 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
 
 	c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
 
-	return l2;
+	if (!l2)
+		cpu_detect_cache_sizes(c);
 }
 
 static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e5ec0f1..14433ff 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -18,6 +18,13 @@
 #define RNG_ENABLED	(1 << 3)
 #define RNG_ENABLE	(1 << 6)	/* MSR_VIA_RNG */
 
+#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW	0x00200000
+#define X86_VMX_FEATURE_PROC_CTLS_VNMI		0x00400000
+#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS	0x80000000
+#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC	0x00000001
+#define X86_VMX_FEATURE_PROC_CTLS2_EPT		0x00000002
+#define X86_VMX_FEATURE_PROC_CTLS2_VPID		0x00000020
+
 static void init_c3(struct cpuinfo_x86 *c)
 {
 	u32  lo, hi;
@@ -112,6 +119,31 @@ static void early_init_centaur(struct cpuinfo_x86 *c)
 	}
 }
 
+static void centaur_detect_vmx_virtcap(struct cpuinfo_x86 *c)
+{
+	u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
+
+	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
+	msr_ctl = vmx_msr_high | vmx_msr_low;
+
+	if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
+		set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+	if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
+		set_cpu_cap(c, X86_FEATURE_VNMI);
+	if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
+		rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+		      vmx_msr_low, vmx_msr_high);
+		msr_ctl2 = vmx_msr_high | vmx_msr_low;
+		if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
+		    (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
+			set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+		if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
+			set_cpu_cap(c, X86_FEATURE_EPT);
+		if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
+			set_cpu_cap(c, X86_FEATURE_VPID);
+	}
+}
+
 static void init_centaur(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_X86_32
@@ -128,6 +160,24 @@ static void init_centaur(struct cpuinfo_x86 *c)
 	clear_cpu_cap(c, 0*32+31);
 #endif
 	early_init_centaur(c);
+	init_intel_cacheinfo(c);
+	detect_num_cpu_cores(c);
+#ifdef CONFIG_X86_32
+	detect_ht(c);
+#endif
+
+	if (c->cpuid_level > 9) {
+		unsigned int eax = cpuid_eax(10);
+
+		/*
+		 * Check for version and the number of counters
+		 * Version(eax[7:0]) can't be 0;
+		 * Counters(eax[15:8]) should be greater than 1;
+		 */
+		if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1))
+			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+	}
+
 	switch (c->x86) {
 #ifdef CONFIG_X86_32
 	case 5:
@@ -199,6 +249,9 @@ static void init_centaur(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_64
 	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
 #endif
+
+	if (cpu_has(c, X86_FEATURE_VMX))
+		centaur_detect_vmx_virtcap(c);
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 38276f5..95c8e50 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -66,6 +66,13 @@ cpumask_var_t cpu_callin_mask;
 /* representing cpus for which sibling maps can be computed */
 cpumask_var_t cpu_sibling_setup_mask;
 
+/* Number of siblings per CPU package */
+int smp_num_siblings = 1;
+EXPORT_SYMBOL(smp_num_siblings);
+
+/* Last level cache ID of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
+
 /* correctly size the local cpu masks */
 void __init setup_cpu_local_masks(void)
 {
@@ -577,6 +584,19 @@ static void get_model_name(struct cpuinfo_x86 *c)
 	*(s + 1) = '\0';
 }
 
+void detect_num_cpu_cores(struct cpuinfo_x86 *c)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	c->x86_max_cores = 1;
+	if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4)
+		return;
+
+	cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
+	if (eax & 0x1f)
+		c->x86_max_cores = (eax >> 26) + 1;
+}
+
 void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
 {
 	unsigned int n, dummy, ebx, ecx, edx, l2size;
@@ -1044,6 +1064,21 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	 */
 	setup_clear_cpu_cap(X86_FEATURE_PCID);
 #endif
+
+	/*
+	 * Later in the boot process pgtable_l5_enabled() relies on
+	 * cpu_feature_enabled(X86_FEATURE_LA57). If 5-level paging is not
+	 * enabled by this point we need to clear the feature bit to avoid
+	 * false-positives at the later stage.
+	 *
+	 * pgtable_l5_enabled() can be false here for several reasons:
+	 *  - 5-level paging is disabled compile-time;
+	 *  - it's 32-bit kernel;
+	 *  - machine doesn't support 5-level paging;
+	 *  - user specified 'no5lvl' in kernel command line.
+	 */
+	if (!pgtable_l5_enabled())
+		setup_clear_cpu_cap(X86_FEATURE_LA57);
 }
 
 void __init early_cpu_init(void)
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 37672d2..38216f6 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -47,6 +47,16 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
 
 extern void get_cpu_cap(struct cpuinfo_x86 *c);
 extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern u32 get_scattered_cpuid_leaf(unsigned int level,
+				    unsigned int sub_leaf,
+				    enum cpuid_regs_idx reg);
+extern void init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
+
+extern void detect_num_cpu_cores(struct cpuinfo_x86 *c);
+extern int detect_extended_topology(struct cpuinfo_x86 *c);
+extern void detect_ht(struct cpuinfo_x86 *c);
 
 unsigned int aperfmperf_get_khz(int cpu);
 
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 577e7f7..eb75564 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -456,24 +456,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
 #endif
 }
 
-/*
- * find out the number of processor cores on the die
- */
-static int intel_num_cpu_cores(struct cpuinfo_x86 *c)
-{
-	unsigned int eax, ebx, ecx, edx;
-
-	if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4)
-		return 1;
-
-	/* Intel has a non-standard dependency on %ecx for this CPUID level. */
-	cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
-	if (eax & 0x1f)
-		return (eax >> 26) + 1;
-	else
-		return 1;
-}
-
 static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
 {
 	/* Intel VMX MSR indicated features */
@@ -656,8 +638,6 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
 
 static void init_intel(struct cpuinfo_x86 *c)
 {
-	unsigned int l2 = 0;
-
 	early_init_intel(c);
 
 	intel_workarounds(c);
@@ -674,19 +654,13 @@ static void init_intel(struct cpuinfo_x86 *c)
 		 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
 		 * detection.
 		 */
-		c->x86_max_cores = intel_num_cpu_cores(c);
+		detect_num_cpu_cores(c);
 #ifdef CONFIG_X86_32
 		detect_ht(c);
 #endif
 	}
 
-	l2 = init_intel_cacheinfo(c);
-
-	/* Detect legacy cache sizes if init_intel_cacheinfo did not */
-	if (l2 == 0) {
-		cpu_detect_cache_sizes(c);
-		l2 = c->x86_cache_size;
-	}
+	init_intel_cacheinfo(c);
 
 	if (c->cpuid_level > 9) {
 		unsigned eax = cpuid_eax(10);
@@ -699,7 +673,8 @@ static void init_intel(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
 
 	if (boot_cpu_has(X86_FEATURE_DS)) {
-		unsigned int l1;
+		unsigned int l1, l2;
+
 		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
 		if (!(l1 & (1<<11)))
 			set_cpu_cap(c, X86_FEATURE_BTS);
@@ -727,6 +702,7 @@ static void init_intel(struct cpuinfo_x86 *c)
 	 * Dixon is NOT a Celeron.
 	 */
 	if (c->x86 == 6) {
+		unsigned int l2 = c->x86_cache_size;
 		char *p = NULL;
 
 		switch (c->x86_model) {
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index ad9e5ed..2ad9107 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
-obj-y		:= main.o if.o generic.o cleanup.o
+obj-y		:= mtrr.o if.o generic.o cleanup.o
 obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
 
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 7468de4..9a19c80 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -46,6 +46,7 @@
 #include <linux/pci.h>
 #include <linux/smp.h>
 #include <linux/syscore_ops.h>
+#include <linux/rcupdate.h>
 
 #include <asm/cpufeature.h>
 #include <asm/e820/api.h>
@@ -100,7 +101,7 @@ static int have_wrcomb(void)
 		if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
 		    dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
 		    dev->revision <= 5) {
-			pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
+			pr_info("Serverworks LE rev < 6 detected. Write-combining disabled.\n");
 			pci_dev_put(dev);
 			return 0;
 		}
@@ -110,7 +111,7 @@ static int have_wrcomb(void)
 		 */
 		if (dev->vendor == PCI_VENDOR_ID_INTEL &&
 		    dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
-			pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n");
+			pr_info("Intel 450NX MMC detected. Write-combining disabled.\n");
 			pci_dev_put(dev);
 			return 0;
 		}
@@ -312,24 +313,24 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 		return error;
 
 	if (type >= MTRR_NUM_TYPES) {
-		pr_warn("mtrr: type: %u invalid\n", type);
+		pr_warn("type: %u invalid\n", type);
 		return -EINVAL;
 	}
 
 	/* If the type is WC, check that this processor supports it */
 	if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
-		pr_warn("mtrr: your processor doesn't support write-combining\n");
+		pr_warn("your processor doesn't support write-combining\n");
 		return -ENOSYS;
 	}
 
 	if (!size) {
-		pr_warn("mtrr: zero sized request\n");
+		pr_warn("zero sized request\n");
 		return -EINVAL;
 	}
 
 	if ((base | (base + size - 1)) >>
 	    (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
-		pr_warn("mtrr: base or size exceeds the MTRR width\n");
+		pr_warn("base or size exceeds the MTRR width\n");
 		return -EINVAL;
 	}
 
@@ -360,8 +361,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 				} else if (types_compatible(type, ltype))
 					continue;
 			}
-			pr_warn("mtrr: 0x%lx000,0x%lx000 overlaps existing"
-				" 0x%lx000,0x%lx000\n", base, size, lbase,
+			pr_warn("0x%lx000,0x%lx000 overlaps existing 0x%lx000,0x%lx000\n", base, size, lbase,
 				lsize);
 			goto out;
 		}
@@ -369,7 +369,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 		if (ltype != type) {
 			if (types_compatible(type, ltype))
 				continue;
-			pr_warn("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
+			pr_warn("type mismatch for %lx000,%lx000 old: %s new: %s\n",
 				base, size, mtrr_attrib_to_str(ltype),
 				mtrr_attrib_to_str(type));
 			goto out;
@@ -395,7 +395,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 			}
 		}
 	} else {
-		pr_info("mtrr: no more MTRRs available\n");
+		pr_info("no more MTRRs available\n");
 	}
 	error = i;
  out:
@@ -407,8 +407,8 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 static int mtrr_check(unsigned long base, unsigned long size)
 {
 	if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
-		pr_warn("mtrr: size and base must be multiples of 4 kiB\n");
-		pr_debug("mtrr: size: 0x%lx  base: 0x%lx\n", size, base);
+		pr_warn("size and base must be multiples of 4 kiB\n");
+		pr_debug("size: 0x%lx  base: 0x%lx\n", size, base);
 		dump_stack();
 		return -1;
 	}
@@ -499,22 +499,22 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 			}
 		}
 		if (reg < 0) {
-			pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n",
+			pr_debug("no MTRR for %lx000,%lx000 found\n",
 				 base, size);
 			goto out;
 		}
 	}
 	if (reg >= max) {
-		pr_warn("mtrr: register: %d too big\n", reg);
+		pr_warn("register: %d too big\n", reg);
 		goto out;
 	}
 	mtrr_if->get(reg, &lbase, &lsize, &ltype);
 	if (lsize < 1) {
-		pr_warn("mtrr: MTRR %d not used\n", reg);
+		pr_warn("MTRR %d not used\n", reg);
 		goto out;
 	}
 	if (mtrr_usage_table[reg] < 1) {
-		pr_warn("mtrr: reg: %d has count=0\n", reg);
+		pr_warn("reg: %d has count=0\n", reg);
 		goto out;
 	}
 	if (--mtrr_usage_table[reg] < 1)
@@ -775,7 +775,7 @@ void __init mtrr_bp_init(void)
 	}
 
 	if (!mtrr_enabled()) {
-		pr_info("MTRR: Disabled\n");
+		pr_info("Disabled\n");
 
 		/*
 		 * PAT initialization relies on MTRR's rendezvous handler.
@@ -793,6 +793,9 @@ void mtrr_ap_init(void)
 
 	if (!use_intel() || mtrr_aps_delayed_init)
 		return;
+
+	rcu_cpu_starting(smp_processor_id());
+
 	/*
 	 * Ideally we should hold mtrr_mutex here to avoid mtrr entries
 	 * changed, but this routine will be called in cpu boot time,
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index b099024..81c0afb 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -27,7 +27,7 @@
  * exists, use it for populating initial_apicid and cpu topology
  * detection.
  */
-void detect_extended_topology(struct cpuinfo_x86 *c)
+int detect_extended_topology(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
 	unsigned int eax, ebx, ecx, edx, sub_index;
@@ -36,7 +36,7 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
 	static bool printed;
 
 	if (c->cpuid_level < 0xb)
-		return;
+		return -1;
 
 	cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
 
@@ -44,7 +44,7 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
 	 * check if the cpuid leaf 0xb is actually implemented.
 	 */
 	if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
-		return;
+		return -1;
 
 	set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
 
@@ -95,6 +95,6 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
 			       c->cpu_core_id);
 		printed = 1;
 	}
-	return;
 #endif
+	return 0;
 }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 18fa9d7..666a284 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -22,11 +22,14 @@
 #include <asm/stacktrace.h>
 #include <asm/unwind.h>
 
+#define OPCODE_BUFSIZE 64
+
 int panic_on_unrecovered_nmi;
 int panic_on_io_nmi;
-static unsigned int code_bytes = 64;
 static int die_counter;
 
+static struct pt_regs exec_summary_regs;
+
 bool in_task_stack(unsigned long *stack, struct task_struct *task,
 		   struct stack_info *info)
 {
@@ -69,9 +72,62 @@ static void printk_stack_address(unsigned long address, int reliable,
 	printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
 }
 
+/*
+ * There are a couple of reasons for the 2/3rd prologue, courtesy of Linus:
+ *
+ * In case where we don't have the exact kernel image (which, if we did, we can
+ * simply disassemble and navigate to the RIP), the purpose of the bigger
+ * prologue is to have more context and to be able to correlate the code from
+ * the different toolchains better.
+ *
+ * In addition, it helps in recreating the register allocation of the failing
+ * kernel and thus make sense of the register dump.
+ *
+ * What is more, the additional complication of a variable length insn arch like
+ * x86 warrants having longer byte sequence before rIP so that the disassembler
+ * can "sync" up properly and find instruction boundaries when decoding the
+ * opcode bytes.
+ *
+ * Thus, the 2/3rds prologue and 64 byte OPCODE_BUFSIZE is just a random
+ * guesstimate in attempt to achieve all of the above.
+ */
+void show_opcodes(u8 *rip, const char *loglvl)
+{
+	unsigned int code_prologue = OPCODE_BUFSIZE * 2 / 3;
+	u8 opcodes[OPCODE_BUFSIZE];
+	u8 *ip;
+	int i;
+
+	printk("%sCode: ", loglvl);
+
+	ip = (u8 *)rip - code_prologue;
+	if (probe_kernel_read(opcodes, ip, OPCODE_BUFSIZE)) {
+		pr_cont("Bad RIP value.\n");
+		return;
+	}
+
+	for (i = 0; i < OPCODE_BUFSIZE; i++, ip++) {
+		if (ip == rip)
+			pr_cont("<%02x> ", opcodes[i]);
+		else
+			pr_cont("%02x ", opcodes[i]);
+	}
+	pr_cont("\n");
+}
+
+void show_ip(struct pt_regs *regs, const char *loglvl)
+{
+#ifdef CONFIG_X86_32
+	printk("%sEIP: %pS\n", loglvl, (void *)regs->ip);
+#else
+	printk("%sRIP: %04x:%pS\n", loglvl, (int)regs->cs, (void *)regs->ip);
+#endif
+	show_opcodes((u8 *)regs->ip, loglvl);
+}
+
 void show_iret_regs(struct pt_regs *regs)
 {
-	printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
+	show_ip(regs, KERN_DEFAULT);
 	printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
 		regs->sp, regs->flags);
 }
@@ -267,7 +323,6 @@ unsigned long oops_begin(void)
 	bust_spinlocks(1);
 	return flags;
 }
-EXPORT_SYMBOL_GPL(oops_begin);
 NOKPROBE_SYMBOL(oops_begin);
 
 void __noreturn rewind_stack_do_exit(int signr);
@@ -287,6 +342,9 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
 	raw_local_irq_restore(flags);
 	oops_exit();
 
+	/* Executive summary in case the oops scrolled away */
+	__show_regs(&exec_summary_regs, true);
+
 	if (!signr)
 		return;
 	if (in_interrupt())
@@ -305,10 +363,10 @@ NOKPROBE_SYMBOL(oops_end);
 
 int __die(const char *str, struct pt_regs *regs, long err)
 {
-#ifdef CONFIG_X86_32
-	unsigned short ss;
-	unsigned long sp;
-#endif
+	/* Save the regs of the first oops for the executive summary later. */
+	if (!die_counter)
+		exec_summary_regs = *regs;
+
 	printk(KERN_DEFAULT
 	       "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
 	       IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT"         : "",
@@ -318,26 +376,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
 	       IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
 	       (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
 
+	show_regs(regs);
+	print_modules();
+
 	if (notify_die(DIE_OOPS, str, regs, err,
 			current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
 		return 1;
 
-	print_modules();
-	show_regs(regs);
-#ifdef CONFIG_X86_32
-	if (user_mode(regs)) {
-		sp = regs->sp;
-		ss = regs->ss;
-	} else {
-		sp = kernel_stack_pointer(regs);
-		savesegment(ss, ss);
-	}
-	printk(KERN_EMERG "EIP: %pS SS:ESP: %04x:%08lx\n",
-	       (void *)regs->ip, ss, sp);
-#else
-	/* Executive summary in case the oops scrolled away */
-	printk(KERN_ALERT "RIP: %pS RSP: %016lx\n", (void *)regs->ip, regs->sp);
-#endif
 	return 0;
 }
 NOKPROBE_SYMBOL(__die);
@@ -356,30 +401,9 @@ void die(const char *str, struct pt_regs *regs, long err)
 	oops_end(flags, regs, sig);
 }
 
-static int __init code_bytes_setup(char *s)
-{
-	ssize_t ret;
-	unsigned long val;
-
-	if (!s)
-		return -EINVAL;
-
-	ret = kstrtoul(s, 0, &val);
-	if (ret)
-		return ret;
-
-	code_bytes = val;
-	if (code_bytes > 8192)
-		code_bytes = 8192;
-
-	return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
-
 void show_regs(struct pt_regs *regs)
 {
 	bool all = true;
-	int i;
 
 	show_regs_print_info(KERN_DEFAULT);
 
@@ -389,36 +413,8 @@ void show_regs(struct pt_regs *regs)
 	__show_regs(regs, all);
 
 	/*
-	 * When in-kernel, we also print out the stack and code at the
-	 * time of the fault..
+	 * When in-kernel, we also print out the stack at the time of the fault..
 	 */
-	if (!user_mode(regs)) {
-		unsigned int code_prologue = code_bytes * 43 / 64;
-		unsigned int code_len = code_bytes;
-		unsigned char c;
-		u8 *ip;
-
+	if (!user_mode(regs))
 		show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
-
-		printk(KERN_DEFAULT "Code: ");
-
-		ip = (u8 *)regs->ip - code_prologue;
-		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
-			/* try starting at IP */
-			ip = (u8 *)regs->ip;
-			code_len = code_len - code_prologue + 1;
-		}
-		for (i = 0; i < code_len; i++, ip++) {
-			if (ip < (u8 *)PAGE_OFFSET ||
-					probe_kernel_address(ip, c)) {
-				pr_cont(" Bad RIP value.");
-				break;
-			}
-			if (ip == (u8 *)regs->ip)
-				pr_cont("<%02x> ", c);
-			else
-				pr_cont("%02x ", c);
-		}
-	}
-	pr_cont("\n");
 }
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 6a2cb14..d1f25c8 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -155,7 +155,8 @@ static void __init __e820__range_add(struct e820_table *table, u64 start, u64 si
 	int x = table->nr_entries;
 
 	if (x >= ARRAY_SIZE(table->entries)) {
-		pr_err("e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", start, start + size - 1);
+		pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n",
+		       start, start + size - 1);
 		return;
 	}
 
@@ -190,9 +191,10 @@ void __init e820__print_table(char *who)
 	int i;
 
 	for (i = 0; i < e820_table->nr_entries; i++) {
-		pr_info("%s: [mem %#018Lx-%#018Lx] ", who,
-		       e820_table->entries[i].addr,
-		       e820_table->entries[i].addr + e820_table->entries[i].size - 1);
+		pr_info("%s: [mem %#018Lx-%#018Lx] ",
+			who,
+			e820_table->entries[i].addr,
+			e820_table->entries[i].addr + e820_table->entries[i].size - 1);
 
 		e820_print_type(e820_table->entries[i].type);
 		pr_cont("\n");
@@ -574,7 +576,7 @@ void __init e820__update_table_print(void)
 	if (e820__update_table(e820_table))
 		return;
 
-	pr_info("e820: modified physical RAM map:\n");
+	pr_info("modified physical RAM map:\n");
 	e820__print_table("modified");
 }
 
@@ -636,9 +638,8 @@ __init void e820__setup_pci_gap(void)
 	if (!found) {
 #ifdef CONFIG_X86_64
 		gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
-		pr_err(
-			"e820: Cannot find an available gap in the 32-bit address range\n"
-			"e820: PCI devices with unassigned 32-bit BARs may not work!\n");
+		pr_err("Cannot find an available gap in the 32-bit address range\n");
+		pr_err("PCI devices with unassigned 32-bit BARs may not work!\n");
 #else
 		gapstart = 0x10000000;
 #endif
@@ -649,7 +650,8 @@ __init void e820__setup_pci_gap(void)
 	 */
 	pci_mem_start = gapstart;
 
-	pr_info("e820: [mem %#010lx-%#010lx] available for PCI devices\n", gapstart, gapstart + gapsize - 1);
+	pr_info("[mem %#010lx-%#010lx] available for PCI devices\n",
+		gapstart, gapstart + gapsize - 1);
 }
 
 /*
@@ -711,7 +713,7 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
 	memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
 
 	early_memunmap(sdata, data_len);
-	pr_info("e820: extended physical RAM map:\n");
+	pr_info("extended physical RAM map:\n");
 	e820__print_table("extended");
 }
 
@@ -780,7 +782,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
 	addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
 	if (addr) {
 		e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
-		pr_info("e820: update e820_table_kexec for e820__memblock_alloc_reserved()\n");
+		pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n");
 		e820__update_table_kexec();
 	}
 
@@ -830,8 +832,8 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type
 	if (last_pfn > max_arch_pfn)
 		last_pfn = max_arch_pfn;
 
-	pr_info("e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
-			 last_pfn, max_arch_pfn);
+	pr_info("last_pfn = %#lx max_arch_pfn = %#lx\n",
+		last_pfn, max_arch_pfn);
 	return last_pfn;
 }
 
@@ -1005,7 +1007,7 @@ void __init e820__finish_early_params(void)
 		if (e820__update_table(e820_table) < 0)
 			early_panic("Invalid user supplied memory map");
 
-		pr_info("e820: user-defined physical RAM map:\n");
+		pr_info("user-defined physical RAM map:\n");
 		e820__print_table("user");
 	}
 }
@@ -1238,7 +1240,7 @@ void __init e820__memory_setup(void)
 	memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
 	memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
 
-	pr_info("e820: BIOS-provided physical RAM map:\n");
+	pr_info("BIOS-provided physical RAM map:\n");
 	e820__print_table(who);
 }
 
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index bae0d32..da5d8ac 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -28,8 +28,6 @@
 #include <asm/irq_remapping.h>
 #include <asm/early_ioremap.h>
 
-#define dev_err(msg)  pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg)
-
 static void __init fix_hypertransport_config(int num, int slot, int func)
 {
 	u32 htcfg;
@@ -617,7 +615,8 @@ static void __init apple_airport_reset(int bus, int slot, int func)
 
 		pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
 		if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
-			dev_err("Cannot power up Apple AirPort card\n");
+			pr_err("pci 0000:%02x:%02x.%d: Cannot power up Apple AirPort card\n",
+			       bus, slot, func);
 			return;
 		}
 	}
@@ -628,7 +627,8 @@ static void __init apple_airport_reset(int bus, int slot, int func)
 
 	mmio = early_ioremap(addr, BCM4331_MMIO_SIZE);
 	if (!mmio) {
-		dev_err("Cannot iomap Apple AirPort card\n");
+		pr_err("pci 0000:%02x:%02x.%d: Cannot iomap Apple AirPort card\n",
+		       bus, slot, func);
 		return;
 	}
 
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 2d29e47..a21d6ac 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -6,6 +6,10 @@
  */
 
 #define DISABLE_BRANCH_PROFILING
+
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
 #include <linux/init.h>
 #include <linux/linkage.h>
 #include <linux/types.h>
@@ -32,11 +36,6 @@
 #include <asm/microcode.h>
 #include <asm/kasan.h>
 
-#ifdef CONFIG_X86_5LEVEL
-#undef pgtable_l5_enabled
-#define pgtable_l5_enabled __pgtable_l5_enabled
-#endif
-
 /*
  * Manage page tables very early on.
  */
@@ -45,8 +44,7 @@ static unsigned int __initdata next_early_pgt;
 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
 
 #ifdef CONFIG_X86_5LEVEL
-unsigned int __pgtable_l5_enabled __ro_after_init;
-EXPORT_SYMBOL(__pgtable_l5_enabled);
+unsigned int __pgtable_l5_enabled __initdata;
 unsigned int pgdir_shift __ro_after_init = 39;
 EXPORT_SYMBOL(pgdir_shift);
 unsigned int ptrs_per_p4d __ro_after_init = 1;
@@ -82,13 +80,14 @@ static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr)
 
 static bool __head check_la57_support(unsigned long physaddr)
 {
-	if (native_cpuid_eax(0) < 7)
-		return false;
-
-	if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
+	/*
+	 * 5-level paging is detected and enabled at kernel decomression
+	 * stage. Only check if it has been enabled there.
+	 */
+	if (!(native_read_cr4() & X86_CR4_LA57))
 		return false;
 
-	*fixup_int(&pgtable_l5_enabled, physaddr) = 1;
+	*fixup_int(&__pgtable_l5_enabled, physaddr) = 1;
 	*fixup_int(&pgdir_shift, physaddr) = 48;
 	*fixup_int(&ptrs_per_p4d, physaddr) = 512;
 	*fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5;
@@ -281,7 +280,7 @@ again:
 	 * critical -- __PAGE_OFFSET would point us back into the dynamic
 	 * range and we might end up looping forever...
 	 */
-	if (!pgtable_l5_enabled)
+	if (!pgtable_l5_enabled())
 		p4d_p = pgd_p;
 	else if (pgd)
 		p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 8ce4212..b6be34e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -975,8 +975,7 @@ int __init hpet_enable(void)
 	cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
 	hpet_writel(cfg, HPET_CFG);
 	if (cfg)
-		pr_warn("HPET: Unrecognized bits %#x set in global cfg\n",
-			cfg);
+		pr_warn("Unrecognized bits %#x set in global cfg\n", cfg);
 
 	for (i = 0; i <= last; ++i) {
 		cfg = hpet_readl(HPET_Tn_CFG(i));
@@ -988,7 +987,7 @@ int __init hpet_enable(void)
 			 | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE
 			 | HPET_TN_FSB | HPET_TN_FSB_CAP);
 		if (cfg)
-			pr_warn("HPET: Unrecognized bits %#x set in cfg#%u\n",
+			pr_warn("Unrecognized bits %#x set in cfg#%u\n",
 				cfg, i);
 	}
 	hpet_print_config();
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 6010449..4c8acdf 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -354,7 +354,8 @@ void arch_crash_save_vmcoreinfo(void)
 {
 	VMCOREINFO_NUMBER(phys_base);
 	VMCOREINFO_SYMBOL(init_top_pgt);
-	VMCOREINFO_NUMBER(pgtable_l5_enabled);
+	vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
+			pgtable_l5_enabled());
 
 #ifdef CONFIG_NUMA
 	VMCOREINFO_SYMBOL(node_data);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 77625b6..ab5d9dd 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -15,13 +15,11 @@
 #include <asm/x86_init.h>
 #include <asm/iommu_table.h>
 
-static int forbid_dac __read_mostly;
+static bool disable_dac_quirk __read_mostly;
 
 const struct dma_map_ops *dma_ops = &dma_direct_ops;
 EXPORT_SYMBOL(dma_ops);
 
-static int iommu_sac_force __read_mostly;
-
 #ifdef CONFIG_IOMMU_DEBUG
 int panic_on_overflow __read_mostly = 1;
 int force_iommu __read_mostly = 1;
@@ -55,9 +53,6 @@ struct device x86_dma_fallback_dev = {
 };
 EXPORT_SYMBOL(x86_dma_fallback_dev);
 
-/* Number of entries preallocated for DMA-API debugging */
-#define PREALLOC_DMA_DEBUG_ENTRIES       65536
-
 void __init pci_iommu_alloc(void)
 {
 	struct iommu_table_entry *p;
@@ -76,7 +71,7 @@ void __init pci_iommu_alloc(void)
 	}
 }
 
-bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp)
+bool arch_dma_alloc_attrs(struct device **dev)
 {
 	if (!*dev)
 		*dev = &x86_dma_fallback_dev;
@@ -125,13 +120,13 @@ static __init int iommu_setup(char *p)
 		if (!strncmp(p, "nomerge", 7))
 			iommu_merge = 0;
 		if (!strncmp(p, "forcesac", 8))
-			iommu_sac_force = 1;
+			pr_warn("forcesac option ignored.\n");
 		if (!strncmp(p, "allowdac", 8))
-			forbid_dac = 0;
+			pr_warn("allowdac option ignored.\n");
 		if (!strncmp(p, "nodac", 5))
-			forbid_dac = 1;
+			pr_warn("nodac option ignored.\n");
 		if (!strncmp(p, "usedac", 6)) {
-			forbid_dac = -1;
+			disable_dac_quirk = true;
 			return 1;
 		}
 #ifdef CONFIG_SWIOTLB
@@ -156,40 +151,9 @@ static __init int iommu_setup(char *p)
 }
 early_param("iommu", iommu_setup);
 
-int arch_dma_supported(struct device *dev, u64 mask)
-{
-#ifdef CONFIG_PCI
-	if (mask > 0xffffffff && forbid_dac > 0) {
-		dev_info(dev, "PCI: Disallowing DAC for device\n");
-		return 0;
-	}
-#endif
-
-	/* Tell the device to use SAC when IOMMU force is on.  This
-	   allows the driver to use cheaper accesses in some cases.
-
-	   Problem with this is that if we overflow the IOMMU area and
-	   return DAC as fallback address the device may not handle it
-	   correctly.
-
-	   As a special case some controllers have a 39bit address
-	   mode that is as efficient as 32bit (aic79xx). Don't force
-	   SAC for these.  Assume all masks <= 40 bits are of this
-	   type. Normally this doesn't make any difference, but gives
-	   more gentle handling of IOMMU overflow. */
-	if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) {
-		dev_info(dev, "Force SAC with mask %Lx\n", mask);
-		return 0;
-	}
-
-	return 1;
-}
-EXPORT_SYMBOL(arch_dma_supported);
-
 static int __init pci_iommu_init(void)
 {
 	struct iommu_table_entry *p;
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
 
 #ifdef CONFIG_PCI
 	dma_debug_add_bus(&pci_bus_type);
@@ -209,11 +173,17 @@ rootfs_initcall(pci_iommu_init);
 #ifdef CONFIG_PCI
 /* Many VIA bridges seem to corrupt data for DAC. Disable it here */
 
+static int via_no_dac_cb(struct pci_dev *pdev, void *data)
+{
+	pdev->dev.dma_32bit_limit = true;
+	return 0;
+}
+
 static void via_no_dac(struct pci_dev *dev)
 {
-	if (forbid_dac == 0) {
+	if (!disable_dac_quirk) {
 		dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
-		forbid_dac = 1;
+		pci_walk_bus(dev->subordinate, via_no_dac_cb, NULL);
 	}
 }
 DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 5224c60..0ae659d 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -76,16 +76,14 @@ void __show_regs(struct pt_regs *regs, int all)
 		savesegment(gs, gs);
 	}
 
-	printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip);
-	printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags,
-		raw_smp_processor_id());
+	show_ip(regs, KERN_DEFAULT);
 
 	printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
 		regs->ax, regs->bx, regs->cx, regs->dx);
 	printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
 		regs->si, regs->di, regs->bp, sp);
-	printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
-	       (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
+	printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n",
+	       (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, regs->flags);
 
 	if (!all)
 		return;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index ed5c4cd..e2ee403 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1377,7 +1377,6 @@ static void fill_sigtrap_info(struct task_struct *tsk,
 	tsk->thread.trap_nr = X86_TRAP_DB;
 	tsk->thread.error_code = error_code;
 
-	memset(info, 0, sizeof(*info));
 	info->si_signo = SIGTRAP;
 	info->si_code = si_code;
 	info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
@@ -1395,6 +1394,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 {
 	struct siginfo info;
 
+	clear_siginfo(&info);
 	fill_sigtrap_info(tsk, regs, error_code, si_code, &info);
 	/* Send us the fake SIGTRAP */
 	force_sig_info(SIGTRAP, &info, tsk);
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index 14c057f..9ccbf05 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -29,7 +29,7 @@ static inline void signal_compat_build_tests(void)
 	BUILD_BUG_ON(NSIGFPE  != 15);
 	BUILD_BUG_ON(NSIGSEGV != 7);
 	BUILD_BUG_ON(NSIGBUS  != 5);
-	BUILD_BUG_ON(NSIGTRAP != 4);
+	BUILD_BUG_ON(NSIGTRAP != 5);
 	BUILD_BUG_ON(NSIGCHLD != 6);
 	BUILD_BUG_ON(NSIGSYS  != 1);
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9dd324a..c2f7d1d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -81,13 +81,6 @@
 #include <asm/cpu_device_id.h>
 #include <asm/spec-ctrl.h>
 
-/* Number of siblings per CPU package */
-int smp_num_siblings = 1;
-EXPORT_SYMBOL(smp_num_siblings);
-
-/* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
-
 /* representing HT siblings of each logical CPU */
 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 03f3d76..a535dd6 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -299,6 +299,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 			NOTIFY_STOP) {
 		cond_local_irq_enable(regs);
+		clear_siginfo(&info);
 		do_trap(trapnr, signr, str, regs, error_code,
 			fill_trap_info(regs, signr, trapnr, &info));
 	}
@@ -854,6 +855,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
 	task->thread.trap_nr	= trapnr;
 	task->thread.error_code = error_code;
+	clear_siginfo(&info);
 	info.si_signo		= SIGFPE;
 	info.si_errno		= 0;
 	info.si_addr		= (void __user *)uprobe_get_trap_addr(regs);
@@ -929,6 +931,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	local_irq_enable();
 
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code = ILL_BADSTK;
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index f44ce0f..ff20b35 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -278,6 +278,7 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
 	tsk->thread.error_code	= X86_PF_USER | X86_PF_WRITE;
 	tsk->thread.trap_nr	= X86_TRAP_PF;
 
+	clear_siginfo(&info);
 	info.si_signo	= SIGSEGV;
 	info.si_errno	= 0;
 	info.si_code	= SEGV_MAPERR;
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index c84bb53..58d8d80 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -1083,8 +1083,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
 		return orig_ret_vaddr;
 
 	if (nleft != rasize) {
-		pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, "
-			"%%ip=%#lx\n", current->pid, regs->sp, regs->ip);
+		pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n",
+		       current->pid, regs->sp, regs->ip);
 
 		force_sig_info(SIGSEGV, SEND_SIG_FORCED, current);
 	}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 795f3a8..5e1458f 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -117,11 +117,11 @@ SECTIONS
 
 #ifdef CONFIG_X86_64
 		. = ALIGN(PAGE_SIZE);
-		VMLINUX_SYMBOL(__entry_trampoline_start) = .;
+		__entry_trampoline_start = .;
 		_entry_trampoline = .;
 		*(.entry_trampoline)
 		. = ALIGN(PAGE_SIZE);
-		VMLINUX_SYMBOL(__entry_trampoline_end) = .;
+		__entry_trampoline_end = .;
 		ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
 #endif
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8494dba..d634f033 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3007,6 +3007,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo	= SIGBUS;
 	info.si_errno	= 0;
 	info.si_code	= BUS_MCEERR_AR;
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 9a53a06..c3b527a 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -184,11 +184,11 @@ ENDPROC(memcpy_orig)
 
 #ifndef CONFIG_UML
 /*
- * memcpy_mcsafe_unrolled - memory copy with machine check exception handling
+ * __memcpy_mcsafe - memory copy with machine check exception handling
  * Note that we only catch machine checks when reading the source addresses.
  * Writes to target are posted and don't generate machine checks.
  */
-ENTRY(memcpy_mcsafe_unrolled)
+ENTRY(__memcpy_mcsafe)
 	cmpl $8, %edx
 	/* Less than 8 bytes? Go to byte copy loop */
 	jb .L_no_whole_words
@@ -204,58 +204,29 @@ ENTRY(memcpy_mcsafe_unrolled)
 	subl $8, %ecx
 	negl %ecx
 	subl %ecx, %edx
-.L_copy_leading_bytes:
+.L_read_leading_bytes:
 	movb (%rsi), %al
+.L_write_leading_bytes:
 	movb %al, (%rdi)
 	incq %rsi
 	incq %rdi
 	decl %ecx
-	jnz .L_copy_leading_bytes
+	jnz .L_read_leading_bytes
 
 .L_8byte_aligned:
-	/* Figure out how many whole cache lines (64-bytes) to copy */
-	movl %edx, %ecx
-	andl $63, %edx
-	shrl $6, %ecx
-	jz .L_no_whole_cache_lines
-
-	/* Loop copying whole cache lines */
-.L_cache_w0: movq (%rsi), %r8
-.L_cache_w1: movq 1*8(%rsi), %r9
-.L_cache_w2: movq 2*8(%rsi), %r10
-.L_cache_w3: movq 3*8(%rsi), %r11
-	movq %r8, (%rdi)
-	movq %r9, 1*8(%rdi)
-	movq %r10, 2*8(%rdi)
-	movq %r11, 3*8(%rdi)
-.L_cache_w4: movq 4*8(%rsi), %r8
-.L_cache_w5: movq 5*8(%rsi), %r9
-.L_cache_w6: movq 6*8(%rsi), %r10
-.L_cache_w7: movq 7*8(%rsi), %r11
-	movq %r8, 4*8(%rdi)
-	movq %r9, 5*8(%rdi)
-	movq %r10, 6*8(%rdi)
-	movq %r11, 7*8(%rdi)
-	leaq 64(%rsi), %rsi
-	leaq 64(%rdi), %rdi
-	decl %ecx
-	jnz .L_cache_w0
-
-	/* Are there any trailing 8-byte words? */
-.L_no_whole_cache_lines:
 	movl %edx, %ecx
 	andl $7, %edx
 	shrl $3, %ecx
 	jz .L_no_whole_words
 
-	/* Copy trailing words */
-.L_copy_trailing_words:
+.L_read_words:
 	movq (%rsi), %r8
-	mov %r8, (%rdi)
-	leaq 8(%rsi), %rsi
-	leaq 8(%rdi), %rdi
+.L_write_words:
+	movq %r8, (%rdi)
+	addq $8, %rsi
+	addq $8, %rdi
 	decl %ecx
-	jnz .L_copy_trailing_words
+	jnz .L_read_words
 
 	/* Any trailing bytes? */
 .L_no_whole_words:
@@ -264,38 +235,53 @@ ENTRY(memcpy_mcsafe_unrolled)
 
 	/* Copy trailing bytes */
 	movl %edx, %ecx
-.L_copy_trailing_bytes:
+.L_read_trailing_bytes:
 	movb (%rsi), %al
+.L_write_trailing_bytes:
 	movb %al, (%rdi)
 	incq %rsi
 	incq %rdi
 	decl %ecx
-	jnz .L_copy_trailing_bytes
+	jnz .L_read_trailing_bytes
 
 	/* Copy successful. Return zero */
 .L_done_memcpy_trap:
 	xorq %rax, %rax
 	ret
-ENDPROC(memcpy_mcsafe_unrolled)
-EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled)
+ENDPROC(__memcpy_mcsafe)
+EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
 
 	.section .fixup, "ax"
-	/* Return -EFAULT for any failure */
-.L_memcpy_mcsafe_fail:
-	mov	$-EFAULT, %rax
+	/*
+	 * Return number of bytes not copied for any failure. Note that
+	 * there is no "tail" handling since the source buffer is 8-byte
+	 * aligned and poison is cacheline aligned.
+	 */
+.E_read_words:
+	shll	$3, %ecx
+.E_leading_bytes:
+	addl	%edx, %ecx
+.E_trailing_bytes:
+	mov	%ecx, %eax
 	ret
 
+	/*
+	 * For write fault handling, given the destination is unaligned,
+	 * we handle faults on multi-byte writes with a byte-by-byte
+	 * copy up to the write-protected page.
+	 */
+.E_write_words:
+	shll	$3, %ecx
+	addl	%edx, %ecx
+	movl	%ecx, %edx
+	jmp mcsafe_handle_tail
+
 	.previous
 
-	_ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_cache_w2, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
+	_ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
+	_ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
+	_ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
+	_ASM_EXTABLE(.L_write_words, .E_write_words)
+	_ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
 #endif
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 75d3776..9c5606d 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -23,13 +23,13 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
 	asm volatile(
 		"	testq  %[size8],%[size8]\n"
 		"	jz     4f\n"
-		"0:	movq %[zero],(%[dst])\n"
-		"	addq   %[eight],%[dst]\n"
+		"0:	movq $0,(%[dst])\n"
+		"	addq   $8,%[dst]\n"
 		"	decl %%ecx ; jnz   0b\n"
 		"4:	movq  %[size1],%%rcx\n"
 		"	testl %%ecx,%%ecx\n"
 		"	jz     2f\n"
-		"1:	movb   %b[zero],(%[dst])\n"
+		"1:	movb   $0,(%[dst])\n"
 		"	incq   %[dst]\n"
 		"	decl %%ecx ; jnz  1b\n"
 		"2:\n"
@@ -40,8 +40,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
 		_ASM_EXTABLE(0b,3b)
 		_ASM_EXTABLE(1b,2b)
 		: [size8] "=&c"(size), [dst] "=&D" (__d0)
-		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
-		  [zero] "r" (0UL), [eight] "r" (8UL));
+		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
 	clac();
 	return size;
 }
@@ -75,6 +74,27 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
 	return len;
 }
 
+/*
+ * Similar to copy_user_handle_tail, probe for the write fault point,
+ * but reuse __memcpy_mcsafe in case a new read error is encountered.
+ * clac() is handled in _copy_to_iter_mcsafe().
+ */
+__visible unsigned long
+mcsafe_handle_tail(char *to, char *from, unsigned len)
+{
+	for (; len; --len, to++, from++) {
+		/*
+		 * Call the assembly routine back directly since
+		 * memcpy_mcsafe() may silently fallback to memcpy.
+		 */
+		unsigned long rem = __memcpy_mcsafe(to, from, 1);
+
+		if (rem)
+			break;
+	}
+	return len;
+}
+
 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
 /**
  * clean_cache_range - write back a cache range with CLWB
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index cc7ff59..2f3c919 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -360,7 +360,7 @@ static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
 				void *pt)
 {
 	if (__pa(pt) == __pa(kasan_zero_pmd) ||
-	    (pgtable_l5_enabled && __pa(pt) == __pa(kasan_zero_p4d)) ||
+	    (pgtable_l5_enabled() && __pa(pt) == __pa(kasan_zero_p4d)) ||
 	    __pa(pt) == __pa(kasan_zero_pud)) {
 		pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
 		note_page(m, st, __pgprot(prot), 0, 5);
@@ -476,8 +476,8 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
 	}
 }
 
-#define pgd_large(a) (pgtable_l5_enabled ? pgd_large(a) : p4d_large(__p4d(pgd_val(a))))
-#define pgd_none(a)  (pgtable_l5_enabled ? pgd_none(a) : p4d_none(__p4d(pgd_val(a))))
+#define pgd_large(a) (pgtable_l5_enabled() ? pgd_large(a) : p4d_large(__p4d(pgd_val(a))))
+#define pgd_none(a)  (pgtable_l5_enabled() ? pgd_none(a) : p4d_none(__p4d(pgd_val(a))))
 
 static inline bool is_hypervisor_range(int idx)
 {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 73bd8c9..9a84a0d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -209,6 +209,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
 	unsigned lsb = 0;
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo	= si_signo;
 	info.si_errno	= 0;
 	info.si_code	= si_code;
@@ -439,7 +440,7 @@ static noinline int vmalloc_fault(unsigned long address)
 	if (pgd_none(*pgd_k))
 		return -1;
 
-	if (pgtable_l5_enabled) {
+	if (pgtable_l5_enabled()) {
 		if (pgd_none(*pgd)) {
 			set_pgd(pgd, *pgd_k);
 			arch_flush_lazy_mmu_mode();
@@ -454,7 +455,7 @@ static noinline int vmalloc_fault(unsigned long address)
 	if (p4d_none(*p4d_k))
 		return -1;
 
-	if (p4d_none(*p4d) && !pgtable_l5_enabled) {
+	if (p4d_none(*p4d) && !pgtable_l5_enabled()) {
 		set_p4d(p4d, *p4d_k);
 		arch_flush_lazy_mmu_mode();
 	} else {
@@ -828,6 +829,8 @@ static inline void
 show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 		unsigned long address, struct task_struct *tsk)
 {
+	const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
+
 	if (!unhandled_signal(tsk, SIGSEGV))
 		return;
 
@@ -835,13 +838,14 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 		return;
 
 	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
-		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-		tsk->comm, task_pid_nr(tsk), address,
+		loglvl, tsk->comm, task_pid_nr(tsk), address,
 		(void *)regs->ip, (void *)regs->sp, error_code);
 
 	print_vma_addr(KERN_CONT " in ", regs->ip);
 
 	printk(KERN_CONT "\n");
+
+	show_opcodes((u8 *)regs->ip, loglvl);
 }
 
 static void
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index a2f0c7e..fe7a125 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -123,7 +123,7 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
 		result = ident_p4d_init(info, p4d, addr, next);
 		if (result)
 			return result;
-		if (pgtable_l5_enabled) {
+		if (pgtable_l5_enabled()) {
 			set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
 		} else {
 			/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0a40060..17383f9 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -180,7 +180,7 @@ static void sync_global_pgds_l4(unsigned long start, unsigned long end)
  */
 void sync_global_pgds(unsigned long start, unsigned long end)
 {
-	if (pgtable_l5_enabled)
+	if (pgtable_l5_enabled())
 		sync_global_pgds_l5(start, end);
 	else
 		sync_global_pgds_l4(start, end);
@@ -643,7 +643,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
 	unsigned long vaddr = (unsigned long)__va(paddr);
 	int i = p4d_index(vaddr);
 
-	if (!pgtable_l5_enabled)
+	if (!pgtable_l5_enabled())
 		return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
 
 	for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
@@ -723,7 +723,7 @@ kernel_physical_mapping_init(unsigned long paddr_start,
 					   page_size_mask);
 
 		spin_lock(&init_mm.page_table_lock);
-		if (pgtable_l5_enabled)
+		if (pgtable_l5_enabled())
 			pgd_populate(&init_mm, pgd, p4d);
 		else
 			p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
@@ -1100,7 +1100,7 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
 		 * 5-level case we should free them. This code will have to change
 		 * to adapt for boot-time switching between 4 and 5 level page tables.
 		 */
-		if (pgtable_l5_enabled)
+		if (pgtable_l5_enabled())
 			free_pud_table(pud_base, p4d);
 	}
 
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 980dbeb..e3e7752 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -2,10 +2,8 @@
 #define DISABLE_BRANCH_PROFILING
 #define pr_fmt(fmt) "kasan: " fmt
 
-#ifdef CONFIG_X86_5LEVEL
-/* Too early to use cpu_feature_enabled() */
-#define pgtable_l5_enabled __pgtable_l5_enabled
-#endif
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
 
 #include <linux/bootmem.h>
 #include <linux/kasan.h>
@@ -182,7 +180,7 @@ static void __init clear_pgds(unsigned long start,
 		 * With folded p4d, pgd_clear() is nop, use p4d_clear()
 		 * instead.
 		 */
-		if (pgtable_l5_enabled)
+		if (pgtable_l5_enabled())
 			pgd_clear(pgd);
 		else
 			p4d_clear(p4d_offset(pgd, start));
@@ -197,7 +195,7 @@ static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
 {
 	unsigned long p4d;
 
-	if (!pgtable_l5_enabled)
+	if (!pgtable_l5_enabled())
 		return (p4d_t *)pgd;
 
 	p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK;
@@ -284,7 +282,7 @@ void __init kasan_early_init(void)
 	for (i = 0; i < PTRS_PER_PUD; i++)
 		kasan_zero_pud[i] = __pud(pud_val);
 
-	for (i = 0; pgtable_l5_enabled && i < PTRS_PER_P4D; i++)
+	for (i = 0; pgtable_l5_enabled() && i < PTRS_PER_P4D; i++)
 		kasan_zero_p4d[i] = __p4d(p4d_val);
 
 	kasan_map_early_shadow(early_top_pgt);
@@ -315,7 +313,7 @@ void __init kasan_init(void)
 	 * bunch of things like kernel code, modules, EFI mapping, etc.
 	 * We need to take extra steps to not overwrite them.
 	 */
-	if (pgtable_l5_enabled) {
+	if (pgtable_l5_enabled()) {
 		void *ptr;
 
 		ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 615cc03..61db77b 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -78,7 +78,7 @@ void __init kernel_randomize_memory(void)
 	struct rnd_state rand_state;
 	unsigned long remain_entropy;
 
-	vaddr_start = pgtable_l5_enabled ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4;
+	vaddr_start = pgtable_l5_enabled() ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4;
 	vaddr = vaddr_start;
 
 	/*
@@ -124,7 +124,7 @@ void __init kernel_randomize_memory(void)
 		 */
 		entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
 		prandom_bytes_state(&rand_state, &rand, sizeof(rand));
-		if (pgtable_l5_enabled)
+		if (pgtable_l5_enabled())
 			entropy = (rand % (entropy + 1)) & P4D_MASK;
 		else
 			entropy = (rand % (entropy + 1)) & PUD_MASK;
@@ -136,7 +136,7 @@ void __init kernel_randomize_memory(void)
 		 * randomization alignment.
 		 */
 		vaddr += get_padding(&kaslr_regions[i]);
-		if (pgtable_l5_enabled)
+		if (pgtable_l5_enabled())
 			vaddr = round_up(vaddr + 1, P4D_SIZE);
 		else
 			vaddr = round_up(vaddr + 1, PUD_SIZE);
@@ -212,7 +212,7 @@ void __meminit init_trampoline(void)
 		return;
 	}
 
-	if (pgtable_l5_enabled)
+	if (pgtable_l5_enabled())
 		init_trampoline_p4d();
 	else
 		init_trampoline_pud();
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 25504d5..fa15085 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -136,13 +136,13 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
 
 	/* whine about and ignore invalid blks */
 	if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
-		pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
-			   nid, start, end - 1);
+		pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
+			nid, start, end - 1);
 		return 0;
 	}
 
 	if (mi->nr_blks >= NR_NODE_MEMBLKS) {
-		pr_err("NUMA: too many memblk ranges\n");
+		pr_err("too many memblk ranges\n");
 		return -EINVAL;
 	}
 
@@ -267,14 +267,14 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 			 */
 			if (bi->end > bj->start && bi->start < bj->end) {
 				if (bi->nid != bj->nid) {
-					pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
+					pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
 					       bi->nid, bi->start, bi->end - 1,
 					       bj->nid, bj->start, bj->end - 1);
 					return -EINVAL;
 				}
-				pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
-					   bi->nid, bi->start, bi->end - 1,
-					   bj->start, bj->end - 1);
+				pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
+					bi->nid, bi->start, bi->end - 1,
+					bj->start, bj->end - 1);
 			}
 
 			/*
@@ -364,7 +364,7 @@ static int __init numa_alloc_distance(void)
 	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
 				      size, PAGE_SIZE);
 	if (!phys) {
-		pr_warning("NUMA: Warning: can't allocate distance table!\n");
+		pr_warn("Warning: can't allocate distance table!\n");
 		/* don't retry until explicitly reset */
 		numa_distance = (void *)1LU;
 		return -ENOMEM;
@@ -410,14 +410,14 @@ void __init numa_set_distance(int from, int to, int distance)
 
 	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
 			from < 0 || to < 0) {
-		pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
-			    from, to, distance);
+		pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
+			     from, to, distance);
 		return;
 	}
 
 	if ((u8)distance != distance ||
 	    (from == to && distance != LOCAL_DISTANCE)) {
-		pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+		pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
 			     from, to, distance);
 		return;
 	}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index e055d1a..6eb1f34 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -157,7 +157,7 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
 	unsigned long sp = current_stack_pointer;
 	pgd_t *pgd = pgd_offset(mm, sp);
 
-	if (pgtable_l5_enabled) {
+	if (pgtable_l5_enabled()) {
 		if (unlikely(pgd_none(*pgd))) {
 			pgd_t *pgd_ref = pgd_offset_k(sp);
 
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 263c845..d765ace 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1,4 +1,5 @@
-/* bpf_jit_comp.c : BPF JIT compiler
+/*
+ * bpf_jit_comp.c: BPF JIT compiler
  *
  * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com)
  * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
@@ -17,7 +18,7 @@
 #include <asm/nospec-branch.h>
 
 /*
- * assembly code in arch/x86/net/bpf_jit.S
+ * Assembly code in arch/x86/net/bpf_jit.S
  */
 extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
 extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[];
@@ -45,14 +46,15 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 #define EMIT2(b1, b2)		EMIT((b1) + ((b2) << 8), 2)
 #define EMIT3(b1, b2, b3)	EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
 #define EMIT4(b1, b2, b3, b4)   EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+
 #define EMIT1_off32(b1, off) \
-	do {EMIT1(b1); EMIT(off, 4); } while (0)
+	do { EMIT1(b1); EMIT(off, 4); } while (0)
 #define EMIT2_off32(b1, b2, off) \
-	do {EMIT2(b1, b2); EMIT(off, 4); } while (0)
+	do { EMIT2(b1, b2); EMIT(off, 4); } while (0)
 #define EMIT3_off32(b1, b2, b3, off) \
-	do {EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+	do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
 #define EMIT4_off32(b1, b2, b3, b4, off) \
-	do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
+	do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
 
 static bool is_imm8(int value)
 {
@@ -70,9 +72,10 @@ static bool is_uimm32(u64 value)
 }
 
 /* mov dst, src */
-#define EMIT_mov(DST, SRC) \
-	do {if (DST != SRC) \
-		EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
+#define EMIT_mov(DST, SRC)								 \
+	do {										 \
+		if (DST != SRC)								 \
+			EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
 	} while (0)
 
 static int bpf_size_to_x86_bytes(int bpf_size)
@@ -89,7 +92,8 @@ static int bpf_size_to_x86_bytes(int bpf_size)
 		return 0;
 }
 
-/* list of x86 cond jumps opcodes (. + s8)
+/*
+ * List of x86 cond jumps opcodes (. + s8)
  * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
  */
 #define X86_JB  0x72
@@ -106,35 +110,37 @@ static int bpf_size_to_x86_bytes(int bpf_size)
 #define CHOOSE_LOAD_FUNC(K, func) \
 	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
 
-/* pick a register outside of BPF range for JIT internal work */
+/* Pick a register outside of BPF range for JIT internal work */
 #define AUX_REG (MAX_BPF_JIT_REG + 1)
 
-/* The following table maps BPF registers to x64 registers.
+/*
+ * The following table maps BPF registers to x86-64 registers.
  *
- * x64 register r12 is unused, since if used as base address
+ * x86-64 register R12 is unused, since if used as base address
  * register in load/store instructions, it always needs an
  * extra byte of encoding and is callee saved.
  *
- *  r9 caches skb->len - skb->data_len
- * r10 caches skb->data, and used for blinding (if enabled)
+ * R9  caches skb->len - skb->data_len
+ * R10 caches skb->data, and used for blinding (if enabled)
  */
 static const int reg2hex[] = {
-	[BPF_REG_0] = 0,  /* rax */
-	[BPF_REG_1] = 7,  /* rdi */
-	[BPF_REG_2] = 6,  /* rsi */
-	[BPF_REG_3] = 2,  /* rdx */
-	[BPF_REG_4] = 1,  /* rcx */
-	[BPF_REG_5] = 0,  /* r8 */
-	[BPF_REG_6] = 3,  /* rbx callee saved */
-	[BPF_REG_7] = 5,  /* r13 callee saved */
-	[BPF_REG_8] = 6,  /* r14 callee saved */
-	[BPF_REG_9] = 7,  /* r15 callee saved */
-	[BPF_REG_FP] = 5, /* rbp readonly */
-	[BPF_REG_AX] = 2, /* r10 temp register */
-	[AUX_REG] = 3,    /* r11 temp register */
+	[BPF_REG_0] = 0,  /* RAX */
+	[BPF_REG_1] = 7,  /* RDI */
+	[BPF_REG_2] = 6,  /* RSI */
+	[BPF_REG_3] = 2,  /* RDX */
+	[BPF_REG_4] = 1,  /* RCX */
+	[BPF_REG_5] = 0,  /* R8  */
+	[BPF_REG_6] = 3,  /* RBX callee saved */
+	[BPF_REG_7] = 5,  /* R13 callee saved */
+	[BPF_REG_8] = 6,  /* R14 callee saved */
+	[BPF_REG_9] = 7,  /* R15 callee saved */
+	[BPF_REG_FP] = 5, /* RBP readonly */
+	[BPF_REG_AX] = 2, /* R10 temp register */
+	[AUX_REG] = 3,    /* R11 temp register */
 };
 
-/* is_ereg() == true if BPF register 'reg' maps to x64 r8..r15
+/*
+ * is_ereg() == true if BPF register 'reg' maps to x86-64 r8..r15
  * which need extra byte of encoding.
  * rax,rcx,...,rbp have simpler encoding
  */
@@ -153,7 +159,7 @@ static bool is_axreg(u32 reg)
 	return reg == BPF_REG_0;
 }
 
-/* add modifiers if 'reg' maps to x64 registers r8..r15 */
+/* Add modifiers if 'reg' maps to x86-64 registers R8..R15 */
 static u8 add_1mod(u8 byte, u32 reg)
 {
 	if (is_ereg(reg))
@@ -170,13 +176,13 @@ static u8 add_2mod(u8 byte, u32 r1, u32 r2)
 	return byte;
 }
 
-/* encode 'dst_reg' register into x64 opcode 'byte' */
+/* Encode 'dst_reg' register into x86-64 opcode 'byte' */
 static u8 add_1reg(u8 byte, u32 dst_reg)
 {
 	return byte + reg2hex[dst_reg];
 }
 
-/* encode 'dst_reg' and 'src_reg' registers into x64 opcode 'byte' */
+/* Encode 'dst_reg' and 'src_reg' registers into x86-64 opcode 'byte' */
 static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
 {
 	return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
@@ -184,27 +190,28 @@ static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
 
 static void jit_fill_hole(void *area, unsigned int size)
 {
-	/* fill whole space with int3 instructions */
+	/* Fill whole space with INT3 instructions */
 	memset(area, 0xcc, size);
 }
 
 struct jit_context {
-	int cleanup_addr; /* epilogue code offset */
+	int cleanup_addr; /* Epilogue code offset */
 	bool seen_ld_abs;
 	bool seen_ax_reg;
 };
 
-/* maximum number of bytes emitted while JITing one eBPF insn */
+/* Maximum number of bytes emitted while JITing one eBPF insn */
 #define BPF_MAX_INSN_SIZE	128
 #define BPF_INSN_SAFETY		64
 
 #define AUX_STACK_SPACE \
-	(32 /* space for rbx, r13, r14, r15 */ + \
-	 8 /* space for skb_copy_bits() buffer */)
+	(32 /* Space for RBX, R13, R14, R15 */ + \
+	  8 /* Space for skb_copy_bits() buffer */)
 
 #define PROLOGUE_SIZE 37
 
-/* emit x64 prologue code for BPF program and check it's size.
+/*
+ * Emit x86-64 prologue code for BPF program and check its size.
  * bpf_tail_call helper will skip it while jumping into another program
  */
 static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
@@ -212,8 +219,11 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	u8 *prog = *pprog;
 	int cnt = 0;
 
-	EMIT1(0x55); /* push rbp */
-	EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */
+	/* push rbp */
+	EMIT1(0x55);
+
+	/* mov rbp,rsp */
+	EMIT3(0x48, 0x89, 0xE5);
 
 	/* sub rsp, rounded_stack_depth + AUX_STACK_SPACE */
 	EMIT3_off32(0x48, 0x81, 0xEC,
@@ -222,14 +232,15 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	/* sub rbp, AUX_STACK_SPACE */
 	EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE);
 
-	/* all classic BPF filters use R6(rbx) save it */
+	/* All classic BPF filters use R6(rbx) save it */
 
 	/* mov qword ptr [rbp+0],rbx */
 	EMIT4(0x48, 0x89, 0x5D, 0);
 
-	/* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
-	 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and
-	 * R8(r14). R9(r15) spill could be made conditional, but there is only
+	/*
+	 * bpf_convert_filter() maps classic BPF register X to R7 and uses R8
+	 * as temporary, so all tcpdump filters need to spill/fill R7(R13) and
+	 * R8(R14). R9(R15) spill could be made conditional, but there is only
 	 * one 'bpf_error' return path out of helper functions inside bpf_jit.S
 	 * The overhead of extra spill is negligible for any filter other
 	 * than synthetic ones. Therefore not worth adding complexity.
@@ -243,9 +254,10 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	EMIT4(0x4C, 0x89, 0x7D, 24);
 
 	if (!ebpf_from_cbpf) {
-		/* Clear the tail call counter (tail_call_cnt): for eBPF tail
+		/*
+		 * Clear the tail call counter (tail_call_cnt): for eBPF tail
 		 * calls we need to reset the counter to 0. It's done in two
-		 * instructions, resetting rax register to 0, and moving it
+		 * instructions, resetting RAX register to 0, and moving it
 		 * to the counter location.
 		 */
 
@@ -260,7 +272,9 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	*pprog = prog;
 }
 
-/* generate the following code:
+/*
+ * Generate the following code:
+ *
  * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
  *   if (index >= array->map.max_entries)
  *     goto out;
@@ -278,23 +292,26 @@ static void emit_bpf_tail_call(u8 **pprog)
 	int label1, label2, label3;
 	int cnt = 0;
 
-	/* rdi - pointer to ctx
+	/*
+	 * rdi - pointer to ctx
 	 * rsi - pointer to bpf_array
 	 * rdx - index in bpf_array
 	 */
 
-	/* if (index >= array->map.max_entries)
-	 *   goto out;
+	/*
+	 * if (index >= array->map.max_entries)
+	 *	goto out;
 	 */
 	EMIT2(0x89, 0xD2);                        /* mov edx, edx */
 	EMIT3(0x39, 0x56,                         /* cmp dword ptr [rsi + 16], edx */
 	      offsetof(struct bpf_array, map.max_entries));
-#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* number of bytes to jump */
+#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* Number of bytes to jump */
 	EMIT2(X86_JBE, OFFSET1);                  /* jbe out */
 	label1 = cnt;
 
-	/* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
-	 *   goto out;
+	/*
+	 * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 *	goto out;
 	 */
 	EMIT2_off32(0x8B, 0x85, 36);              /* mov eax, dword ptr [rbp + 36] */
 	EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);     /* cmp eax, MAX_TAIL_CALL_CNT */
@@ -308,8 +325,9 @@ static void emit_bpf_tail_call(u8 **pprog)
 	EMIT4_off32(0x48, 0x8B, 0x84, 0xD6,       /* mov rax, [rsi + rdx * 8 + offsetof(...)] */
 		    offsetof(struct bpf_array, ptrs));
 
-	/* if (prog == NULL)
-	 *   goto out;
+	/*
+	 * if (prog == NULL)
+	 *	goto out;
 	 */
 	EMIT3(0x48, 0x85, 0xC0);		  /* test rax,rax */
 #define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE)
@@ -321,7 +339,8 @@ static void emit_bpf_tail_call(u8 **pprog)
 	      offsetof(struct bpf_prog, bpf_func));
 	EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE);   /* add rax, prologue_size */
 
-	/* now we're ready to jump into next BPF program
+	/*
+	 * Wow we're ready to jump into next BPF program
 	 * rdi == ctx (1st arg)
 	 * rax == prog->bpf_func + prologue_size
 	 */
@@ -340,7 +359,8 @@ static void emit_load_skb_data_hlen(u8 **pprog)
 	u8 *prog = *pprog;
 	int cnt = 0;
 
-	/* r9d = skb->len - skb->data_len (headlen)
+	/*
+	 * r9d = skb->len - skb->data_len (headlen)
 	 * r10 = skb->data
 	 */
 	/* mov %r9d, off32(%rdi) */
@@ -361,7 +381,8 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
 	u8 b1, b2, b3;
 	int cnt = 0;
 
-	/* optimization: if imm32 is positive, use 'mov %eax, imm32'
+	/*
+	 * Optimization: if imm32 is positive, use 'mov %eax, imm32'
 	 * (which zero-extends imm32) to save 2 bytes.
 	 */
 	if (sign_propagate && (s32)imm32 < 0) {
@@ -373,7 +394,8 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
 		goto done;
 	}
 
-	/* optimization: if imm32 is zero, use 'xor %eax, %eax'
+	/*
+	 * Optimization: if imm32 is zero, use 'xor %eax, %eax'
 	 * to save 3 bytes.
 	 */
 	if (imm32 == 0) {
@@ -400,7 +422,8 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
 	int cnt = 0;
 
 	if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) {
-		/* For emitting plain u32, where sign bit must not be
+		/*
+		 * For emitting plain u32, where sign bit must not be
 		 * propagated LLVM tends to load imm64 over mov32
 		 * directly, so save couple of bytes by just doing
 		 * 'mov %eax, imm32' instead.
@@ -525,7 +548,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			else if (is_ereg(dst_reg))
 				EMIT1(add_1mod(0x40, dst_reg));
 
-			/* b3 holds 'normal' opcode, b2 short form only valid
+			/*
+			 * b3 holds 'normal' opcode, b2 short form only valid
 			 * in case dst is eax/rax.
 			 */
 			switch (BPF_OP(insn->code)) {
@@ -593,7 +617,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			/* mov rax, dst_reg */
 			EMIT_mov(BPF_REG_0, dst_reg);
 
-			/* xor edx, edx
+			/*
+			 * xor edx, edx
 			 * equivalent to 'xor rdx, rdx', but one byte less
 			 */
 			EMIT2(0x31, 0xd2);
@@ -655,7 +680,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			}
 			break;
 		}
-			/* shifts */
+			/* Shifts */
 		case BPF_ALU | BPF_LSH | BPF_K:
 		case BPF_ALU | BPF_RSH | BPF_K:
 		case BPF_ALU | BPF_ARSH | BPF_K:
@@ -686,7 +711,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU64 | BPF_RSH | BPF_X:
 		case BPF_ALU64 | BPF_ARSH | BPF_X:
 
-			/* check for bad case when dst_reg == rcx */
+			/* Check for bad case when dst_reg == rcx */
 			if (dst_reg == BPF_REG_4) {
 				/* mov r11, dst_reg */
 				EMIT_mov(AUX_REG, dst_reg);
@@ -724,13 +749,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU | BPF_END | BPF_FROM_BE:
 			switch (imm32) {
 			case 16:
-				/* emit 'ror %ax, 8' to swap lower 2 bytes */
+				/* Emit 'ror %ax, 8' to swap lower 2 bytes */
 				EMIT1(0x66);
 				if (is_ereg(dst_reg))
 					EMIT1(0x41);
 				EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8);
 
-				/* emit 'movzwl eax, ax' */
+				/* Emit 'movzwl eax, ax' */
 				if (is_ereg(dst_reg))
 					EMIT3(0x45, 0x0F, 0xB7);
 				else
@@ -738,7 +763,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 				EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
 				break;
 			case 32:
-				/* emit 'bswap eax' to swap lower 4 bytes */
+				/* Emit 'bswap eax' to swap lower 4 bytes */
 				if (is_ereg(dst_reg))
 					EMIT2(0x41, 0x0F);
 				else
@@ -746,7 +771,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 				EMIT1(add_1reg(0xC8, dst_reg));
 				break;
 			case 64:
-				/* emit 'bswap rax' to swap 8 bytes */
+				/* Emit 'bswap rax' to swap 8 bytes */
 				EMIT3(add_1mod(0x48, dst_reg), 0x0F,
 				      add_1reg(0xC8, dst_reg));
 				break;
@@ -756,7 +781,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU | BPF_END | BPF_FROM_LE:
 			switch (imm32) {
 			case 16:
-				/* emit 'movzwl eax, ax' to zero extend 16-bit
+				/*
+				 * Emit 'movzwl eax, ax' to zero extend 16-bit
 				 * into 64 bit
 				 */
 				if (is_ereg(dst_reg))
@@ -766,7 +792,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 				EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
 				break;
 			case 32:
-				/* emit 'mov eax, eax' to clear upper 32-bits */
+				/* Emit 'mov eax, eax' to clear upper 32-bits */
 				if (is_ereg(dst_reg))
 					EMIT1(0x45);
 				EMIT2(0x89, add_2reg(0xC0, dst_reg, dst_reg));
@@ -809,9 +835,9 @@ st:			if (is_imm8(insn->off))
 
 			/* STX: *(u8*)(dst_reg + off) = src_reg */
 		case BPF_STX | BPF_MEM | BPF_B:
-			/* emit 'mov byte ptr [rax + off], al' */
+			/* Emit 'mov byte ptr [rax + off], al' */
 			if (is_ereg(dst_reg) || is_ereg(src_reg) ||
-			    /* have to add extra byte for x86 SIL, DIL regs */
+			    /* We have to add extra byte for x86 SIL, DIL regs */
 			    src_reg == BPF_REG_1 || src_reg == BPF_REG_2)
 				EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
 			else
@@ -840,25 +866,26 @@ stx:			if (is_imm8(insn->off))
 
 			/* LDX: dst_reg = *(u8*)(src_reg + off) */
 		case BPF_LDX | BPF_MEM | BPF_B:
-			/* emit 'movzx rax, byte ptr [rax + off]' */
+			/* Emit 'movzx rax, byte ptr [rax + off]' */
 			EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_H:
-			/* emit 'movzx rax, word ptr [rax + off]' */
+			/* Emit 'movzx rax, word ptr [rax + off]' */
 			EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_W:
-			/* emit 'mov eax, dword ptr [rax+0x14]' */
+			/* Emit 'mov eax, dword ptr [rax+0x14]' */
 			if (is_ereg(dst_reg) || is_ereg(src_reg))
 				EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
 			else
 				EMIT1(0x8B);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_DW:
-			/* emit 'mov rax, qword ptr [rax+0x14]' */
+			/* Emit 'mov rax, qword ptr [rax+0x14]' */
 			EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
-ldx:			/* if insn->off == 0 we can save one extra byte, but
-			 * special case of x86 r13 which always needs an offset
+ldx:			/*
+			 * If insn->off == 0 we can save one extra byte, but
+			 * special case of x86 R13 which always needs an offset
 			 * is not worth the hassle
 			 */
 			if (is_imm8(insn->off))
@@ -870,7 +897,7 @@ ldx:			/* if insn->off == 0 we can save one extra byte, but
 
 			/* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */
 		case BPF_STX | BPF_XADD | BPF_W:
-			/* emit 'lock add dword ptr [rax + off], eax' */
+			/* Emit 'lock add dword ptr [rax + off], eax' */
 			if (is_ereg(dst_reg) || is_ereg(src_reg))
 				EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01);
 			else
@@ -897,14 +924,15 @@ xadd:			if (is_imm8(insn->off))
 				} else {
 					EMIT2(0x41, 0x52); /* push %r10 */
 					EMIT2(0x41, 0x51); /* push %r9 */
-					/* need to adjust jmp offset, since
+					/*
+					 * We need to adjust jmp offset, since
 					 * pop %r9, pop %r10 take 4 bytes after call insn
 					 */
 					jmp_offset += 4;
 				}
 			}
 			if (!imm32 || !is_simm32(jmp_offset)) {
-				pr_err("unsupported bpf func %d addr %p image %p\n",
+				pr_err("unsupported BPF func %d addr %p image %p\n",
 				       imm32, func, image);
 				return -EINVAL;
 			}
@@ -970,7 +998,7 @@ xadd:			if (is_imm8(insn->off))
 			else
 				EMIT2_off32(0x81, add_1reg(0xF8, dst_reg), imm32);
 
-emit_cond_jmp:		/* convert BPF opcode to x86 */
+emit_cond_jmp:		/* Convert BPF opcode to x86 */
 			switch (BPF_OP(insn->code)) {
 			case BPF_JEQ:
 				jmp_cond = X86_JE;
@@ -996,22 +1024,22 @@ emit_cond_jmp:		/* convert BPF opcode to x86 */
 				jmp_cond = X86_JBE;
 				break;
 			case BPF_JSGT:
-				/* signed '>', GT in x86 */
+				/* Signed '>', GT in x86 */
 				jmp_cond = X86_JG;
 				break;
 			case BPF_JSLT:
-				/* signed '<', LT in x86 */
+				/* Signed '<', LT in x86 */
 				jmp_cond = X86_JL;
 				break;
 			case BPF_JSGE:
-				/* signed '>=', GE in x86 */
+				/* Signed '>=', GE in x86 */
 				jmp_cond = X86_JGE;
 				break;
 			case BPF_JSLE:
-				/* signed '<=', LE in x86 */
+				/* Signed '<=', LE in x86 */
 				jmp_cond = X86_JLE;
 				break;
-			default: /* to silence gcc warning */
+			default: /* to silence GCC warning */
 				return -EFAULT;
 			}
 			jmp_offset = addrs[i + insn->off] - addrs[i];
@@ -1039,7 +1067,7 @@ emit_cond_jmp:		/* convert BPF opcode to x86 */
 				jmp_offset = addrs[i + insn->off] - addrs[i];
 
 			if (!jmp_offset)
-				/* optimize out nop jumps */
+				/* Optimize out nop jumps */
 				break;
 emit_jmp:
 			if (is_imm8(jmp_offset)) {
@@ -1061,7 +1089,7 @@ common_load:
 			ctx->seen_ld_abs = seen_ld_abs = true;
 			jmp_offset = func - (image + addrs[i]);
 			if (!func || !is_simm32(jmp_offset)) {
-				pr_err("unsupported bpf func %d addr %p image %p\n",
+				pr_err("unsupported BPF func %d addr %p image %p\n",
 				       imm32, func, image);
 				return -EINVAL;
 			}
@@ -1080,7 +1108,8 @@ common_load:
 						EMIT2_off32(0x81, 0xC6, imm32);
 				}
 			}
-			/* skb pointer is in R6 (%rbx), it will be copied into
+			/*
+			 * skb pointer is in R6 (%rbx), it will be copied into
 			 * %rdi if skb_copy_bits() call is necessary.
 			 * sk_load_* helpers also use %r10 and %r9d.
 			 * See bpf_jit.S
@@ -1111,7 +1140,7 @@ common_load:
 				goto emit_jmp;
 			}
 			seen_exit = true;
-			/* update cleanup_addr */
+			/* Update cleanup_addr */
 			ctx->cleanup_addr = proglen;
 			/* mov rbx, qword ptr [rbp+0] */
 			EMIT4(0x48, 0x8B, 0x5D, 0);
@@ -1129,10 +1158,11 @@ common_load:
 			break;
 
 		default:
-			/* By design x64 JIT should support all BPF instructions
+			/*
+			 * By design x86-64 JIT should support all BPF instructions.
 			 * This error will be seen if new instruction was added
-			 * to interpreter, but not to JIT
-			 * or if there is junk in bpf_prog
+			 * to the interpreter, but not to the JIT, or if there is
+			 * junk in bpf_prog.
 			 */
 			pr_err("bpf_jit: unknown opcode %02x\n", insn->code);
 			return -EINVAL;
@@ -1184,7 +1214,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		return orig_prog;
 
 	tmp = bpf_jit_blind_constants(prog);
-	/* If blinding was requested and we failed during blinding,
+	/*
+	 * If blinding was requested and we failed during blinding,
 	 * we must fall back to the interpreter.
 	 */
 	if (IS_ERR(tmp))
@@ -1218,8 +1249,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		goto out_addrs;
 	}
 
-	/* Before first pass, make a rough estimation of addrs[]
-	 * each bpf instruction is translated to less than 64 bytes
+	/*
+	 * Before first pass, make a rough estimation of addrs[]
+	 * each BPF instruction is translated to less than 64 bytes
 	 */
 	for (proglen = 0, i = 0; i < prog->len; i++) {
 		proglen += 64;
@@ -1228,10 +1260,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	ctx.cleanup_addr = proglen;
 skip_init_addrs:
 
-	/* JITed image shrinks with every pass and the loop iterates
-	 * until the image stops shrinking. Very large bpf programs
+	/*
+	 * JITed image shrinks with every pass and the loop iterates
+	 * until the image stops shrinking. Very large BPF programs
 	 * may converge on the last pass. In such case do one more
-	 * pass to emit the final image
+	 * pass to emit the final image.
 	 */
 	for (pass = 0; pass < 20 || image; pass++) {
 		proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index bed7e7f..e01f7ce 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -225,7 +225,7 @@ int __init efi_alloc_page_tables(void)
 
 	pud = pud_alloc(&init_mm, p4d, EFI_VA_END);
 	if (!pud) {
-		if (pgtable_l5_enabled)
+		if (pgtable_l5_enabled())
 			free_page((unsigned long) pgd_page_vaddr(*pgd));
 		free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER);
 		return -ENOMEM;
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index ccf4a49..67ccf64 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -72,7 +72,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
 	 * tables used by the image kernel.
 	 */
 
-	if (pgtable_l5_enabled) {
+	if (pgtable_l5_enabled()) {
 		p4d = (p4d_t *)get_safe_page(GFP_ATOMIC);
 		if (!p4d)
 			return -ENOMEM;
diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile
index 1000335..b2d6967 100644
--- a/arch/x86/um/vdso/Makefile
+++ b/arch/x86/um/vdso/Makefile
@@ -23,14 +23,14 @@ $(obj)/vdso.o: $(obj)/vdso.so
 
 targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
 
-export CPPFLAGS_vdso.lds += -P -C
+CPPFLAGS_vdso.lds += -P -C
 
 VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
        -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
 
 $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
 
-$(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
+$(obj)/vdso.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
 	$(call if_changed,vdso)
 
 $(obj)/%.so: OBJCOPYFLAGS := -S
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
index a18703b..1804b27 100644
--- a/arch/x86/xen/efi.c
+++ b/arch/x86/xen/efi.c
@@ -115,6 +115,61 @@ static efi_system_table_t __init *xen_efi_probe(void)
 	return &efi_systab_xen;
 }
 
+/*
+ * Determine whether we're in secure boot mode.
+ *
+ * Please keep the logic in sync with
+ * drivers/firmware/efi/libstub/secureboot.c:efi_get_secureboot().
+ */
+static enum efi_secureboot_mode xen_efi_get_secureboot(void)
+{
+	static efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
+	static efi_guid_t shim_guid = EFI_SHIM_LOCK_GUID;
+	efi_status_t status;
+	u8 moksbstate, secboot, setupmode;
+	unsigned long size;
+
+	size = sizeof(secboot);
+	status = efi.get_variable(L"SecureBoot", &efi_variable_guid,
+				  NULL, &size, &secboot);
+
+	if (status == EFI_NOT_FOUND)
+		return efi_secureboot_mode_disabled;
+
+	if (status != EFI_SUCCESS)
+		goto out_efi_err;
+
+	size = sizeof(setupmode);
+	status = efi.get_variable(L"SetupMode", &efi_variable_guid,
+				  NULL, &size, &setupmode);
+
+	if (status != EFI_SUCCESS)
+		goto out_efi_err;
+
+	if (secboot == 0 || setupmode == 1)
+		return efi_secureboot_mode_disabled;
+
+	/* See if a user has put the shim into insecure mode. */
+	size = sizeof(moksbstate);
+	status = efi.get_variable(L"MokSBStateRT", &shim_guid,
+				  NULL, &size, &moksbstate);
+
+	/* If it fails, we don't care why. Default to secure. */
+	if (status != EFI_SUCCESS)
+		goto secure_boot_enabled;
+
+	if (moksbstate == 1)
+		return efi_secureboot_mode_disabled;
+
+ secure_boot_enabled:
+	pr_info("UEFI Secure Boot is enabled.\n");
+	return efi_secureboot_mode_enabled;
+
+ out_efi_err:
+	pr_err("Could not determine UEFI Secure Boot status.\n");
+	return efi_secureboot_mode_unknown;
+}
+
 void __init xen_efi_init(void)
 {
 	efi_system_table_t *efi_systab_xen;
@@ -129,6 +184,8 @@ void __init xen_efi_init(void)
 	boot_params.efi_info.efi_systab = (__u32)__pa(efi_systab_xen);
 	boot_params.efi_info.efi_systab_hi = (__u32)(__pa(efi_systab_xen) >> 32);
 
+	boot_params.secure_boot = xen_efi_get_secureboot();
+
 	set_bit(EFI_BOOT, &efi.flags);
 	set_bit(EFI_PARAVIRT, &efi.flags);
 	set_bit(EFI_64BIT, &efi.flags);
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index c921e8b..17df3322 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -19,7 +19,6 @@ config XTENSA
 	select HAVE_ARCH_KASAN if MMU
 	select HAVE_CC_STACKPROTECTOR
 	select HAVE_DEBUG_KMEMLEAK
-	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_EXIT_THREAD
 	select HAVE_FUNCTION_TRACER
diff --git a/arch/xtensa/include/asm/pci.h b/arch/xtensa/include/asm/pci.h
index d5a8215..6ddf0a3 100644
--- a/arch/xtensa/include/asm/pci.h
+++ b/arch/xtensa/include/asm/pci.h
@@ -42,8 +42,6 @@ extern struct pci_controller* pcibios_alloc_controller(void);
  * decisions.
  */
 
-#define PCI_DMA_BUS_IS_PHYS	(1)
-
 /* Tell PCI code what kind of PCI resource mappings we support */
 #define HAVE_PCI_MMAP			1
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE	1
diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c
index 732631c..392b4a8 100644
--- a/arch/xtensa/kernel/pci-dma.c
+++ b/arch/xtensa/kernel/pci-dma.c
@@ -261,12 +261,3 @@ const struct dma_map_ops xtensa_dma_map_ops = {
 	.mapping_error = xtensa_dma_mapping_error,
 };
 EXPORT_SYMBOL(xtensa_dma_map_ops);
-
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init xtensa_dma_init(void)
-{
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-	return 0;
-}
-fs_initcall(xtensa_dma_init);
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 32c5207..86507fa 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -323,8 +323,6 @@ do_illegal_instruction(struct pt_regs *regs)
 void
 do_unaligned_user (struct pt_regs *regs)
 {
-	siginfo_t info;
-
 	__die_if_kernel("Unhandled unaligned exception in kernel",
 			regs, SIGKILL);
 
@@ -334,12 +332,7 @@ do_unaligned_user (struct pt_regs *regs)
 			    "(pid = %d, pc = %#010lx)\n",
 			    regs->excvaddr, current->comm,
 			    task_pid_nr(current), regs->pc);
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_ADRALN;
-	info.si_addr = (void *) regs->excvaddr;
-	force_sig_info(SIGSEGV, &info, current);
-
+	force_sig_fault(SIGBUS, BUS_ADRALN, (void *) regs->excvaddr, current);
 }
 #endif
 
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 8b9b6f4..c111a83 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -39,13 +39,13 @@ void do_page_fault(struct pt_regs *regs)
 	struct mm_struct *mm = current->mm;
 	unsigned int exccause = regs->exccause;
 	unsigned int address = regs->excvaddr;
-	siginfo_t info;
+	int code;
 
 	int is_write, is_exec;
 	int fault;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
-	info.si_code = SEGV_MAPERR;
+	code = SEGV_MAPERR;
 
 	/* We fault-in kernel-space virtual memory on-demand. The
 	 * 'reference' page table is init_mm.pgd.
@@ -91,7 +91,7 @@ retry:
 	 */
 
 good_area:
-	info.si_code = SEGV_ACCERR;
+	code = SEGV_ACCERR;
 
 	if (is_write) {
 		if (!(vma->vm_flags & VM_WRITE))
@@ -157,11 +157,7 @@ bad_area:
 	if (user_mode(regs)) {
 		current->thread.bad_vaddr = address;
 		current->thread.error_code = is_write;
-		info.si_signo = SIGSEGV;
-		info.si_errno = 0;
-		/* info.si_code has been set above */
-		info.si_addr = (void *) address;
-		force_sig_info(SIGSEGV, &info, current);
+		force_sig_fault(SIGSEGV, code, (void *) address, current);
 		return;
 	}
 	bad_page_fault(regs, address, SIGSEGV);
@@ -186,11 +182,7 @@ do_sigbus:
 	 * or user mode.
 	 */
 	current->thread.bad_vaddr = address;
-	info.si_code = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_ADRERR;
-	info.si_addr = (void *) address;
-	force_sig_info(SIGBUS, &info, current);
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void *) address, current);
 
 	/* Kernel mode? Handle exceptions or die */
 	if (!user_mode(regs))
diff --git a/arch/xtensa/platforms/iss/console.c b/arch/xtensa/platforms/iss/console.c
index 92f567f..af81a62 100644
--- a/arch/xtensa/platforms/iss/console.c
+++ b/arch/xtensa/platforms/iss/console.c
@@ -153,19 +153,6 @@ static int rs_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int rs_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, rs_proc_show, NULL);
-}
-
-static const struct file_operations rs_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= rs_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static const struct tty_operations serial_ops = {
 	.open = rs_open,
 	.close = rs_close,
@@ -176,7 +163,7 @@ static const struct tty_operations serial_ops = {
 	.chars_in_buffer = rs_chars_in_buffer,
 	.hangup = rs_hangup,
 	.wait_until_sent = rs_wait_until_sent,
-	.proc_fops = &rs_proc_fops,
+	.proc_show = rs_proc_show,
 };
 
 int __init rs_init(void)
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index d819dc7..a9e8633 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -55,13 +55,13 @@ BFQG_FLAG_FNS(empty)
 /* This should be called with the scheduler lock held. */
 static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
 {
-	unsigned long long now;
+	u64 now;
 
 	if (!bfqg_stats_waiting(stats))
 		return;
 
-	now = sched_clock();
-	if (time_after64(now, stats->start_group_wait_time))
+	now = ktime_get_ns();
+	if (now > stats->start_group_wait_time)
 		blkg_stat_add(&stats->group_wait_time,
 			      now - stats->start_group_wait_time);
 	bfqg_stats_clear_waiting(stats);
@@ -77,20 +77,20 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
 		return;
 	if (bfqg == curr_bfqg)
 		return;
-	stats->start_group_wait_time = sched_clock();
+	stats->start_group_wait_time = ktime_get_ns();
 	bfqg_stats_mark_waiting(stats);
 }
 
 /* This should be called with the scheduler lock held. */
 static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
 {
-	unsigned long long now;
+	u64 now;
 
 	if (!bfqg_stats_empty(stats))
 		return;
 
-	now = sched_clock();
-	if (time_after64(now, stats->start_empty_time))
+	now = ktime_get_ns();
+	if (now > stats->start_empty_time)
 		blkg_stat_add(&stats->empty_time,
 			      now - stats->start_empty_time);
 	bfqg_stats_clear_empty(stats);
@@ -116,7 +116,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
 	if (bfqg_stats_empty(stats))
 		return;
 
-	stats->start_empty_time = sched_clock();
+	stats->start_empty_time = ktime_get_ns();
 	bfqg_stats_mark_empty(stats);
 }
 
@@ -125,9 +125,9 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
 	struct bfqg_stats *stats = &bfqg->stats;
 
 	if (bfqg_stats_idling(stats)) {
-		unsigned long long now = sched_clock();
+		u64 now = ktime_get_ns();
 
-		if (time_after64(now, stats->start_idle_time))
+		if (now > stats->start_idle_time)
 			blkg_stat_add(&stats->idle_time,
 				      now - stats->start_idle_time);
 		bfqg_stats_clear_idling(stats);
@@ -138,7 +138,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
 {
 	struct bfqg_stats *stats = &bfqg->stats;
 
-	stats->start_idle_time = sched_clock();
+	stats->start_idle_time = ktime_get_ns();
 	bfqg_stats_mark_idling(stats);
 }
 
@@ -171,18 +171,18 @@ void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
 	blkg_rwstat_add(&bfqg->stats.merged, op, 1);
 }
 
-void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
-				  uint64_t io_start_time, unsigned int op)
+void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
+				  u64 io_start_time_ns, unsigned int op)
 {
 	struct bfqg_stats *stats = &bfqg->stats;
-	unsigned long long now = sched_clock();
+	u64 now = ktime_get_ns();
 
-	if (time_after64(now, io_start_time))
+	if (now > io_start_time_ns)
 		blkg_rwstat_add(&stats->service_time, op,
-				now - io_start_time);
-	if (time_after64(io_start_time, start_time))
+				now - io_start_time_ns);
+	if (io_start_time_ns > start_time_ns)
 		blkg_rwstat_add(&stats->wait_time, op,
-				io_start_time - start_time);
+				io_start_time_ns - start_time_ns);
 }
 
 #else /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
@@ -191,8 +191,8 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
 			      unsigned int op) { }
 void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
 void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
-void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
-				  uint64_t io_start_time, unsigned int op) { }
+void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
+				  u64 io_start_time_ns, unsigned int op) { }
 void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
 void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
 void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 771ae97..495b9dd 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -49,9 +49,39 @@
  *
  * In particular, to provide these low-latency guarantees, BFQ
  * explicitly privileges the I/O of two classes of time-sensitive
- * applications: interactive and soft real-time. This feature enables
- * BFQ to provide applications in these classes with a very low
- * latency. Finally, BFQ also features additional heuristics for
+ * applications: interactive and soft real-time. In more detail, BFQ
+ * behaves this way if the low_latency parameter is set (default
+ * configuration). This feature enables BFQ to provide applications in
+ * these classes with a very low latency.
+ *
+ * To implement this feature, BFQ constantly tries to detect whether
+ * the I/O requests in a bfq_queue come from an interactive or a soft
+ * real-time application. For brevity, in these cases, the queue is
+ * said to be interactive or soft real-time. In both cases, BFQ
+ * privileges the service of the queue, over that of non-interactive
+ * and non-soft-real-time queues. This privileging is performed,
+ * mainly, by raising the weight of the queue. So, for brevity, we
+ * call just weight-raising periods the time periods during which a
+ * queue is privileged, because deemed interactive or soft real-time.
+ *
+ * The detection of soft real-time queues/applications is described in
+ * detail in the comments on the function
+ * bfq_bfqq_softrt_next_start. On the other hand, the detection of an
+ * interactive queue works as follows: a queue is deemed interactive
+ * if it is constantly non empty only for a limited time interval,
+ * after which it does become empty. The queue may be deemed
+ * interactive again (for a limited time), if it restarts being
+ * constantly non empty, provided that this happens only after the
+ * queue has remained empty for a given minimum idle time.
+ *
+ * By default, BFQ computes automatically the above maximum time
+ * interval, i.e., the time interval after which a constantly
+ * non-empty queue stops being deemed interactive. Since a queue is
+ * weight-raised while it is deemed interactive, this maximum time
+ * interval happens to coincide with the (maximum) duration of the
+ * weight-raising for interactive queues.
+ *
+ * Finally, BFQ also features additional heuristics for
  * preserving both a low latency and a high throughput on NCQ-capable,
  * rotational or flash-based devices, and to get the job done quickly
  * for applications consisting in many I/O-bound processes.
@@ -61,14 +91,14 @@
  * all low-latency heuristics for that device, by setting low_latency
  * to 0.
  *
- * BFQ is described in [1], where also a reference to the initial, more
- * theoretical paper on BFQ can be found. The interested reader can find
- * in the latter paper full details on the main algorithm, as well as
- * formulas of the guarantees and formal proofs of all the properties.
- * With respect to the version of BFQ presented in these papers, this
- * implementation adds a few more heuristics, such as the one that
- * guarantees a low latency to soft real-time applications, and a
- * hierarchical extension based on H-WF2Q+.
+ * BFQ is described in [1], where also a reference to the initial,
+ * more theoretical paper on BFQ can be found. The interested reader
+ * can find in the latter paper full details on the main algorithm, as
+ * well as formulas of the guarantees and formal proofs of all the
+ * properties.  With respect to the version of BFQ presented in these
+ * papers, this implementation adds a few more heuristics, such as the
+ * ones that guarantee a low latency to interactive and soft real-time
+ * applications, and a hierarchical extension based on H-WF2Q+.
  *
  * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
  * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
@@ -218,56 +248,46 @@ static struct kmem_cache *bfq_pool;
 #define BFQ_RATE_SHIFT		16
 
 /*
- * By default, BFQ computes the duration of the weight raising for
- * interactive applications automatically, using the following formula:
- * duration = (R / r) * T, where r is the peak rate of the device, and
- * R and T are two reference parameters.
- * In particular, R is the peak rate of the reference device (see
- * below), and T is a reference time: given the systems that are
- * likely to be installed on the reference device according to its
- * speed class, T is about the maximum time needed, under BFQ and
- * while reading two files in parallel, to load typical large
- * applications on these systems (see the comments on
- * max_service_from_wr below, for more details on how T is obtained).
- * In practice, the slower/faster the device at hand is, the more/less
- * it takes to load applications with respect to the reference device.
- * Accordingly, the longer/shorter BFQ grants weight raising to
- * interactive applications.
- *
- * BFQ uses four different reference pairs (R, T), depending on:
- * . whether the device is rotational or non-rotational;
- * . whether the device is slow, such as old or portable HDDs, as well as
- *   SD cards, or fast, such as newer HDDs and SSDs.
+ * When configured for computing the duration of the weight-raising
+ * for interactive queues automatically (see the comments at the
+ * beginning of this file), BFQ does it using the following formula:
+ * duration = (ref_rate / r) * ref_wr_duration,
+ * where r is the peak rate of the device, and ref_rate and
+ * ref_wr_duration are two reference parameters.  In particular,
+ * ref_rate is the peak rate of the reference storage device (see
+ * below), and ref_wr_duration is about the maximum time needed, with
+ * BFQ and while reading two files in parallel, to load typical large
+ * applications on the reference device (see the comments on
+ * max_service_from_wr below, for more details on how ref_wr_duration
+ * is obtained).  In practice, the slower/faster the device at hand
+ * is, the more/less it takes to load applications with respect to the
+ * reference device.  Accordingly, the longer/shorter BFQ grants
+ * weight raising to interactive applications.
  *
- * The device's speed class is dynamically (re)detected in
- * bfq_update_peak_rate() every time the estimated peak rate is updated.
+ * BFQ uses two different reference pairs (ref_rate, ref_wr_duration),
+ * depending on whether the device is rotational or non-rotational.
  *
- * In the following definitions, R_slow[0]/R_fast[0] and
- * T_slow[0]/T_fast[0] are the reference values for a slow/fast
- * rotational device, whereas R_slow[1]/R_fast[1] and
- * T_slow[1]/T_fast[1] are the reference values for a slow/fast
- * non-rotational device. Finally, device_speed_thresh are the
- * thresholds used to switch between speed classes. The reference
- * rates are not the actual peak rates of the devices used as a
- * reference, but slightly lower values. The reason for using these
- * slightly lower values is that the peak-rate estimator tends to
- * yield slightly lower values than the actual peak rate (it can yield
- * the actual peak rate only if there is only one process doing I/O,
- * and the process does sequential I/O).
+ * In the following definitions, ref_rate[0] and ref_wr_duration[0]
+ * are the reference values for a rotational device, whereas
+ * ref_rate[1] and ref_wr_duration[1] are the reference values for a
+ * non-rotational device. The reference rates are not the actual peak
+ * rates of the devices used as a reference, but slightly lower
+ * values. The reason for using slightly lower values is that the
+ * peak-rate estimator tends to yield slightly lower values than the
+ * actual peak rate (it can yield the actual peak rate only if there
+ * is only one process doing I/O, and the process does sequential
+ * I/O).
  *
- * Both the reference peak rates and the thresholds are measured in
- * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
+ * The reference peak rates are measured in sectors/usec, left-shifted
+ * by BFQ_RATE_SHIFT.
  */
-static int R_slow[2] = {1000, 10700};
-static int R_fast[2] = {14000, 33000};
+static int ref_rate[2] = {14000, 33000};
 /*
- * To improve readability, a conversion function is used to initialize the
- * following arrays, which entails that they can be initialized only in a
- * function.
+ * To improve readability, a conversion function is used to initialize
+ * the following array, which entails that the array can be
+ * initialized only in a function.
  */
-static int T_slow[2];
-static int T_fast[2];
-static int device_speed_thresh[2];
+static int ref_wr_duration[2];
 
 /*
  * BFQ uses the above-detailed, time-based weight-raising mechanism to
@@ -487,46 +507,6 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
 }
 
 /*
- * See the comments on bfq_limit_depth for the purpose of
- * the depths set in the function.
- */
-static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
-{
-	bfqd->sb_shift = bt->sb.shift;
-
-	/*
-	 * In-word depths if no bfq_queue is being weight-raised:
-	 * leaving 25% of tags only for sync reads.
-	 *
-	 * In next formulas, right-shift the value
-	 * (1U<<bfqd->sb_shift), instead of computing directly
-	 * (1U<<(bfqd->sb_shift - something)), to be robust against
-	 * any possible value of bfqd->sb_shift, without having to
-	 * limit 'something'.
-	 */
-	/* no more than 50% of tags for async I/O */
-	bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U);
-	/*
-	 * no more than 75% of tags for sync writes (25% extra tags
-	 * w.r.t. async I/O, to prevent async I/O from starving sync
-	 * writes)
-	 */
-	bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U);
-
-	/*
-	 * In-word depths in case some bfq_queue is being weight-
-	 * raised: leaving ~63% of tags for sync reads. This is the
-	 * highest percentage for which, in our tests, application
-	 * start-up times didn't suffer from any regression due to tag
-	 * shortage.
-	 */
-	/* no more than ~18% of tags for async I/O */
-	bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U);
-	/* no more than ~37% of tags for sync writes (~20% extra tags) */
-	bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U);
-}
-
-/*
  * Async I/O can easily starve sync I/O (both sync reads and sync
  * writes), by consuming all tags. Similarly, storms of sync writes,
  * such as those that sync(2) may trigger, can starve sync reads.
@@ -535,25 +515,11 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
  */
 static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
 {
-	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 	struct bfq_data *bfqd = data->q->elevator->elevator_data;
-	struct sbitmap_queue *bt;
 
 	if (op_is_sync(op) && !op_is_write(op))
 		return;
 
-	if (data->flags & BLK_MQ_REQ_RESERVED) {
-		if (unlikely(!tags->nr_reserved_tags)) {
-			WARN_ON_ONCE(1);
-			return;
-		}
-		bt = &tags->breserved_tags;
-	} else
-		bt = &tags->bitmap_tags;
-
-	if (unlikely(bfqd->sb_shift != bt->sb.shift))
-		bfq_update_depths(bfqd, bt);
-
 	data->shallow_depth =
 		bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
 
@@ -906,26 +872,30 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
 	if (bfqd->bfq_wr_max_time > 0)
 		return bfqd->bfq_wr_max_time;
 
-	dur = bfqd->RT_prod;
+	dur = bfqd->rate_dur_prod;
 	do_div(dur, bfqd->peak_rate);
 
 	/*
-	 * Limit duration between 3 and 13 seconds. Tests show that
-	 * higher values than 13 seconds often yield the opposite of
-	 * the desired result, i.e., worsen responsiveness by letting
-	 * non-interactive and non-soft-real-time applications
-	 * preserve weight raising for a too long time interval.
+	 * Limit duration between 3 and 25 seconds. The upper limit
+	 * has been conservatively set after the following worst case:
+	 * on a QEMU/KVM virtual machine
+	 * - running in a slow PC
+	 * - with a virtual disk stacked on a slow low-end 5400rpm HDD
+	 * - serving a heavy I/O workload, such as the sequential reading
+	 *   of several files
+	 * mplayer took 23 seconds to start, if constantly weight-raised.
+	 *
+	 * As for higher values than that accomodating the above bad
+	 * scenario, tests show that higher values would often yield
+	 * the opposite of the desired result, i.e., would worsen
+	 * responsiveness by allowing non-interactive applications to
+	 * preserve weight raising for too long.
 	 *
 	 * On the other end, lower values than 3 seconds make it
 	 * difficult for most interactive tasks to complete their jobs
 	 * before weight-raising finishes.
 	 */
-	if (dur > msecs_to_jiffies(13000))
-		dur = msecs_to_jiffies(13000);
-	else if (dur < msecs_to_jiffies(3000))
-		dur = msecs_to_jiffies(3000);
-
-	return dur;
+	return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000));
 }
 
 /* switch back from soft real-time to interactive weight raising */
@@ -1393,15 +1363,6 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
 }
 
 /*
- * Return the farthest future time instant according to jiffies
- * macros.
- */
-static unsigned long bfq_greatest_from_now(void)
-{
-	return jiffies + MAX_JIFFY_OFFSET;
-}
-
-/*
  * Return the farthest past time instant according to jiffies
  * macros.
  */
@@ -1545,7 +1506,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 	in_burst = bfq_bfqq_in_large_burst(bfqq);
 	soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
 		!in_burst &&
-		time_is_before_jiffies(bfqq->soft_rt_next_start);
+		time_is_before_jiffies(bfqq->soft_rt_next_start) &&
+		bfqq->dispatched == 0;
 	*interactive = !in_burst && idle_for_long_time;
 	wr_or_deserves_wr = bfqd->low_latency &&
 		(bfqq->wr_coeff > 1 ||
@@ -1858,6 +1820,8 @@ static int bfq_request_merge(struct request_queue *q, struct request **req,
 	return ELEVATOR_NO_MERGE;
 }
 
+static struct bfq_queue *bfq_init_rq(struct request *rq);
+
 static void bfq_request_merged(struct request_queue *q, struct request *req,
 			       enum elv_merge type)
 {
@@ -1866,7 +1830,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req,
 	    blk_rq_pos(req) <
 	    blk_rq_pos(container_of(rb_prev(&req->rb_node),
 				    struct request, rb_node))) {
-		struct bfq_queue *bfqq = RQ_BFQQ(req);
+		struct bfq_queue *bfqq = bfq_init_rq(req);
 		struct bfq_data *bfqd = bfqq->bfqd;
 		struct request *prev, *next_rq;
 
@@ -1891,14 +1855,25 @@ static void bfq_request_merged(struct request_queue *q, struct request *req,
 	}
 }
 
+/*
+ * This function is called to notify the scheduler that the requests
+ * rq and 'next' have been merged, with 'next' going away.  BFQ
+ * exploits this hook to address the following issue: if 'next' has a
+ * fifo_time lower that rq, then the fifo_time of rq must be set to
+ * the value of 'next', to not forget the greater age of 'next'.
+ *
+ * NOTE: in this function we assume that rq is in a bfq_queue, basing
+ * on that rq is picked from the hash table q->elevator->hash, which,
+ * in its turn, is filled only with I/O requests present in
+ * bfq_queues, while BFQ is in use for the request queue q. In fact,
+ * the function that fills this hash table (elv_rqhash_add) is called
+ * only by bfq_insert_request.
+ */
 static void bfq_requests_merged(struct request_queue *q, struct request *rq,
 				struct request *next)
 {
-	struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
-
-	if (!RB_EMPTY_NODE(&rq->rb_node))
-		goto end;
-	spin_lock_irq(&bfqq->bfqd->lock);
+	struct bfq_queue *bfqq = bfq_init_rq(rq),
+		*next_bfqq = bfq_init_rq(next);
 
 	/*
 	 * If next and rq belong to the same bfq_queue and next is older
@@ -1920,11 +1895,6 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
 	if (bfqq->next_rq == next)
 		bfqq->next_rq = rq;
 
-	bfq_remove_request(q, next);
-	bfqg_stats_update_io_remove(bfqq_group(bfqq), next->cmd_flags);
-
-	spin_unlock_irq(&bfqq->bfqd->lock);
-end:
 	bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
 }
 
@@ -2506,37 +2476,15 @@ static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
 /*
  * Update parameters related to throughput and responsiveness, as a
  * function of the estimated peak rate. See comments on
- * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
+ * bfq_calc_max_budget(), and on the ref_wr_duration array.
  */
 static void update_thr_responsiveness_params(struct bfq_data *bfqd)
 {
-	int dev_type = blk_queue_nonrot(bfqd->queue);
-
-	if (bfqd->bfq_user_max_budget == 0)
+	if (bfqd->bfq_user_max_budget == 0) {
 		bfqd->bfq_max_budget =
 			bfq_calc_max_budget(bfqd);
-
-	if (bfqd->device_speed == BFQ_BFQD_FAST &&
-	    bfqd->peak_rate < device_speed_thresh[dev_type]) {
-		bfqd->device_speed = BFQ_BFQD_SLOW;
-		bfqd->RT_prod = R_slow[dev_type] *
-			T_slow[dev_type];
-	} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
-		   bfqd->peak_rate > device_speed_thresh[dev_type]) {
-		bfqd->device_speed = BFQ_BFQD_FAST;
-		bfqd->RT_prod = R_fast[dev_type] *
-			T_fast[dev_type];
+		bfq_log(bfqd, "new max_budget = %d", bfqd->bfq_max_budget);
 	}
-
-	bfq_log(bfqd,
-"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
-		dev_type == 0 ? "ROT" : "NONROT",
-		bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
-		bfqd->device_speed == BFQ_BFQD_FAST ?
-		(USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
-		(USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
-		(USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
-		BFQ_RATE_SHIFT);
 }
 
 static void bfq_reset_rate_computation(struct bfq_data *bfqd,
@@ -3266,23 +3214,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
 				bfq_bfqq_softrt_next_start(bfqd, bfqq);
 		else {
 			/*
-			 * The application is still waiting for the
-			 * completion of one or more requests:
-			 * prevent it from possibly being incorrectly
-			 * deemed as soft real-time by setting its
-			 * soft_rt_next_start to infinity. In fact,
-			 * without this assignment, the application
-			 * would be incorrectly deemed as soft
-			 * real-time if:
-			 * 1) it issued a new request before the
-			 *    completion of all its in-flight
-			 *    requests, and
-			 * 2) at that time, its soft_rt_next_start
-			 *    happened to be in the past.
-			 */
-			bfqq->soft_rt_next_start =
-				bfq_greatest_from_now();
-			/*
 			 * Schedule an update of soft_rt_next_start to when
 			 * the task may be discovered to be isochronous.
 			 */
@@ -4540,14 +4471,12 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
 					   unsigned int cmd_flags) {}
 #endif
 
-static void bfq_prepare_request(struct request *rq, struct bio *bio);
-
 static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 			       bool at_head)
 {
 	struct request_queue *q = hctx->queue;
 	struct bfq_data *bfqd = q->elevator->elevator_data;
-	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_queue *bfqq;
 	bool idle_timer_disabled = false;
 	unsigned int cmd_flags;
 
@@ -4562,24 +4491,13 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	blk_mq_sched_request_inserted(rq);
 
 	spin_lock_irq(&bfqd->lock);
+	bfqq = bfq_init_rq(rq);
 	if (at_head || blk_rq_is_passthrough(rq)) {
 		if (at_head)
 			list_add(&rq->queuelist, &bfqd->dispatch);
 		else
 			list_add_tail(&rq->queuelist, &bfqd->dispatch);
-	} else {
-		if (WARN_ON_ONCE(!bfqq)) {
-			/*
-			 * This should never happen. Most likely rq is
-			 * a requeued regular request, being
-			 * re-inserted without being first
-			 * re-prepared. Do a prepare, to avoid
-			 * failure.
-			 */
-			bfq_prepare_request(rq, rq->bio);
-			bfqq = RQ_BFQQ(rq);
-		}
-
+	} else { /* bfqq is assumed to be non null here */
 		idle_timer_disabled = __bfq_insert_request(bfqd, rq);
 		/*
 		 * Update bfqq, because, if a queue merge has occurred
@@ -4778,8 +4696,8 @@ static void bfq_finish_requeue_request(struct request *rq)
 
 	if (rq->rq_flags & RQF_STARTED)
 		bfqg_stats_update_completion(bfqq_group(bfqq),
-					     rq_start_time_ns(rq),
-					     rq_io_start_time_ns(rq),
+					     rq->start_time_ns,
+					     rq->io_start_time_ns,
 					     rq->cmd_flags);
 
 	if (likely(rq->rq_flags & RQF_STARTED)) {
@@ -4922,11 +4840,48 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
 }
 
 /*
- * Allocate bfq data structures associated with this request.
+ * Only reset private fields. The actual request preparation will be
+ * performed by bfq_init_rq, when rq is either inserted or merged. See
+ * comments on bfq_init_rq for the reason behind this delayed
+ * preparation.
  */
 static void bfq_prepare_request(struct request *rq, struct bio *bio)
 {
+	/*
+	 * Regardless of whether we have an icq attached, we have to
+	 * clear the scheduler pointers, as they might point to
+	 * previously allocated bic/bfqq structs.
+	 */
+	rq->elv.priv[0] = rq->elv.priv[1] = NULL;
+}
+
+/*
+ * If needed, init rq, allocate bfq data structures associated with
+ * rq, and increment reference counters in the destination bfq_queue
+ * for rq. Return the destination bfq_queue for rq, or NULL is rq is
+ * not associated with any bfq_queue.
+ *
+ * This function is invoked by the functions that perform rq insertion
+ * or merging. One may have expected the above preparation operations
+ * to be performed in bfq_prepare_request, and not delayed to when rq
+ * is inserted or merged. The rationale behind this delayed
+ * preparation is that, after the prepare_request hook is invoked for
+ * rq, rq may still be transformed into a request with no icq, i.e., a
+ * request not associated with any queue. No bfq hook is invoked to
+ * signal this tranformation. As a consequence, should these
+ * preparation operations be performed when the prepare_request hook
+ * is invoked, and should rq be transformed one moment later, bfq
+ * would end up in an inconsistent state, because it would have
+ * incremented some queue counters for an rq destined to
+ * transformation, without any chance to correctly lower these
+ * counters back. In contrast, no transformation can still happen for
+ * rq after rq has been inserted or merged. So, it is safe to execute
+ * these preparation operations when rq is finally inserted or merged.
+ */
+static struct bfq_queue *bfq_init_rq(struct request *rq)
+{
 	struct request_queue *q = rq->q;
+	struct bio *bio = rq->bio;
 	struct bfq_data *bfqd = q->elevator->elevator_data;
 	struct bfq_io_cq *bic;
 	const int is_sync = rq_is_sync(rq);
@@ -4934,20 +4889,21 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio)
 	bool new_queue = false;
 	bool bfqq_already_existing = false, split = false;
 
+	if (unlikely(!rq->elv.icq))
+		return NULL;
+
 	/*
-	 * Even if we don't have an icq attached, we should still clear
-	 * the scheduler pointers, as they might point to previously
-	 * allocated bic/bfqq structs.
+	 * Assuming that elv.priv[1] is set only if everything is set
+	 * for this rq. This holds true, because this function is
+	 * invoked only for insertion or merging, and, after such
+	 * events, a request cannot be manipulated any longer before
+	 * being removed from bfq.
 	 */
-	if (!rq->elv.icq) {
-		rq->elv.priv[0] = rq->elv.priv[1] = NULL;
-		return;
-	}
+	if (rq->elv.priv[1])
+		return rq->elv.priv[1];
 
 	bic = icq_to_bic(rq->elv.icq);
 
-	spin_lock_irq(&bfqd->lock);
-
 	bfq_check_ioprio_change(bic, bio);
 
 	bfq_bic_update_cgroup(bic, bio);
@@ -5006,7 +4962,7 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio)
 	if (unlikely(bfq_bfqq_just_created(bfqq)))
 		bfq_handle_burst(bfqd, bfqq);
 
-	spin_unlock_irq(&bfqd->lock);
+	return bfqq;
 }
 
 static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
@@ -5105,6 +5061,64 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
 	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
 }
 
+/*
+ * See the comments on bfq_limit_depth for the purpose of
+ * the depths set in the function. Return minimum shallow depth we'll use.
+ */
+static unsigned int bfq_update_depths(struct bfq_data *bfqd,
+				      struct sbitmap_queue *bt)
+{
+	unsigned int i, j, min_shallow = UINT_MAX;
+
+	/*
+	 * In-word depths if no bfq_queue is being weight-raised:
+	 * leaving 25% of tags only for sync reads.
+	 *
+	 * In next formulas, right-shift the value
+	 * (1U<<bt->sb.shift), instead of computing directly
+	 * (1U<<(bt->sb.shift - something)), to be robust against
+	 * any possible value of bt->sb.shift, without having to
+	 * limit 'something'.
+	 */
+	/* no more than 50% of tags for async I/O */
+	bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U);
+	/*
+	 * no more than 75% of tags for sync writes (25% extra tags
+	 * w.r.t. async I/O, to prevent async I/O from starving sync
+	 * writes)
+	 */
+	bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U);
+
+	/*
+	 * In-word depths in case some bfq_queue is being weight-
+	 * raised: leaving ~63% of tags for sync reads. This is the
+	 * highest percentage for which, in our tests, application
+	 * start-up times didn't suffer from any regression due to tag
+	 * shortage.
+	 */
+	/* no more than ~18% of tags for async I/O */
+	bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U);
+	/* no more than ~37% of tags for sync writes (~20% extra tags) */
+	bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U);
+
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < 2; j++)
+			min_shallow = min(min_shallow, bfqd->word_depths[i][j]);
+
+	return min_shallow;
+}
+
+static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
+{
+	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+	struct blk_mq_tags *tags = hctx->sched_tags;
+	unsigned int min_shallow;
+
+	min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
+	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
+	return 0;
+}
+
 static void bfq_exit_queue(struct elevator_queue *e)
 {
 	struct bfq_data *bfqd = e->elevator_data;
@@ -5242,14 +5256,12 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	bfqd->wr_busy_queues = 0;
 
 	/*
-	 * Begin by assuming, optimistically, that the device is a
-	 * high-speed one, and that its peak rate is equal to 2/3 of
-	 * the highest reference rate.
+	 * Begin by assuming, optimistically, that the device peak
+	 * rate is equal to 2/3 of the highest reference rate.
 	 */
-	bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
-			T_fast[blk_queue_nonrot(bfqd->queue)];
-	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
-	bfqd->device_speed = BFQ_BFQD_FAST;
+	bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] *
+		ref_wr_duration[blk_queue_nonrot(bfqd->queue)];
+	bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
 
 	spin_lock_init(&bfqd->lock);
 
@@ -5526,6 +5538,7 @@ static struct elevator_type iosched_bfq_mq = {
 		.requests_merged	= bfq_requests_merged,
 		.request_merged		= bfq_request_merged,
 		.has_work		= bfq_has_work,
+		.init_hctx		= bfq_init_hctx,
 		.init_sched		= bfq_init_queue,
 		.exit_sched		= bfq_exit_queue,
 	},
@@ -5556,8 +5569,8 @@ static int __init bfq_init(void)
 	/*
 	 * Times to load large popular applications for the typical
 	 * systems installed on the reference devices (see the
-	 * comments before the definitions of the next two
-	 * arrays). Actually, we use slightly slower values, as the
+	 * comments before the definition of the next
+	 * array). Actually, we use slightly lower values, as the
 	 * estimated peak rate tends to be smaller than the actual
 	 * peak rate.  The reason for this last fact is that estimates
 	 * are computed over much shorter time intervals than the long
@@ -5566,25 +5579,8 @@ static int __init bfq_init(void)
 	 * scheduler cannot rely on a peak-rate-evaluation workload to
 	 * be run for a long time.
 	 */
-	T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
-	T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
-	T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
-	T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
-
-	/*
-	 * Thresholds that determine the switch between speed classes
-	 * (see the comments before the definition of the array
-	 * device_speed_thresh). These thresholds are biased towards
-	 * transitions to the fast class. This is safer than the
-	 * opposite bias. In fact, a wrong transition to the slow
-	 * class results in short weight-raising periods, because the
-	 * speed of the device then tends to be higher that the
-	 * reference peak rate. On the opposite end, a wrong
-	 * transition to the fast class tends to increase
-	 * weight-raising periods, because of the opposite reason.
-	 */
-	device_speed_thresh[0] = (4 * R_slow[0]) / 3;
-	device_speed_thresh[1] = (4 * R_slow[1]) / 3;
+	ref_wr_duration[0] = msecs_to_jiffies(7000); /* actually 8 sec */
+	ref_wr_duration[1] = msecs_to_jiffies(2500); /* actually 3 sec */
 
 	ret = elv_register(&iosched_bfq_mq);
 	if (ret)
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index ae2f3da..0f712e0 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -399,11 +399,6 @@ struct bfq_io_cq {
 	struct bfq_ttime saved_ttime;
 };
 
-enum bfq_device_speed {
-	BFQ_BFQD_FAST,
-	BFQ_BFQD_SLOW,
-};
-
 /**
  * struct bfq_data - per-device data structure.
  *
@@ -611,12 +606,11 @@ struct bfq_data {
 	/* Max service-rate for a soft real-time queue, in sectors/sec */
 	unsigned int bfq_wr_max_softrt_rate;
 	/*
-	 * Cached value of the product R*T, used for computing the
-	 * maximum duration of weight raising automatically.
+	 * Cached value of the product ref_rate*ref_wr_duration, used
+	 * for computing the maximum duration of weight raising
+	 * automatically.
 	 */
-	u64 RT_prod;
-	/* device-speed class for the low-latency heuristic */
-	enum bfq_device_speed device_speed;
+	u64 rate_dur_prod;
 
 	/* fallback dummy bfqq for extreme OOM conditions */
 	struct bfq_queue oom_bfqq;
@@ -636,12 +630,6 @@ struct bfq_data {
 	struct bfq_queue *bio_bfqq;
 
 	/*
-	 * Cached sbitmap shift, used to compute depth limits in
-	 * bfq_update_depths.
-	 */
-	unsigned int sb_shift;
-
-	/*
 	 * Depth limits used in bfq_limit_depth (see comments on the
 	 * function)
 	 */
@@ -732,9 +720,9 @@ struct bfqg_stats {
 	/* total time with empty current active q with other requests queued */
 	struct blkg_stat		empty_time;
 	/* fields after this shouldn't be cleared on stat reset */
-	uint64_t			start_group_wait_time;
-	uint64_t			start_idle_time;
-	uint64_t			start_empty_time;
+	u64				start_group_wait_time;
+	u64				start_idle_time;
+	u64				start_empty_time;
 	uint16_t			flags;
 #endif	/* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 };
@@ -856,8 +844,8 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
 			      unsigned int op);
 void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op);
 void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op);
-void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
-				  uint64_t io_start_time, unsigned int op);
+void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
+				  u64 io_start_time_ns, unsigned int op);
 void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
 void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
 void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 9cfdd6c..add7c7c 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -56,12 +56,12 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 	struct bio_set *bs = bio->bi_pool;
 	unsigned inline_vecs;
 
-	if (!bs || !bs->bio_integrity_pool) {
+	if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) {
 		bip = kmalloc(sizeof(struct bio_integrity_payload) +
 			      sizeof(struct bio_vec) * nr_vecs, gfp_mask);
 		inline_vecs = nr_vecs;
 	} else {
-		bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+		bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask);
 		inline_vecs = BIP_INLINE_VECS;
 	}
 
@@ -74,7 +74,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 		unsigned long idx = 0;
 
 		bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
-					  bs->bvec_integrity_pool);
+					  &bs->bvec_integrity_pool);
 		if (!bip->bip_vec)
 			goto err;
 		bip->bip_max_vcnt = bvec_nr_vecs(idx);
@@ -90,7 +90,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 
 	return bip;
 err:
-	mempool_free(bip, bs->bio_integrity_pool);
+	mempool_free(bip, &bs->bio_integrity_pool);
 	return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL(bio_integrity_alloc);
@@ -111,10 +111,10 @@ static void bio_integrity_free(struct bio *bio)
 		kfree(page_address(bip->bip_vec->bv_page) +
 		      bip->bip_vec->bv_offset);
 
-	if (bs && bs->bio_integrity_pool) {
-		bvec_free(bs->bvec_integrity_pool, bip->bip_vec, bip->bip_slab);
+	if (bs && mempool_initialized(&bs->bio_integrity_pool)) {
+		bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, bip->bip_slab);
 
-		mempool_free(bip, bs->bio_integrity_pool);
+		mempool_free(bip, &bs->bio_integrity_pool);
 	} else {
 		kfree(bip);
 	}
@@ -465,16 +465,15 @@ EXPORT_SYMBOL(bio_integrity_clone);
 
 int bioset_integrity_create(struct bio_set *bs, int pool_size)
 {
-	if (bs->bio_integrity_pool)
+	if (mempool_initialized(&bs->bio_integrity_pool))
 		return 0;
 
-	bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
-	if (!bs->bio_integrity_pool)
+	if (mempool_init_slab_pool(&bs->bio_integrity_pool,
+				   pool_size, bip_slab))
 		return -1;
 
-	bs->bvec_integrity_pool = biovec_create_pool(pool_size);
-	if (!bs->bvec_integrity_pool) {
-		mempool_destroy(bs->bio_integrity_pool);
+	if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) {
+		mempool_exit(&bs->bio_integrity_pool);
 		return -1;
 	}
 
@@ -484,8 +483,8 @@ EXPORT_SYMBOL(bioset_integrity_create);
 
 void bioset_integrity_free(struct bio_set *bs)
 {
-	mempool_destroy(bs->bio_integrity_pool);
-	mempool_destroy(bs->bvec_integrity_pool);
+	mempool_exit(&bs->bio_integrity_pool);
+	mempool_exit(&bs->bvec_integrity_pool);
 }
 EXPORT_SYMBOL(bioset_integrity_free);
 
diff --git a/block/bio.c b/block/bio.c
index 53e0f0a..595663e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -53,7 +53,7 @@ static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = {
  * fs_bio_set is the bio_set containing bio and iovec memory pools used by
  * IO code that does not need private memory pools.
  */
-struct bio_set *fs_bio_set;
+struct bio_set fs_bio_set;
 EXPORT_SYMBOL(fs_bio_set);
 
 /*
@@ -254,7 +254,7 @@ static void bio_free(struct bio *bio)
 	bio_uninit(bio);
 
 	if (bs) {
-		bvec_free(bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
+		bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
 
 		/*
 		 * If we have front padding, adjust the bio pointer before freeing
@@ -262,7 +262,7 @@ static void bio_free(struct bio *bio)
 		p = bio;
 		p -= bs->front_pad;
 
-		mempool_free(p, bs->bio_pool);
+		mempool_free(p, &bs->bio_pool);
 	} else {
 		/* Bio was allocated by bio_kmalloc() */
 		kfree(bio);
@@ -454,7 +454,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
 		inline_vecs = nr_iovecs;
 	} else {
 		/* should not use nobvec bioset for nr_iovecs > 0 */
-		if (WARN_ON_ONCE(!bs->bvec_pool && nr_iovecs > 0))
+		if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) &&
+				 nr_iovecs > 0))
 			return NULL;
 		/*
 		 * generic_make_request() converts recursion to iteration; this
@@ -483,11 +484,11 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
 		    bs->rescue_workqueue)
 			gfp_mask &= ~__GFP_DIRECT_RECLAIM;
 
-		p = mempool_alloc(bs->bio_pool, gfp_mask);
+		p = mempool_alloc(&bs->bio_pool, gfp_mask);
 		if (!p && gfp_mask != saved_gfp) {
 			punt_bios_to_rescuer(bs);
 			gfp_mask = saved_gfp;
-			p = mempool_alloc(bs->bio_pool, gfp_mask);
+			p = mempool_alloc(&bs->bio_pool, gfp_mask);
 		}
 
 		front_pad = bs->front_pad;
@@ -503,11 +504,11 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
 	if (nr_iovecs > inline_vecs) {
 		unsigned long idx = 0;
 
-		bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+		bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
 		if (!bvl && gfp_mask != saved_gfp) {
 			punt_bios_to_rescuer(bs);
 			gfp_mask = saved_gfp;
-			bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+			bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
 		}
 
 		if (unlikely(!bvl))
@@ -524,25 +525,25 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
 	return bio;
 
 err_free:
-	mempool_free(p, bs->bio_pool);
+	mempool_free(p, &bs->bio_pool);
 	return NULL;
 }
 EXPORT_SYMBOL(bio_alloc_bioset);
 
-void zero_fill_bio(struct bio *bio)
+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
 {
 	unsigned long flags;
 	struct bio_vec bv;
 	struct bvec_iter iter;
 
-	bio_for_each_segment(bv, bio, iter) {
+	__bio_for_each_segment(bv, bio, iter, start) {
 		char *data = bvec_kmap_irq(&bv, &flags);
 		memset(data, 0, bv.bv_len);
 		flush_dcache_page(bv.bv_page);
 		bvec_kunmap_irq(data, &flags);
 	}
 }
-EXPORT_SYMBOL(zero_fill_bio);
+EXPORT_SYMBOL(zero_fill_bio_iter);
 
 /**
  * bio_put - release a reference to a bio
@@ -970,27 +971,68 @@ void bio_advance(struct bio *bio, unsigned bytes)
 }
 EXPORT_SYMBOL(bio_advance);
 
+void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
+			struct bio *src, struct bvec_iter *src_iter)
+{
+	struct bio_vec src_bv, dst_bv;
+	void *src_p, *dst_p;
+	unsigned bytes;
+
+	while (src_iter->bi_size && dst_iter->bi_size) {
+		src_bv = bio_iter_iovec(src, *src_iter);
+		dst_bv = bio_iter_iovec(dst, *dst_iter);
+
+		bytes = min(src_bv.bv_len, dst_bv.bv_len);
+
+		src_p = kmap_atomic(src_bv.bv_page);
+		dst_p = kmap_atomic(dst_bv.bv_page);
+
+		memcpy(dst_p + dst_bv.bv_offset,
+		       src_p + src_bv.bv_offset,
+		       bytes);
+
+		kunmap_atomic(dst_p);
+		kunmap_atomic(src_p);
+
+		flush_dcache_page(dst_bv.bv_page);
+
+		bio_advance_iter(src, src_iter, bytes);
+		bio_advance_iter(dst, dst_iter, bytes);
+	}
+}
+EXPORT_SYMBOL(bio_copy_data_iter);
+
 /**
- * bio_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
+ * bio_copy_data - copy contents of data buffers from one bio to another
+ * @src: source bio
+ * @dst: destination bio
  *
  * Stops when it reaches the end of either @src or @dst - that is, copies
  * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
  */
 void bio_copy_data(struct bio *dst, struct bio *src)
 {
-	struct bvec_iter src_iter, dst_iter;
-	struct bio_vec src_bv, dst_bv;
-	void *src_p, *dst_p;
-	unsigned bytes;
+	struct bvec_iter src_iter = src->bi_iter;
+	struct bvec_iter dst_iter = dst->bi_iter;
 
-	src_iter = src->bi_iter;
-	dst_iter = dst->bi_iter;
+	bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+}
+EXPORT_SYMBOL(bio_copy_data);
+
+/**
+ * bio_list_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * Stops when it reaches the end of either the @src list or @dst list - that is,
+ * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of
+ * bios).
+ */
+void bio_list_copy_data(struct bio *dst, struct bio *src)
+{
+	struct bvec_iter src_iter = src->bi_iter;
+	struct bvec_iter dst_iter = dst->bi_iter;
 
 	while (1) {
 		if (!src_iter.bi_size) {
@@ -1009,26 +1051,10 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 			dst_iter = dst->bi_iter;
 		}
 
-		src_bv = bio_iter_iovec(src, src_iter);
-		dst_bv = bio_iter_iovec(dst, dst_iter);
-
-		bytes = min(src_bv.bv_len, dst_bv.bv_len);
-
-		src_p = kmap_atomic(src_bv.bv_page);
-		dst_p = kmap_atomic(dst_bv.bv_page);
-
-		memcpy(dst_p + dst_bv.bv_offset,
-		       src_p + src_bv.bv_offset,
-		       bytes);
-
-		kunmap_atomic(dst_p);
-		kunmap_atomic(src_p);
-
-		bio_advance_iter(src, &src_iter, bytes);
-		bio_advance_iter(dst, &dst_iter, bytes);
+		bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
 	}
 }
-EXPORT_SYMBOL(bio_copy_data);
+EXPORT_SYMBOL(bio_list_copy_data);
 
 struct bio_map_data {
 	int is_our_pages;
@@ -1584,6 +1610,7 @@ void bio_set_pages_dirty(struct bio *bio)
 			set_page_dirty_lock(page);
 	}
 }
+EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
 
 static void bio_release_pages(struct bio *bio)
 {
@@ -1667,6 +1694,7 @@ void bio_check_pages_dirty(struct bio *bio)
 		bio_put(bio);
 	}
 }
+EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
 
 void generic_start_io_acct(struct request_queue *q, int rw,
 			   unsigned long sectors, struct hd_struct *part)
@@ -1749,6 +1777,9 @@ again:
 	if (!bio_integrity_endio(bio))
 		return;
 
+	if (WARN_ONCE(bio->bi_next, "driver left bi_next not NULL"))
+		bio->bi_next = NULL;
+
 	/*
 	 * Need to have a real endio function for chained bios, otherwise
 	 * various corner cases will break (like stacking block devices that
@@ -1848,30 +1879,38 @@ EXPORT_SYMBOL_GPL(bio_trim);
  * create memory pools for biovec's in a bio_set.
  * use the global biovec slabs created for general use.
  */
-mempool_t *biovec_create_pool(int pool_entries)
+int biovec_init_pool(mempool_t *pool, int pool_entries)
 {
 	struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX;
 
-	return mempool_create_slab_pool(pool_entries, bp->slab);
+	return mempool_init_slab_pool(pool, pool_entries, bp->slab);
 }
 
-void bioset_free(struct bio_set *bs)
+/*
+ * bioset_exit - exit a bioset initialized with bioset_init()
+ *
+ * May be called on a zeroed but uninitialized bioset (i.e. allocated with
+ * kzalloc()).
+ */
+void bioset_exit(struct bio_set *bs)
 {
 	if (bs->rescue_workqueue)
 		destroy_workqueue(bs->rescue_workqueue);
+	bs->rescue_workqueue = NULL;
 
-	mempool_destroy(bs->bio_pool);
-	mempool_destroy(bs->bvec_pool);
+	mempool_exit(&bs->bio_pool);
+	mempool_exit(&bs->bvec_pool);
 
 	bioset_integrity_free(bs);
-	bio_put_slab(bs);
-
-	kfree(bs);
+	if (bs->bio_slab)
+		bio_put_slab(bs);
+	bs->bio_slab = NULL;
 }
-EXPORT_SYMBOL(bioset_free);
+EXPORT_SYMBOL(bioset_exit);
 
 /**
- * bioset_create  - Create a bio_set
+ * bioset_init - Initialize a bio_set
+ * @bs:		pool to initialize
  * @pool_size:	Number of bio and bio_vecs to cache in the mempool
  * @front_pad:	Number of bytes to allocate in front of the returned bio
  * @flags:	Flags to modify behavior, currently %BIOSET_NEED_BVECS
@@ -1890,16 +1929,12 @@ EXPORT_SYMBOL(bioset_free);
  *    dispatch queued requests when the mempool runs out of space.
  *
  */
-struct bio_set *bioset_create(unsigned int pool_size,
-			      unsigned int front_pad,
-			      int flags)
+int bioset_init(struct bio_set *bs,
+		unsigned int pool_size,
+		unsigned int front_pad,
+		int flags)
 {
 	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
-	struct bio_set *bs;
-
-	bs = kzalloc(sizeof(*bs), GFP_KERNEL);
-	if (!bs)
-		return NULL;
 
 	bs->front_pad = front_pad;
 
@@ -1908,34 +1943,29 @@ struct bio_set *bioset_create(unsigned int pool_size,
 	INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
 
 	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
-	if (!bs->bio_slab) {
-		kfree(bs);
-		return NULL;
-	}
+	if (!bs->bio_slab)
+		return -ENOMEM;
 
-	bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
-	if (!bs->bio_pool)
+	if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
 		goto bad;
 
-	if (flags & BIOSET_NEED_BVECS) {
-		bs->bvec_pool = biovec_create_pool(pool_size);
-		if (!bs->bvec_pool)
-			goto bad;
-	}
+	if ((flags & BIOSET_NEED_BVECS) &&
+	    biovec_init_pool(&bs->bvec_pool, pool_size))
+		goto bad;
 
 	if (!(flags & BIOSET_NEED_RESCUER))
-		return bs;
+		return 0;
 
 	bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
 	if (!bs->rescue_workqueue)
 		goto bad;
 
-	return bs;
+	return 0;
 bad:
-	bioset_free(bs);
-	return NULL;
+	bioset_exit(bs);
+	return -ENOMEM;
 }
-EXPORT_SYMBOL(bioset_create);
+EXPORT_SYMBOL(bioset_init);
 
 #ifdef CONFIG_BLK_CGROUP
 
@@ -2020,11 +2050,10 @@ static int __init init_bio(void)
 	bio_integrity_init();
 	biovec_init_slabs();
 
-	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-	if (!fs_bio_set)
+	if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
 		panic("bio: can't allocate bios\n");
 
-	if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
+	if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
 		panic("bio: can't create integrity pool\n");
 
 	return 0;
diff --git a/block/blk-core.c b/block/blk-core.c
index 85909b4..3f56be1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -196,15 +196,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->tag = -1;
 	rq->internal_tag = -1;
-	rq->start_time = jiffies;
-	set_start_time_ns(rq);
+	rq->start_time_ns = ktime_get_ns();
 	rq->part = NULL;
-	seqcount_init(&rq->gstate_seq);
-	u64_stats_init(&rq->aborted_gstate_sync);
-	/*
-	 * See comment of blk_mq_init_request
-	 */
-	WRITE_ONCE(rq->gstate, MQ_RQ_GEN_INC);
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -280,6 +273,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 	bio_advance(bio, nbytes);
 
 	/* don't actually finish bio if it's part of flush sequence */
+	/*
+	 * XXX this code looks suspicious - it's not consistent with advancing
+	 * req->bio in caller
+	 */
 	if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
 		bio_endio(bio);
 }
@@ -360,7 +357,6 @@ EXPORT_SYMBOL(blk_start_queue_async);
 void blk_start_queue(struct request_queue *q)
 {
 	lockdep_assert_held(q->queue_lock);
-	WARN_ON(!in_interrupt() && !irqs_disabled());
 	WARN_ON_ONCE(q->mq_ops);
 
 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
@@ -996,18 +992,24 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
 					   spinlock_t *lock)
 {
 	struct request_queue *q;
+	int ret;
 
 	q = kmem_cache_alloc_node(blk_requestq_cachep,
 				gfp_mask | __GFP_ZERO, node_id);
 	if (!q)
 		return NULL;
 
+	INIT_LIST_HEAD(&q->queue_head);
+	q->last_merge = NULL;
+	q->end_sector = 0;
+	q->boundary_rq = NULL;
+
 	q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
 	if (q->id < 0)
 		goto fail_q;
 
-	q->bio_split = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-	if (!q->bio_split)
+	ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+	if (ret)
 		goto fail_id;
 
 	q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
@@ -1079,7 +1081,7 @@ fail_bdi:
 fail_stats:
 	bdi_put(q->backing_dev_info);
 fail_split:
-	bioset_free(q->bio_split);
+	bioset_exit(&q->bio_split);
 fail_id:
 	ida_simple_remove(&blk_queue_ida, q->id);
 fail_q:
@@ -1173,16 +1175,8 @@ int blk_init_allocated_queue(struct request_queue *q)
 
 	q->sg_reserved_size = INT_MAX;
 
-	/* Protect q->elevator from elevator_change */
-	mutex_lock(&q->sysfs_lock);
-
-	/* init elevator */
-	if (elevator_init(q, NULL)) {
-		mutex_unlock(&q->sysfs_lock);
+	if (elevator_init(q))
 		goto out_exit_flush_rq;
-	}
-
-	mutex_unlock(&q->sysfs_lock);
 	return 0;
 
 out_exit_flush_rq:
@@ -1334,6 +1328,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
  * @op: operation and flags
  * @bio: bio to allocate request for (can be %NULL)
  * @flags: BLQ_MQ_REQ_* flags
+ * @gfp_mask: allocator flags
  *
  * Get a free request from @q.  This function may fail under memory
  * pressure or if @q is dead.
@@ -1343,7 +1338,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *__get_request(struct request_list *rl, unsigned int op,
-				     struct bio *bio, blk_mq_req_flags_t flags)
+		struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp_mask)
 {
 	struct request_queue *q = rl->q;
 	struct request *rq;
@@ -1352,8 +1347,6 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
 	struct io_cq *icq = NULL;
 	const bool is_sync = op_is_sync(op);
 	int may_queue;
-	gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
-			 __GFP_DIRECT_RECLAIM;
 	req_flags_t rq_flags = RQF_ALLOCED;
 
 	lockdep_assert_held(q->queue_lock);
@@ -1517,8 +1510,9 @@ rq_starved:
  * @op: operation and flags
  * @bio: bio to allocate request for (can be %NULL)
  * @flags: BLK_MQ_REQ_* flags.
+ * @gfp: allocator flags
  *
- * Get a free request from @q.  If %__GFP_DIRECT_RECLAIM is set in @gfp_mask,
+ * Get a free request from @q.  If %BLK_MQ_REQ_NOWAIT is set in @flags,
  * this function keeps retrying under memory pressure and fails iff @q is dead.
  *
  * Must be called with @q->queue_lock held and,
@@ -1526,7 +1520,7 @@ rq_starved:
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *get_request(struct request_queue *q, unsigned int op,
-				   struct bio *bio, blk_mq_req_flags_t flags)
+		struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp)
 {
 	const bool is_sync = op_is_sync(op);
 	DEFINE_WAIT(wait);
@@ -1538,7 +1532,7 @@ static struct request *get_request(struct request_queue *q, unsigned int op,
 
 	rl = blk_get_rl(q, bio);	/* transferred to @rq on success */
 retry:
-	rq = __get_request(rl, op, bio, flags);
+	rq = __get_request(rl, op, bio, flags, gfp);
 	if (!IS_ERR(rq))
 		return rq;
 
@@ -1579,8 +1573,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
 				unsigned int op, blk_mq_req_flags_t flags)
 {
 	struct request *rq;
-	gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
-			 __GFP_DIRECT_RECLAIM;
+	gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : GFP_NOIO;
 	int ret = 0;
 
 	WARN_ON_ONCE(q->mq_ops);
@@ -1592,7 +1585,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
 	if (ret)
 		return ERR_PTR(ret);
 	spin_lock_irq(q->queue_lock);
-	rq = get_request(q, op, NULL, flags);
+	rq = get_request(q, op, NULL, flags, gfp_mask);
 	if (IS_ERR(rq)) {
 		spin_unlock_irq(q->queue_lock);
 		blk_queue_exit(q);
@@ -1607,13 +1600,13 @@ static struct request *blk_old_get_request(struct request_queue *q,
 }
 
 /**
- * blk_get_request_flags - allocate a request
+ * blk_get_request - allocate a request
  * @q: request queue to allocate a request for
  * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
  * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
  */
-struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
-				      blk_mq_req_flags_t flags)
+struct request *blk_get_request(struct request_queue *q, unsigned int op,
+				blk_mq_req_flags_t flags)
 {
 	struct request *req;
 
@@ -1632,14 +1625,6 @@ struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
 
 	return req;
 }
-EXPORT_SYMBOL(blk_get_request_flags);
-
-struct request *blk_get_request(struct request_queue *q, unsigned int op,
-				gfp_t gfp_mask)
-{
-	return blk_get_request_flags(q, op, gfp_mask & __GFP_DIRECT_RECLAIM ?
-				     0 : BLK_MQ_REQ_NOWAIT);
-}
 EXPORT_SYMBOL(blk_get_request);
 
 /**
@@ -1660,7 +1645,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
-	wbt_requeue(q->rq_wb, &rq->issue_stat);
+	wbt_requeue(q->rq_wb, rq);
 
 	if (rq->rq_flags & RQF_QUEUED)
 		blk_queue_end_tag(q, rq);
@@ -1767,7 +1752,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 	/* this is a bio leak */
 	WARN_ON(req->bio != NULL);
 
-	wbt_done(q->rq_wb, &req->issue_stat);
+	wbt_done(q->rq_wb, req);
 
 	/*
 	 * Request may not have originated from ll_rw_blk. if not,
@@ -2066,7 +2051,7 @@ get_rq:
 	 * Returns with the queue unlocked.
 	 */
 	blk_queue_enter_live(q);
-	req = get_request(q, bio->bi_opf, bio, 0);
+	req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
 	if (IS_ERR(req)) {
 		blk_queue_exit(q);
 		__wbt_done(q->rq_wb, wb_acct);
@@ -2078,7 +2063,7 @@ get_rq:
 		goto out_unlock;
 	}
 
-	wbt_track(&req->issue_stat, wb_acct);
+	wbt_track(req, wb_acct);
 
 	/*
 	 * After dropping the lock and possibly sleeping here, our request
@@ -2392,7 +2377,9 @@ blk_qc_t generic_make_request(struct bio *bio)
 
 	if (bio->bi_opf & REQ_NOWAIT)
 		flags = BLK_MQ_REQ_NOWAIT;
-	if (blk_queue_enter(q, flags) < 0) {
+	if (bio_flagged(bio, BIO_QUEUE_ENTERED))
+		blk_queue_enter_live(q);
+	else if (blk_queue_enter(q, flags) < 0) {
 		if (!blk_queue_dying(q) && (bio->bi_opf & REQ_NOWAIT))
 			bio_wouldblock_error(bio);
 		else
@@ -2727,7 +2714,7 @@ void blk_account_io_completion(struct request *req, unsigned int bytes)
 	}
 }
 
-void blk_account_io_done(struct request *req)
+void blk_account_io_done(struct request *req, u64 now)
 {
 	/*
 	 * Account IO completion.  flush_rq isn't accounted as a
@@ -2735,11 +2722,12 @@ void blk_account_io_done(struct request *req)
 	 * containing request is enough.
 	 */
 	if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
-		unsigned long duration = jiffies - req->start_time;
+		unsigned long duration;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
 		int cpu;
 
+		duration = nsecs_to_jiffies(now - req->start_time_ns);
 		cpu = part_stat_lock();
 		part = req->part;
 
@@ -2970,10 +2958,8 @@ static void blk_dequeue_request(struct request *rq)
 	 * and to it is freed is accounted as io that is in progress at
 	 * the driver side.
 	 */
-	if (blk_account_rq(rq)) {
+	if (blk_account_rq(rq))
 		q->in_flight[rq_is_sync(rq)]++;
-		set_io_start_time_ns(rq);
-	}
 }
 
 /**
@@ -2992,9 +2978,12 @@ void blk_start_request(struct request *req)
 	blk_dequeue_request(req);
 
 	if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
-		blk_stat_set_issue(&req->issue_stat, blk_rq_sectors(req));
+		req->io_start_time_ns = ktime_get_ns();
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+		req->throtl_size = blk_rq_sectors(req);
+#endif
 		req->rq_flags |= RQF_STATS;
-		wbt_issue(req->q->rq_wb, &req->issue_stat);
+		wbt_issue(req->q->rq_wb, req);
 	}
 
 	BUG_ON(blk_rq_is_complete(req));
@@ -3092,8 +3081,10 @@ bool blk_update_request(struct request *req, blk_status_t error,
 		struct bio *bio = req->bio;
 		unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
 
-		if (bio_bytes == bio->bi_iter.bi_size)
+		if (bio_bytes == bio->bi_iter.bi_size) {
 			req->bio = bio->bi_next;
+			bio->bi_next = NULL;
+		}
 
 		/* Completion has already been traced */
 		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
@@ -3190,12 +3181,13 @@ EXPORT_SYMBOL_GPL(blk_unprep_request);
 void blk_finish_request(struct request *req, blk_status_t error)
 {
 	struct request_queue *q = req->q;
+	u64 now = ktime_get_ns();
 
 	lockdep_assert_held(req->q->queue_lock);
 	WARN_ON_ONCE(q->mq_ops);
 
 	if (req->rq_flags & RQF_STATS)
-		blk_stat_add(req);
+		blk_stat_add(req, now);
 
 	if (req->rq_flags & RQF_QUEUED)
 		blk_queue_end_tag(q, req);
@@ -3210,10 +3202,10 @@ void blk_finish_request(struct request *req, blk_status_t error)
 	if (req->rq_flags & RQF_DONTPREP)
 		blk_unprep_request(req);
 
-	blk_account_io_done(req);
+	blk_account_io_done(req, now);
 
 	if (req->end_io) {
-		wbt_done(req->q->rq_wb, &req->issue_stat);
+		wbt_done(req->q->rq_wb, req);
 		req->end_io(req, error);
 	} else {
 		if (blk_bidi_rq(req))
@@ -3519,7 +3511,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 	struct bio *bio, *bio_src;
 
 	if (!bs)
-		bs = fs_bio_set;
+		bs = &fs_bio_set;
 
 	__rq_for_each_bio(bio_src, rq_src) {
 		bio = bio_clone_fast(bio_src, gfp_mask, bs);
@@ -3630,7 +3622,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
 		blk_run_queue_async(q);
 	else
 		__blk_run_queue(q);
-	spin_unlock(q->queue_lock);
+	spin_unlock_irq(q->queue_lock);
 }
 
 static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
@@ -3678,7 +3670,6 @@ EXPORT_SYMBOL(blk_check_plugged);
 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
 	struct request_queue *q;
-	unsigned long flags;
 	struct request *rq;
 	LIST_HEAD(list);
 	unsigned int depth;
@@ -3698,11 +3689,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	q = NULL;
 	depth = 0;
 
-	/*
-	 * Save and disable interrupts here, to avoid doing it for every
-	 * queue lock we have to take.
-	 */
-	local_irq_save(flags);
 	while (!list_empty(&list)) {
 		rq = list_entry_rq(list.next);
 		list_del_init(&rq->queuelist);
@@ -3715,7 +3701,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 				queue_unplugged(q, depth, from_schedule);
 			q = rq->q;
 			depth = 0;
-			spin_lock(q->queue_lock);
+			spin_lock_irq(q->queue_lock);
 		}
 
 		/*
@@ -3742,8 +3728,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	 */
 	if (q)
 		queue_unplugged(q, depth, from_schedule);
-
-	local_irq_restore(flags);
 }
 
 void blk_finish_plug(struct blk_plug *plug)
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index feb3057..6121611 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -333,34 +333,34 @@ static ssize_t integrity_device_show(struct blk_integrity *bi, char *page)
 }
 
 static struct integrity_sysfs_entry integrity_format_entry = {
-	.attr = { .name = "format", .mode = S_IRUGO },
+	.attr = { .name = "format", .mode = 0444 },
 	.show = integrity_format_show,
 };
 
 static struct integrity_sysfs_entry integrity_tag_size_entry = {
-	.attr = { .name = "tag_size", .mode = S_IRUGO },
+	.attr = { .name = "tag_size", .mode = 0444 },
 	.show = integrity_tag_size_show,
 };
 
 static struct integrity_sysfs_entry integrity_interval_entry = {
-	.attr = { .name = "protection_interval_bytes", .mode = S_IRUGO },
+	.attr = { .name = "protection_interval_bytes", .mode = 0444 },
 	.show = integrity_interval_show,
 };
 
 static struct integrity_sysfs_entry integrity_verify_entry = {
-	.attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR },
+	.attr = { .name = "read_verify", .mode = 0644 },
 	.show = integrity_verify_show,
 	.store = integrity_verify_store,
 };
 
 static struct integrity_sysfs_entry integrity_generate_entry = {
-	.attr = { .name = "write_generate", .mode = S_IRUGO | S_IWUSR },
+	.attr = { .name = "write_generate", .mode = 0644 },
 	.show = integrity_generate_show,
 	.store = integrity_generate_store,
 };
 
 static struct integrity_sysfs_entry integrity_device_entry = {
-	.attr = { .name = "device_is_integrity_capable", .mode = S_IRUGO },
+	.attr = { .name = "device_is_integrity_capable", .mode = 0444 },
 	.show = integrity_device_show,
 };
 
diff --git a/block/blk-lib.c b/block/blk-lib.c
index a676084..8faa70f 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -62,10 +62,16 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		unsigned int req_sects;
 		sector_t end_sect, tmp;
 
-		/* Make sure bi_size doesn't overflow */
-		req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9);
+		/*
+		 * Issue in chunks of the user defined max discard setting,
+		 * ensuring that bi_size doesn't overflow
+		 */
+		req_sects = min_t(sector_t, nr_sects,
+					q->limits.max_discard_sectors);
+		if (req_sects > UINT_MAX >> 9)
+			req_sects = UINT_MAX >> 9;
 
-		/**
+		/*
 		 * If splitting a request, and the next starting sector would be
 		 * misaligned, stop the discard at the previous aligned sector.
 		 */
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 782940c..aaec38c 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -188,16 +188,16 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
 	switch (bio_op(*bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
-		split = blk_bio_discard_split(q, *bio, q->bio_split, &nsegs);
+		split = blk_bio_discard_split(q, *bio, &q->bio_split, &nsegs);
 		break;
 	case REQ_OP_WRITE_ZEROES:
-		split = blk_bio_write_zeroes_split(q, *bio, q->bio_split, &nsegs);
+		split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, &nsegs);
 		break;
 	case REQ_OP_WRITE_SAME:
-		split = blk_bio_write_same_split(q, *bio, q->bio_split, &nsegs);
+		split = blk_bio_write_same_split(q, *bio, &q->bio_split, &nsegs);
 		break;
 	default:
-		split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs);
+		split = blk_bio_segment_split(q, *bio, &q->bio_split, &nsegs);
 		break;
 	}
 
@@ -210,6 +210,16 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
 		/* there isn't chance to merge the splitted bio */
 		split->bi_opf |= REQ_NOMERGE;
 
+		/*
+		 * Since we're recursing into make_request here, ensure
+		 * that we mark this bio as already having entered the queue.
+		 * If not, and the queue is going away, we can get stuck
+		 * forever on waiting for the queue reference to drop. But
+		 * that will never happen, as we're already holding a
+		 * reference to it.
+		 */
+		bio_set_flag(*bio, BIO_QUEUE_ENTERED);
+
 		bio_chain(split, *bio);
 		trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
 		generic_make_request(*bio);
@@ -724,13 +734,12 @@ static struct request *attempt_merge(struct request_queue *q,
 	}
 
 	/*
-	 * At this point we have either done a back merge
-	 * or front merge. We need the smaller start_time of
-	 * the merged requests to be the current request
-	 * for accounting purposes.
+	 * At this point we have either done a back merge or front merge. We
+	 * need the smaller start_time_ns of the merged requests to be the
+	 * current request for accounting purposes.
 	 */
-	if (time_after(req->start_time, next->start_time))
-		req->start_time = next->start_time;
+	if (next->start_time_ns < req->start_time_ns)
+		req->start_time_ns = next->start_time_ns;
 
 	req->biotail->bi_next = next->bio;
 	req->biotail = next->biotail;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3080e18..ffa6223 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -344,7 +344,6 @@ static const char *const rqf_name[] = {
 	RQF_NAME(STATS),
 	RQF_NAME(SPECIAL_PAYLOAD),
 	RQF_NAME(ZONE_WRITE_LOCKED),
-	RQF_NAME(MQ_TIMEOUT_EXPIRED),
 	RQF_NAME(MQ_POLL_SLEPT),
 };
 #undef RQF_NAME
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 25c14c5..56c493c 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -268,19 +268,16 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
 
 /*
- * Reverse check our software queue for entries that we could potentially
- * merge with. Currently includes a hand-wavy stop count of 8, to not spend
- * too much time checking for merges.
+ * Iterate list of requests and see if we can merge this bio with any
+ * of them.
  */
-static bool blk_mq_attempt_merge(struct request_queue *q,
-				 struct blk_mq_ctx *ctx, struct bio *bio)
+bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
+			   struct bio *bio)
 {
 	struct request *rq;
 	int checked = 8;
 
-	lockdep_assert_held(&ctx->lock);
-
-	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
+	list_for_each_entry_reverse(rq, list, queuelist) {
 		bool merged = false;
 
 		if (!checked--)
@@ -305,13 +302,30 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
 			continue;
 		}
 
-		if (merged)
-			ctx->rq_merged++;
 		return merged;
 	}
 
 	return false;
 }
+EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);
+
+/*
+ * Reverse check our software queue for entries that we could potentially
+ * merge with. Currently includes a hand-wavy stop count of 8, to not spend
+ * too much time checking for merges.
+ */
+static bool blk_mq_attempt_merge(struct request_queue *q,
+				 struct blk_mq_ctx *ctx, struct bio *bio)
+{
+	lockdep_assert_held(&ctx->lock);
+
+	if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) {
+		ctx->rq_merged++;
+		return true;
+	}
+
+	return false;
+}
 
 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 {
@@ -571,6 +585,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 
 	if (!e) {
 		q->elevator = NULL;
+		q->nr_requests = q->tag_set->queue_depth;
 		return 0;
 	}
 
@@ -633,14 +648,3 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 	blk_mq_sched_tags_teardown(q);
 	q->elevator = NULL;
 }
-
-int blk_mq_sched_init(struct request_queue *q)
-{
-	int ret;
-
-	mutex_lock(&q->sysfs_lock);
-	ret = elevator_init(q, NULL);
-	mutex_unlock(&q->sysfs_lock);
-
-	return ret;
-}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 1e9c901..0cb8f93 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -33,8 +33,6 @@ int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 			    unsigned int hctx_idx);
 
-int blk_mq_sched_init(struct request_queue *q);
-
 static inline bool
 blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 {
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index a54b4b0..aafb442 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -166,15 +166,15 @@ static struct attribute *default_ctx_attrs[] = {
 };
 
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = {
-	.attr = {.name = "nr_tags", .mode = S_IRUGO },
+	.attr = {.name = "nr_tags", .mode = 0444 },
 	.show = blk_mq_hw_sysfs_nr_tags_show,
 };
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = {
-	.attr = {.name = "nr_reserved_tags", .mode = S_IRUGO },
+	.attr = {.name = "nr_reserved_tags", .mode = 0444 },
 	.show = blk_mq_hw_sysfs_nr_reserved_tags_show,
 };
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
-	.attr = {.name = "cpu_list", .mode = S_IRUGO },
+	.attr = {.name = "cpu_list", .mode = 0444 },
 	.show = blk_mq_hw_sysfs_cpus_show,
 };
 
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 336dde0..70356a2a 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -134,6 +134,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	ws = bt_wait_ptr(bt, data->hctx);
 	drop_ctx = data->ctx == NULL;
 	do {
+		struct sbitmap_queue *bt_prev;
+
 		/*
 		 * We're out of tags on this hardware queue, kick any
 		 * pending IO submits before going to sleep waiting for
@@ -159,6 +161,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (data->ctx)
 			blk_mq_put_ctx(data->ctx);
 
+		bt_prev = bt;
 		io_schedule();
 
 		data->ctx = blk_mq_get_ctx(data->q);
@@ -170,6 +173,15 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 			bt = &tags->bitmap_tags;
 
 		finish_wait(&ws->wait, &wait);
+
+		/*
+		 * If destination hw queue is changed, fake wake up on
+		 * previous queue for compensating the wake up miss, so
+		 * other allocations on previous queue won't be starved.
+		 */
+		if (bt != bt_prev)
+			sbitmap_queue_wake_up(bt_prev);
+
 		ws = bt_wait_ptr(bt, data->hctx);
 	} while (1);
 
@@ -259,7 +271,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	 * test and set the bit before assining ->rqs[].
 	 */
 	rq = tags->rqs[bitnr];
-	if (rq)
+	if (rq && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
 		iter_data->fn(rq, iter_data->data, reserved);
 
 	return true;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9ce9cac..6332940 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -309,7 +309,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->rq_disk = NULL;
 	rq->part = NULL;
-	rq->start_time = jiffies;
+	rq->start_time_ns = ktime_get_ns();
+	rq->io_start_time_ns = 0;
 	rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	rq->nr_integrity_segments = 0;
@@ -328,11 +329,10 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 
 #ifdef CONFIG_BLK_CGROUP
 	rq->rl = NULL;
-	set_start_time_ns(rq);
-	rq->io_start_time_ns = 0;
 #endif
 
 	data->ctx->rq_dispatched[op_is_sync(op)]++;
+	refcount_set(&rq->ref, 1);
 	return rq;
 }
 
@@ -361,9 +361,11 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 
 		/*
 		 * Flush requests are special and go directly to the
-		 * dispatch list.
+		 * dispatch list. Don't include reserved tags in the
+		 * limiting, as it isn't useful.
 		 */
-		if (!op_is_flush(op) && e->type->ops.mq.limit_depth)
+		if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
+		    !(data->flags & BLK_MQ_REQ_RESERVED))
 			e->type->ops.mq.limit_depth(op, data);
 	}
 
@@ -464,13 +466,27 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 
+static void __blk_mq_free_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+	const int sched_tag = rq->internal_tag;
+
+	if (rq->tag != -1)
+		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
+	if (sched_tag != -1)
+		blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
+	blk_mq_sched_restart(hctx);
+	blk_queue_exit(q);
+}
+
 void blk_mq_free_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-	const int sched_tag = rq->internal_tag;
 
 	if (rq->rq_flags & RQF_ELVPRIV) {
 		if (e && e->type->ops.mq.finish_request)
@@ -488,27 +504,30 @@ void blk_mq_free_request(struct request *rq)
 	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
 		laptop_io_completion(q->backing_dev_info);
 
-	wbt_done(q->rq_wb, &rq->issue_stat);
+	wbt_done(q->rq_wb, rq);
 
 	if (blk_rq_rl(rq))
 		blk_put_rl(blk_rq_rl(rq));
 
-	blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
-	if (rq->tag != -1)
-		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
-	if (sched_tag != -1)
-		blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
-	blk_mq_sched_restart(hctx);
-	blk_queue_exit(q);
+	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
+	if (refcount_dec_and_test(&rq->ref))
+		__blk_mq_free_request(rq);
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 {
-	blk_account_io_done(rq);
+	u64 now = ktime_get_ns();
+
+	if (rq->rq_flags & RQF_STATS) {
+		blk_mq_poll_stats_start(rq->q);
+		blk_stat_add(rq, now);
+	}
+
+	blk_account_io_done(rq, now);
 
 	if (rq->end_io) {
-		wbt_done(rq->q->rq_wb, &rq->issue_stat);
+		wbt_done(rq->q->rq_wb, rq);
 		rq->end_io(rq, error);
 	} else {
 		if (unlikely(blk_bidi_rq(rq)))
@@ -539,15 +558,12 @@ static void __blk_mq_complete_request(struct request *rq)
 	bool shared = false;
 	int cpu;
 
-	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
-	blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE);
+	if (cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) !=
+			MQ_RQ_IN_FLIGHT)
+		return;
 
 	if (rq->internal_tag != -1)
 		blk_mq_sched_completed_request(rq);
-	if (rq->rq_flags & RQF_STATS) {
-		blk_mq_poll_stats_start(rq->q);
-		blk_stat_add(rq);
-	}
 
 	if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
 		rq->q->softirq_done_fn(rq);
@@ -589,36 +605,6 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
 		*srcu_idx = srcu_read_lock(hctx->srcu);
 }
 
-static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
-{
-	unsigned long flags;
-
-	/*
-	 * blk_mq_rq_aborted_gstate() is used from the completion path and
-	 * can thus be called from irq context.  u64_stats_fetch in the
-	 * middle of update on the same CPU leads to lockup.  Disable irq
-	 * while updating.
-	 */
-	local_irq_save(flags);
-	u64_stats_update_begin(&rq->aborted_gstate_sync);
-	rq->aborted_gstate = gstate;
-	u64_stats_update_end(&rq->aborted_gstate_sync);
-	local_irq_restore(flags);
-}
-
-static u64 blk_mq_rq_aborted_gstate(struct request *rq)
-{
-	unsigned int start;
-	u64 aborted_gstate;
-
-	do {
-		start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
-		aborted_gstate = rq->aborted_gstate;
-	} while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
-
-	return aborted_gstate;
-}
-
 /**
  * blk_mq_complete_request - end I/O on a request
  * @rq:		the request being processed
@@ -629,28 +615,9 @@ static u64 blk_mq_rq_aborted_gstate(struct request *rq)
  **/
 void blk_mq_complete_request(struct request *rq)
 {
-	struct request_queue *q = rq->q;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
-	int srcu_idx;
-
-	if (unlikely(blk_should_fake_timeout(q)))
+	if (unlikely(blk_should_fake_timeout(rq->q)))
 		return;
-
-	/*
-	 * If @rq->aborted_gstate equals the current instance, timeout is
-	 * claiming @rq and we lost.  This is synchronized through
-	 * hctx_lock().  See blk_mq_timeout_work() for details.
-	 *
-	 * Completion path never blocks and we can directly use RCU here
-	 * instead of hctx_lock() which can be either RCU or SRCU.
-	 * However, that would complicate paths which want to synchronize
-	 * against us.  Let stay in sync with the issue path so that
-	 * hctx_lock() covers both issue and completion paths.
-	 */
-	hctx_lock(hctx, &srcu_idx);
-	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
-		__blk_mq_complete_request(rq);
-	hctx_unlock(hctx, srcu_idx);
+	__blk_mq_complete_request(rq);
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
@@ -669,32 +636,18 @@ void blk_mq_start_request(struct request *rq)
 	trace_block_rq_issue(q, rq);
 
 	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
-		blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq));
+		rq->io_start_time_ns = ktime_get_ns();
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+		rq->throtl_size = blk_rq_sectors(rq);
+#endif
 		rq->rq_flags |= RQF_STATS;
-		wbt_issue(q->rq_wb, &rq->issue_stat);
+		wbt_issue(q->rq_wb, rq);
 	}
 
 	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
 
-	/*
-	 * Mark @rq in-flight which also advances the generation number,
-	 * and register for timeout.  Protect with a seqcount to allow the
-	 * timeout path to read both @rq->gstate and @rq->deadline
-	 * coherently.
-	 *
-	 * This is the only place where a request is marked in-flight.  If
-	 * the timeout path reads an in-flight @rq->gstate, the
-	 * @rq->deadline it reads together under @rq->gstate_seq is
-	 * guaranteed to be the matching one.
-	 */
-	preempt_disable();
-	write_seqcount_begin(&rq->gstate_seq);
-
-	blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
 	blk_add_timer(rq);
-
-	write_seqcount_end(&rq->gstate_seq);
-	preempt_enable();
+	WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
 
 	if (q->dma_drain_size && blk_rq_bytes(rq)) {
 		/*
@@ -707,11 +660,6 @@ void blk_mq_start_request(struct request *rq)
 }
 EXPORT_SYMBOL(blk_mq_start_request);
 
-/*
- * When we reach here because queue is busy, it's safe to change the state
- * to IDLE without checking @rq->aborted_gstate because we should still be
- * holding the RCU read lock and thus protected against timeout.
- */
 static void __blk_mq_requeue_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
@@ -719,10 +667,10 @@ static void __blk_mq_requeue_request(struct request *rq)
 	blk_mq_put_driver_tag(rq);
 
 	trace_block_rq_requeue(q, rq);
-	wbt_requeue(q->rq_wb, &rq->issue_stat);
+	wbt_requeue(q->rq_wb, rq);
 
-	if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) {
-		blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
+	if (blk_mq_request_started(rq)) {
+		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 		if (q->dma_drain_size && blk_rq_bytes(rq))
 			rq->nr_phys_segments--;
 	}
@@ -820,101 +768,79 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
-struct blk_mq_timeout_data {
-	unsigned long next;
-	unsigned int next_set;
-	unsigned int nr_expired;
-};
-
 static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 {
-	const struct blk_mq_ops *ops = req->q->mq_ops;
-	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
-
-	req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;
-
-	if (ops->timeout)
-		ret = ops->timeout(req, reserved);
+	if (req->q->mq_ops->timeout) {
+		enum blk_eh_timer_return ret;
 
-	switch (ret) {
-	case BLK_EH_HANDLED:
-		__blk_mq_complete_request(req);
-		break;
-	case BLK_EH_RESET_TIMER:
-		/*
-		 * As nothing prevents from completion happening while
-		 * ->aborted_gstate is set, this may lead to ignored
-		 * completions and further spurious timeouts.
-		 */
-		blk_mq_rq_update_aborted_gstate(req, 0);
-		blk_add_timer(req);
-		break;
-	case BLK_EH_NOT_HANDLED:
-		break;
-	default:
-		printk(KERN_ERR "block: bad eh return: %d\n", ret);
-		break;
+		ret = req->q->mq_ops->timeout(req, reserved);
+		if (ret == BLK_EH_DONE)
+			return;
+		WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
 	}
+
+	blk_add_timer(req);
 }
 
-static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
-		struct request *rq, void *priv, bool reserved)
+static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
 {
-	struct blk_mq_timeout_data *data = priv;
-	unsigned long gstate, deadline;
-	int start;
+	unsigned long deadline;
 
-	might_sleep();
-
-	if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED)
-		return;
+	if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
+		return false;
 
-	/* read coherent snapshots of @rq->state_gen and @rq->deadline */
-	while (true) {
-		start = read_seqcount_begin(&rq->gstate_seq);
-		gstate = READ_ONCE(rq->gstate);
-		deadline = blk_rq_deadline(rq);
-		if (!read_seqcount_retry(&rq->gstate_seq, start))
-			break;
-		cond_resched();
-	}
+	deadline = blk_rq_deadline(rq);
+	if (time_after_eq(jiffies, deadline))
+		return true;
 
-	/* if in-flight && overdue, mark for abortion */
-	if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
-	    time_after_eq(jiffies, deadline)) {
-		blk_mq_rq_update_aborted_gstate(rq, gstate);
-		data->nr_expired++;
-		hctx->nr_expired++;
-	} else if (!data->next_set || time_after(data->next, deadline)) {
-		data->next = deadline;
-		data->next_set = 1;
-	}
+	if (*next == 0)
+		*next = deadline;
+	else if (time_after(*next, deadline))
+		*next = deadline;
+	return false;
 }
 
-static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, void *priv, bool reserved)
 {
+	unsigned long *next = priv;
+
 	/*
-	 * We marked @rq->aborted_gstate and waited for RCU.  If there were
-	 * completions that we lost to, they would have finished and
-	 * updated @rq->gstate by now; otherwise, the completion path is
-	 * now guaranteed to see @rq->aborted_gstate and yield.  If
-	 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
+	 * Just do a quick check if it is expired before locking the request in
+	 * so we're not unnecessarilly synchronizing across CPUs.
+	 */
+	if (!blk_mq_req_expired(rq, next))
+		return;
+
+	/*
+	 * We have reason to believe the request may be expired. Take a
+	 * reference on the request to lock this request lifetime into its
+	 * currently allocated context to prevent it from being reallocated in
+	 * the event the completion by-passes this timeout handler.
+	 *
+	 * If the reference was already released, then the driver beat the
+	 * timeout handler to posting a natural completion.
 	 */
-	if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
-	    READ_ONCE(rq->gstate) == rq->aborted_gstate)
+	if (!refcount_inc_not_zero(&rq->ref))
+		return;
+
+	/*
+	 * The request is now locked and cannot be reallocated underneath the
+	 * timeout handler's processing. Re-verify this exact request is truly
+	 * expired; if it is not expired, then the request was completed and
+	 * reallocated as a new request.
+	 */
+	if (blk_mq_req_expired(rq, next))
 		blk_mq_rq_timed_out(rq, reserved);
+	if (refcount_dec_and_test(&rq->ref))
+		__blk_mq_free_request(rq);
 }
 
 static void blk_mq_timeout_work(struct work_struct *work)
 {
 	struct request_queue *q =
 		container_of(work, struct request_queue, timeout_work);
-	struct blk_mq_timeout_data data = {
-		.next		= 0,
-		.next_set	= 0,
-		.nr_expired	= 0,
-	};
+	unsigned long next = 0;
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
@@ -934,39 +860,10 @@ static void blk_mq_timeout_work(struct work_struct *work)
 	if (!percpu_ref_tryget(&q->q_usage_counter))
 		return;
 
-	/* scan for the expired ones and set their ->aborted_gstate */
-	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
-
-	if (data.nr_expired) {
-		bool has_rcu = false;
-
-		/*
-		 * Wait till everyone sees ->aborted_gstate.  The
-		 * sequential waits for SRCUs aren't ideal.  If this ever
-		 * becomes a problem, we can add per-hw_ctx rcu_head and
-		 * wait in parallel.
-		 */
-		queue_for_each_hw_ctx(q, hctx, i) {
-			if (!hctx->nr_expired)
-				continue;
-
-			if (!(hctx->flags & BLK_MQ_F_BLOCKING))
-				has_rcu = true;
-			else
-				synchronize_srcu(hctx->srcu);
-
-			hctx->nr_expired = 0;
-		}
-		if (has_rcu)
-			synchronize_rcu();
-
-		/* terminate the ones we won */
-		blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
-	}
+	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
 
-	if (data.next_set) {
-		data.next = blk_rq_timeout(round_jiffies_up(data.next));
-		mod_timer(&q->timeout, data.next);
+	if (next != 0) {
+		mod_timer(&q->timeout, next);
 	} else {
 		/*
 		 * Request timeouts are handled as a forward rolling timer. If
@@ -1029,7 +926,7 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
 	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
 
 	spin_lock(&ctx->lock);
-	if (unlikely(!list_empty(&ctx->rq_list))) {
+	if (!list_empty(&ctx->rq_list)) {
 		dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
 		list_del_init(&dispatch_data->rq->queuelist);
 		if (list_empty(&ctx->rq_list))
@@ -1716,15 +1613,6 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 	blk_account_io_start(rq, true);
 }
 
-static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx,
-				   struct blk_mq_ctx *ctx,
-				   struct request *rq)
-{
-	spin_lock(&ctx->lock);
-	__blk_mq_insert_request(hctx, rq, false);
-	spin_unlock(&ctx->lock);
-}
-
 static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
 	if (rq->tag != -1)
@@ -1882,7 +1770,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		return BLK_QC_T_NONE;
 	}
 
-	wbt_track(&rq->issue_stat, wb_acct);
+	wbt_track(rq, wb_acct);
 
 	cookie = request_to_qc_t(data.hctx, rq);
 
@@ -1949,15 +1837,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
 		blk_mq_try_issue_directly(data.hctx, rq, &cookie);
-	} else if (q->elevator) {
-		blk_mq_put_ctx(data.ctx);
-		blk_mq_bio_to_request(rq, bio);
-		blk_mq_sched_insert_request(rq, false, true, true);
 	} else {
 		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
-		blk_mq_queue_io(data.hctx, data.ctx, rq);
-		blk_mq_run_hw_queue(data.hctx, true);
+		blk_mq_sched_insert_request(rq, false, true, true);
 	}
 
 	return cookie;
@@ -2056,15 +1939,7 @@ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
 			return ret;
 	}
 
-	seqcount_init(&rq->gstate_seq);
-	u64_stats_init(&rq->aborted_gstate_sync);
-	/*
-	 * start gstate with gen 1 instead of 0, otherwise it will be equal
-	 * to aborted_gstate, and be identified timed out by
-	 * blk_mq_terminate_expired.
-	 */
-	WRITE_ONCE(rq->gstate, MQ_RQ_GEN_INC);
-
+	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 	return 0;
 }
 
@@ -2365,6 +2240,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	queue_for_each_hw_ctx(q, hctx, i) {
 		cpumask_clear(hctx->cpumask);
 		hctx->nr_ctx = 0;
+		hctx->dispatch_from = NULL;
 	}
 
 	/*
@@ -2697,7 +2573,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
 		int ret;
 
-		ret = blk_mq_sched_init(q);
+		ret = elevator_init_mq(q);
 		if (ret)
 			return ERR_PTR(ret);
 	}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e1bb420..89231e4 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -30,20 +30,6 @@ struct blk_mq_ctx {
 	struct kobject		kobj;
 } ____cacheline_aligned_in_smp;
 
-/*
- * Bits for request->gstate.  The lower two bits carry MQ_RQ_* state value
- * and the upper bits the generation number.
- */
-enum mq_rq_state {
-	MQ_RQ_IDLE		= 0,
-	MQ_RQ_IN_FLIGHT		= 1,
-	MQ_RQ_COMPLETE		= 2,
-
-	MQ_RQ_STATE_BITS	= 2,
-	MQ_RQ_STATE_MASK	= (1 << MQ_RQ_STATE_BITS) - 1,
-	MQ_RQ_GEN_INC		= 1 << MQ_RQ_STATE_BITS,
-};
-
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
@@ -107,33 +93,9 @@ void blk_mq_release(struct request_queue *q);
  * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
  * @rq: target request.
  */
-static inline int blk_mq_rq_state(struct request *rq)
+static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
 {
-	return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
-}
-
-/**
- * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
- * @rq: target request.
- * @state: new state to set.
- *
- * Set @rq's state to @state.  The caller is responsible for ensuring that
- * there are no other updaters.  A request can transition into IN_FLIGHT
- * only from IDLE and doing so increments the generation number.
- */
-static inline void blk_mq_rq_update_state(struct request *rq,
-					  enum mq_rq_state state)
-{
-	u64 old_val = READ_ONCE(rq->gstate);
-	u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
-
-	if (state == MQ_RQ_IN_FLIGHT) {
-		WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
-		new_val += MQ_RQ_GEN_INC;
-	}
-
-	/* avoid exposing interim values */
-	WRITE_ONCE(rq->gstate, new_val);
+	return READ_ONCE(rq->state);
 }
 
 static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
diff --git a/block/blk-stat.c b/block/blk-stat.c
index bd365a9..175c143 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -47,19 +47,15 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
 	stat->nr_samples++;
 }
 
-void blk_stat_add(struct request *rq)
+void blk_stat_add(struct request *rq, u64 now)
 {
 	struct request_queue *q = rq->q;
 	struct blk_stat_callback *cb;
 	struct blk_rq_stat *stat;
 	int bucket;
-	u64 now, value;
+	u64 value;
 
-	now = __blk_stat_time(ktime_to_ns(ktime_get()));
-	if (now < blk_stat_time(&rq->issue_stat))
-		return;
-
-	value = now - blk_stat_time(&rq->issue_stat);
+	value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
 
 	blk_throtl_stat_add(rq, value);
 
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 2dd3634..78399cd 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -8,21 +8,6 @@
 #include <linux/rcupdate.h>
 #include <linux/timer.h>
 
-/*
- * from upper:
- * 3 bits: reserved for other usage
- * 12 bits: size
- * 49 bits: time
- */
-#define BLK_STAT_RES_BITS	3
-#define BLK_STAT_SIZE_BITS	12
-#define BLK_STAT_RES_SHIFT	(64 - BLK_STAT_RES_BITS)
-#define BLK_STAT_SIZE_SHIFT	(BLK_STAT_RES_SHIFT - BLK_STAT_SIZE_BITS)
-#define BLK_STAT_TIME_MASK	((1ULL << BLK_STAT_SIZE_SHIFT) - 1)
-#define BLK_STAT_SIZE_MASK	\
-	(((1ULL << BLK_STAT_SIZE_BITS) - 1) << BLK_STAT_SIZE_SHIFT)
-#define BLK_STAT_RES_MASK	(~((1ULL << BLK_STAT_RES_SHIFT) - 1))
-
 /**
  * struct blk_stat_callback - Block statistics callback.
  *
@@ -80,35 +65,7 @@ struct blk_stat_callback {
 struct blk_queue_stats *blk_alloc_queue_stats(void);
 void blk_free_queue_stats(struct blk_queue_stats *);
 
-void blk_stat_add(struct request *);
-
-static inline u64 __blk_stat_time(u64 time)
-{
-	return time & BLK_STAT_TIME_MASK;
-}
-
-static inline u64 blk_stat_time(struct blk_issue_stat *stat)
-{
-	return __blk_stat_time(stat->stat);
-}
-
-static inline sector_t blk_capped_size(sector_t size)
-{
-	return size & ((1ULL << BLK_STAT_SIZE_BITS) - 1);
-}
-
-static inline sector_t blk_stat_size(struct blk_issue_stat *stat)
-{
-	return (stat->stat & BLK_STAT_SIZE_MASK) >> BLK_STAT_SIZE_SHIFT;
-}
-
-static inline void blk_stat_set_issue(struct blk_issue_stat *stat,
-	sector_t size)
-{
-	stat->stat = (stat->stat & BLK_STAT_RES_MASK) |
-		(ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK) |
-		(((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT);
-}
+void blk_stat_add(struct request *rq, u64 now);
 
 /* record time/size info in request but not add a callback */
 void blk_stat_enable_accounting(struct request_queue *q);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index d00d1b0..94987b1 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -491,188 +491,198 @@ static ssize_t queue_wc_store(struct request_queue *q, const char *page,
 	return count;
 }
 
+static ssize_t queue_fua_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%u\n", test_bit(QUEUE_FLAG_FUA, &q->queue_flags));
+}
+
 static ssize_t queue_dax_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(blk_queue_dax(q), page);
 }
 
 static struct queue_sysfs_entry queue_requests_entry = {
-	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "nr_requests", .mode = 0644 },
 	.show = queue_requests_show,
 	.store = queue_requests_store,
 };
 
 static struct queue_sysfs_entry queue_ra_entry = {
-	.attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "read_ahead_kb", .mode = 0644 },
 	.show = queue_ra_show,
 	.store = queue_ra_store,
 };
 
 static struct queue_sysfs_entry queue_max_sectors_entry = {
-	.attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "max_sectors_kb", .mode = 0644 },
 	.show = queue_max_sectors_show,
 	.store = queue_max_sectors_store,
 };
 
 static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
-	.attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
+	.attr = {.name = "max_hw_sectors_kb", .mode = 0444 },
 	.show = queue_max_hw_sectors_show,
 };
 
 static struct queue_sysfs_entry queue_max_segments_entry = {
-	.attr = {.name = "max_segments", .mode = S_IRUGO },
+	.attr = {.name = "max_segments", .mode = 0444 },
 	.show = queue_max_segments_show,
 };
 
 static struct queue_sysfs_entry queue_max_discard_segments_entry = {
-	.attr = {.name = "max_discard_segments", .mode = S_IRUGO },
+	.attr = {.name = "max_discard_segments", .mode = 0444 },
 	.show = queue_max_discard_segments_show,
 };
 
 static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
-	.attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
+	.attr = {.name = "max_integrity_segments", .mode = 0444 },
 	.show = queue_max_integrity_segments_show,
 };
 
 static struct queue_sysfs_entry queue_max_segment_size_entry = {
-	.attr = {.name = "max_segment_size", .mode = S_IRUGO },
+	.attr = {.name = "max_segment_size", .mode = 0444 },
 	.show = queue_max_segment_size_show,
 };
 
 static struct queue_sysfs_entry queue_iosched_entry = {
-	.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "scheduler", .mode = 0644 },
 	.show = elv_iosched_show,
 	.store = elv_iosched_store,
 };
 
 static struct queue_sysfs_entry queue_hw_sector_size_entry = {
-	.attr = {.name = "hw_sector_size", .mode = S_IRUGO },
+	.attr = {.name = "hw_sector_size", .mode = 0444 },
 	.show = queue_logical_block_size_show,
 };
 
 static struct queue_sysfs_entry queue_logical_block_size_entry = {
-	.attr = {.name = "logical_block_size", .mode = S_IRUGO },
+	.attr = {.name = "logical_block_size", .mode = 0444 },
 	.show = queue_logical_block_size_show,
 };
 
 static struct queue_sysfs_entry queue_physical_block_size_entry = {
-	.attr = {.name = "physical_block_size", .mode = S_IRUGO },
+	.attr = {.name = "physical_block_size", .mode = 0444 },
 	.show = queue_physical_block_size_show,
 };
 
 static struct queue_sysfs_entry queue_chunk_sectors_entry = {
-	.attr = {.name = "chunk_sectors", .mode = S_IRUGO },
+	.attr = {.name = "chunk_sectors", .mode = 0444 },
 	.show = queue_chunk_sectors_show,
 };
 
 static struct queue_sysfs_entry queue_io_min_entry = {
-	.attr = {.name = "minimum_io_size", .mode = S_IRUGO },
+	.attr = {.name = "minimum_io_size", .mode = 0444 },
 	.show = queue_io_min_show,
 };
 
 static struct queue_sysfs_entry queue_io_opt_entry = {
-	.attr = {.name = "optimal_io_size", .mode = S_IRUGO },
+	.attr = {.name = "optimal_io_size", .mode = 0444 },
 	.show = queue_io_opt_show,
 };
 
 static struct queue_sysfs_entry queue_discard_granularity_entry = {
-	.attr = {.name = "discard_granularity", .mode = S_IRUGO },
+	.attr = {.name = "discard_granularity", .mode = 0444 },
 	.show = queue_discard_granularity_show,
 };
 
 static struct queue_sysfs_entry queue_discard_max_hw_entry = {
-	.attr = {.name = "discard_max_hw_bytes", .mode = S_IRUGO },
+	.attr = {.name = "discard_max_hw_bytes", .mode = 0444 },
 	.show = queue_discard_max_hw_show,
 };
 
 static struct queue_sysfs_entry queue_discard_max_entry = {
-	.attr = {.name = "discard_max_bytes", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "discard_max_bytes", .mode = 0644 },
 	.show = queue_discard_max_show,
 	.store = queue_discard_max_store,
 };
 
 static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
-	.attr = {.name = "discard_zeroes_data", .mode = S_IRUGO },
+	.attr = {.name = "discard_zeroes_data", .mode = 0444 },
 	.show = queue_discard_zeroes_data_show,
 };
 
 static struct queue_sysfs_entry queue_write_same_max_entry = {
-	.attr = {.name = "write_same_max_bytes", .mode = S_IRUGO },
+	.attr = {.name = "write_same_max_bytes", .mode = 0444 },
 	.show = queue_write_same_max_show,
 };
 
 static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
-	.attr = {.name = "write_zeroes_max_bytes", .mode = S_IRUGO },
+	.attr = {.name = "write_zeroes_max_bytes", .mode = 0444 },
 	.show = queue_write_zeroes_max_show,
 };
 
 static struct queue_sysfs_entry queue_nonrot_entry = {
-	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "rotational", .mode = 0644 },
 	.show = queue_show_nonrot,
 	.store = queue_store_nonrot,
 };
 
 static struct queue_sysfs_entry queue_zoned_entry = {
-	.attr = {.name = "zoned", .mode = S_IRUGO },
+	.attr = {.name = "zoned", .mode = 0444 },
 	.show = queue_zoned_show,
 };
 
 static struct queue_sysfs_entry queue_nomerges_entry = {
-	.attr = {.name = "nomerges", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "nomerges", .mode = 0644 },
 	.show = queue_nomerges_show,
 	.store = queue_nomerges_store,
 };
 
 static struct queue_sysfs_entry queue_rq_affinity_entry = {
-	.attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "rq_affinity", .mode = 0644 },
 	.show = queue_rq_affinity_show,
 	.store = queue_rq_affinity_store,
 };
 
 static struct queue_sysfs_entry queue_iostats_entry = {
-	.attr = {.name = "iostats", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "iostats", .mode = 0644 },
 	.show = queue_show_iostats,
 	.store = queue_store_iostats,
 };
 
 static struct queue_sysfs_entry queue_random_entry = {
-	.attr = {.name = "add_random", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "add_random", .mode = 0644 },
 	.show = queue_show_random,
 	.store = queue_store_random,
 };
 
 static struct queue_sysfs_entry queue_poll_entry = {
-	.attr = {.name = "io_poll", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "io_poll", .mode = 0644 },
 	.show = queue_poll_show,
 	.store = queue_poll_store,
 };
 
 static struct queue_sysfs_entry queue_poll_delay_entry = {
-	.attr = {.name = "io_poll_delay", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "io_poll_delay", .mode = 0644 },
 	.show = queue_poll_delay_show,
 	.store = queue_poll_delay_store,
 };
 
 static struct queue_sysfs_entry queue_wc_entry = {
-	.attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "write_cache", .mode = 0644 },
 	.show = queue_wc_show,
 	.store = queue_wc_store,
 };
 
+static struct queue_sysfs_entry queue_fua_entry = {
+	.attr = {.name = "fua", .mode = 0444 },
+	.show = queue_fua_show,
+};
+
 static struct queue_sysfs_entry queue_dax_entry = {
-	.attr = {.name = "dax", .mode = S_IRUGO },
+	.attr = {.name = "dax", .mode = 0444 },
 	.show = queue_dax_show,
 };
 
 static struct queue_sysfs_entry queue_wb_lat_entry = {
-	.attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "wbt_lat_usec", .mode = 0644 },
 	.show = queue_wb_lat_show,
 	.store = queue_wb_lat_store,
 };
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 static struct queue_sysfs_entry throtl_sample_time_entry = {
-	.attr = {.name = "throttle_sample_time", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "throttle_sample_time", .mode = 0644 },
 	.show = blk_throtl_sample_time_show,
 	.store = blk_throtl_sample_time_store,
 };
@@ -708,6 +718,7 @@ static struct attribute *default_attrs[] = {
 	&queue_random_entry.attr,
 	&queue_poll_entry.attr,
 	&queue_wc_entry.attr,
+	&queue_fua_entry.attr,
 	&queue_dax_entry.attr,
 	&queue_wb_lat_entry.attr,
 	&queue_poll_delay_entry.attr,
@@ -813,8 +824,7 @@ static void __blk_release_queue(struct work_struct *work)
 	if (q->mq_ops)
 		blk_mq_debugfs_unregister(q);
 
-	if (q->bio_split)
-		bioset_free(q->bio_split);
+	bioset_exit(&q->bio_split);
 
 	ida_simple_remove(&blk_queue_ida, q->id);
 	call_rcu(&q->rcu_head, blk_free_queue_rcu);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c5a1316..82282e6 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -36,8 +36,6 @@ static int throtl_quantum = 32;
  */
 #define LATENCY_FILTERED_HD (1000L) /* 1ms */
 
-#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
-
 static struct blkcg_policy blkcg_policy_throtl;
 
 /* A workqueue to queue throttle related work */
@@ -821,7 +819,7 @@ static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
 	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
 		return false;
 
-	return 1;
+	return true;
 }
 
 /* Trim the used slices and adjust slice start accordingly */
@@ -931,7 +929,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
 
 	if (wait)
 		*wait = jiffy_wait;
-	return 0;
+	return false;
 }
 
 static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
@@ -974,7 +972,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
 	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
 	if (wait)
 		*wait = jiffy_wait;
-	return 0;
+	return false;
 }
 
 /*
@@ -1024,7 +1022,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 	    tg_with_in_iops_limit(tg, bio, &iops_wait)) {
 		if (wait)
 			*wait = 0;
-		return 1;
+		return true;
 	}
 
 	max_wait = max(bps_wait, iops_wait);
@@ -1035,7 +1033,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 	if (time_before(tg->slice_end[rw], jiffies + max_wait))
 		throtl_extend_slice(tg, rw, jiffies + max_wait);
 
-	return 0;
+	return false;
 }
 
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
@@ -1209,7 +1207,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
 
 	while (1) {
 		struct throtl_grp *tg = throtl_rb_first(parent_sq);
-		struct throtl_service_queue *sq = &tg->service_queue;
+		struct throtl_service_queue *sq;
 
 		if (!tg)
 			break;
@@ -1221,6 +1219,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
 
 		nr_disp += throtl_dispatch_tg(tg);
 
+		sq = &tg->service_queue;
 		if (sq->nr_queued[0] || sq->nr_queued[1])
 			tg_update_disptime(tg);
 
@@ -2139,7 +2138,7 @@ static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
 		bio->bi_cg_private = tg;
 		blkg_get(tg_to_blkg(tg));
 	}
-	blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
+	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
 #endif
 }
 
@@ -2251,7 +2250,7 @@ out:
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	if (throttled || !td->track_bio_latency)
-		bio->bi_issue_stat.stat |= SKIP_LATENCY;
+		bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;
 #endif
 	return throttled;
 }
@@ -2281,8 +2280,7 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
 	struct request_queue *q = rq->q;
 	struct throtl_data *td = q->td;
 
-	throtl_track_latency(td, blk_stat_size(&rq->issue_stat),
-		req_op(rq), time_ns >> 10);
+	throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10);
 }
 
 void blk_throtl_bio_endio(struct bio *bio)
@@ -2302,8 +2300,8 @@ void blk_throtl_bio_endio(struct bio *bio)
 	finish_time_ns = ktime_get_ns();
 	tg->last_finish_time = finish_time_ns >> 10;
 
-	start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
-	finish_time = __blk_stat_time(finish_time_ns) >> 10;
+	start_time = bio_issue_time(&bio->bi_issue) >> 10;
+	finish_time = __bio_issue_time(finish_time_ns) >> 10;
 	if (!start_time || finish_time <= start_time) {
 		blkg_put(tg_to_blkg(tg));
 		return;
@@ -2311,16 +2309,15 @@ void blk_throtl_bio_endio(struct bio *bio)
 
 	lat = finish_time - start_time;
 	/* this is only for bio based driver */
-	if (!(bio->bi_issue_stat.stat & SKIP_LATENCY))
-		throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
-			bio_op(bio), lat);
+	if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY))
+		throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue),
+				     bio_op(bio), lat);
 
 	if (tg->latency_target && lat >= tg->td->filtered_latency) {
 		int bucket;
 		unsigned int threshold;
 
-		bucket = request_bucket_index(
-			blk_stat_size(&bio->bi_issue_stat));
+		bucket = request_bucket_index(bio_issue_size(&bio->bi_issue));
 		threshold = tg->td->avg_buckets[rw][bucket].latency +
 			tg->latency_target;
 		if (lat > threshold)
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 652d4d4..4b8a48d 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -86,14 +86,11 @@ static void blk_rq_timed_out(struct request *req)
 	if (q->rq_timed_out_fn)
 		ret = q->rq_timed_out_fn(req);
 	switch (ret) {
-	case BLK_EH_HANDLED:
-		__blk_complete_request(req);
-		break;
 	case BLK_EH_RESET_TIMER:
 		blk_add_timer(req);
 		blk_clear_rq_complete(req);
 		break;
-	case BLK_EH_NOT_HANDLED:
+	case BLK_EH_DONE:
 		/*
 		 * LLD handles this for now but in the future
 		 * we can send a request msg to abort the command
@@ -214,7 +211,6 @@ void blk_add_timer(struct request *req)
 		req->timeout = q->rq_timeout;
 
 	blk_rq_set_deadline(req, jiffies + req->timeout);
-	req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
 
 	/*
 	 * Only the non-mq case needs to add the request to a protected list.
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index f92fc84..4f89b28 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -29,6 +29,26 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/wbt.h>
 
+static inline void wbt_clear_state(struct request *rq)
+{
+	rq->wbt_flags = 0;
+}
+
+static inline enum wbt_flags wbt_flags(struct request *rq)
+{
+	return rq->wbt_flags;
+}
+
+static inline bool wbt_is_tracked(struct request *rq)
+{
+	return rq->wbt_flags & WBT_TRACKED;
+}
+
+static inline bool wbt_is_read(struct request *rq)
+{
+	return rq->wbt_flags & WBT_READ;
+}
+
 enum {
 	/*
 	 * Default setting, we'll scale up (to 75% of QD max) or down (min 1)
@@ -101,9 +121,15 @@ static bool wb_recent_wait(struct rq_wb *rwb)
 	return time_before(jiffies, wb->dirty_sleep + HZ);
 }
 
-static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd)
+static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
+					  enum wbt_flags wb_acct)
 {
-	return &rwb->rq_wait[is_kswapd];
+	if (wb_acct & WBT_KSWAPD)
+		return &rwb->rq_wait[WBT_RWQ_KSWAPD];
+	else if (wb_acct & WBT_DISCARD)
+		return &rwb->rq_wait[WBT_RWQ_DISCARD];
+
+	return &rwb->rq_wait[WBT_RWQ_BG];
 }
 
 static void rwb_wake_all(struct rq_wb *rwb)
@@ -126,7 +152,7 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
 	if (!(wb_acct & WBT_TRACKED))
 		return;
 
-	rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD);
+	rqw = get_rq_wait(rwb, wb_acct);
 	inflight = atomic_dec_return(&rqw->inflight);
 
 	/*
@@ -139,10 +165,13 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
 	}
 
 	/*
-	 * If the device does write back caching, drop further down
-	 * before we wake people up.
+	 * For discards, our limit is always the background. For writes, if
+	 * the device does write back caching, drop further down before we
+	 * wake people up.
 	 */
-	if (rwb->wc && !wb_recent_wait(rwb))
+	if (wb_acct & WBT_DISCARD)
+		limit = rwb->wb_background;
+	else if (rwb->wc && !wb_recent_wait(rwb))
 		limit = 0;
 	else
 		limit = rwb->wb_normal;
@@ -165,24 +194,24 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
  * Called on completion of a request. Note that it's also called when
  * a request is merged, when the request gets freed.
  */
-void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
+void wbt_done(struct rq_wb *rwb, struct request *rq)
 {
 	if (!rwb)
 		return;
 
-	if (!wbt_is_tracked(stat)) {
-		if (rwb->sync_cookie == stat) {
+	if (!wbt_is_tracked(rq)) {
+		if (rwb->sync_cookie == rq) {
 			rwb->sync_issue = 0;
 			rwb->sync_cookie = NULL;
 		}
 
-		if (wbt_is_read(stat))
+		if (wbt_is_read(rq))
 			wb_timestamp(rwb, &rwb->last_comp);
 	} else {
-		WARN_ON_ONCE(stat == rwb->sync_cookie);
-		__wbt_done(rwb, wbt_stat_to_mask(stat));
+		WARN_ON_ONCE(rq == rwb->sync_cookie);
+		__wbt_done(rwb, wbt_flags(rq));
 	}
-	wbt_clear_state(stat);
+	wbt_clear_state(rq);
 }
 
 /*
@@ -479,6 +508,9 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
 {
 	unsigned int limit;
 
+	if ((rw & REQ_OP_MASK) == REQ_OP_DISCARD)
+		return rwb->wb_background;
+
 	/*
 	 * At this point we know it's a buffered write. If this is
 	 * kswapd trying to free memory, or REQ_SYNC is set, then
@@ -529,11 +561,12 @@ static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
  * Block if we will exceed our limit, or if we are currently waiting for
  * the timer to kick off queuing again.
  */
-static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
+static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
+		       unsigned long rw, spinlock_t *lock)
 	__releases(lock)
 	__acquires(lock)
 {
-	struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
+	struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
 	DEFINE_WAIT(wait);
 
 	if (may_queue(rwb, rqw, &wait, rw))
@@ -559,21 +592,20 @@ static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
 
 static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
 {
-	const int op = bio_op(bio);
-
-	/*
-	 * If not a WRITE, do nothing
-	 */
-	if (op != REQ_OP_WRITE)
-		return false;
-
-	/*
-	 * Don't throttle WRITE_ODIRECT
-	 */
-	if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE))
+	switch (bio_op(bio)) {
+	case REQ_OP_WRITE:
+		/*
+		 * Don't throttle WRITE_ODIRECT
+		 */
+		if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) ==
+		    (REQ_SYNC | REQ_IDLE))
+			return false;
+		/* fallthrough */
+	case REQ_OP_DISCARD:
+		return true;
+	default:
 		return false;
-
-	return true;
+	}
 }
 
 /*
@@ -584,7 +616,7 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
  */
 enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
 {
-	unsigned int ret = 0;
+	enum wbt_flags ret = 0;
 
 	if (!rwb_enabled(rwb))
 		return 0;
@@ -598,41 +630,42 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
 		return ret;
 	}
 
-	__wbt_wait(rwb, bio->bi_opf, lock);
+	if (current_is_kswapd())
+		ret |= WBT_KSWAPD;
+	if (bio_op(bio) == REQ_OP_DISCARD)
+		ret |= WBT_DISCARD;
+
+	__wbt_wait(rwb, ret, bio->bi_opf, lock);
 
 	if (!blk_stat_is_active(rwb->cb))
 		rwb_arm_timer(rwb);
 
-	if (current_is_kswapd())
-		ret |= WBT_KSWAPD;
-
 	return ret | WBT_TRACKED;
 }
 
-void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+void wbt_issue(struct rq_wb *rwb, struct request *rq)
 {
 	if (!rwb_enabled(rwb))
 		return;
 
 	/*
-	 * Track sync issue, in case it takes a long time to complete. Allows
-	 * us to react quicker, if a sync IO takes a long time to complete.
-	 * Note that this is just a hint. 'stat' can go away when the
-	 * request completes, so it's important we never dereference it. We
-	 * only use the address to compare with, which is why we store the
-	 * sync_issue time locally.
+	 * Track sync issue, in case it takes a long time to complete. Allows us
+	 * to react quicker, if a sync IO takes a long time to complete. Note
+	 * that this is just a hint. The request can go away when it completes,
+	 * so it's important we never dereference it. We only use the address to
+	 * compare with, which is why we store the sync_issue time locally.
 	 */
-	if (wbt_is_read(stat) && !rwb->sync_issue) {
-		rwb->sync_cookie = stat;
-		rwb->sync_issue = blk_stat_time(stat);
+	if (wbt_is_read(rq) && !rwb->sync_issue) {
+		rwb->sync_cookie = rq;
+		rwb->sync_issue = rq->io_start_time_ns;
 	}
 }
 
-void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+void wbt_requeue(struct rq_wb *rwb, struct request *rq)
 {
 	if (!rwb_enabled(rwb))
 		return;
-	if (stat == rwb->sync_cookie) {
+	if (rq == rwb->sync_cookie) {
 		rwb->sync_issue = 0;
 		rwb->sync_cookie = NULL;
 	}
@@ -701,7 +734,7 @@ static int wbt_data_dir(const struct request *rq)
 
 	if (op == REQ_OP_READ)
 		return READ;
-	else if (op == REQ_OP_WRITE || op == REQ_OP_FLUSH)
+	else if (op_is_write(op))
 		return WRITE;
 
 	/* don't account */
@@ -713,8 +746,6 @@ int wbt_init(struct request_queue *q)
 	struct rq_wb *rwb;
 	int i;
 
-	BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
-
 	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
 	if (!rwb)
 		return -ENOMEM;
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index a232c98..300df53 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -14,12 +14,16 @@ enum wbt_flags {
 	WBT_TRACKED		= 1,	/* write, tracked for throttling */
 	WBT_READ		= 2,	/* read */
 	WBT_KSWAPD		= 4,	/* write, from kswapd */
+	WBT_DISCARD		= 8,	/* discard */
 
-	WBT_NR_BITS		= 3,	/* number of bits */
+	WBT_NR_BITS		= 4,	/* number of bits */
 };
 
 enum {
-	WBT_NUM_RWQ		= 2,
+	WBT_RWQ_BG		= 0,
+	WBT_RWQ_KSWAPD,
+	WBT_RWQ_DISCARD,
+	WBT_NUM_RWQ,
 };
 
 /*
@@ -31,31 +35,6 @@ enum {
 	WBT_STATE_ON_MANUAL	= 2,
 };
 
-static inline void wbt_clear_state(struct blk_issue_stat *stat)
-{
-	stat->stat &= ~BLK_STAT_RES_MASK;
-}
-
-static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat)
-{
-	return (stat->stat & BLK_STAT_RES_MASK) >> BLK_STAT_RES_SHIFT;
-}
-
-static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct)
-{
-	stat->stat |= ((u64) wb_acct) << BLK_STAT_RES_SHIFT;
-}
-
-static inline bool wbt_is_tracked(struct blk_issue_stat *stat)
-{
-	return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_TRACKED;
-}
-
-static inline bool wbt_is_read(struct blk_issue_stat *stat)
-{
-	return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_READ;
-}
-
 struct rq_wait {
 	wait_queue_head_t wait;
 	atomic_t inflight;
@@ -84,7 +63,7 @@ struct rq_wb {
 
 	struct blk_stat_callback *cb;
 
-	s64 sync_issue;
+	u64 sync_issue;
 	void *sync_cookie;
 
 	unsigned int wc;
@@ -109,14 +88,19 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb)
 
 #ifdef CONFIG_BLK_WBT
 
+static inline void wbt_track(struct request *rq, enum wbt_flags flags)
+{
+	rq->wbt_flags |= flags;
+}
+
 void __wbt_done(struct rq_wb *, enum wbt_flags);
-void wbt_done(struct rq_wb *, struct blk_issue_stat *);
+void wbt_done(struct rq_wb *, struct request *);
 enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *);
 int wbt_init(struct request_queue *);
 void wbt_exit(struct request_queue *);
 void wbt_update_limits(struct rq_wb *);
-void wbt_requeue(struct rq_wb *, struct blk_issue_stat *);
-void wbt_issue(struct rq_wb *, struct blk_issue_stat *);
+void wbt_requeue(struct rq_wb *, struct request *);
+void wbt_issue(struct rq_wb *, struct request *);
 void wbt_disable_default(struct request_queue *);
 void wbt_enable_default(struct request_queue *);
 
@@ -127,10 +111,13 @@ u64 wbt_default_latency_nsec(struct request_queue *);
 
 #else
 
+static inline void wbt_track(struct request *rq, enum wbt_flags flags)
+{
+}
 static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags)
 {
 }
-static inline void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
+static inline void wbt_done(struct rq_wb *rwb, struct request *rq)
 {
 }
 static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio,
@@ -148,10 +135,10 @@ static inline void wbt_exit(struct request_queue *q)
 static inline void wbt_update_limits(struct rq_wb *rwb)
 {
 }
-static inline void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+static inline void wbt_requeue(struct rq_wb *rwb, struct request *rq)
 {
 }
-static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+static inline void wbt_issue(struct rq_wb *rwb, struct request *rq)
 {
 }
 static inline void wbt_disable_default(struct request_queue *q)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 08e84ef..3d08dc84 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -328,7 +328,11 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
 	if (!rep.nr_zones)
 		return -EINVAL;
 
-	zones = kcalloc(rep.nr_zones, sizeof(struct blk_zone), GFP_KERNEL);
+	if (rep.nr_zones > INT_MAX / sizeof(struct blk_zone))
+		return -ERANGE;
+
+	zones = kvmalloc(rep.nr_zones * sizeof(struct blk_zone),
+			GFP_KERNEL | __GFP_ZERO);
 	if (!zones)
 		return -ENOMEM;
 
@@ -350,7 +354,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
 	}
 
  out:
-	kfree(zones);
+	kvfree(zones);
 
 	return ret;
 }
diff --git a/block/blk.h b/block/blk.h
index b034fd2..8d23aea9 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -186,7 +186,7 @@ unsigned int blk_plug_queued_count(struct request_queue *q);
 
 void blk_account_io_start(struct request *req, bool new_io);
 void blk_account_io_completion(struct request *req, unsigned int bytes);
-void blk_account_io_done(struct request *req);
+void blk_account_io_done(struct request *req, u64 now);
 
 /*
  * EH timer and IO completion will both attempt to 'grab' the request, make
@@ -231,6 +231,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq
 		e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
 }
 
+int elevator_init(struct request_queue *);
+int elevator_init_mq(struct request_queue *q);
+void elevator_exit(struct request_queue *, struct elevator_queue *);
 int elv_register_queue(struct request_queue *q);
 void elv_unregister_queue(struct request_queue *q);
 
diff --git a/block/bounce.c b/block/bounce.c
index dd0b93f..fd31347 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -28,28 +28,29 @@
 #define POOL_SIZE	64
 #define ISA_POOL_SIZE	16
 
-static struct bio_set *bounce_bio_set, *bounce_bio_split;
-static mempool_t *page_pool, *isa_page_pool;
+static struct bio_set bounce_bio_set, bounce_bio_split;
+static mempool_t page_pool, isa_page_pool;
 
 #if defined(CONFIG_HIGHMEM)
 static __init int init_emergency_pool(void)
 {
+	int ret;
 #if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
 	if (max_pfn <= max_low_pfn)
 		return 0;
 #endif
 
-	page_pool = mempool_create_page_pool(POOL_SIZE, 0);
-	BUG_ON(!page_pool);
+	ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0);
+	BUG_ON(ret);
 	pr_info("pool size: %d pages\n", POOL_SIZE);
 
-	bounce_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-	BUG_ON(!bounce_bio_set);
-	if (bioset_integrity_create(bounce_bio_set, BIO_POOL_SIZE))
+	ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+	BUG_ON(ret);
+	if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE))
 		BUG_ON(1);
 
-	bounce_bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
-	BUG_ON(!bounce_bio_split);
+	ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
+	BUG_ON(ret);
 
 	return 0;
 }
@@ -63,14 +64,11 @@ __initcall(init_emergency_pool);
  */
 static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
 {
-	unsigned long flags;
 	unsigned char *vto;
 
-	local_irq_save(flags);
 	vto = kmap_atomic(to->bv_page);
 	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
 	kunmap_atomic(vto);
-	local_irq_restore(flags);
 }
 
 #else /* CONFIG_HIGHMEM */
@@ -94,12 +92,14 @@ static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
  */
 int init_emergency_isa_pool(void)
 {
-	if (isa_page_pool)
+	int ret;
+
+	if (mempool_initialized(&isa_page_pool))
 		return 0;
 
-	isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
-				       mempool_free_pages, (void *) 0);
-	BUG_ON(!isa_page_pool);
+	ret = mempool_init(&isa_page_pool, ISA_POOL_SIZE, mempool_alloc_pages_isa,
+			   mempool_free_pages, (void *) 0);
+	BUG_ON(ret);
 
 	pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE);
 	return 0;
@@ -166,13 +166,13 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
 
 static void bounce_end_io_write(struct bio *bio)
 {
-	bounce_end_io(bio, page_pool);
+	bounce_end_io(bio, &page_pool);
 }
 
 static void bounce_end_io_write_isa(struct bio *bio)
 {
 
-	bounce_end_io(bio, isa_page_pool);
+	bounce_end_io(bio, &isa_page_pool);
 }
 
 static void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
@@ -187,12 +187,12 @@ static void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
 
 static void bounce_end_io_read(struct bio *bio)
 {
-	__bounce_end_io_read(bio, page_pool);
+	__bounce_end_io_read(bio, &page_pool);
 }
 
 static void bounce_end_io_read_isa(struct bio *bio)
 {
-	__bounce_end_io_read(bio, isa_page_pool);
+	__bounce_end_io_read(bio, &isa_page_pool);
 }
 
 static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
@@ -217,13 +217,13 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 		return;
 
 	if (!passthrough && sectors < bio_sectors(*bio_orig)) {
-		bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
+		bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
 		bio_chain(bio, *bio_orig);
 		generic_make_request(*bio_orig);
 		*bio_orig = bio;
 	}
 	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
-			bounce_bio_set);
+			&bounce_bio_set);
 
 	bio_for_each_segment_all(to, bio, i) {
 		struct page *page = to->bv_page;
@@ -250,7 +250,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 
 	bio->bi_flags |= (1 << BIO_BOUNCED);
 
-	if (pool == page_pool) {
+	if (pool == &page_pool) {
 		bio->bi_end_io = bounce_end_io_write;
 		if (rw == READ)
 			bio->bi_end_io = bounce_end_io_read;
@@ -282,10 +282,10 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	if (!(q->bounce_gfp & GFP_DMA)) {
 		if (q->limits.bounce_pfn >= blk_max_pfn)
 			return;
-		pool = page_pool;
+		pool = &page_pool;
 	} else {
-		BUG_ON(!isa_page_pool);
-		pool = isa_page_pool;
+		BUG_ON(!mempool_initialized(&isa_page_pool));
+		pool = &isa_page_pool;
 	}
 
 	/*
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index fc2e5ff..9419def 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -303,11 +303,9 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req)
  * @name: device to give bsg device
  * @job_fn: bsg job handler
  * @dd_job_size: size of LLD data needed for each job
- * @release: @dev release function
  */
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
-		bsg_job_fn *job_fn, int dd_job_size,
-		void (*release)(struct device *))
+		bsg_job_fn *job_fn, int dd_job_size)
 {
 	struct request_queue *q;
 	int ret;
@@ -331,7 +329,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
 	blk_queue_softirq_done(q, bsg_softirq_done);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
 
-	ret = bsg_register_queue(q, dev, name, &bsg_transport_ops, release);
+	ret = bsg_register_queue(q, dev, name, &bsg_transport_ops);
 	if (ret) {
 		printk(KERN_ERR "%s: bsg interface failed to "
 		       "initialize - register queue\n", dev->kobj.name);
diff --git a/block/bsg.c b/block/bsg.c
index defa06c..132e657 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -226,8 +226,7 @@ bsg_map_hdr(struct request_queue *q, struct sg_io_v4 *hdr, fmode_t mode)
 		return ERR_PTR(ret);
 
 	rq = blk_get_request(q, hdr->dout_xfer_len ?
-			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
-			GFP_KERNEL);
+			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(rq))
 		return rq;
 
@@ -249,7 +248,7 @@ bsg_map_hdr(struct request_queue *q, struct sg_io_v4 *hdr, fmode_t mode)
 			goto out;
 		}
 
-		next_rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
+		next_rq = blk_get_request(q, REQ_OP_SCSI_IN, 0);
 		if (IS_ERR(next_rq)) {
 			ret = PTR_ERR(next_rq);
 			goto out;
@@ -650,18 +649,6 @@ static struct bsg_device *bsg_alloc_device(void)
 	return bd;
 }
 
-static void bsg_kref_release_function(struct kref *kref)
-{
-	struct bsg_class_device *bcd =
-		container_of(kref, struct bsg_class_device, ref);
-	struct device *parent = bcd->parent;
-
-	if (bcd->release)
-		bcd->release(bcd->parent);
-
-	put_device(parent);
-}
-
 static int bsg_put_device(struct bsg_device *bd)
 {
 	int ret = 0, do_free;
@@ -694,7 +681,6 @@ static int bsg_put_device(struct bsg_device *bd)
 
 	kfree(bd);
 out:
-	kref_put(&q->bsg_dev.ref, bsg_kref_release_function);
 	if (do_free)
 		blk_put_queue(q);
 	return ret;
@@ -760,8 +746,6 @@ static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file)
 	 */
 	mutex_lock(&bsg_mutex);
 	bcd = idr_find(&bsg_minor_idr, iminor(inode));
-	if (bcd)
-		kref_get(&bcd->ref);
 	mutex_unlock(&bsg_mutex);
 
 	if (!bcd)
@@ -772,8 +756,6 @@ static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file)
 		return bd;
 
 	bd = bsg_add_device(inode, bcd->queue, file);
-	if (IS_ERR(bd))
-		kref_put(&bcd->ref, bsg_kref_release_function);
 
 	return bd;
 }
@@ -913,25 +895,17 @@ void bsg_unregister_queue(struct request_queue *q)
 		sysfs_remove_link(&q->kobj, "bsg");
 	device_unregister(bcd->class_dev);
 	bcd->class_dev = NULL;
-	kref_put(&bcd->ref, bsg_kref_release_function);
 	mutex_unlock(&bsg_mutex);
 }
 EXPORT_SYMBOL_GPL(bsg_unregister_queue);
 
 int bsg_register_queue(struct request_queue *q, struct device *parent,
-		const char *name, const struct bsg_ops *ops,
-		void (*release)(struct device *))
+		const char *name, const struct bsg_ops *ops)
 {
 	struct bsg_class_device *bcd;
 	dev_t dev;
 	int ret;
 	struct device *class_dev = NULL;
-	const char *devname;
-
-	if (name)
-		devname = name;
-	else
-		devname = dev_name(parent);
 
 	/*
 	 * we need a proper transport to send commands, not a stacked device
@@ -955,15 +929,12 @@ int bsg_register_queue(struct request_queue *q, struct device *parent,
 
 	bcd->minor = ret;
 	bcd->queue = q;
-	bcd->parent = get_device(parent);
-	bcd->release = release;
 	bcd->ops = ops;
-	kref_init(&bcd->ref);
 	dev = MKDEV(bsg_major, bcd->minor);
-	class_dev = device_create(bsg_class, parent, dev, NULL, "%s", devname);
+	class_dev = device_create(bsg_class, parent, dev, NULL, "%s", name);
 	if (IS_ERR(class_dev)) {
 		ret = PTR_ERR(class_dev);
-		goto put_dev;
+		goto idr_remove;
 	}
 	bcd->class_dev = class_dev;
 
@@ -978,8 +949,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent,
 
 unregister_class_dev:
 	device_unregister(class_dev);
-put_dev:
-	put_device(parent);
+idr_remove:
 	idr_remove(&bsg_minor_idr, bcd->minor);
 unlock:
 	mutex_unlock(&bsg_mutex);
@@ -993,7 +963,7 @@ int bsg_scsi_register_queue(struct request_queue *q, struct device *parent)
 		return -EINVAL;
 	}
 
-	return bsg_register_queue(q, parent, NULL, &bsg_scsi_ops, NULL);
+	return bsg_register_queue(q, parent, dev_name(parent), &bsg_scsi_ops);
 }
 EXPORT_SYMBOL_GPL(bsg_scsi_register_queue);
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9f342ef..82b6c27 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -210,9 +210,9 @@ struct cfqg_stats {
 	/* total time with empty current active q with other requests queued */
 	struct blkg_stat		empty_time;
 	/* fields after this shouldn't be cleared on stat reset */
-	uint64_t			start_group_wait_time;
-	uint64_t			start_idle_time;
-	uint64_t			start_empty_time;
+	u64				start_group_wait_time;
+	u64				start_idle_time;
+	u64				start_empty_time;
 	uint16_t			flags;
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED */
@@ -491,13 +491,13 @@ CFQG_FLAG_FNS(empty)
 /* This should be called with the queue_lock held. */
 static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
 {
-	unsigned long long now;
+	u64 now;
 
 	if (!cfqg_stats_waiting(stats))
 		return;
 
-	now = sched_clock();
-	if (time_after64(now, stats->start_group_wait_time))
+	now = ktime_get_ns();
+	if (now > stats->start_group_wait_time)
 		blkg_stat_add(&stats->group_wait_time,
 			      now - stats->start_group_wait_time);
 	cfqg_stats_clear_waiting(stats);
@@ -513,20 +513,20 @@ static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
 		return;
 	if (cfqg == curr_cfqg)
 		return;
-	stats->start_group_wait_time = sched_clock();
+	stats->start_group_wait_time = ktime_get_ns();
 	cfqg_stats_mark_waiting(stats);
 }
 
 /* This should be called with the queue_lock held. */
 static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
 {
-	unsigned long long now;
+	u64 now;
 
 	if (!cfqg_stats_empty(stats))
 		return;
 
-	now = sched_clock();
-	if (time_after64(now, stats->start_empty_time))
+	now = ktime_get_ns();
+	if (now > stats->start_empty_time)
 		blkg_stat_add(&stats->empty_time,
 			      now - stats->start_empty_time);
 	cfqg_stats_clear_empty(stats);
@@ -552,7 +552,7 @@ static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
 	if (cfqg_stats_empty(stats))
 		return;
 
-	stats->start_empty_time = sched_clock();
+	stats->start_empty_time = ktime_get_ns();
 	cfqg_stats_mark_empty(stats);
 }
 
@@ -561,9 +561,9 @@ static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
 	struct cfqg_stats *stats = &cfqg->stats;
 
 	if (cfqg_stats_idling(stats)) {
-		unsigned long long now = sched_clock();
+		u64 now = ktime_get_ns();
 
-		if (time_after64(now, stats->start_idle_time))
+		if (now > stats->start_idle_time)
 			blkg_stat_add(&stats->idle_time,
 				      now - stats->start_idle_time);
 		cfqg_stats_clear_idling(stats);
@@ -576,7 +576,7 @@ static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
 
 	BUG_ON(cfqg_stats_idling(stats));
 
-	stats->start_idle_time = sched_clock();
+	stats->start_idle_time = ktime_get_ns();
 	cfqg_stats_mark_idling(stats);
 }
 
@@ -701,17 +701,19 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
 }
 
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
-			uint64_t start_time, uint64_t io_start_time,
-			unsigned int op)
+						u64 start_time_ns,
+						u64 io_start_time_ns,
+						unsigned int op)
 {
 	struct cfqg_stats *stats = &cfqg->stats;
-	unsigned long long now = sched_clock();
+	u64 now = ktime_get_ns();
 
-	if (time_after64(now, io_start_time))
-		blkg_rwstat_add(&stats->service_time, op, now - io_start_time);
-	if (time_after64(io_start_time, start_time))
+	if (now > io_start_time_ns)
+		blkg_rwstat_add(&stats->service_time, op,
+				now - io_start_time_ns);
+	if (io_start_time_ns > start_time_ns)
 		blkg_rwstat_add(&stats->wait_time, op,
-				io_start_time - start_time);
+				io_start_time_ns - start_time_ns);
 }
 
 /* @stats = 0 */
@@ -797,8 +799,9 @@ static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
 static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
 			unsigned int op) { }
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
-			uint64_t start_time, uint64_t io_start_time,
-			unsigned int op) { }
+						u64 start_time_ns,
+						u64 io_start_time_ns,
+						unsigned int op) { }
 
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED */
 
@@ -4225,8 +4228,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
 	(RQ_CFQG(rq))->dispatched--;
-	cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
-				     rq_io_start_time_ns(rq), rq->cmd_flags);
+	cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns,
+				     rq->io_start_time_ns, rq->cmd_flags);
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
@@ -4242,16 +4245,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 					cfqq_type(cfqq));
 
 		st->ttime.last_end_request = now;
-		/*
-		 * We have to do this check in jiffies since start_time is in
-		 * jiffies and it is not trivial to convert to ns. If
-		 * cfq_fifo_expire[1] ever comes close to 1 jiffie, this test
-		 * will become problematic but so far we are fine (the default
-		 * is 128 ms).
-		 */
-		if (!time_after(rq->start_time +
-				  nsecs_to_jiffies(cfqd->cfq_fifo_expire[1]),
-				jiffies))
+		if (rq->start_time_ns + cfqd->cfq_fifo_expire[1] <= now)
 			cfqd->last_delayed_sync = now;
 	}
 
@@ -4792,7 +4786,7 @@ USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, U
 #undef USEC_STORE_FUNCTION
 
 #define CFQ_ATTR(name) \
-	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
+	__ATTR(name, 0644, cfq_##name##_show, cfq_##name##_store)
 
 static struct elv_fs_entry cfq_attrs[] = {
 	CFQ_ATTR(quantum),
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 9de9f15..ef2f1f0 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -512,8 +512,7 @@ STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
 #undef STORE_FUNCTION
 
 #define DD_ATTR(name) \
-	__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
-				      deadline_##name##_store)
+	__ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
 
 static struct elv_fs_entry deadline_attrs[] = {
 	DD_ATTR(read_expire),
diff --git a/block/elevator.c b/block/elevator.c
index e87e9b4..fa828b5 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -199,76 +199,46 @@ static void elevator_release(struct kobject *kobj)
 	kfree(e);
 }
 
-int elevator_init(struct request_queue *q, char *name)
+/*
+ * Use the default elevator specified by config boot param for non-mq devices,
+ * or by config option.  Don't try to load modules as we could be running off
+ * async and request_module() isn't allowed from async.
+ */
+int elevator_init(struct request_queue *q)
 {
 	struct elevator_type *e = NULL;
-	int err;
+	int err = 0;
 
 	/*
 	 * q->sysfs_lock must be held to provide mutual exclusion between
 	 * elevator_switch() and here.
 	 */
-	lockdep_assert_held(&q->sysfs_lock);
-
+	mutex_lock(&q->sysfs_lock);
 	if (unlikely(q->elevator))
-		return 0;
-
-	INIT_LIST_HEAD(&q->queue_head);
-	q->last_merge = NULL;
-	q->end_sector = 0;
-	q->boundary_rq = NULL;
-
-	if (name) {
-		e = elevator_get(q, name, true);
-		if (!e)
-			return -EINVAL;
-	}
+		goto out_unlock;
 
-	/*
-	 * Use the default elevator specified by config boot param for
-	 * non-mq devices, or by config option. Don't try to load modules
-	 * as we could be running off async and request_module() isn't
-	 * allowed from async.
-	 */
-	if (!e && !q->mq_ops && *chosen_elevator) {
+	if (*chosen_elevator) {
 		e = elevator_get(q, chosen_elevator, false);
 		if (!e)
 			printk(KERN_ERR "I/O scheduler %s not found\n",
 							chosen_elevator);
 	}
 
+	if (!e)
+		e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false);
 	if (!e) {
-		/*
-		 * For blk-mq devices, we default to using mq-deadline,
-		 * if available, for single queue devices. If deadline
-		 * isn't available OR we have multiple queues, default
-		 * to "none".
-		 */
-		if (q->mq_ops) {
-			if (q->nr_hw_queues == 1)
-				e = elevator_get(q, "mq-deadline", false);
-			if (!e)
-				return 0;
-		} else
-			e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false);
-
-		if (!e) {
-			printk(KERN_ERR
-				"Default I/O scheduler not found. " \
-				"Using noop.\n");
-			e = elevator_get(q, "noop", false);
-		}
+		printk(KERN_ERR
+			"Default I/O scheduler not found. Using noop.\n");
+		e = elevator_get(q, "noop", false);
 	}
 
-	if (e->uses_mq)
-		err = blk_mq_init_sched(q, e);
-	else
-		err = e->ops.sq.elevator_init_fn(q, e);
+	err = e->ops.sq.elevator_init_fn(q, e);
 	if (err)
 		elevator_put(e);
+out_unlock:
+	mutex_unlock(&q->sysfs_lock);
 	return err;
 }
-EXPORT_SYMBOL(elevator_init);
 
 void elevator_exit(struct request_queue *q, struct elevator_queue *e)
 {
@@ -281,7 +251,6 @@ void elevator_exit(struct request_queue *q, struct elevator_queue *e)
 
 	kobject_put(&e->kobj);
 }
-EXPORT_SYMBOL(elevator_exit);
 
 static inline void __elv_rqhash_del(struct request *rq)
 {
@@ -1005,6 +974,40 @@ out:
 }
 
 /*
+ * For blk-mq devices, we default to using mq-deadline, if available, for single
+ * queue devices.  If deadline isn't available OR we have multiple queues,
+ * default to "none".
+ */
+int elevator_init_mq(struct request_queue *q)
+{
+	struct elevator_type *e;
+	int err = 0;
+
+	if (q->nr_hw_queues != 1)
+		return 0;
+
+	/*
+	 * q->sysfs_lock must be held to provide mutual exclusion between
+	 * elevator_switch() and here.
+	 */
+	mutex_lock(&q->sysfs_lock);
+	if (unlikely(q->elevator))
+		goto out_unlock;
+
+	e = elevator_get(q, "mq-deadline", false);
+	if (!e)
+		goto out_unlock;
+
+	err = blk_mq_init_sched(q, e);
+	if (err)
+		elevator_put(e);
+out_unlock:
+	mutex_unlock(&q->sysfs_lock);
+	return err;
+}
+
+
+/*
  * switch to new_e io scheduler. be careful not to introduce deadlocks -
  * we don't free the old io scheduler, before we have allocated what we
  * need for the new one. this way we have a chance of going back to the old
diff --git a/block/genhd.c b/block/genhd.c
index c4513fe..f1543a4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1027,18 +1027,6 @@ static const struct seq_operations partitions_op = {
 	.stop	= disk_seqf_stop,
 	.show	= show_partition
 };
-
-static int partitions_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &partitions_op);
-}
-
-static const struct file_operations proc_partitions_operations = {
-	.open		= partitions_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
 #endif
 
 
@@ -1139,28 +1127,25 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
 	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
 }
 
-static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
-static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
-static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
-static DEVICE_ATTR(hidden, S_IRUGO, disk_hidden_show, NULL);
-static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
-static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
-static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
-static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
-		   NULL);
-static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
-static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
-static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
-static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show,
-		disk_badblocks_store);
+static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
+static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
+static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
+static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL);
+static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL);
+static DEVICE_ATTR(size, 0444, part_size_show, NULL);
+static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL);
+static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
+static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
+static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
-	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
+	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
 #endif
 #ifdef CONFIG_FAIL_IO_TIMEOUT
 static struct device_attribute dev_attr_fail_timeout =
-	__ATTR(io-timeout-fail,  S_IRUGO|S_IWUSR, part_timeout_show,
-		part_timeout_store);
+	__ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
 #endif
 
 static struct attribute *disk_attrs[] = {
@@ -1377,22 +1362,10 @@ static const struct seq_operations diskstats_op = {
 	.show	= diskstats_show
 };
 
-static int diskstats_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &diskstats_op);
-}
-
-static const struct file_operations proc_diskstats_operations = {
-	.open		= diskstats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int __init proc_genhd_init(void)
 {
-	proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
-	proc_create("partitions", 0, NULL, &proc_partitions_operations);
+	proc_create_seq("diskstats", 0, NULL, &diskstats_op);
+	proc_create_seq("partitions", 0, NULL, &partitions_op);
 	return 0;
 }
 module_init(proc_genhd_init);
@@ -1924,9 +1897,9 @@ static ssize_t disk_events_poll_msecs_store(struct device *dev,
 	return count;
 }
 
-static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
-static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
-static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
+static const DEVICE_ATTR(events, 0444, disk_events_show, NULL);
+static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL);
+static const DEVICE_ATTR(events_poll_msecs, 0644,
 			 disk_events_poll_msecs_show,
 			 disk_events_poll_msecs_store);
 
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 0d6d25e3..a1660ba 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -72,6 +72,19 @@ static const unsigned int kyber_batch_size[] = {
 	[KYBER_OTHER] = 8,
 };
 
+/*
+ * There is a same mapping between ctx & hctx and kcq & khd,
+ * we use request->mq_ctx->index_hw to index the kcq in khd.
+ */
+struct kyber_ctx_queue {
+	/*
+	 * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
+	 * Also protect the rqs on rq_list when merge.
+	 */
+	spinlock_t lock;
+	struct list_head rq_list[KYBER_NUM_DOMAINS];
+} ____cacheline_aligned_in_smp;
+
 struct kyber_queue_data {
 	struct request_queue *q;
 
@@ -99,6 +112,8 @@ struct kyber_hctx_data {
 	struct list_head rqs[KYBER_NUM_DOMAINS];
 	unsigned int cur_domain;
 	unsigned int batching;
+	struct kyber_ctx_queue *kcqs;
+	struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
 	wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
 	struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
 	atomic_t wait_index[KYBER_NUM_DOMAINS];
@@ -107,10 +122,8 @@ struct kyber_hctx_data {
 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 			     void *key);
 
-static int rq_sched_domain(const struct request *rq)
+static unsigned int kyber_sched_domain(unsigned int op)
 {
-	unsigned int op = rq->cmd_flags;
-
 	if ((op & REQ_OP_MASK) == REQ_OP_READ)
 		return KYBER_READ;
 	else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
@@ -284,6 +297,11 @@ static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
 	return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
 }
 
+static int kyber_bucket_fn(const struct request *rq)
+{
+	return kyber_sched_domain(rq->cmd_flags);
+}
+
 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
 {
 	struct kyber_queue_data *kqd;
@@ -297,7 +315,7 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
 		goto err;
 	kqd->q = q;
 
-	kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
+	kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
 					  KYBER_NUM_DOMAINS, kqd);
 	if (!kqd->cb)
 		goto err_kqd;
@@ -376,8 +394,18 @@ static void kyber_exit_sched(struct elevator_queue *e)
 	kfree(kqd);
 }
 
+static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
+{
+	unsigned int i;
+
+	spin_lock_init(&kcq->lock);
+	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
+		INIT_LIST_HEAD(&kcq->rq_list[i]);
+}
+
 static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 {
+	struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
 	struct kyber_hctx_data *khd;
 	int i;
 
@@ -385,6 +413,24 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 	if (!khd)
 		return -ENOMEM;
 
+	khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
+				       sizeof(struct kyber_ctx_queue),
+				       GFP_KERNEL, hctx->numa_node);
+	if (!khd->kcqs)
+		goto err_khd;
+
+	for (i = 0; i < hctx->nr_ctx; i++)
+		kyber_ctx_queue_init(&khd->kcqs[i]);
+
+	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+		if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
+				      ilog2(8), GFP_KERNEL, hctx->numa_node)) {
+			while (--i >= 0)
+				sbitmap_free(&khd->kcq_map[i]);
+			goto err_kcqs;
+		}
+	}
+
 	spin_lock_init(&khd->lock);
 
 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
@@ -400,12 +446,26 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 	khd->batching = 0;
 
 	hctx->sched_data = khd;
+	sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
+					kqd->async_depth);
 
 	return 0;
+
+err_kcqs:
+	kfree(khd->kcqs);
+err_khd:
+	kfree(khd);
+	return -ENOMEM;
 }
 
 static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 {
+	struct kyber_hctx_data *khd = hctx->sched_data;
+	int i;
+
+	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
+		sbitmap_free(&khd->kcq_map[i]);
+	kfree(khd->kcqs);
 	kfree(hctx->sched_data);
 }
 
@@ -427,7 +487,7 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd,
 
 	nr = rq_get_domain_token(rq);
 	if (nr != -1) {
-		sched_domain = rq_sched_domain(rq);
+		sched_domain = kyber_sched_domain(rq->cmd_flags);
 		sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
 				    rq->mq_ctx->cpu);
 	}
@@ -446,11 +506,51 @@ static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
 	}
 }
 
+static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+{
+	struct kyber_hctx_data *khd = hctx->sched_data;
+	struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
+	struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw];
+	unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
+	struct list_head *rq_list = &kcq->rq_list[sched_domain];
+	bool merged;
+
+	spin_lock(&kcq->lock);
+	merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
+	spin_unlock(&kcq->lock);
+	blk_mq_put_ctx(ctx);
+
+	return merged;
+}
+
 static void kyber_prepare_request(struct request *rq, struct bio *bio)
 {
 	rq_set_domain_token(rq, -1);
 }
 
+static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
+				  struct list_head *rq_list, bool at_head)
+{
+	struct kyber_hctx_data *khd = hctx->sched_data;
+	struct request *rq, *next;
+
+	list_for_each_entry_safe(rq, next, rq_list, queuelist) {
+		unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
+		struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
+		struct list_head *head = &kcq->rq_list[sched_domain];
+
+		spin_lock(&kcq->lock);
+		if (at_head)
+			list_move(&rq->queuelist, head);
+		else
+			list_move_tail(&rq->queuelist, head);
+		sbitmap_set_bit(&khd->kcq_map[sched_domain],
+				rq->mq_ctx->index_hw);
+		blk_mq_sched_request_inserted(rq);
+		spin_unlock(&kcq->lock);
+	}
+}
+
 static void kyber_finish_request(struct request *rq)
 {
 	struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
@@ -469,7 +569,7 @@ static void kyber_completed_request(struct request *rq)
 	 * Check if this request met our latency goal. If not, quickly gather
 	 * some statistics and start throttling.
 	 */
-	sched_domain = rq_sched_domain(rq);
+	sched_domain = kyber_sched_domain(rq->cmd_flags);
 	switch (sched_domain) {
 	case KYBER_READ:
 		target = kqd->read_lat_nsec;
@@ -485,29 +585,48 @@ static void kyber_completed_request(struct request *rq)
 	if (blk_stat_is_active(kqd->cb))
 		return;
 
-	now = __blk_stat_time(ktime_to_ns(ktime_get()));
-	if (now < blk_stat_time(&rq->issue_stat))
+	now = ktime_get_ns();
+	if (now < rq->io_start_time_ns)
 		return;
 
-	latency = now - blk_stat_time(&rq->issue_stat);
+	latency = now - rq->io_start_time_ns;
 
 	if (latency > target)
 		blk_stat_activate_msecs(kqd->cb, 10);
 }
 
-static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
-				  struct blk_mq_hw_ctx *hctx)
+struct flush_kcq_data {
+	struct kyber_hctx_data *khd;
+	unsigned int sched_domain;
+	struct list_head *list;
+};
+
+static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
 {
-	LIST_HEAD(rq_list);
-	struct request *rq, *next;
+	struct flush_kcq_data *flush_data = data;
+	struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
 
-	blk_mq_flush_busy_ctxs(hctx, &rq_list);
-	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
-		unsigned int sched_domain;
+	spin_lock(&kcq->lock);
+	list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
+			      flush_data->list);
+	sbitmap_clear_bit(sb, bitnr);
+	spin_unlock(&kcq->lock);
 
-		sched_domain = rq_sched_domain(rq);
-		list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
-	}
+	return true;
+}
+
+static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
+				  unsigned int sched_domain,
+				  struct list_head *list)
+{
+	struct flush_kcq_data data = {
+		.khd = khd,
+		.sched_domain = sched_domain,
+		.list = list,
+	};
+
+	sbitmap_for_each_set(&khd->kcq_map[sched_domain],
+			     flush_busy_kcq, &data);
 }
 
 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
@@ -570,26 +689,23 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 static struct request *
 kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
 			  struct kyber_hctx_data *khd,
-			  struct blk_mq_hw_ctx *hctx,
-			  bool *flushed)
+			  struct blk_mq_hw_ctx *hctx)
 {
 	struct list_head *rqs;
 	struct request *rq;
 	int nr;
 
 	rqs = &khd->rqs[khd->cur_domain];
-	rq = list_first_entry_or_null(rqs, struct request, queuelist);
 
 	/*
-	 * If there wasn't already a pending request and we haven't flushed the
-	 * software queues yet, flush the software queues and check again.
+	 * If we already have a flushed request, then we just need to get a
+	 * token for it. Otherwise, if there are pending requests in the kcqs,
+	 * flush the kcqs, but only if we can get a token. If not, we should
+	 * leave the requests in the kcqs so that they can be merged. Note that
+	 * khd->lock serializes the flushes, so if we observed any bit set in
+	 * the kcq_map, we will always get a request.
 	 */
-	if (!rq && !*flushed) {
-		kyber_flush_busy_ctxs(khd, hctx);
-		*flushed = true;
-		rq = list_first_entry_or_null(rqs, struct request, queuelist);
-	}
-
+	rq = list_first_entry_or_null(rqs, struct request, queuelist);
 	if (rq) {
 		nr = kyber_get_domain_token(kqd, khd, hctx);
 		if (nr >= 0) {
@@ -598,6 +714,16 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
 			list_del_init(&rq->queuelist);
 			return rq;
 		}
+	} else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
+		nr = kyber_get_domain_token(kqd, khd, hctx);
+		if (nr >= 0) {
+			kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
+			rq = list_first_entry(rqs, struct request, queuelist);
+			khd->batching++;
+			rq_set_domain_token(rq, nr);
+			list_del_init(&rq->queuelist);
+			return rq;
+		}
 	}
 
 	/* There were either no pending requests or no tokens. */
@@ -608,7 +734,6 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
 	struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
 	struct kyber_hctx_data *khd = hctx->sched_data;
-	bool flushed = false;
 	struct request *rq;
 	int i;
 
@@ -619,7 +744,7 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	 * from the batch.
 	 */
 	if (khd->batching < kyber_batch_size[khd->cur_domain]) {
-		rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
+		rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
 		if (rq)
 			goto out;
 	}
@@ -640,7 +765,7 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
 		else
 			khd->cur_domain++;
 
-		rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
+		rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
 		if (rq)
 			goto out;
 	}
@@ -657,10 +782,12 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
 	int i;
 
 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
-		if (!list_empty_careful(&khd->rqs[i]))
+		if (!list_empty_careful(&khd->rqs[i]) ||
+		    sbitmap_any_bit_set(&khd->kcq_map[i]))
 			return true;
 	}
-	return sbitmap_any_bit_set(&hctx->ctx_map);
+
+	return false;
 }
 
 #define KYBER_LAT_SHOW_STORE(op)					\
@@ -831,7 +958,9 @@ static struct elevator_type kyber_sched = {
 		.init_hctx = kyber_init_hctx,
 		.exit_hctx = kyber_exit_hctx,
 		.limit_depth = kyber_limit_depth,
+		.bio_merge = kyber_bio_merge,
 		.prepare_request = kyber_prepare_request,
+		.insert_requests = kyber_insert_requests,
 		.finish_request = kyber_finish_request,
 		.requeue_request = kyber_finish_request,
 		.completed_request = kyber_completed_request,
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 8ec0ba9..099a9e05 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -630,8 +630,7 @@ STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
 #undef STORE_FUNCTION
 
 #define DD_ATTR(name) \
-	__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
-				      deadline_##name##_store)
+	__ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
 
 static struct elv_fs_entry deadline_attrs[] = {
 	DD_ATTR(read_expire),
diff --git a/block/partition-generic.c b/block/partition-generic.c
index db57cce..3dcfd4e 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -179,18 +179,17 @@ ssize_t part_fail_store(struct device *dev,
 }
 #endif
 
-static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
-static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
-static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
-static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
-static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
-static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
-		   NULL);
-static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
-static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
+static DEVICE_ATTR(partition, 0444, part_partition_show, NULL);
+static DEVICE_ATTR(start, 0444, part_start_show, NULL);
+static DEVICE_ATTR(size, 0444, part_size_show, NULL);
+static DEVICE_ATTR(ro, 0444, part_ro_show, NULL);
+static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL);
+static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
-	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
+	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
 #endif
 
 static struct attribute *part_attrs[] = {
@@ -291,8 +290,7 @@ static ssize_t whole_disk_show(struct device *dev,
 {
 	return 0;
 }
-static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
-		   whole_disk_show, NULL);
+static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL);
 
 /*
  * Must be called either with bd_mutex held, before a disk can be opened or
@@ -518,7 +516,7 @@ rescan:
 
 	if (disk->fops->revalidate_disk)
 		disk->fops->revalidate_disk(disk);
-	check_disk_size_change(disk, bdev);
+	check_disk_size_change(disk, bdev, true);
 	bdev->bd_invalidated = 0;
 	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
 		return 0;
@@ -643,7 +641,7 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
 		return res;
 
 	set_capacity(disk, 0);
-	check_disk_size_change(disk, bdev);
+	check_disk_size_change(disk, bdev, false);
 	bdev->bd_invalidated = 0;
 	/* tell userspace that the media / partition table may have changed */
 	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 60b471f..533f4ae 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -321,8 +321,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 		at_head = 1;
 
 	ret = -ENOMEM;
-	rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
-			GFP_KERNEL);
+	rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 	req = scsi_req(rq);
@@ -449,8 +448,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 
 	}
 
-	rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
-			__GFP_RECLAIM);
+	rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(rq)) {
 		err = PTR_ERR(rq);
 		goto error_free_buffer;
@@ -501,7 +499,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 		break;
 	}
 
-	if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_RECLAIM)) {
+	if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, GFP_NOIO)) {
 		err = DRIVER_ERROR << 24;
 		goto error;
 	}
@@ -538,7 +536,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 	struct request *rq;
 	int err;
 
-	rq = blk_get_request(q, REQ_OP_SCSI_OUT, __GFP_RECLAIM);
+	rq = blk_get_request(q, REQ_OP_SCSI_OUT, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 7846c0c..89ed613 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -347,7 +347,6 @@ static const struct proto_ops alg_proto_ops = {
 	.sendpage	=	sock_no_sendpage,
 	.sendmsg	=	sock_no_sendmsg,
 	.recvmsg	=	sock_no_recvmsg,
-	.poll		=	sock_no_poll,
 
 	.bind		=	alg_bind,
 	.release	=	af_alg_release,
@@ -1061,19 +1060,12 @@ void af_alg_async_cb(struct crypto_async_request *_req, int err)
 }
 EXPORT_SYMBOL_GPL(af_alg_async_cb);
 
-/**
- * af_alg_poll - poll system call handler
- */
-__poll_t af_alg_poll(struct file *file, struct socket *sock,
-			 poll_table *wait)
+__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	struct alg_sock *ask = alg_sk(sk);
 	struct af_alg_ctx *ctx = ask->private;
-	__poll_t mask;
-
-	sock_poll_wait(file, sk_sleep(sk), wait);
-	mask = 0;
+	__poll_t mask = 0;
 
 	if (!ctx->more || ctx->used)
 		mask |= EPOLLIN | EPOLLRDNORM;
@@ -1083,7 +1075,7 @@ __poll_t af_alg_poll(struct file *file, struct socket *sock,
 
 	return mask;
 }
-EXPORT_SYMBOL_GPL(af_alg_poll);
+EXPORT_SYMBOL_GPL(af_alg_poll_mask);
 
 /**
  * af_alg_alloc_areq - allocate struct af_alg_async_req
diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 4b07edd..330cf9f 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -375,7 +375,7 @@ static struct proto_ops algif_aead_ops = {
 	.sendmsg	=	aead_sendmsg,
 	.sendpage	=	af_alg_sendpage,
 	.recvmsg	=	aead_recvmsg,
-	.poll		=	af_alg_poll,
+	.poll_mask	=	af_alg_poll_mask,
 };
 
 static int aead_check_key(struct socket *sock)
@@ -471,7 +471,7 @@ static struct proto_ops algif_aead_ops_nokey = {
 	.sendmsg	=	aead_sendmsg_nokey,
 	.sendpage	=	aead_sendpage_nokey,
 	.recvmsg	=	aead_recvmsg_nokey,
-	.poll		=	af_alg_poll,
+	.poll_mask	=	af_alg_poll_mask,
 };
 
 static void *aead_bind(const char *name, u32 type, u32 mask)
diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
index 6c9b192..bfcf595 100644
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -288,7 +288,6 @@ static struct proto_ops algif_hash_ops = {
 	.mmap		=	sock_no_mmap,
 	.bind		=	sock_no_bind,
 	.setsockopt	=	sock_no_setsockopt,
-	.poll		=	sock_no_poll,
 
 	.release	=	af_alg_release,
 	.sendmsg	=	hash_sendmsg,
@@ -396,7 +395,6 @@ static struct proto_ops algif_hash_ops_nokey = {
 	.mmap		=	sock_no_mmap,
 	.bind		=	sock_no_bind,
 	.setsockopt	=	sock_no_setsockopt,
-	.poll		=	sock_no_poll,
 
 	.release	=	af_alg_release,
 	.sendmsg	=	hash_sendmsg_nokey,
diff --git a/crypto/algif_rng.c b/crypto/algif_rng.c
index 150c2b6..22df379 100644
--- a/crypto/algif_rng.c
+++ b/crypto/algif_rng.c
@@ -106,7 +106,6 @@ static struct proto_ops algif_rng_ops = {
 	.bind		=	sock_no_bind,
 	.accept		=	sock_no_accept,
 	.setsockopt	=	sock_no_setsockopt,
-	.poll		=	sock_no_poll,
 	.sendmsg	=	sock_no_sendmsg,
 	.sendpage	=	sock_no_sendpage,
 
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index c4e885d..15cf3c5 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -205,7 +205,7 @@ static struct proto_ops algif_skcipher_ops = {
 	.sendmsg	=	skcipher_sendmsg,
 	.sendpage	=	af_alg_sendpage,
 	.recvmsg	=	skcipher_recvmsg,
-	.poll		=	af_alg_poll,
+	.poll_mask	=	af_alg_poll_mask,
 };
 
 static int skcipher_check_key(struct socket *sock)
@@ -301,7 +301,7 @@ static struct proto_ops algif_skcipher_ops_nokey = {
 	.sendmsg	=	skcipher_sendmsg_nokey,
 	.sendpage	=	skcipher_sendpage_nokey,
 	.recvmsg	=	skcipher_recvmsg_nokey,
-	.poll		=	af_alg_poll,
+	.poll_mask	=	af_alg_poll_mask,
 };
 
 static void *skcipher_bind(const char *name, u32 type, u32 mask)
diff --git a/crypto/proc.c b/crypto/proc.c
index 822fcef..f4eb613 100644
--- a/crypto/proc.c
+++ b/crypto/proc.c
@@ -94,21 +94,9 @@ static const struct seq_operations crypto_seq_ops = {
 	.show		= c_show
 };
 
-static int crypto_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &crypto_seq_ops);
-}
-        
-static const struct file_operations proc_crypto_ops = {
-	.open		= crypto_info_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release
-};
-
 void __init crypto_init_proc(void)
 {
-	proc_create("crypto", 0, NULL, &proc_crypto_ops);
+	proc_create_seq("crypto", 0, NULL, &crypto_seq_ops);
 }
 
 void __exit crypto_exit_proc(void)
diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c
index 2d8de2f..84fdfa7 100644
--- a/drivers/acpi/ac.c
+++ b/drivers/acpi/ac.c
@@ -82,7 +82,6 @@ static SIMPLE_DEV_PM_OPS(acpi_ac_pm, NULL, acpi_ac_resume);
 #ifdef CONFIG_ACPI_PROCFS_POWER
 extern struct proc_dir_entry *acpi_lock_ac_dir(void);
 extern void *acpi_unlock_ac_dir(struct proc_dir_entry *acpi_ac_dir);
-static int acpi_ac_open_fs(struct inode *inode, struct file *file);
 #endif
 
 
@@ -111,16 +110,6 @@ struct acpi_ac {
 
 #define to_acpi_ac(x) power_supply_get_drvdata(x)
 
-#ifdef CONFIG_ACPI_PROCFS_POWER
-static const struct file_operations acpi_ac_fops = {
-	.owner = THIS_MODULE,
-	.open = acpi_ac_open_fs,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-#endif
-
 /* --------------------------------------------------------------------------
                                AC Adapter Management
    -------------------------------------------------------------------------- */
@@ -209,11 +198,6 @@ static int acpi_ac_seq_show(struct seq_file *seq, void *offset)
 	return 0;
 }
 
-static int acpi_ac_open_fs(struct inode *inode, struct file *file)
-{
-	return single_open(file, acpi_ac_seq_show, PDE_DATA(inode));
-}
-
 static int acpi_ac_add_fs(struct acpi_ac *ac)
 {
 	struct proc_dir_entry *entry = NULL;
@@ -228,9 +212,8 @@ static int acpi_ac_add_fs(struct acpi_ac *ac)
 	}
 
 	/* 'state' [R] */
-	entry = proc_create_data(ACPI_AC_FILE_STATE,
-				 S_IRUGO, acpi_device_dir(ac->device),
-				 &acpi_ac_fops, ac);
+	entry = proc_create_single_data(ACPI_AC_FILE_STATE, S_IRUGO,
+			acpi_device_dir(ac->device), acpi_ac_seq_show, ac);
 	if (!entry)
 		return -ENODEV;
 	return 0;
diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index bdb24d6..7655068 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -81,14 +81,6 @@ MODULE_PARM_DESC(cache_time, "cache time in milliseconds");
 #ifdef CONFIG_ACPI_PROCFS_POWER
 extern struct proc_dir_entry *acpi_lock_battery_dir(void);
 extern void *acpi_unlock_battery_dir(struct proc_dir_entry *acpi_battery_dir);
-
-enum acpi_battery_files {
-	info_tag = 0,
-	state_tag,
-	alarm_tag,
-	ACPI_BATTERY_NUMFILES,
-};
-
 #endif
 
 static const struct acpi_device_id battery_device_ids[] = {
@@ -985,9 +977,10 @@ static const char *acpi_battery_units(const struct acpi_battery *battery)
 		"mA" : "mW";
 }
 
-static int acpi_battery_print_info(struct seq_file *seq, int result)
+static int acpi_battery_info_proc_show(struct seq_file *seq, void *offset)
 {
 	struct acpi_battery *battery = seq->private;
+	int result = acpi_battery_update(battery, false);
 
 	if (result)
 		goto end;
@@ -1041,9 +1034,10 @@ static int acpi_battery_print_info(struct seq_file *seq, int result)
 	return result;
 }
 
-static int acpi_battery_print_state(struct seq_file *seq, int result)
+static int acpi_battery_state_proc_show(struct seq_file *seq, void *offset)
 {
 	struct acpi_battery *battery = seq->private;
+	int result = acpi_battery_update(battery, false);
 
 	if (result)
 		goto end;
@@ -1088,9 +1082,10 @@ static int acpi_battery_print_state(struct seq_file *seq, int result)
 	return result;
 }
 
-static int acpi_battery_print_alarm(struct seq_file *seq, int result)
+static int acpi_battery_alarm_proc_show(struct seq_file *seq, void *offset)
 {
 	struct acpi_battery *battery = seq->private;
+	int result = acpi_battery_update(battery, false);
 
 	if (result)
 		goto end;
@@ -1142,82 +1137,22 @@ static ssize_t acpi_battery_write_alarm(struct file *file,
 	return result;
 }
 
-typedef int(*print_func)(struct seq_file *seq, int result);
-
-static print_func acpi_print_funcs[ACPI_BATTERY_NUMFILES] = {
-	acpi_battery_print_info,
-	acpi_battery_print_state,
-	acpi_battery_print_alarm,
-};
-
-static int acpi_battery_read(int fid, struct seq_file *seq)
+static int acpi_battery_alarm_proc_open(struct inode *inode, struct file *file)
 {
-	struct acpi_battery *battery = seq->private;
-	int result = acpi_battery_update(battery, false);
-	return acpi_print_funcs[fid](seq, result);
+	return single_open(file, acpi_battery_alarm_proc_show, PDE_DATA(inode));
 }
 
-#define DECLARE_FILE_FUNCTIONS(_name) \
-static int acpi_battery_read_##_name(struct seq_file *seq, void *offset) \
-{ \
-	return acpi_battery_read(_name##_tag, seq); \
-} \
-static int acpi_battery_##_name##_open_fs(struct inode *inode, struct file *file) \
-{ \
-	return single_open(file, acpi_battery_read_##_name, PDE_DATA(inode)); \
-}
-
-DECLARE_FILE_FUNCTIONS(info);
-DECLARE_FILE_FUNCTIONS(state);
-DECLARE_FILE_FUNCTIONS(alarm);
-
-#undef DECLARE_FILE_FUNCTIONS
-
-#define FILE_DESCRIPTION_RO(_name) \
-	{ \
-	.name = __stringify(_name), \
-	.mode = S_IRUGO, \
-	.ops = { \
-		.open = acpi_battery_##_name##_open_fs, \
-		.read = seq_read, \
-		.llseek = seq_lseek, \
-		.release = single_release, \
-		.owner = THIS_MODULE, \
-		}, \
-	}
-
-#define FILE_DESCRIPTION_RW(_name) \
-	{ \
-	.name = __stringify(_name), \
-	.mode = S_IFREG | S_IRUGO | S_IWUSR, \
-	.ops = { \
-		.open = acpi_battery_##_name##_open_fs, \
-		.read = seq_read, \
-		.llseek = seq_lseek, \
-		.write = acpi_battery_write_##_name, \
-		.release = single_release, \
-		.owner = THIS_MODULE, \
-		}, \
-	}
-
-static const struct battery_file {
-	struct file_operations ops;
-	umode_t mode;
-	const char *name;
-} acpi_battery_file[] = {
-	FILE_DESCRIPTION_RO(info),
-	FILE_DESCRIPTION_RO(state),
-	FILE_DESCRIPTION_RW(alarm),
+static const struct file_operations acpi_battery_alarm_fops = {
+	.owner		= THIS_MODULE,
+	.open		= acpi_battery_alarm_proc_open,
+	.read		= seq_read,
+	.write		= acpi_battery_write_alarm,
+	.llseek		= seq_lseek,
+	.release	= single_release,
 };
 
-#undef FILE_DESCRIPTION_RO
-#undef FILE_DESCRIPTION_RW
-
 static int acpi_battery_add_fs(struct acpi_device *device)
 {
-	struct proc_dir_entry *entry = NULL;
-	int i;
-
 	printk(KERN_WARNING PREFIX "Deprecated procfs I/F for battery is loaded,"
 			" please retry with CONFIG_ACPI_PROCFS_POWER cleared\n");
 	if (!acpi_device_dir(device)) {
@@ -1227,28 +1162,24 @@ static int acpi_battery_add_fs(struct acpi_device *device)
 			return -ENODEV;
 	}
 
-	for (i = 0; i < ACPI_BATTERY_NUMFILES; ++i) {
-		entry = proc_create_data(acpi_battery_file[i].name,
-					 acpi_battery_file[i].mode,
-					 acpi_device_dir(device),
-					 &acpi_battery_file[i].ops,
-					 acpi_driver_data(device));
-		if (!entry)
-			return -ENODEV;
-	}
+	if (!proc_create_single_data("info", S_IRUGO, acpi_device_dir(device),
+			acpi_battery_info_proc_show, acpi_driver_data(device)))
+		return -ENODEV;
+	if (!proc_create_single_data("state", S_IRUGO, acpi_device_dir(device),
+			acpi_battery_state_proc_show, acpi_driver_data(device)))
+		return -ENODEV;
+	if (!proc_create_data("alarm", S_IFREG | S_IRUGO | S_IWUSR,
+			acpi_device_dir(device), &acpi_battery_alarm_fops,
+			acpi_driver_data(device)))
+		return -ENODEV;
 	return 0;
 }
 
 static void acpi_battery_remove_fs(struct acpi_device *device)
 {
-	int i;
 	if (!acpi_device_dir(device))
 		return;
-	for (i = 0; i < ACPI_BATTERY_NUMFILES; ++i)
-		remove_proc_entry(acpi_battery_file[i].name,
-				  acpi_device_dir(device));
-
-	remove_proc_entry(acpi_device_bid(device), acpi_battery_dir);
+	remove_proc_subtree(acpi_device_bid(device), acpi_battery_dir);
 	acpi_device_dir(device) = NULL;
 }
 
diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c
index f1cc4f9..2345a5e 100644
--- a/drivers/acpi/button.c
+++ b/drivers/acpi/button.c
@@ -263,19 +263,6 @@ static int acpi_button_state_seq_show(struct seq_file *seq, void *offset)
 	return 0;
 }
 
-static int acpi_button_state_open_fs(struct inode *inode, struct file *file)
-{
-	return single_open(file, acpi_button_state_seq_show, PDE_DATA(inode));
-}
-
-static const struct file_operations acpi_button_state_fops = {
-	.owner = THIS_MODULE,
-	.open = acpi_button_state_open_fs,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 static int acpi_button_add_fs(struct acpi_device *device)
 {
 	struct acpi_button *button = acpi_driver_data(device);
@@ -311,9 +298,9 @@ static int acpi_button_add_fs(struct acpi_device *device)
 	}
 
 	/* create /proc/acpi/button/lid/LID/state */
-	entry = proc_create_data(ACPI_BUTTON_FILE_STATE,
-				 S_IRUGO, acpi_device_dir(device),
-				 &acpi_button_state_fops, device);
+	entry = proc_create_single_data(ACPI_BUTTON_FILE_STATE, S_IRUGO,
+			acpi_device_dir(device), acpi_button_state_seq_show,
+			device);
 	if (!entry) {
 		ret = -ENODEV;
 		goto remove_dev_dir;
diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index 4a3ac31..3b01187 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -20,6 +20,7 @@
 #include <linux/sizes.h>
 #include <linux/limits.h>
 #include <linux/clk/clk-conf.h>
+#include <linux/platform_device.h>
 
 #include <asm/irq.h>
 
@@ -193,14 +194,16 @@ static const struct dev_pm_ops amba_pm = {
 /*
  * Primecells are part of the Advanced Microcontroller Bus Architecture,
  * so we call the bus "amba".
+ * DMA configuration for platform and AMBA bus is same. So here we reuse
+ * platform's DMA config routine.
  */
 struct bus_type amba_bustype = {
 	.name		= "amba",
 	.dev_groups	= amba_dev_groups,
 	.match		= amba_match,
 	.uevent		= amba_uevent,
+	.dma_configure	= platform_dma_configure,
 	.pm		= &amba_pm,
-	.force_dma	= true,
 };
 
 static int __init amba_init(void)
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 513b260b..a2398e2 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -500,57 +500,6 @@ void ata_eh_release(struct ata_port *ap)
 	mutex_unlock(&ap->host->eh_mutex);
 }
 
-/**
- *	ata_scsi_timed_out - SCSI layer time out callback
- *	@cmd: timed out SCSI command
- *
- *	Handles SCSI layer timeout.  We race with normal completion of
- *	the qc for @cmd.  If the qc is already gone, we lose and let
- *	the scsi command finish (EH_HANDLED).  Otherwise, the qc has
- *	timed out and EH should be invoked.  Prevent ata_qc_complete()
- *	from finishing it by setting EH_SCHEDULED and return
- *	EH_NOT_HANDLED.
- *
- *	TODO: kill this function once old EH is gone.
- *
- *	LOCKING:
- *	Called from timer context
- *
- *	RETURNS:
- *	EH_HANDLED or EH_NOT_HANDLED
- */
-enum blk_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
-{
-	struct Scsi_Host *host = cmd->device->host;
-	struct ata_port *ap = ata_shost_to_port(host);
-	unsigned long flags;
-	struct ata_queued_cmd *qc;
-	enum blk_eh_timer_return ret;
-
-	DPRINTK("ENTER\n");
-
-	if (ap->ops->error_handler) {
-		ret = BLK_EH_NOT_HANDLED;
-		goto out;
-	}
-
-	ret = BLK_EH_HANDLED;
-	spin_lock_irqsave(ap->lock, flags);
-	qc = ata_qc_from_tag(ap, ap->link.active_tag);
-	if (qc) {
-		WARN_ON(qc->scsicmd != cmd);
-		qc->flags |= ATA_QCFLAG_EH_SCHEDULED;
-		qc->err_mask |= AC_ERR_TIMEOUT;
-		ret = BLK_EH_NOT_HANDLED;
-	}
-	spin_unlock_irqrestore(ap->lock, flags);
-
- out:
-	DPRINTK("EXIT, ret=%d\n", ret);
-	return ret;
-}
-EXPORT_SYMBOL(ata_scsi_timed_out);
-
 static void ata_eh_unload(struct ata_port *ap)
 {
 	struct ata_link *link;
diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c
index d82566d..f831a58 100644
--- a/drivers/base/dma-mapping.c
+++ b/drivers/base/dma-mapping.c
@@ -329,36 +329,13 @@ void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
 #endif
 
 /*
- * Common configuration to enable DMA API use for a device
+ * enables DMA API use for a device
  */
-#include <linux/pci.h>
-
 int dma_configure(struct device *dev)
 {
-	struct device *bridge = NULL, *dma_dev = dev;
-	enum dev_dma_attr attr;
-	int ret = 0;
-
-	if (dev_is_pci(dev)) {
-		bridge = pci_get_host_bridge_device(to_pci_dev(dev));
-		dma_dev = bridge;
-		if (IS_ENABLED(CONFIG_OF) && dma_dev->parent &&
-		    dma_dev->parent->of_node)
-			dma_dev = dma_dev->parent;
-	}
-
-	if (dma_dev->of_node) {
-		ret = of_dma_configure(dev, dma_dev->of_node);
-	} else if (has_acpi_companion(dma_dev)) {
-		attr = acpi_get_dma_attr(to_acpi_device_node(dma_dev->fwnode));
-		if (attr != DEV_DMA_NOT_SUPPORTED)
-			ret = acpi_dma_configure(dev, attr);
-	}
-
-	if (bridge)
-		pci_put_host_bridge_device(bridge);
-
-	return ret;
+	if (dev->bus->dma_configure)
+		return dev->bus->dma_configure(dev);
+	return 0;
 }
 
 void dma_deconfigure(struct device *dev)
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 8075ddc..c0ff1e7 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -1130,6 +1130,22 @@ int platform_pm_restore(struct device *dev)
 
 #endif /* CONFIG_HIBERNATE_CALLBACKS */
 
+int platform_dma_configure(struct device *dev)
+{
+	enum dev_dma_attr attr;
+	int ret = 0;
+
+	if (dev->of_node) {
+		ret = of_dma_configure(dev, dev->of_node, true);
+	} else if (has_acpi_companion(dev)) {
+		attr = acpi_get_dma_attr(to_acpi_device_node(dev->fwnode));
+		if (attr != DEV_DMA_NOT_SUPPORTED)
+			ret = acpi_dma_configure(dev, attr);
+	}
+
+	return ret;
+}
+
 static const struct dev_pm_ops platform_dev_pm_ops = {
 	.runtime_suspend = pm_generic_runtime_suspend,
 	.runtime_resume = pm_generic_runtime_resume,
@@ -1141,8 +1157,8 @@ struct bus_type platform_bus_type = {
 	.dev_groups	= platform_dev_groups,
 	.match		= platform_match,
 	.uevent		= platform_uevent,
+	.dma_configure	= platform_dma_configure,
 	.pm		= &platform_dev_pm_ops,
-	.force_dma	= true,
 };
 EXPORT_SYMBOL_GPL(platform_bus_type);
 
diff --git a/drivers/base/regmap/regmap-mmio.c b/drivers/base/regmap/regmap-mmio.c
index 5cadfd3..8741fb5 100644
--- a/drivers/base/regmap/regmap-mmio.c
+++ b/drivers/base/regmap/regmap-mmio.c
@@ -206,7 +206,8 @@ static void regmap_mmio_free_context(void *context)
 
 	if (!IS_ERR(ctx->clk)) {
 		clk_unprepare(ctx->clk);
-		clk_put(ctx->clk);
+		if (!ctx->attached_clk)
+			clk_put(ctx->clk);
 	}
 	kfree(context);
 }
diff --git a/drivers/base/regmap/regmap-slimbus.c b/drivers/base/regmap/regmap-slimbus.c
index c90bee8..91d501e 100644
--- a/drivers/base/regmap/regmap-slimbus.c
+++ b/drivers/base/regmap/regmap-slimbus.c
@@ -41,7 +41,7 @@ static struct regmap_bus regmap_slimbus_bus = {
 static const struct regmap_bus *regmap_get_slimbus(struct slim_device *slim,
 					const struct regmap_config *config)
 {
-	if (config->val_bits == 8 && config->reg_bits == 8)
+	if (config->val_bits == 8 && config->reg_bits == 16)
 		return &regmap_slimbus_bus;
 
 	return ERR_PTR(-ENOTSUPP);
diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c
index e6986c7..fc1f4ac 100644
--- a/drivers/bcma/main.c
+++ b/drivers/bcma/main.c
@@ -207,7 +207,7 @@ static void bcma_of_fill_device(struct device *parent,
 
 	core->irq = bcma_of_get_irq(parent, core, 0);
 
-	of_dma_configure(&core->dev, node);
+	of_dma_configure(&core->dev, node, false);
 }
 
 unsigned int bcma_core_irq(struct bcma_device *core, int num)
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index f781eff..6ca77d6 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -1179,7 +1179,6 @@ static bool DAC960_V1_EnableMemoryMailboxInterface(DAC960_Controller_T
 
   if (pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32)))
 	return DAC960_Failure(Controller, "DMA mask out of range");
-  Controller->BounceBufferLimit = DMA_BIT_MASK(32);
 
   if ((hw_type == DAC960_PD_Controller) || (hw_type == DAC960_P_Controller)) {
     CommandMailboxesSize =  0;
@@ -1380,11 +1379,8 @@ static bool DAC960_V2_EnableMemoryMailboxInterface(DAC960_Controller_T
   dma_addr_t	CommandMailboxDMA;
   DAC960_V2_CommandStatus_T CommandStatus;
 
-	if (!pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(64)))
-		Controller->BounceBufferLimit = DMA_BIT_MASK(64);
-	else if (!pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32)))
-		Controller->BounceBufferLimit = DMA_BIT_MASK(32);
-	else
+	if (pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(64)) &&
+	    pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32)))
 		return DAC960_Failure(Controller, "DMA mask out of range");
 
   /* This is a temporary dma mapping, used only in the scope of this function */
@@ -2540,7 +2536,6 @@ static bool DAC960_RegisterBlockDevice(DAC960_Controller_T *Controller)
 		continue;
   	}
   	Controller->RequestQueue[n] = RequestQueue;
-  	blk_queue_bounce_limit(RequestQueue, Controller->BounceBufferLimit);
   	RequestQueue->queuedata = Controller;
 	blk_queue_max_segments(RequestQueue, Controller->DriverScatterGatherLimit);
 	blk_queue_max_hw_sectors(RequestQueue, Controller->MaxBlocksPerCommand);
@@ -6451,19 +6446,6 @@ static int dac960_proc_show(struct seq_file *m, void *v)
   return 0;
 }
 
-static int dac960_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, dac960_proc_show, NULL);
-}
-
-static const struct file_operations dac960_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= dac960_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int dac960_initial_status_proc_show(struct seq_file *m, void *v)
 {
 	DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private;
@@ -6471,19 +6453,6 @@ static int dac960_initial_status_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int dac960_initial_status_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, dac960_initial_status_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations dac960_initial_status_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= dac960_initial_status_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int dac960_current_status_proc_show(struct seq_file *m, void *v)
 {
   DAC960_Controller_T *Controller = (DAC960_Controller_T *) m->private;
@@ -6517,19 +6486,6 @@ static int dac960_current_status_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int dac960_current_status_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, dac960_current_status_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations dac960_current_status_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= dac960_current_status_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int dac960_user_command_proc_show(struct seq_file *m, void *v)
 {
 	DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private;
@@ -6584,17 +6540,19 @@ static void DAC960_CreateProcEntries(DAC960_Controller_T *Controller)
 
 	if (DAC960_ProcDirectoryEntry == NULL) {
 		DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL);
-		proc_create("status", 0, DAC960_ProcDirectoryEntry,
-			    &dac960_proc_fops);
+		proc_create_single("status", 0, DAC960_ProcDirectoryEntry,
+				dac960_proc_show);
 	}
 
 	snprintf(Controller->ControllerName, sizeof(Controller->ControllerName),
 		 "c%d", Controller->ControllerNumber);
 	ControllerProcEntry = proc_mkdir(Controller->ControllerName,
 					 DAC960_ProcDirectoryEntry);
-	proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller);
-	proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller);
-	proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller);
+	proc_create_single_data("initial_status", 0, ControllerProcEntry,
+			dac960_initial_status_proc_show, Controller);
+	proc_create_single_data("current_status", 0, ControllerProcEntry,
+			dac960_current_status_proc_show, Controller);
+	proc_create_data("user_command", 0600, ControllerProcEntry, &dac960_user_command_proc_fops, Controller);
 	Controller->ControllerProcEntry = ControllerProcEntry;
 }
 
diff --git a/drivers/block/DAC960.h b/drivers/block/DAC960.h
index 21aff47..1439e65 100644
--- a/drivers/block/DAC960.h
+++ b/drivers/block/DAC960.h
@@ -2295,7 +2295,6 @@ typedef struct DAC960_Controller
   unsigned short MaxBlocksPerCommand;
   unsigned short ControllerScatterGatherLimit;
   unsigned short DriverScatterGatherLimit;
-  u64		BounceBufferLimit;
   unsigned int CombinedStatusBufferLength;
   unsigned int InitialStatusLength;
   unsigned int CurrentStatusLength;
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 6797e6c..429ebb8 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -159,14 +159,14 @@ static int aoe_debugfs_open(struct inode *inode, struct file *file)
 	return single_open(file, aoedisk_debugfs_show, inode->i_private);
 }
 
-static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL);
-static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL);
-static DEVICE_ATTR(netif, S_IRUGO, aoedisk_show_netif, NULL);
+static DEVICE_ATTR(state, 0444, aoedisk_show_state, NULL);
+static DEVICE_ATTR(mac, 0444, aoedisk_show_mac, NULL);
+static DEVICE_ATTR(netif, 0444, aoedisk_show_netif, NULL);
 static struct device_attribute dev_attr_firmware_version = {
-	.attr = { .name = "firmware-version", .mode = S_IRUGO },
+	.attr = { .name = "firmware-version", .mode = 0444 },
 	.show = aoedisk_show_fwver,
 };
-static DEVICE_ATTR(payload, S_IRUGO, aoedisk_show_payload, NULL);
+static DEVICE_ATTR(payload, 0444, aoedisk_show_payload, NULL);
 
 static struct attribute *aoe_attrs[] = {
 	&dev_attr_state.attr,
@@ -388,7 +388,6 @@ aoeblk_gdalloc(void *vp)
 			d->aoemajor, d->aoeminor);
 		goto err_mempool;
 	}
-	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
 
 	spin_lock_irqsave(&d->lock, flags);
 	WARN_ON(!(d->flags & DEVFL_GD_NOW));
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 540bb60..096882e 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1032,8 +1032,9 @@ bvcpy(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter, long cnt)
 	iter.bi_size = cnt;
 
 	__bio_for_each_segment(bv, bio, iter, iter) {
-		char *p = page_address(bv.bv_page) + bv.bv_offset;
+		char *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
 		skb_copy_bits(skb, soff, p, bv.bv_len);
+		kunmap_atomic(p);
 		soff += bv.bv_len;
 	}
 }
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 66cb0f8..bb97659 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -331,15 +331,15 @@ static const struct block_device_operations brd_fops = {
  * And now the modules code and kernel interface.
  */
 static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
-module_param(rd_nr, int, S_IRUGO);
+module_param(rd_nr, int, 0444);
 MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
 
 unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE;
-module_param(rd_size, ulong, S_IRUGO);
+module_param(rd_size, ulong, 0444);
 MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
 
 static int max_part = 1;
-module_param(max_part, int, S_IRUGO);
+module_param(max_part, int, 0444);
 MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
 
 MODULE_LICENSE("GPL");
@@ -402,6 +402,10 @@ static struct brd_device *brd_alloc(int i)
 	set_capacity(disk, rd_size * 2);
 	disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
 
+	/* Tell the block layer that this is not a rotational device */
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
+
 	return brd;
 
 out_free_queue:
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 9f4e6f5..11a85b7 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -977,7 +977,7 @@ static void drbd_bm_endio(struct bio *bio)
 	bm_page_unlock_io(device, idx);
 
 	if (ctx->flags & BM_AIO_COPY_PAGES)
-		mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool);
+		mempool_free(bio->bi_io_vec[0].bv_page, &drbd_md_io_page_pool);
 
 	bio_put(bio);
 
@@ -1014,7 +1014,8 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
 	bm_set_page_unchanged(b->bm_pages[page_nr]);
 
 	if (ctx->flags & BM_AIO_COPY_PAGES) {
-		page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_RECLAIM);
+		page = mempool_alloc(&drbd_md_io_page_pool,
+				GFP_NOIO | __GFP_HIGHMEM);
 		copy_highpage(page, b->bm_pages[page_nr]);
 		bm_store_page_idx(page, page_nr);
 	} else
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index ab21976..5d5e8d6 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -481,9 +481,9 @@ void drbd_debugfs_resource_add(struct drbd_resource *resource)
 		goto fail;
 	resource->debugfs_res_connections = dentry;
 
-	dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP,
-			resource->debugfs_res, resource,
-			&in_flight_summary_fops);
+	dentry = debugfs_create_file("in_flight_summary", 0440,
+				     resource->debugfs_res, resource,
+				     &in_flight_summary_fops);
 	if (IS_ERR_OR_NULL(dentry))
 		goto fail;
 	resource->debugfs_res_in_flight_summary = dentry;
@@ -645,16 +645,16 @@ void drbd_debugfs_connection_add(struct drbd_connection *connection)
 		goto fail;
 	connection->debugfs_conn = dentry;
 
-	dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP,
-			connection->debugfs_conn, connection,
-			&connection_callback_history_fops);
+	dentry = debugfs_create_file("callback_history", 0440,
+				     connection->debugfs_conn, connection,
+				     &connection_callback_history_fops);
 	if (IS_ERR_OR_NULL(dentry))
 		goto fail;
 	connection->debugfs_conn_callback_history = dentry;
 
-	dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP,
-			connection->debugfs_conn, connection,
-			&connection_oldest_requests_fops);
+	dentry = debugfs_create_file("oldest_requests", 0440,
+				     connection->debugfs_conn, connection,
+				     &connection_oldest_requests_fops);
 	if (IS_ERR_OR_NULL(dentry))
 		goto fail;
 	connection->debugfs_conn_oldest_requests = dentry;
@@ -824,7 +824,7 @@ void drbd_debugfs_device_add(struct drbd_device *device)
 	device->debugfs_minor = dentry;
 
 #define DCF(name)	do {					\
-	dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP,	\
+	dentry = debugfs_create_file(#name, 0440,	\
 			device->debugfs_vol, device,		\
 			&device_ ## name ## _fops);		\
 	if (IS_ERR_OR_NULL(dentry))				\
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 06ecee1..bc4ed2e 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1405,8 +1405,8 @@ extern struct kmem_cache *drbd_request_cache;
 extern struct kmem_cache *drbd_ee_cache;	/* peer requests */
 extern struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
 extern struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
-extern mempool_t *drbd_request_mempool;
-extern mempool_t *drbd_ee_mempool;
+extern mempool_t drbd_request_mempool;
+extern mempool_t drbd_ee_mempool;
 
 /* drbd's page pool, used to buffer data received from the peer,
  * or data requested by the peer.
@@ -1432,16 +1432,16 @@ extern wait_queue_head_t drbd_pp_wait;
  * 128 should be plenty, currently we probably can get away with as few as 1.
  */
 #define DRBD_MIN_POOL_PAGES	128
-extern mempool_t *drbd_md_io_page_pool;
+extern mempool_t drbd_md_io_page_pool;
 
 /* We also need to make sure we get a bio
  * when we need it for housekeeping purposes */
-extern struct bio_set *drbd_md_io_bio_set;
+extern struct bio_set drbd_md_io_bio_set;
 /* to allocate from that set */
 extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
 
 /* And a bio_set for cloning */
-extern struct bio_set *drbd_io_bio_set;
+extern struct bio_set drbd_io_bio_set;
 
 extern struct mutex resources_mutex;
 
@@ -1643,7 +1643,7 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
 
 /* drbd_proc.c */
 extern struct proc_dir_entry *drbd_proc;
-extern const struct file_operations drbd_proc_fops;
+int drbd_seq_show(struct seq_file *seq, void *v);
 
 /* drbd_actlog.c */
 extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 185f1ef..7655d61 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -124,11 +124,11 @@ struct kmem_cache *drbd_request_cache;
 struct kmem_cache *drbd_ee_cache;	/* peer requests */
 struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
 struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
-mempool_t *drbd_request_mempool;
-mempool_t *drbd_ee_mempool;
-mempool_t *drbd_md_io_page_pool;
-struct bio_set *drbd_md_io_bio_set;
-struct bio_set *drbd_io_bio_set;
+mempool_t drbd_request_mempool;
+mempool_t drbd_ee_mempool;
+mempool_t drbd_md_io_page_pool;
+struct bio_set drbd_md_io_bio_set;
+struct bio_set drbd_io_bio_set;
 
 /* I do not use a standard mempool, because:
    1) I want to hand out the pre-allocated objects first.
@@ -153,10 +153,10 @@ struct bio *bio_alloc_drbd(gfp_t gfp_mask)
 {
 	struct bio *bio;
 
-	if (!drbd_md_io_bio_set)
+	if (!bioset_initialized(&drbd_md_io_bio_set))
 		return bio_alloc(gfp_mask, 1);
 
-	bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+	bio = bio_alloc_bioset(gfp_mask, 1, &drbd_md_io_bio_set);
 	if (!bio)
 		return NULL;
 	return bio;
@@ -2097,16 +2097,11 @@ static void drbd_destroy_mempools(void)
 
 	/* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */
 
-	if (drbd_io_bio_set)
-		bioset_free(drbd_io_bio_set);
-	if (drbd_md_io_bio_set)
-		bioset_free(drbd_md_io_bio_set);
-	if (drbd_md_io_page_pool)
-		mempool_destroy(drbd_md_io_page_pool);
-	if (drbd_ee_mempool)
-		mempool_destroy(drbd_ee_mempool);
-	if (drbd_request_mempool)
-		mempool_destroy(drbd_request_mempool);
+	bioset_exit(&drbd_io_bio_set);
+	bioset_exit(&drbd_md_io_bio_set);
+	mempool_exit(&drbd_md_io_page_pool);
+	mempool_exit(&drbd_ee_mempool);
+	mempool_exit(&drbd_request_mempool);
 	if (drbd_ee_cache)
 		kmem_cache_destroy(drbd_ee_cache);
 	if (drbd_request_cache)
@@ -2116,11 +2111,6 @@ static void drbd_destroy_mempools(void)
 	if (drbd_al_ext_cache)
 		kmem_cache_destroy(drbd_al_ext_cache);
 
-	drbd_io_bio_set      = NULL;
-	drbd_md_io_bio_set   = NULL;
-	drbd_md_io_page_pool = NULL;
-	drbd_ee_mempool      = NULL;
-	drbd_request_mempool = NULL;
 	drbd_ee_cache        = NULL;
 	drbd_request_cache   = NULL;
 	drbd_bm_ext_cache    = NULL;
@@ -2133,18 +2123,7 @@ static int drbd_create_mempools(void)
 {
 	struct page *page;
 	const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count;
-	int i;
-
-	/* prepare our caches and mempools */
-	drbd_request_mempool = NULL;
-	drbd_ee_cache        = NULL;
-	drbd_request_cache   = NULL;
-	drbd_bm_ext_cache    = NULL;
-	drbd_al_ext_cache    = NULL;
-	drbd_pp_pool         = NULL;
-	drbd_md_io_page_pool = NULL;
-	drbd_md_io_bio_set   = NULL;
-	drbd_io_bio_set      = NULL;
+	int i, ret;
 
 	/* caches */
 	drbd_request_cache = kmem_cache_create(
@@ -2168,26 +2147,26 @@ static int drbd_create_mempools(void)
 		goto Enomem;
 
 	/* mempools */
-	drbd_io_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0);
-	if (drbd_io_bio_set == NULL)
+	ret = bioset_init(&drbd_io_bio_set, BIO_POOL_SIZE, 0, 0);
+	if (ret)
 		goto Enomem;
 
-	drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0,
-					   BIOSET_NEED_BVECS);
-	if (drbd_md_io_bio_set == NULL)
+	ret = bioset_init(&drbd_md_io_bio_set, DRBD_MIN_POOL_PAGES, 0,
+			  BIOSET_NEED_BVECS);
+	if (ret)
 		goto Enomem;
 
-	drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
-	if (drbd_md_io_page_pool == NULL)
+	ret = mempool_init_page_pool(&drbd_md_io_page_pool, DRBD_MIN_POOL_PAGES, 0);
+	if (ret)
 		goto Enomem;
 
-	drbd_request_mempool = mempool_create_slab_pool(number,
-		drbd_request_cache);
-	if (drbd_request_mempool == NULL)
+	ret = mempool_init_slab_pool(&drbd_request_mempool, number,
+				     drbd_request_cache);
+	if (ret)
 		goto Enomem;
 
-	drbd_ee_mempool = mempool_create_slab_pool(number, drbd_ee_cache);
-	if (drbd_ee_mempool == NULL)
+	ret = mempool_init_slab_pool(&drbd_ee_mempool, number, drbd_ee_cache);
+	if (ret)
 		goto Enomem;
 
 	/* drbd's page pool */
@@ -3010,7 +2989,7 @@ static int __init drbd_init(void)
 		goto fail;
 
 	err = -ENOMEM;
-	drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
+	drbd_proc = proc_create_single("drbd", S_IFREG | 0444 , NULL, drbd_seq_show);
 	if (!drbd_proc)	{
 		pr_err("unable to register proc file\n");
 		goto fail;
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 582caeb..74ef292 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -33,18 +33,7 @@
 #include <linux/drbd.h>
 #include "drbd_int.h"
 
-static int drbd_proc_open(struct inode *inode, struct file *file);
-static int drbd_proc_release(struct inode *inode, struct file *file);
-
-
 struct proc_dir_entry *drbd_proc;
-const struct file_operations drbd_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= drbd_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= drbd_proc_release,
-};
 
 static void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
 {
@@ -235,7 +224,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
 	}
 }
 
-static int drbd_seq_show(struct seq_file *seq, void *v)
+int drbd_seq_show(struct seq_file *seq, void *v)
 {
 	int i, prev_i = -1;
 	const char *sn;
@@ -345,24 +334,3 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 
 	return 0;
 }
-
-static int drbd_proc_open(struct inode *inode, struct file *file)
-{
-	int err;
-
-	if (try_module_get(THIS_MODULE)) {
-		err = single_open(file, drbd_seq_show, NULL);
-		if (err)
-			module_put(THIS_MODULE);
-		return err;
-	}
-	return -ENODEV;
-}
-
-static int drbd_proc_release(struct inode *inode, struct file *file)
-{
-	module_put(THIS_MODULE);
-	return single_release(inode, file);
-}
-
-/* PROC FS stuff end */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index c72dee0..be9450f 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -378,7 +378,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
 	if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
 		return NULL;
 
-	peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+	peer_req = mempool_alloc(&drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
 	if (!peer_req) {
 		if (!(gfp_mask & __GFP_NOWARN))
 			drbd_err(device, "%s: allocation failed\n", __func__);
@@ -409,7 +409,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
 	return peer_req;
 
  fail:
-	mempool_free(peer_req, drbd_ee_mempool);
+	mempool_free(peer_req, &drbd_ee_mempool);
 	return NULL;
 }
 
@@ -426,7 +426,7 @@ void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *
 		peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
 		drbd_al_complete_io(device, &peer_req->i);
 	}
-	mempool_free(peer_req, drbd_ee_mempool);
+	mempool_free(peer_req, &drbd_ee_mempool);
 }
 
 int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index a500e73..a47e498 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -55,7 +55,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio
 {
 	struct drbd_request *req;
 
-	req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
+	req = mempool_alloc(&drbd_request_mempool, GFP_NOIO);
 	if (!req)
 		return NULL;
 	memset(req, 0, sizeof(*req));
@@ -184,7 +184,7 @@ void drbd_req_destroy(struct kref *kref)
 		}
 	}
 
-	mempool_free(req, drbd_request_mempool);
+	mempool_free(req, &drbd_request_mempool);
 }
 
 static void wake_all_senders(struct drbd_connection *connection)
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index cb97b3b..94c6540 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -269,7 +269,7 @@ enum drbd_req_state_bits {
 static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
 {
 	struct bio *bio;
-	bio = bio_clone_fast(bio_src, GFP_NOIO, drbd_io_bio_set);
+	bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set);
 
 	req->private_bio = bio;
 
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 8ec7235..8871b50 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4450,7 +4450,7 @@ static ssize_t floppy_cmos_show(struct device *dev,
 	return sprintf(buf, "%X\n", UDP->cmos);
 }
 
-static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL);
+static DEVICE_ATTR(cmos, 0444, floppy_cmos_show, NULL);
 
 static struct attribute *floppy_dev_attrs[] = {
 	&dev_attr_cmos.attr,
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 55cf554..4838b0d 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -732,7 +732,7 @@ static ssize_t loop_attr_do_show_##_name(struct device *d,		\
 	return loop_attr_show(d, b, loop_attr_##_name##_show);		\
 }									\
 static struct device_attribute loop_attr_##_name =			\
-	__ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL);
+	__ATTR(_name, 0444, loop_attr_do_show_##_name, NULL);
 
 static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
 {
@@ -809,16 +809,17 @@ static struct attribute_group loop_attribute_group = {
 	.attrs= loop_attrs,
 };
 
-static int loop_sysfs_init(struct loop_device *lo)
+static void loop_sysfs_init(struct loop_device *lo)
 {
-	return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
-				  &loop_attribute_group);
+	lo->sysfs_inited = !sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
+						&loop_attribute_group);
 }
 
 static void loop_sysfs_exit(struct loop_device *lo)
 {
-	sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
-			   &loop_attribute_group);
+	if (lo->sysfs_inited)
+		sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
+				   &loop_attribute_group);
 }
 
 static void loop_config_discard(struct loop_device *lo)
@@ -1677,9 +1678,9 @@ static const struct block_device_operations lo_fops = {
  * And now the modules code and kernel interface.
  */
 static int max_loop;
-module_param(max_loop, int, S_IRUGO);
+module_param(max_loop, int, 0444);
 MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
-module_param(max_part, int, S_IRUGO);
+module_param(max_part, int, 0444);
 MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index b78de98..4d42c7a 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -58,6 +58,7 @@ struct loop_device {
 	struct kthread_worker	worker;
 	struct task_struct	*worker_task;
 	bool			use_dio;
+	bool			sysfs_inited;
 
 	struct request_queue	*lo_queue;
 	struct blk_mq_tag_set	tag_set;
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 769c551..c73626d 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2285,7 +2285,7 @@ static ssize_t mtip_hw_show_status(struct device *dev,
 	return size;
 }
 
-static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL);
+static DEVICE_ATTR(status, 0444, mtip_hw_show_status, NULL);
 
 /* debugsfs entries */
 
@@ -2566,10 +2566,9 @@ static int mtip_hw_debugfs_init(struct driver_data *dd)
 		return -1;
 	}
 
-	debugfs_create_file("flags", S_IRUGO, dd->dfs_node, dd,
-							&mtip_flags_fops);
-	debugfs_create_file("registers", S_IRUGO, dd->dfs_node, dd,
-							&mtip_regs_fops);
+	debugfs_create_file("flags", 0444, dd->dfs_node, dd, &mtip_flags_fops);
+	debugfs_create_file("registers", 0444, dd->dfs_node, dd,
+			    &mtip_regs_fops);
 
 	return 0;
 }
@@ -2726,15 +2725,11 @@ static void mtip_softirq_done_fn(struct request *rq)
 	blk_mq_end_request(rq, cmd->status);
 }
 
-static void mtip_abort_cmd(struct request *req, void *data,
-							bool reserved)
+static void mtip_abort_cmd(struct request *req, void *data, bool reserved)
 {
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
 	struct driver_data *dd = data;
 
-	if (!blk_mq_request_started(req))
-		return;
-
 	dbg_printk(MTIP_DRV_NAME " Aborting request, tag = %d\n", req->tag);
 
 	clear_bit(req->tag, dd->port->cmds_to_issue);
@@ -2742,14 +2737,10 @@ static void mtip_abort_cmd(struct request *req, void *data,
 	mtip_softirq_done_fn(req);
 }
 
-static void mtip_queue_cmd(struct request *req, void *data,
-							bool reserved)
+static void mtip_queue_cmd(struct request *req, void *data, bool reserved)
 {
 	struct driver_data *dd = data;
 
-	if (!blk_mq_request_started(req))
-		return;
-
 	set_bit(req->tag, dd->port->cmds_to_issue);
 	blk_abort_request(req);
 }
@@ -3720,7 +3711,8 @@ static enum blk_eh_timer_return mtip_cmd_timeout(struct request *req,
 		struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
 
 		cmd->status = BLK_STS_TIMEOUT;
-		return BLK_EH_HANDLED;
+		blk_mq_complete_request(req);
+		return BLK_EH_DONE;
 	}
 
 	if (test_bit(req->tag, dd->port->cmds_to_issue))
@@ -3862,7 +3854,6 @@ skip_create_disk:
 	blk_queue_max_hw_sectors(dd->queue, 0xffff);
 	blk_queue_max_segment_size(dd->queue, 0x400000);
 	blk_queue_io_min(dd->queue, 4096);
-	blk_queue_bounce_limit(dd->queue, dd->pdev->dma_mask);
 
 	/* Signal trim support */
 	if (dd->trim_supp == true) {
@@ -4273,7 +4264,7 @@ static int mtip_pci_probe(struct pci_dev *pdev,
 	if (!dd->isr_workq) {
 		dev_warn(&pdev->dev, "Can't create wq %d\n", dd->instance);
 		rv = -ENOMEM;
-		goto block_initialize_err;
+		goto setmask_err;
 	}
 
 	memset(cpu_list, 0, sizeof(cpu_list));
@@ -4614,7 +4605,7 @@ static int __init mtip_init(void)
 	}
 	if (dfs_parent) {
 		dfs_device_status = debugfs_create_file("device_status",
-					S_IRUGO, dfs_parent, NULL,
+					0444, dfs_parent, NULL,
 					&mtip_device_status_fops);
 		if (IS_ERR_OR_NULL(dfs_device_status)) {
 			pr_err("Error creating device_status node\n");
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index afbc202..3ed1ef8 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -166,16 +166,19 @@ static ssize_t pid_show(struct device *dev,
 }
 
 static const struct device_attribute pid_attr = {
-	.attr = { .name = "pid", .mode = S_IRUGO},
+	.attr = { .name = "pid", .mode = 0444},
 	.show = pid_show,
 };
 
 static void nbd_dev_remove(struct nbd_device *nbd)
 {
 	struct gendisk *disk = nbd->disk;
+	struct request_queue *q;
+
 	if (disk) {
+		q = disk->queue;
 		del_gendisk(disk);
-		blk_cleanup_queue(disk->queue);
+		blk_cleanup_queue(q);
 		blk_mq_free_tag_set(&nbd->tag_set);
 		disk->private_data = NULL;
 		put_disk(disk);
@@ -213,7 +216,15 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
 	}
 	if (!nsock->dead) {
 		kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
-		atomic_dec(&nbd->config->live_connections);
+		if (atomic_dec_return(&nbd->config->live_connections) == 0) {
+			if (test_and_clear_bit(NBD_DISCONNECT_REQUESTED,
+					       &nbd->config->runtime_flags)) {
+				set_bit(NBD_DISCONNECTED,
+					&nbd->config->runtime_flags);
+				dev_info(nbd_to_dev(nbd),
+					"Disconnected due to user request.\n");
+			}
+		}
 	}
 	nsock->dead = true;
 	nsock->pending = NULL;
@@ -231,9 +242,22 @@ static void nbd_size_clear(struct nbd_device *nbd)
 static void nbd_size_update(struct nbd_device *nbd)
 {
 	struct nbd_config *config = nbd->config;
+	struct block_device *bdev = bdget_disk(nbd->disk, 0);
+
+	if (config->flags & NBD_FLAG_SEND_TRIM) {
+		nbd->disk->queue->limits.discard_granularity = config->blksize;
+		blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
+	}
 	blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
 	blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
 	set_capacity(nbd->disk, config->bytesize >> 9);
+	if (bdev) {
+		if (bdev->bd_disk)
+			bd_set_size(bdev, config->bytesize);
+		else
+			bdev->bd_invalidated = 1;
+		bdput(bdev);
+	}
 	kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 }
 
@@ -243,6 +267,8 @@ static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
 	struct nbd_config *config = nbd->config;
 	config->blksize = blocksize;
 	config->bytesize = blocksize * nr_blocks;
+	if (nbd->task_recv != NULL)
+		nbd_size_update(nbd);
 }
 
 static void nbd_complete_rq(struct request *req)
@@ -286,13 +312,15 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 
 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
 		cmd->status = BLK_STS_TIMEOUT;
-		return BLK_EH_HANDLED;
+		goto done;
 	}
 	config = nbd->config;
 
 	if (config->num_connections > 1) {
 		dev_err_ratelimited(nbd_to_dev(nbd),
-				    "Connection timed out, retrying\n");
+				    "Connection timed out, retrying (%d/%d alive)\n",
+				    atomic_read(&config->live_connections),
+				    config->num_connections);
 		/*
 		 * Hooray we have more connections, requeue this IO, the submit
 		 * path will put it on a real connection.
@@ -314,7 +342,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 			}
 			blk_mq_requeue_request(req, true);
 			nbd_config_put(nbd);
-			return BLK_EH_NOT_HANDLED;
+			return BLK_EH_DONE;
 		}
 	} else {
 		dev_err_ratelimited(nbd_to_dev(nbd),
@@ -324,8 +352,9 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 	cmd->status = BLK_STS_IOERR;
 	sock_shutdown(nbd);
 	nbd_config_put(nbd);
-
-	return BLK_EH_HANDLED;
+done:
+	blk_mq_complete_request(req);
+	return BLK_EH_DONE;
 }
 
 /*
@@ -647,11 +676,8 @@ static void recv_work(struct work_struct *work)
 
 static void nbd_clear_req(struct request *req, void *data, bool reserved)
 {
-	struct nbd_cmd *cmd;
+	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 
-	if (!blk_mq_request_started(req))
-		return;
-	cmd = blk_mq_rq_to_pdu(req);
 	cmd->status = BLK_STS_IOERR;
 	blk_mq_complete_request(req);
 }
@@ -714,10 +740,9 @@ static int wait_for_reconnect(struct nbd_device *nbd)
 		return 0;
 	if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
 		return 0;
-	wait_event_timeout(config->conn_wait,
-			   atomic_read(&config->live_connections),
-			   config->dead_conn_timeout);
-	return atomic_read(&config->live_connections);
+	return wait_event_timeout(config->conn_wait,
+				  atomic_read(&config->live_connections) > 0,
+				  config->dead_conn_timeout) > 0;
 }
 
 static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
@@ -950,10 +975,6 @@ static void nbd_bdev_reset(struct block_device *bdev)
 	if (bdev->bd_openers > 1)
 		return;
 	bd_set_size(bdev, 0);
-	if (max_part > 0) {
-		blkdev_reread_part(bdev);
-		bdev->bd_invalidated = 1;
-	}
 }
 
 static void nbd_parse_flags(struct nbd_device *nbd)
@@ -1040,6 +1061,8 @@ static void nbd_config_put(struct nbd_device *nbd)
 		nbd->config = NULL;
 
 		nbd->tag_set.timeout = 0;
+		nbd->disk->queue->limits.discard_granularity = 0;
+		blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
 		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue);
 
 		mutex_unlock(&nbd->config_lock);
@@ -1109,7 +1132,6 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b
 	if (ret)
 		return ret;
 
-	bd_set_size(bdev, config->bytesize);
 	if (max_part)
 		bdev->bd_invalidated = 1;
 	mutex_unlock(&nbd->config_lock);
@@ -1118,7 +1140,7 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b
 	if (ret)
 		sock_shutdown(nbd);
 	mutex_lock(&nbd->config_lock);
-	bd_set_size(bdev, 0);
+	nbd_bdev_reset(bdev);
 	/* user requested, ignore socket errors */
 	if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags))
 		ret = 0;
@@ -1269,6 +1291,9 @@ static int nbd_open(struct block_device *bdev, fmode_t mode)
 		refcount_set(&nbd->config_refs, 1);
 		refcount_inc(&nbd->refs);
 		mutex_unlock(&nbd->config_lock);
+		bdev->bd_invalidated = 1;
+	} else if (nbd_disconnected(nbd->config)) {
+		bdev->bd_invalidated = 1;
 	}
 out:
 	mutex_unlock(&nbd_index_mutex);
@@ -1490,8 +1515,8 @@ static int nbd_dev_add(int index)
 	 */
 	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
 	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
-	disk->queue->limits.discard_granularity = 512;
-	blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
+	disk->queue->limits.discard_granularity = 0;
+	blk_queue_max_discard_sectors(disk->queue, 0);
 	blk_queue_max_segment_size(disk->queue, UINT_MAX);
 	blk_queue_max_segments(disk->queue, USHRT_MAX);
 	blk_queue_max_hw_sectors(disk->queue, 65536);
@@ -1755,6 +1780,7 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
 	}
 	mutex_lock(&nbd->config_lock);
 	nbd_disconnect(nbd);
+	nbd_clear_sock(nbd);
 	mutex_unlock(&nbd->config_lock);
 	if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
 			       &nbd->config->runtime_flags))
@@ -2093,7 +2119,8 @@ static int __init nbd_init(void)
 	if (nbds_max > 1UL << (MINORBITS - part_shift))
 		return -EINVAL;
 	recv_workqueue = alloc_workqueue("knbd-recv",
-					 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+					 WQ_MEM_RECLAIM | WQ_HIGHPRI |
+					 WQ_UNBOUND, 0);
 	if (!recv_workqueue)
 		return -ENOMEM;
 
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index a765532..2bdadd7 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -157,23 +157,23 @@ enum {
 };
 
 static int g_no_sched;
-module_param_named(no_sched, g_no_sched, int, S_IRUGO);
+module_param_named(no_sched, g_no_sched, int, 0444);
 MODULE_PARM_DESC(no_sched, "No io scheduler");
 
 static int g_submit_queues = 1;
-module_param_named(submit_queues, g_submit_queues, int, S_IRUGO);
+module_param_named(submit_queues, g_submit_queues, int, 0444);
 MODULE_PARM_DESC(submit_queues, "Number of submission queues");
 
 static int g_home_node = NUMA_NO_NODE;
-module_param_named(home_node, g_home_node, int, S_IRUGO);
+module_param_named(home_node, g_home_node, int, 0444);
 MODULE_PARM_DESC(home_node, "Home node for the device");
 
 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
 static char g_timeout_str[80];
-module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), S_IRUGO);
+module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444);
 
 static char g_requeue_str[80];
-module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), S_IRUGO);
+module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444);
 #endif
 
 static int g_queue_mode = NULL_Q_MQ;
@@ -203,27 +203,27 @@ static const struct kernel_param_ops null_queue_mode_param_ops = {
 	.get	= param_get_int,
 };
 
-device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, S_IRUGO);
+device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444);
 MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");
 
 static int g_gb = 250;
-module_param_named(gb, g_gb, int, S_IRUGO);
+module_param_named(gb, g_gb, int, 0444);
 MODULE_PARM_DESC(gb, "Size in GB");
 
 static int g_bs = 512;
-module_param_named(bs, g_bs, int, S_IRUGO);
+module_param_named(bs, g_bs, int, 0444);
 MODULE_PARM_DESC(bs, "Block size (in bytes)");
 
 static int nr_devices = 1;
-module_param(nr_devices, int, S_IRUGO);
+module_param(nr_devices, int, 0444);
 MODULE_PARM_DESC(nr_devices, "Number of devices to register");
 
 static bool g_blocking;
-module_param_named(blocking, g_blocking, bool, S_IRUGO);
+module_param_named(blocking, g_blocking, bool, 0444);
 MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
 
 static bool shared_tags;
-module_param(shared_tags, bool, S_IRUGO);
+module_param(shared_tags, bool, 0444);
 MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq");
 
 static int g_irqmode = NULL_IRQ_SOFTIRQ;
@@ -239,19 +239,19 @@ static const struct kernel_param_ops null_irqmode_param_ops = {
 	.get	= param_get_int,
 };
 
-device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, S_IRUGO);
+device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444);
 MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
 
 static unsigned long g_completion_nsec = 10000;
-module_param_named(completion_nsec, g_completion_nsec, ulong, S_IRUGO);
+module_param_named(completion_nsec, g_completion_nsec, ulong, 0444);
 MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");
 
 static int g_hw_queue_depth = 64;
-module_param_named(hw_queue_depth, g_hw_queue_depth, int, S_IRUGO);
+module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444);
 MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");
 
 static bool g_use_per_node_hctx;
-module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, S_IRUGO);
+module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
 MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
 
 static struct nullb_device *null_alloc_dev(void);
@@ -1365,7 +1365,8 @@ static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio)
 static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq)
 {
 	pr_info("null: rq %p timed out\n", rq);
-	return BLK_EH_HANDLED;
+	blk_mq_complete_request(rq);
+	return BLK_EH_DONE;
 }
 
 static int null_rq_prep_fn(struct request_queue *q, struct request *req)
@@ -1427,7 +1428,8 @@ static void null_request_fn(struct request_queue *q)
 static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
 {
 	pr_info("null: rq %p timed out\n", rq);
-	return BLK_EH_HANDLED;
+	blk_mq_complete_request(rq);
+	return BLK_EH_DONE;
 }
 
 static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 27a44b9..8961b190 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -740,7 +740,7 @@ static int pd_special_command(struct pd_unit *disk,
 {
 	struct request *rq;
 
-	rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index c61d20c..b3f83cd 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -97,8 +97,8 @@ static int pktdev_major;
 static int write_congestion_on  = PKT_WRITE_CONGESTION_ON;
 static int write_congestion_off = PKT_WRITE_CONGESTION_OFF;
 static struct mutex ctl_mutex;	/* Serialize open/close/setup/teardown */
-static mempool_t *psd_pool;
-static struct bio_set *pkt_bio_set;
+static mempool_t psd_pool;
+static struct bio_set pkt_bio_set;
 
 static struct class	*class_pktcdvd = NULL;    /* /sys/class/pktcdvd */
 static struct dentry	*pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */
@@ -478,8 +478,8 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd)
 	if (!pd->dfs_d_root)
 		return;
 
-	pd->dfs_f_info = debugfs_create_file("info", S_IRUGO,
-				pd->dfs_d_root, pd, &debug_fops);
+	pd->dfs_f_info = debugfs_create_file("info", 0444,
+					     pd->dfs_d_root, pd, &debug_fops);
 }
 
 static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd)
@@ -631,7 +631,7 @@ static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node)
 static void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node)
 {
 	rb_erase(&node->rb_node, &pd->bio_queue);
-	mempool_free(node, pd->rb_pool);
+	mempool_free(node, &pd->rb_pool);
 	pd->bio_queue_size--;
 	BUG_ON(pd->bio_queue_size < 0);
 }
@@ -704,13 +704,13 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 	int ret = 0;
 
 	rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
-			     REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM);
+			     REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 
 	if (cgc->buflen) {
 		ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
-				      __GFP_RECLAIM);
+				      GFP_NOIO);
 		if (ret)
 			goto out;
 	}
@@ -1285,7 +1285,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 	 * Fill-in bvec with data from orig_bios.
 	 */
 	spin_lock(&pkt->lock);
-	bio_copy_data(pkt->w_bio, pkt->orig_bios.head);
+	bio_list_copy_data(pkt->w_bio, pkt->orig_bios.head);
 
 	pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
 	spin_unlock(&pkt->lock);
@@ -2303,14 +2303,14 @@ static void pkt_end_io_read_cloned(struct bio *bio)
 	psd->bio->bi_status = bio->bi_status;
 	bio_put(bio);
 	bio_endio(psd->bio);
-	mempool_free(psd, psd_pool);
+	mempool_free(psd, &psd_pool);
 	pkt_bio_finished(pd);
 }
 
 static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
 {
-	struct bio *cloned_bio = bio_clone_fast(bio, GFP_NOIO, pkt_bio_set);
-	struct packet_stacked_data *psd = mempool_alloc(psd_pool, GFP_NOIO);
+	struct bio *cloned_bio = bio_clone_fast(bio, GFP_NOIO, &pkt_bio_set);
+	struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO);
 
 	psd->pd = pd;
 	psd->bio = bio;
@@ -2381,7 +2381,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
 	/*
 	 * No matching packet found. Store the bio in the work queue.
 	 */
-	node = mempool_alloc(pd->rb_pool, GFP_NOIO);
+	node = mempool_alloc(&pd->rb_pool, GFP_NOIO);
 	node->bio = bio;
 	spin_lock(&pd->lock);
 	BUG_ON(pd->bio_queue_size < 0);
@@ -2451,7 +2451,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio)
 
 			split = bio_split(bio, last_zone -
 					  bio->bi_iter.bi_sector,
-					  GFP_NOIO, pkt_bio_set);
+					  GFP_NOIO, &pkt_bio_set);
 			bio_chain(split, bio);
 		} else {
 			split = bio;
@@ -2538,18 +2538,6 @@ static int pkt_seq_show(struct seq_file *m, void *p)
 	return 0;
 }
 
-static int pkt_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pkt_seq_show, PDE_DATA(inode));
-}
-
-static const struct file_operations pkt_proc_fops = {
-	.open	= pkt_seq_open,
-	.read	= seq_read,
-	.llseek	= seq_lseek,
-	.release = single_release
-};
-
 static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 {
 	int i;
@@ -2604,7 +2592,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 		goto out_mem;
 	}
 
-	proc_create_data(pd->name, 0, pkt_proc, &pkt_proc_fops, pd);
+	proc_create_single_data(pd->name, 0, pkt_proc, pkt_seq_show, pd);
 	pkt_dbg(1, pd, "writer mapped to %s\n", bdevname(bdev, b));
 	return 0;
 
@@ -2707,9 +2695,9 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 	if (!pd)
 		goto out_mutex;
 
-	pd->rb_pool = mempool_create_kmalloc_pool(PKT_RB_POOL_SIZE,
-						  sizeof(struct pkt_rb_node));
-	if (!pd->rb_pool)
+	ret = mempool_init_kmalloc_pool(&pd->rb_pool, PKT_RB_POOL_SIZE,
+					sizeof(struct pkt_rb_node));
+	if (ret)
 		goto out_mem;
 
 	INIT_LIST_HEAD(&pd->cdrw.pkt_free_list);
@@ -2766,7 +2754,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 out_mem2:
 	put_disk(disk);
 out_mem:
-	mempool_destroy(pd->rb_pool);
+	mempool_exit(&pd->rb_pool);
 	kfree(pd);
 out_mutex:
 	mutex_unlock(&ctl_mutex);
@@ -2817,7 +2805,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
 	blk_cleanup_queue(pd->disk->queue);
 	put_disk(pd->disk);
 
-	mempool_destroy(pd->rb_pool);
+	mempool_exit(&pd->rb_pool);
 	kfree(pd);
 
 	/* This is safe: open() is still holding a reference. */
@@ -2914,14 +2902,14 @@ static int __init pkt_init(void)
 
 	mutex_init(&ctl_mutex);
 
-	psd_pool = mempool_create_kmalloc_pool(PSD_POOL_SIZE,
-					sizeof(struct packet_stacked_data));
-	if (!psd_pool)
-		return -ENOMEM;
-	pkt_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0);
-	if (!pkt_bio_set) {
-		mempool_destroy(psd_pool);
-		return -ENOMEM;
+	ret = mempool_init_kmalloc_pool(&psd_pool, PSD_POOL_SIZE,
+				    sizeof(struct packet_stacked_data));
+	if (ret)
+		return ret;
+	ret = bioset_init(&pkt_bio_set, BIO_POOL_SIZE, 0, 0);
+	if (ret) {
+		mempool_exit(&psd_pool);
+		return ret;
 	}
 
 	ret = register_blkdev(pktdev_major, DRIVER_NAME);
@@ -2954,8 +2942,8 @@ out_misc:
 out:
 	unregister_blkdev(pktdev_major, DRIVER_NAME);
 out2:
-	mempool_destroy(psd_pool);
-	bioset_free(pkt_bio_set);
+	mempool_exit(&psd_pool);
+	bioset_exit(&pkt_bio_set);
 	return ret;
 }
 
@@ -2968,8 +2956,8 @@ static void __exit pkt_exit(void)
 	pkt_sysfs_cleanup();
 
 	unregister_blkdev(pktdev_major, DRIVER_NAME);
-	mempool_destroy(psd_pool);
-	bioset_free(pkt_bio_set);
+	mempool_exit(&psd_pool);
+	bioset_exit(&pkt_bio_set);
 }
 
 MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives");
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 075662f..afe1508 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -465,8 +465,6 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
 	priv->queue = queue;
 	queue->queuedata = dev;
 
-	blk_queue_bounce_limit(queue, BLK_BOUNCE_HIGH);
-
 	blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9);
 	blk_queue_segment_boundary(queue, -1UL);
 	blk_queue_dma_alignment(queue, dev->blk_size-1);
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 6a55959..8fa4533 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -521,26 +521,13 @@ static int ps3vram_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int ps3vram_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ps3vram_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ps3vram_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ps3vram_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static void ps3vram_proc_init(struct ps3_system_bus_device *dev)
 {
 	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	struct proc_dir_entry *pde;
 
-	pde = proc_create_data(DEVICE_NAME, 0444, NULL, &ps3vram_proc_fops,
-			       priv);
+	pde = proc_create_single_data(DEVICE_NAME, 0444, NULL,
+			ps3vram_proc_show, priv);
 	if (!pde)
 		dev_warn(&dev->core, "failed to create /proc entry\n");
 }
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 33b36fe..af35404 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -424,7 +424,7 @@ static struct workqueue_struct *rbd_wq;
  * single-major requires >= 0.75 version of userspace rbd utility.
  */
 static bool single_major = true;
-module_param(single_major, bool, S_IRUGO);
+module_param(single_major, bool, 0444);
 MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
 
 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
@@ -468,11 +468,11 @@ static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
 	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
 }
 
-static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
-static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
-static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
-static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
-static BUS_ATTR(supported_features, S_IRUGO, rbd_supported_features_show, NULL);
+static BUS_ATTR(add, 0200, NULL, rbd_add);
+static BUS_ATTR(remove, 0200, NULL, rbd_remove);
+static BUS_ATTR(add_single_major, 0200, NULL, rbd_add_single_major);
+static BUS_ATTR(remove_single_major, 0200, NULL, rbd_remove_single_major);
+static BUS_ATTR(supported_features, 0444, rbd_supported_features_show, NULL);
 
 static struct attribute *rbd_bus_attrs[] = {
 	&bus_attr_add.attr,
@@ -4204,22 +4204,22 @@ static ssize_t rbd_image_refresh(struct device *dev,
 	return size;
 }
 
-static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
-static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
-static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
-static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
-static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
-static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
-static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
-static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
-static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
-static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
-static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
-static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
-static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
-static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
-static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
-static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
+static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
+static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
+static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
+static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
+static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
+static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
+static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
+static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
+static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
+static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
+static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
+static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
+static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
+static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
+static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
+static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
 
 static struct attribute *rbd_attrs[] = {
 	&dev_attr_size.attr,
diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c
index 34997df..09537be 100644
--- a/drivers/block/rsxx/core.c
+++ b/drivers/block/rsxx/core.c
@@ -247,19 +247,19 @@ static void rsxx_debugfs_dev_new(struct rsxx_cardinfo *card)
 	if (IS_ERR_OR_NULL(card->debugfs_dir))
 		goto failed_debugfs_dir;
 
-	debugfs_stats = debugfs_create_file("stats", S_IRUGO,
+	debugfs_stats = debugfs_create_file("stats", 0444,
 					    card->debugfs_dir, card,
 					    &debugfs_stats_fops);
 	if (IS_ERR_OR_NULL(debugfs_stats))
 		goto failed_debugfs_stats;
 
-	debugfs_pci_regs = debugfs_create_file("pci_regs", S_IRUGO,
+	debugfs_pci_regs = debugfs_create_file("pci_regs", 0444,
 					       card->debugfs_dir, card,
 					       &debugfs_pci_regs_fops);
 	if (IS_ERR_OR_NULL(debugfs_pci_regs))
 		goto failed_debugfs_pci_regs;
 
-	debugfs_cram = debugfs_create_file("cram", S_IRUGO | S_IWUSR,
+	debugfs_cram = debugfs_create_file("cram", 0644,
 					   card->debugfs_dir, card,
 					   &debugfs_cram_fops);
 	if (IS_ERR_OR_NULL(debugfs_cram))
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 08586dc..4d90e5e 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -567,7 +567,7 @@ static struct carm_request *carm_get_special(struct carm_host *host)
 	if (!crq)
 		return NULL;
 
-	rq = blk_get_request(host->oob_q, REQ_OP_DRV_OUT, GFP_KERNEL);
+	rq = blk_get_request(host->oob_q, REQ_OP_DRV_OUT, 0);
 	if (IS_ERR(rq)) {
 		spin_lock_irqsave(&host->lock, flags);
 		carm_put_request(host, crq);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 4a07593c..23752dc 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -298,7 +298,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
 	struct request *req;
 	int err;
 
-	req = blk_get_request(q, REQ_OP_DRV_IN, GFP_KERNEL);
+	req = blk_get_request(q, REQ_OP_DRV_IN, 0);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -371,7 +371,7 @@ static ssize_t virtblk_serial_show(struct device *dev,
 	return err;
 }
 
-static DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL);
+static DEVICE_ATTR(serial, 0444, virtblk_serial_show, NULL);
 
 /* The queue's logical block size must be set before calling this */
 static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize)
@@ -576,10 +576,10 @@ virtblk_cache_type_show(struct device *dev, struct device_attribute *attr,
 }
 
 static const struct device_attribute dev_attr_cache_type_ro =
-	__ATTR(cache_type, S_IRUGO,
+	__ATTR(cache_type, 0444,
 	       virtblk_cache_type_show, NULL);
 static const struct device_attribute dev_attr_cache_type_rw =
-	__ATTR(cache_type, S_IRUGO|S_IWUSR,
+	__ATTR(cache_type, 0644,
 	       virtblk_cache_type_show, virtblk_cache_type_store);
 
 static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq,
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 987d665..b55b245 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -98,7 +98,7 @@ MODULE_PARM_DESC(max_queues,
  * backend, 4KB page granularity is used.
  */
 unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
-module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
+module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444);
 MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
 /*
  * The LRU mechanism to clean the lists of persistent grants needs to
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 21c1be1..66412eed 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -367,7 +367,7 @@ int __init xen_blkif_interface_init(void)
 out:									\
 		return sprintf(buf, format, result);			\
 	}								\
-	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+	static DEVICE_ATTR(name, 0444, show_##name, NULL)
 
 VBD_SHOW_ALLRING(oo_req,  "%llu\n");
 VBD_SHOW_ALLRING(rd_req,  "%llu\n");
@@ -403,7 +403,7 @@ static const struct attribute_group xen_vbdstat_group = {
 									\
 		return sprintf(buf, format, ##args);			\
 	}								\
-	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+	static DEVICE_ATTR(name, 0444, show_##name, NULL)
 
 VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
 VBD_SHOW(mode, "%s\n", be->mode);
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2a8e781..ae00a82 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -129,13 +129,12 @@ static const struct block_device_operations xlvbd_block_fops;
  */
 
 static unsigned int xen_blkif_max_segments = 32;
-module_param_named(max_indirect_segments, xen_blkif_max_segments, uint,
-		   S_IRUGO);
+module_param_named(max_indirect_segments, xen_blkif_max_segments, uint, 0444);
 MODULE_PARM_DESC(max_indirect_segments,
 		 "Maximum amount of segments in indirect requests (default is 32)");
 
 static unsigned int xen_blkif_max_queues = 4;
-module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO);
+module_param_named(max_queues, xen_blkif_max_queues, uint, 0444);
 MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk");
 
 /*
@@ -143,7 +142,7 @@ MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per v
  * backend, 4KB page granularity is used.
  */
 static unsigned int xen_blkif_max_ring_order;
-module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
+module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444);
 MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
 
 #define BLK_RING_SIZE(info)	\
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index bfc566d..9adc8c3 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2192,7 +2192,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 
 		len = nr * CD_FRAMESIZE_RAW;
 
-		rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
+		rq = blk_get_request(q, REQ_OP_SCSI_IN, 0);
 		if (IS_ERR(rq)) {
 			ret = PTR_ERR(rq);
 			break;
diff --git a/drivers/char/apm-emulation.c b/drivers/char/apm-emulation.c
index a5e2f9e..53436c0 100644
--- a/drivers/char/apm-emulation.c
+++ b/drivers/char/apm-emulation.c
@@ -461,19 +461,6 @@ static int proc_apm_show(struct seq_file *m, void *v)
 
 	return 0;
 }
-
-static int proc_apm_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_apm_show, NULL);
-}
-
-static const struct file_operations apm_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_apm_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 static int kapmd(void *arg)
@@ -657,7 +644,7 @@ static int __init apm_init(void)
 	wake_up_process(kapmd_tsk);
 
 #ifdef CONFIG_PROC_FS
-	proc_create("apm", 0, NULL, &apm_proc_fops);
+	proc_create_single("apm", 0, NULL, proc_apm_show);
 #endif
 
 	ret = misc_register(&apm_device);
diff --git a/drivers/char/ds1620.c b/drivers/char/ds1620.c
index eb53cba..a5ecf6d 100644
--- a/drivers/char/ds1620.c
+++ b/drivers/char/ds1620.c
@@ -345,18 +345,6 @@ static int ds1620_proc_therm_show(struct seq_file *m, void *v)
 		   fan_state[netwinder_get_fan()]);
 	return 0;
 }
-
-static int ds1620_proc_therm_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ds1620_proc_therm_show, NULL);
-}
-
-static const struct file_operations ds1620_proc_therm_fops = {
-	.open		= ds1620_proc_therm_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 static const struct file_operations ds1620_fops = {
@@ -404,7 +392,7 @@ static int __init ds1620_init(void)
 		return ret;
 
 #ifdef THERM_USE_PROC
-	if (!proc_create("therm", 0, NULL, &ds1620_proc_therm_fops))
+	if (!proc_create_single("therm", 0, NULL, ds1620_proc_therm_show))
 		printk(KERN_ERR "therm: unable to register /proc/therm\n");
 #endif
 
diff --git a/drivers/char/efirtc.c b/drivers/char/efirtc.c
index dc62568..d9aab64 100644
--- a/drivers/char/efirtc.c
+++ b/drivers/char/efirtc.c
@@ -358,19 +358,6 @@ static int efi_rtc_proc_show(struct seq_file *m, void *v)
 
 	return 0;
 }
-
-static int efi_rtc_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, efi_rtc_proc_show, NULL);
-}
-
-static const struct file_operations efi_rtc_proc_fops = {
-	.open		= efi_rtc_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init 
 efi_rtc_init(void)
 {
@@ -386,7 +373,7 @@ efi_rtc_init(void)
 		return ret;
 	}
 
-	dir = proc_create("driver/efirtc", 0, NULL, &efi_rtc_proc_fops);
+	dir = proc_create_single("driver/efirtc", 0, NULL, efi_rtc_proc_show);
 	if (dir == NULL) {
 		printk(KERN_ERR "efirtc: can't create /proc/driver/efirtc.\n");
 		misc_deregister(&efi_rtc_dev);
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 1bb9e7c..53cfe57 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -95,19 +95,6 @@ static const struct seq_operations misc_seq_ops = {
 	.stop  = misc_seq_stop,
 	.show  = misc_seq_show,
 };
-
-static int misc_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &misc_seq_ops);
-}
-
-static const struct file_operations misc_proc_fops = {
-	.owner	 = THIS_MODULE,
-	.open    = misc_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release,
-};
 #endif
 
 static int misc_open(struct inode *inode, struct file *file)
@@ -282,7 +269,7 @@ static int __init misc_init(void)
 	int err;
 	struct proc_dir_entry *ret;
 
-	ret = proc_create("misc", 0, NULL, &misc_proc_fops);
+	ret = proc_create_seq("misc", 0, NULL, &misc_seq_ops);
 	misc_class = class_create(THIS_MODULE, "misc");
 	err = PTR_ERR(misc_class);
 	if (IS_ERR(misc_class))
diff --git a/drivers/char/nvram.c b/drivers/char/nvram.c
index 678fa97..25264d6 100644
--- a/drivers/char/nvram.c
+++ b/drivers/char/nvram.c
@@ -389,22 +389,9 @@ static int nvram_proc_read(struct seq_file *seq, void *offset)
 	return 0;
 }
 
-static int nvram_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, nvram_proc_read, NULL);
-}
-
-static const struct file_operations nvram_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= nvram_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int nvram_add_proc_fs(void)
 {
-	if (!proc_create("driver/nvram", 0, NULL, &nvram_proc_fops))
+	if (!proc_create_single("driver/nvram", 0, NULL, nvram_proc_read))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index aa502e9..66b0419 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -2616,19 +2616,6 @@ static int mgslpc_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int mgslpc_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, mgslpc_proc_show, NULL);
-}
-
-static const struct file_operations mgslpc_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= mgslpc_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int rx_alloc_buffers(MGSLPC_INFO *info)
 {
 	/* each buffer has header and data */
@@ -2815,7 +2802,7 @@ static const struct tty_operations mgslpc_ops = {
 	.tiocmget = tiocmget,
 	.tiocmset = tiocmset,
 	.get_icount = mgslpc_get_icount,
-	.proc_fops = &mgslpc_proc_fops,
+	.proc_show = mgslpc_proc_show,
 };
 
 static int __init synclink_cs_init(void)
diff --git a/drivers/char/random.c b/drivers/char/random.c
index cd888d4..a8fb002 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -402,8 +402,7 @@ static struct poolinfo {
 /*
  * Static global variables
  */
-static DECLARE_WAIT_QUEUE_HEAD(random_read_wait);
-static DECLARE_WAIT_QUEUE_HEAD(random_write_wait);
+static DECLARE_WAIT_QUEUE_HEAD(random_wait);
 static struct fasync_struct *fasync;
 
 static DEFINE_SPINLOCK(random_ready_list_lock);
@@ -722,8 +721,8 @@ retry:
 
 		/* should we wake readers? */
 		if (entropy_bits >= random_read_wakeup_bits &&
-		    wq_has_sleeper(&random_read_wait)) {
-			wake_up_interruptible(&random_read_wait);
+		    wq_has_sleeper(&random_wait)) {
+			wake_up_interruptible_poll(&random_wait, POLLIN);
 			kill_fasync(&fasync, SIGIO, POLL_IN);
 		}
 		/* If the input pool is getting full, send some
@@ -1397,7 +1396,7 @@ retry:
 	trace_debit_entropy(r->name, 8 * ibytes);
 	if (ibytes &&
 	    (r->entropy_count >> ENTROPY_SHIFT) < random_write_wakeup_bits) {
-		wake_up_interruptible(&random_write_wait);
+		wake_up_interruptible_poll(&random_wait, POLLOUT);
 		kill_fasync(&fasync, SIGIO, POLL_OUT);
 	}
 
@@ -1839,7 +1838,7 @@ _random_read(int nonblock, char __user *buf, size_t nbytes)
 		if (nonblock)
 			return -EAGAIN;
 
-		wait_event_interruptible(random_read_wait,
+		wait_event_interruptible(random_wait,
 			ENTROPY_BITS(&input_pool) >=
 			random_read_wakeup_bits);
 		if (signal_pending(current))
@@ -1876,14 +1875,17 @@ urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 	return ret;
 }
 
+static struct wait_queue_head *
+random_get_poll_head(struct file *file, __poll_t events)
+{
+	return &random_wait;
+}
+
 static __poll_t
-random_poll(struct file *file, poll_table * wait)
+random_poll_mask(struct file *file, __poll_t events)
 {
-	__poll_t mask;
+	__poll_t mask = 0;
 
-	poll_wait(file, &random_read_wait, wait);
-	poll_wait(file, &random_write_wait, wait);
-	mask = 0;
 	if (ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits)
 		mask |= EPOLLIN | EPOLLRDNORM;
 	if (ENTROPY_BITS(&input_pool) < random_write_wakeup_bits)
@@ -1990,7 +1992,8 @@ static int random_fasync(int fd, struct file *filp, int on)
 const struct file_operations random_fops = {
 	.read  = random_read,
 	.write = random_write,
-	.poll  = random_poll,
+	.get_poll_head  = random_get_poll_head,
+	.poll_mask  = random_poll_mask,
 	.unlocked_ioctl = random_ioctl,
 	.fasync = random_fasync,
 	.llseek = noop_llseek,
@@ -2323,7 +2326,7 @@ void add_hwgenerator_randomness(const char *buffer, size_t count,
 	 * We'll be woken up again once below random_write_wakeup_thresh,
 	 * or when the calling thread is about to terminate.
 	 */
-	wait_event_interruptible(random_write_wait, kthread_should_stop() ||
+	wait_event_interruptible(random_wait, kthread_should_stop() ||
 			ENTROPY_BITS(&input_pool) <= random_write_wakeup_bits);
 	mix_pool_bytes(poolp, buffer, count);
 	credit_entropy_bits(poolp, entropy);
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 57dc546..94fedee 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -171,7 +171,7 @@ static void mask_rtc_irq_bit(unsigned char bit)
 #endif
 
 #ifdef CONFIG_PROC_FS
-static int rtc_proc_open(struct inode *inode, struct file *file);
+static int rtc_proc_show(struct seq_file *seq, void *v);
 #endif
 
 /*
@@ -832,16 +832,6 @@ static struct miscdevice rtc_dev = {
 	.fops		= &rtc_fops,
 };
 
-#ifdef CONFIG_PROC_FS
-static const struct file_operations rtc_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= rtc_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-#endif
-
 static resource_size_t rtc_size;
 
 static struct resource * __init rtc_request_region(resource_size_t size)
@@ -982,7 +972,7 @@ no_irq:
 	}
 
 #ifdef CONFIG_PROC_FS
-	ent = proc_create("driver/rtc", 0, NULL, &rtc_proc_fops);
+	ent = proc_create_single("driver/rtc", 0, NULL, rtc_proc_show);
 	if (!ent)
 		printk(KERN_WARNING "rtc: Failed to register with procfs.\n");
 #endif
@@ -1201,11 +1191,6 @@ static int rtc_proc_show(struct seq_file *seq, void *v)
 #undef YN
 #undef NY
 }
-
-static int rtc_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, rtc_proc_show, NULL);
-}
 #endif
 
 static void rtc_get_rtc_time(struct rtc_time *rtc_tm)
diff --git a/drivers/char/toshiba.c b/drivers/char/toshiba.c
index 5488516..802376f 100644
--- a/drivers/char/toshiba.c
+++ b/drivers/char/toshiba.c
@@ -326,19 +326,6 @@ static int proc_toshiba_show(struct seq_file *m, void *v)
 		key);
 	return 0;
 }
-
-static int proc_toshiba_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_toshiba_show, NULL);
-}
-
-static const struct file_operations proc_toshiba_fops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_toshiba_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 
@@ -524,7 +511,7 @@ static int __init toshiba_init(void)
 	{
 		struct proc_dir_entry *pde;
 
-		pde = proc_create("toshiba", 0, NULL, &proc_toshiba_fops);
+		pde = proc_create_single("toshiba", 0, NULL, proc_toshiba_show);
 		if (!pde) {
 			misc_deregister(&tosh_device);
 			return -ENOMEM;
diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index 8615594b..e718b8c 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -260,19 +260,6 @@ static int cn_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int cn_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, cn_proc_show, NULL);
-}
-
-static const struct file_operations cn_file_ops = {
-	.owner   = THIS_MODULE,
-	.open    = cn_proc_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = single_release
-};
-
 static struct cn_dev cdev = {
 	.input   = cn_rx_skb,
 };
@@ -297,7 +284,7 @@ static int cn_init(void)
 
 	cn_already_initialized = 1;
 
-	proc_create("connector", S_IRUGO, init_net.proc_net, &cn_file_ops);
+	proc_create_single("connector", S_IRUGO, init_net.proc_net, cn_proc_show);
 
 	return 0;
 }
diff --git a/drivers/dma/qcom/hidma_mgmt.c b/drivers/dma/qcom/hidma_mgmt.c
index 000c7019..d64edeb 100644
--- a/drivers/dma/qcom/hidma_mgmt.c
+++ b/drivers/dma/qcom/hidma_mgmt.c
@@ -398,7 +398,7 @@ static int __init hidma_mgmt_of_populate_channels(struct device_node *np)
 		}
 		of_node_get(child);
 		new_pdev->dev.of_node = child;
-		of_dma_configure(&new_pdev->dev, child);
+		of_dma_configure(&new_pdev->dev, child, true);
 		/*
 		 * It is assumed that calling of_msi_configure is safe on
 		 * platforms with or without MSI support.
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 3098410..781a4a3 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -174,6 +174,11 @@ config UEFI_CPER_ARM
 	depends on UEFI_CPER && ( ARM || ARM64 )
 	default y
 
+config UEFI_CPER_X86
+	bool
+	depends on UEFI_CPER && X86
+	default y
+
 config EFI_DEV_PATH_PARSER
 	bool
 	depends on ACPI
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index cb80537..5f9f503 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -31,3 +31,4 @@ obj-$(CONFIG_ARM)			+= $(arm-obj-y)
 obj-$(CONFIG_ARM64)			+= $(arm-obj-y)
 obj-$(CONFIG_EFI_CAPSULE_LOADER)	+= capsule-loader.o
 obj-$(CONFIG_UEFI_CPER_ARM)		+= cper-arm.o
+obj-$(CONFIG_UEFI_CPER_X86)		+= cper-x86.o
diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c
index e456f46..9668898 100644
--- a/drivers/firmware/efi/capsule-loader.c
+++ b/drivers/firmware/efi/capsule-loader.c
@@ -134,10 +134,16 @@ static ssize_t efi_capsule_submit_update(struct capsule_info *cap_info)
 
 	/* Indicate capsule binary uploading is done */
 	cap_info->index = NO_FURTHER_WRITE_ACTION;
-	pr_info("Successfully upload capsule file with reboot type '%s'\n",
-		!cap_info->reset_type ? "RESET_COLD" :
-		cap_info->reset_type == 1 ? "RESET_WARM" :
-		"RESET_SHUTDOWN");
+
+	if (cap_info->header.flags & EFI_CAPSULE_PERSIST_ACROSS_RESET) {
+		pr_info("Successfully uploaded capsule file with reboot type '%s'\n",
+			!cap_info->reset_type ? "RESET_COLD" :
+			cap_info->reset_type == 1 ? "RESET_WARM" :
+			"RESET_SHUTDOWN");
+	} else {
+		pr_info("Successfully processed capsule file\n");
+	}
+
 	return 0;
 }
 
diff --git a/drivers/firmware/efi/cper-arm.c b/drivers/firmware/efi/cper-arm.c
index 698e5c8..5028113 100644
--- a/drivers/firmware/efi/cper-arm.c
+++ b/drivers/firmware/efi/cper-arm.c
@@ -30,8 +30,6 @@
 #include <acpi/ghes.h>
 #include <ras/ras_event.h>
 
-#define INDENT_SP	" "
-
 static const char * const arm_reg_ctx_strs[] = {
 	"AArch32 general purpose registers",
 	"AArch32 EL1 context registers",
@@ -283,7 +281,7 @@ void cper_print_proc_arm(const char *pfx,
 			pfx, proc->psci_state);
 	}
 
-	snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
+	snprintf(newpfx, sizeof(newpfx), "%s ", pfx);
 
 	err_info = (struct cper_arm_err_info *)(proc + 1);
 	for (i = 0; i < proc->err_info_num; i++) {
@@ -310,7 +308,7 @@ void cper_print_proc_arm(const char *pfx,
 		if (err_info->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO) {
 			printk("%serror_info: 0x%016llx\n", newpfx,
 			       err_info->error_info);
-			snprintf(infopfx, sizeof(infopfx), "%s%s", newpfx, INDENT_SP);
+			snprintf(infopfx, sizeof(infopfx), "%s ", newpfx);
 			cper_print_arm_err_info(infopfx, err_info->type,
 						err_info->error_info);
 		}
diff --git a/drivers/firmware/efi/cper-x86.c b/drivers/firmware/efi/cper-x86.c
new file mode 100644
index 0000000..2531de4
--- /dev/null
+++ b/drivers/firmware/efi/cper-x86.c
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+#include <linux/cper.h>
+
+/*
+ * We don't need a "CPER_IA" prefix since these are all locally defined.
+ * This will save us a lot of line space.
+ */
+#define VALID_LAPIC_ID			BIT_ULL(0)
+#define VALID_CPUID_INFO		BIT_ULL(1)
+#define VALID_PROC_ERR_INFO_NUM(bits)	(((bits) & GENMASK_ULL(7, 2)) >> 2)
+#define VALID_PROC_CXT_INFO_NUM(bits)	(((bits) & GENMASK_ULL(13, 8)) >> 8)
+
+#define INFO_ERR_STRUCT_TYPE_CACHE					\
+	GUID_INIT(0xA55701F5, 0xE3EF, 0x43DE, 0xAC, 0x72, 0x24, 0x9B,	\
+		  0x57, 0x3F, 0xAD, 0x2C)
+#define INFO_ERR_STRUCT_TYPE_TLB					\
+	GUID_INIT(0xFC06B535, 0x5E1F, 0x4562, 0x9F, 0x25, 0x0A, 0x3B,	\
+		  0x9A, 0xDB, 0x63, 0xC3)
+#define INFO_ERR_STRUCT_TYPE_BUS					\
+	GUID_INIT(0x1CF3F8B3, 0xC5B1, 0x49a2, 0xAA, 0x59, 0x5E, 0xEF,	\
+		  0x92, 0xFF, 0xA6, 0x3C)
+#define INFO_ERR_STRUCT_TYPE_MS						\
+	GUID_INIT(0x48AB7F57, 0xDC34, 0x4f6c, 0xA7, 0xD3, 0xB0, 0xB5,	\
+		  0xB0, 0xA7, 0x43, 0x14)
+
+#define INFO_VALID_CHECK_INFO		BIT_ULL(0)
+#define INFO_VALID_TARGET_ID		BIT_ULL(1)
+#define INFO_VALID_REQUESTOR_ID		BIT_ULL(2)
+#define INFO_VALID_RESPONDER_ID		BIT_ULL(3)
+#define INFO_VALID_IP			BIT_ULL(4)
+
+#define CHECK_VALID_TRANS_TYPE		BIT_ULL(0)
+#define CHECK_VALID_OPERATION		BIT_ULL(1)
+#define CHECK_VALID_LEVEL		BIT_ULL(2)
+#define CHECK_VALID_PCC			BIT_ULL(3)
+#define CHECK_VALID_UNCORRECTED		BIT_ULL(4)
+#define CHECK_VALID_PRECISE_IP		BIT_ULL(5)
+#define CHECK_VALID_RESTARTABLE_IP	BIT_ULL(6)
+#define CHECK_VALID_OVERFLOW		BIT_ULL(7)
+
+#define CHECK_VALID_BUS_PART_TYPE	BIT_ULL(8)
+#define CHECK_VALID_BUS_TIME_OUT	BIT_ULL(9)
+#define CHECK_VALID_BUS_ADDR_SPACE	BIT_ULL(10)
+
+#define CHECK_VALID_BITS(check)		(((check) & GENMASK_ULL(15, 0)))
+#define CHECK_TRANS_TYPE(check)		(((check) & GENMASK_ULL(17, 16)) >> 16)
+#define CHECK_OPERATION(check)		(((check) & GENMASK_ULL(21, 18)) >> 18)
+#define CHECK_LEVEL(check)		(((check) & GENMASK_ULL(24, 22)) >> 22)
+#define CHECK_PCC			BIT_ULL(25)
+#define CHECK_UNCORRECTED		BIT_ULL(26)
+#define CHECK_PRECISE_IP		BIT_ULL(27)
+#define CHECK_RESTARTABLE_IP		BIT_ULL(28)
+#define CHECK_OVERFLOW			BIT_ULL(29)
+
+#define CHECK_BUS_PART_TYPE(check)	(((check) & GENMASK_ULL(31, 30)) >> 30)
+#define CHECK_BUS_TIME_OUT		BIT_ULL(32)
+#define CHECK_BUS_ADDR_SPACE(check)	(((check) & GENMASK_ULL(34, 33)) >> 33)
+
+#define CHECK_VALID_MS_ERR_TYPE		BIT_ULL(0)
+#define CHECK_VALID_MS_PCC		BIT_ULL(1)
+#define CHECK_VALID_MS_UNCORRECTED	BIT_ULL(2)
+#define CHECK_VALID_MS_PRECISE_IP	BIT_ULL(3)
+#define CHECK_VALID_MS_RESTARTABLE_IP	BIT_ULL(4)
+#define CHECK_VALID_MS_OVERFLOW		BIT_ULL(5)
+
+#define CHECK_MS_ERR_TYPE(check)	(((check) & GENMASK_ULL(18, 16)) >> 16)
+#define CHECK_MS_PCC			BIT_ULL(19)
+#define CHECK_MS_UNCORRECTED		BIT_ULL(20)
+#define CHECK_MS_PRECISE_IP		BIT_ULL(21)
+#define CHECK_MS_RESTARTABLE_IP		BIT_ULL(22)
+#define CHECK_MS_OVERFLOW		BIT_ULL(23)
+
+#define CTX_TYPE_MSR			1
+#define CTX_TYPE_MMREG			7
+
+enum err_types {
+	ERR_TYPE_CACHE = 0,
+	ERR_TYPE_TLB,
+	ERR_TYPE_BUS,
+	ERR_TYPE_MS,
+	N_ERR_TYPES
+};
+
+static enum err_types cper_get_err_type(const guid_t *err_type)
+{
+	if (guid_equal(err_type, &INFO_ERR_STRUCT_TYPE_CACHE))
+		return ERR_TYPE_CACHE;
+	else if (guid_equal(err_type, &INFO_ERR_STRUCT_TYPE_TLB))
+		return ERR_TYPE_TLB;
+	else if (guid_equal(err_type, &INFO_ERR_STRUCT_TYPE_BUS))
+		return ERR_TYPE_BUS;
+	else if (guid_equal(err_type, &INFO_ERR_STRUCT_TYPE_MS))
+		return ERR_TYPE_MS;
+	else
+		return N_ERR_TYPES;
+}
+
+static const char * const ia_check_trans_type_strs[] = {
+	"Instruction",
+	"Data Access",
+	"Generic",
+};
+
+static const char * const ia_check_op_strs[] = {
+	"generic error",
+	"generic read",
+	"generic write",
+	"data read",
+	"data write",
+	"instruction fetch",
+	"prefetch",
+	"eviction",
+	"snoop",
+};
+
+static const char * const ia_check_bus_part_type_strs[] = {
+	"Local Processor originated request",
+	"Local Processor responded to request",
+	"Local Processor observed",
+	"Generic",
+};
+
+static const char * const ia_check_bus_addr_space_strs[] = {
+	"Memory Access",
+	"Reserved",
+	"I/O",
+	"Other Transaction",
+};
+
+static const char * const ia_check_ms_error_type_strs[] = {
+	"No Error",
+	"Unclassified",
+	"Microcode ROM Parity Error",
+	"External Error",
+	"FRC Error",
+	"Internal Unclassified",
+};
+
+static const char * const ia_reg_ctx_strs[] = {
+	"Unclassified Data",
+	"MSR Registers (Machine Check and other MSRs)",
+	"32-bit Mode Execution Context",
+	"64-bit Mode Execution Context",
+	"FXSAVE Context",
+	"32-bit Mode Debug Registers (DR0-DR7)",
+	"64-bit Mode Debug Registers (DR0-DR7)",
+	"Memory Mapped Registers",
+};
+
+static inline void print_bool(char *str, const char *pfx, u64 check, u64 bit)
+{
+	printk("%s%s: %s\n", pfx, str, (check & bit) ? "true" : "false");
+}
+
+static void print_err_info_ms(const char *pfx, u16 validation_bits, u64 check)
+{
+	if (validation_bits & CHECK_VALID_MS_ERR_TYPE) {
+		u8 err_type = CHECK_MS_ERR_TYPE(check);
+
+		printk("%sError Type: %u, %s\n", pfx, err_type,
+		       err_type < ARRAY_SIZE(ia_check_ms_error_type_strs) ?
+		       ia_check_ms_error_type_strs[err_type] : "unknown");
+	}
+
+	if (validation_bits & CHECK_VALID_MS_PCC)
+		print_bool("Processor Context Corrupt", pfx, check, CHECK_MS_PCC);
+
+	if (validation_bits & CHECK_VALID_MS_UNCORRECTED)
+		print_bool("Uncorrected", pfx, check, CHECK_MS_UNCORRECTED);
+
+	if (validation_bits & CHECK_VALID_MS_PRECISE_IP)
+		print_bool("Precise IP", pfx, check, CHECK_MS_PRECISE_IP);
+
+	if (validation_bits & CHECK_VALID_MS_RESTARTABLE_IP)
+		print_bool("Restartable IP", pfx, check, CHECK_MS_RESTARTABLE_IP);
+
+	if (validation_bits & CHECK_VALID_MS_OVERFLOW)
+		print_bool("Overflow", pfx, check, CHECK_MS_OVERFLOW);
+}
+
+static void print_err_info(const char *pfx, u8 err_type, u64 check)
+{
+	u16 validation_bits = CHECK_VALID_BITS(check);
+
+	/*
+	 * The MS Check structure varies a lot from the others, so use a
+	 * separate function for decoding.
+	 */
+	if (err_type == ERR_TYPE_MS)
+		return print_err_info_ms(pfx, validation_bits, check);
+
+	if (validation_bits & CHECK_VALID_TRANS_TYPE) {
+		u8 trans_type = CHECK_TRANS_TYPE(check);
+
+		printk("%sTransaction Type: %u, %s\n", pfx, trans_type,
+		       trans_type < ARRAY_SIZE(ia_check_trans_type_strs) ?
+		       ia_check_trans_type_strs[trans_type] : "unknown");
+	}
+
+	if (validation_bits & CHECK_VALID_OPERATION) {
+		u8 op = CHECK_OPERATION(check);
+
+		/*
+		 * CACHE has more operation types than TLB or BUS, though the
+		 * name and the order are the same.
+		 */
+		u8 max_ops = (err_type == ERR_TYPE_CACHE) ? 9 : 7;
+
+		printk("%sOperation: %u, %s\n", pfx, op,
+		       op < max_ops ? ia_check_op_strs[op] : "unknown");
+	}
+
+	if (validation_bits & CHECK_VALID_LEVEL)
+		printk("%sLevel: %llu\n", pfx, CHECK_LEVEL(check));
+
+	if (validation_bits & CHECK_VALID_PCC)
+		print_bool("Processor Context Corrupt", pfx, check, CHECK_PCC);
+
+	if (validation_bits & CHECK_VALID_UNCORRECTED)
+		print_bool("Uncorrected", pfx, check, CHECK_UNCORRECTED);
+
+	if (validation_bits & CHECK_VALID_PRECISE_IP)
+		print_bool("Precise IP", pfx, check, CHECK_PRECISE_IP);
+
+	if (validation_bits & CHECK_VALID_RESTARTABLE_IP)
+		print_bool("Restartable IP", pfx, check, CHECK_RESTARTABLE_IP);
+
+	if (validation_bits & CHECK_VALID_OVERFLOW)
+		print_bool("Overflow", pfx, check, CHECK_OVERFLOW);
+
+	if (err_type != ERR_TYPE_BUS)
+		return;
+
+	if (validation_bits & CHECK_VALID_BUS_PART_TYPE) {
+		u8 part_type = CHECK_BUS_PART_TYPE(check);
+
+		printk("%sParticipation Type: %u, %s\n", pfx, part_type,
+		       part_type < ARRAY_SIZE(ia_check_bus_part_type_strs) ?
+		       ia_check_bus_part_type_strs[part_type] : "unknown");
+	}
+
+	if (validation_bits & CHECK_VALID_BUS_TIME_OUT)
+		print_bool("Time Out", pfx, check, CHECK_BUS_TIME_OUT);
+
+	if (validation_bits & CHECK_VALID_BUS_ADDR_SPACE) {
+		u8 addr_space = CHECK_BUS_ADDR_SPACE(check);
+
+		printk("%sAddress Space: %u, %s\n", pfx, addr_space,
+		       addr_space < ARRAY_SIZE(ia_check_bus_addr_space_strs) ?
+		       ia_check_bus_addr_space_strs[addr_space] : "unknown");
+	}
+}
+
+void cper_print_proc_ia(const char *pfx, const struct cper_sec_proc_ia *proc)
+{
+	int i;
+	struct cper_ia_err_info *err_info;
+	struct cper_ia_proc_ctx *ctx_info;
+	char newpfx[64], infopfx[64];
+	u8 err_type;
+
+	if (proc->validation_bits & VALID_LAPIC_ID)
+		printk("%sLocal APIC_ID: 0x%llx\n", pfx, proc->lapic_id);
+
+	if (proc->validation_bits & VALID_CPUID_INFO) {
+		printk("%sCPUID Info:\n", pfx);
+		print_hex_dump(pfx, "", DUMP_PREFIX_OFFSET, 16, 4, proc->cpuid,
+			       sizeof(proc->cpuid), 0);
+	}
+
+	snprintf(newpfx, sizeof(newpfx), "%s ", pfx);
+
+	err_info = (struct cper_ia_err_info *)(proc + 1);
+	for (i = 0; i < VALID_PROC_ERR_INFO_NUM(proc->validation_bits); i++) {
+		printk("%sError Information Structure %d:\n", pfx, i);
+
+		err_type = cper_get_err_type(&err_info->err_type);
+		printk("%sError Structure Type: %s\n", newpfx,
+		       err_type < ARRAY_SIZE(cper_proc_error_type_strs) ?
+		       cper_proc_error_type_strs[err_type] : "unknown");
+
+		if (err_type >= N_ERR_TYPES) {
+			printk("%sError Structure Type: %pUl\n", newpfx,
+			       &err_info->err_type);
+		}
+
+		if (err_info->validation_bits & INFO_VALID_CHECK_INFO) {
+			printk("%sCheck Information: 0x%016llx\n", newpfx,
+			       err_info->check_info);
+
+			if (err_type < N_ERR_TYPES) {
+				snprintf(infopfx, sizeof(infopfx), "%s ",
+					 newpfx);
+
+				print_err_info(infopfx, err_type,
+					       err_info->check_info);
+			}
+		}
+
+		if (err_info->validation_bits & INFO_VALID_TARGET_ID) {
+			printk("%sTarget Identifier: 0x%016llx\n",
+			       newpfx, err_info->target_id);
+		}
+
+		if (err_info->validation_bits & INFO_VALID_REQUESTOR_ID) {
+			printk("%sRequestor Identifier: 0x%016llx\n",
+			       newpfx, err_info->requestor_id);
+		}
+
+		if (err_info->validation_bits & INFO_VALID_RESPONDER_ID) {
+			printk("%sResponder Identifier: 0x%016llx\n",
+			       newpfx, err_info->responder_id);
+		}
+
+		if (err_info->validation_bits & INFO_VALID_IP) {
+			printk("%sInstruction Pointer: 0x%016llx\n",
+			       newpfx, err_info->ip);
+		}
+
+		err_info++;
+	}
+
+	ctx_info = (struct cper_ia_proc_ctx *)err_info;
+	for (i = 0; i < VALID_PROC_CXT_INFO_NUM(proc->validation_bits); i++) {
+		int size = sizeof(*ctx_info) + ctx_info->reg_arr_size;
+		int groupsize = 4;
+
+		printk("%sContext Information Structure %d:\n", pfx, i);
+
+		printk("%sRegister Context Type: %s\n", newpfx,
+		       ctx_info->reg_ctx_type < ARRAY_SIZE(ia_reg_ctx_strs) ?
+		       ia_reg_ctx_strs[ctx_info->reg_ctx_type] : "unknown");
+
+		printk("%sRegister Array Size: 0x%04x\n", newpfx,
+		       ctx_info->reg_arr_size);
+
+		if (ctx_info->reg_ctx_type == CTX_TYPE_MSR) {
+			groupsize = 8; /* MSRs are 8 bytes wide. */
+			printk("%sMSR Address: 0x%08x\n", newpfx,
+			       ctx_info->msr_addr);
+		}
+
+		if (ctx_info->reg_ctx_type == CTX_TYPE_MMREG) {
+			printk("%sMM Register Address: 0x%016llx\n", newpfx,
+			       ctx_info->mm_reg_addr);
+		}
+
+		printk("%sRegister Array:\n", newpfx);
+		print_hex_dump(newpfx, "", DUMP_PREFIX_OFFSET, 16, groupsize,
+			       (ctx_info + 1), ctx_info->reg_arr_size, 0);
+
+		ctx_info = (struct cper_ia_proc_ctx *)((long)ctx_info + size);
+	}
+}
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index c165933..3bf0dca 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -37,8 +37,6 @@
 #include <acpi/ghes.h>
 #include <ras/ras_event.h>
 
-#define INDENT_SP	" "
-
 static char rcd_decode_str[CPER_REC_LEN];
 
 /*
@@ -433,7 +431,7 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
 	if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
 		printk("%s""fru_text: %.20s\n", pfx, gdata->fru_text);
 
-	snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
+	snprintf(newpfx, sizeof(newpfx), "%s ", pfx);
 	if (guid_equal(sec_type, &CPER_SEC_PROC_GENERIC)) {
 		struct cper_sec_proc_generic *proc_err = acpi_hest_get_payload(gdata);
 
@@ -470,6 +468,16 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
 		else
 			goto err_section_too_small;
 #endif
+#if defined(CONFIG_UEFI_CPER_X86)
+	} else if (guid_equal(sec_type, &CPER_SEC_PROC_IA)) {
+		struct cper_sec_proc_ia *ia_err = acpi_hest_get_payload(gdata);
+
+		printk("%ssection_type: IA32/X64 processor error\n", newpfx);
+		if (gdata->error_data_length >= sizeof(*ia_err))
+			cper_print_proc_ia(newpfx, ia_err);
+		else
+			goto err_section_too_small;
+#endif
 	} else {
 		const void *err = acpi_hest_get_payload(gdata);
 
@@ -500,7 +508,7 @@ void cper_estatus_print(const char *pfx,
 		       "It has been corrected by h/w "
 		       "and requires no further action");
 	printk("%s""event severity: %s\n", pfx, cper_severity_str(severity));
-	snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
+	snprintf(newpfx, sizeof(newpfx), "%s ", pfx);
 
 	apei_estatus_for_each_section(estatus, gdata) {
 		cper_estatus_print_section(newpfx, gdata, sec_no);
diff --git a/drivers/firmware/efi/libstub/secureboot.c b/drivers/firmware/efi/libstub/secureboot.c
index 8f07eb4..72d9dfb 100644
--- a/drivers/firmware/efi/libstub/secureboot.c
+++ b/drivers/firmware/efi/libstub/secureboot.c
@@ -30,6 +30,9 @@ static const efi_char16_t shim_MokSBState_name[] = L"MokSBState";
 
 /*
  * Determine whether we're in secure boot mode.
+ *
+ * Please keep the logic in sync with
+ * arch/x86/xen/efi.c:xen_efi_get_secureboot().
  */
 enum efi_secureboot_mode efi_get_secureboot(efi_system_table_t *sys_table_arg)
 {
diff --git a/drivers/firmware/efi/libstub/tpm.c b/drivers/firmware/efi/libstub/tpm.c
index 9d08cea..caa37a6 100644
--- a/drivers/firmware/efi/libstub/tpm.c
+++ b/drivers/firmware/efi/libstub/tpm.c
@@ -59,7 +59,7 @@ void efi_enable_reset_attack_mitigation(efi_system_table_t *sys_table_arg)
 
 #endif
 
-void efi_retrieve_tpm2_eventlog_1_2(efi_system_table_t *sys_table_arg)
+static void efi_retrieve_tpm2_eventlog_1_2(efi_system_table_t *sys_table_arg)
 {
 	efi_guid_t tcg2_guid = EFI_TCG2_PROTOCOL_GUID;
 	efi_guid_t linux_eventlog_guid = LINUX_EFI_TPM_EVENT_LOG_GUID;
diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index 88a3558..815bdb4 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -314,6 +314,11 @@ static int host1x_device_match(struct device *dev, struct device_driver *drv)
 	return strcmp(dev_name(dev), drv->name) == 0;
 }
 
+static int host1x_dma_configure(struct device *dev)
+{
+	return of_dma_configure(dev, dev->of_node, true);
+}
+
 static const struct dev_pm_ops host1x_device_pm_ops = {
 	.suspend = pm_generic_suspend,
 	.resume = pm_generic_resume,
@@ -326,8 +331,8 @@ static const struct dev_pm_ops host1x_device_pm_ops = {
 struct bus_type host1x_bus_type = {
 	.name = "host1x",
 	.match = host1x_device_match,
+	.dma_configure	= host1x_dma_configure,
 	.pm = &host1x_device_pm_ops,
-	.force_dma = true,
 };
 
 static void __host1x_device_del(struct host1x_device *device)
@@ -416,7 +421,7 @@ static int host1x_device_add(struct host1x *host1x,
 	device->dev.bus = &host1x_bus_type;
 	device->dev.parent = host1x->dev;
 
-	of_dma_configure(&device->dev, host1x->dev->of_node);
+	of_dma_configure(&device->dev, host1x->dev->of_node, true);
 
 	err = host1x_device_parse_dt(device, driver);
 	if (err < 0) {
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 6ec307c..f10840a 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -717,15 +717,12 @@ config SENSORS_LTC2945
 	  be called ltc2945.
 
 config SENSORS_LTC2990
-	tristate "Linear Technology LTC2990 (current monitoring mode only)"
+	tristate "Linear Technology LTC2990"
 	depends on I2C
 	help
 	  If you say yes here you get support for Linear Technology LTC2990
 	  I2C System Monitor. The LTC2990 supports a combination of voltage,
-	  current and temperature monitoring, but in addition to the Vcc supply
-	  voltage and chip temperature, this driver currently only supports
-	  reading two currents by measuring two differential voltages across
-	  series resistors.
+	  current and temperature monitoring.
 
 	  This driver can also be built as a module. If so, the module will
 	  be called ltc2990.
diff --git a/drivers/hwmon/asus_atk0110.c b/drivers/hwmon/asus_atk0110.c
index 975c43d..a6636fe 100644
--- a/drivers/hwmon/asus_atk0110.c
+++ b/drivers/hwmon/asus_atk0110.c
@@ -125,6 +125,8 @@ struct atk_data {
 	int temperature_count;
 	int fan_count;
 	struct list_head sensor_list;
+	struct attribute_group attr_group;
+	const struct attribute_group *attr_groups[2];
 
 	struct {
 		struct dentry *root;
@@ -188,7 +190,6 @@ static int atk_add(struct acpi_device *device);
 static int atk_remove(struct acpi_device *device);
 static void atk_print_sensor(struct atk_data *data, union acpi_object *obj);
 static int atk_read_value(struct atk_sensor_data *sensor, u64 *value);
-static void atk_free_sensors(struct atk_data *data);
 
 static struct acpi_driver atk_driver = {
 	.name	= ATK_HID,
@@ -262,14 +263,6 @@ static ssize_t atk_limit2_show(struct device *dev,
 	return sprintf(buf, "%lld\n", value);
 }
 
-static ssize_t atk_name_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	return sprintf(buf, "atk0110\n");
-}
-static struct device_attribute atk_name_attr =
-		__ATTR(name, 0444, atk_name_show, NULL);
-
 static void atk_init_attribute(struct device_attribute *attr, char *name,
 		sysfs_show_func show)
 {
@@ -912,15 +905,13 @@ static int atk_add_sensor(struct atk_data *data, union acpi_object *obj)
 	limit1 = atk_get_pack_member(data, obj, HWMON_PACK_LIMIT1);
 	limit2 = atk_get_pack_member(data, obj, HWMON_PACK_LIMIT2);
 
-	sensor = kzalloc(sizeof(*sensor), GFP_KERNEL);
+	sensor = devm_kzalloc(dev, sizeof(*sensor), GFP_KERNEL);
 	if (!sensor)
 		return -ENOMEM;
 
-	sensor->acpi_name = kstrdup(name->string.pointer, GFP_KERNEL);
-	if (!sensor->acpi_name) {
-		err = -ENOMEM;
-		goto out;
-	}
+	sensor->acpi_name = devm_kstrdup(dev, name->string.pointer, GFP_KERNEL);
+	if (!sensor->acpi_name)
+		return -ENOMEM;
 
 	INIT_LIST_HEAD(&sensor->list);
 	sensor->type = type;
@@ -961,9 +952,6 @@ static int atk_add_sensor(struct atk_data *data, union acpi_object *obj)
 	(*num)++;
 
 	return 1;
-out:
-	kfree(sensor);
-	return err;
 }
 
 static int atk_enumerate_old_hwmon(struct atk_data *data)
@@ -1004,8 +992,7 @@ static int atk_enumerate_old_hwmon(struct atk_data *data)
 		dev_warn(dev, METHOD_OLD_ENUM_TMP ": ACPI exception: %s\n",
 				acpi_format_exception(status));
 
-		ret = -ENODEV;
-		goto cleanup;
+		return -ENODEV;
 	}
 
 	pack = buf.pointer;
@@ -1026,8 +1013,7 @@ static int atk_enumerate_old_hwmon(struct atk_data *data)
 		dev_warn(dev, METHOD_OLD_ENUM_FAN ": ACPI exception: %s\n",
 				acpi_format_exception(status));
 
-		ret = -ENODEV;
-		goto cleanup;
+		return -ENODEV;
 	}
 
 	pack = buf.pointer;
@@ -1041,9 +1027,6 @@ static int atk_enumerate_old_hwmon(struct atk_data *data)
 	ACPI_FREE(buf.pointer);
 
 	return count;
-cleanup:
-	atk_free_sensors(data);
-	return ret;
 }
 
 static int atk_ec_present(struct atk_data *data)
@@ -1193,76 +1176,44 @@ static int atk_enumerate_new_hwmon(struct atk_data *data)
 	return err;
 }
 
-static int atk_create_files(struct atk_data *data)
+static int atk_init_attribute_groups(struct atk_data *data)
 {
+	struct device *dev = &data->acpi_dev->dev;
 	struct atk_sensor_data *s;
-	int err;
+	struct attribute **attrs;
+	int i = 0;
+	int len = (data->voltage_count + data->temperature_count
+			+ data->fan_count) * 4 + 1;
 
-	list_for_each_entry(s, &data->sensor_list, list) {
-		err = device_create_file(data->hwmon_dev, &s->input_attr);
-		if (err)
-			return err;
-		err = device_create_file(data->hwmon_dev, &s->label_attr);
-		if (err)
-			return err;
-		err = device_create_file(data->hwmon_dev, &s->limit1_attr);
-		if (err)
-			return err;
-		err = device_create_file(data->hwmon_dev, &s->limit2_attr);
-		if (err)
-			return err;
-	}
-
-	err = device_create_file(data->hwmon_dev, &atk_name_attr);
-
-	return err;
-}
-
-static void atk_remove_files(struct atk_data *data)
-{
-	struct atk_sensor_data *s;
+	attrs = devm_kcalloc(dev, len, sizeof(struct attribute *), GFP_KERNEL);
+	if (!attrs)
+		return -ENOMEM;
 
 	list_for_each_entry(s, &data->sensor_list, list) {
-		device_remove_file(data->hwmon_dev, &s->input_attr);
-		device_remove_file(data->hwmon_dev, &s->label_attr);
-		device_remove_file(data->hwmon_dev, &s->limit1_attr);
-		device_remove_file(data->hwmon_dev, &s->limit2_attr);
+		attrs[i++] = &s->input_attr.attr;
+		attrs[i++] = &s->label_attr.attr;
+		attrs[i++] = &s->limit1_attr.attr;
+		attrs[i++] = &s->limit2_attr.attr;
 	}
-	device_remove_file(data->hwmon_dev, &atk_name_attr);
-}
 
-static void atk_free_sensors(struct atk_data *data)
-{
-	struct list_head *head = &data->sensor_list;
-	struct atk_sensor_data *s, *tmp;
+	data->attr_group.attrs = attrs;
+	data->attr_groups[0] = &data->attr_group;
 
-	list_for_each_entry_safe(s, tmp, head, list) {
-		kfree(s->acpi_name);
-		kfree(s);
-	}
+	return 0;
 }
 
 static int atk_register_hwmon(struct atk_data *data)
 {
 	struct device *dev = &data->acpi_dev->dev;
-	int err;
 
 	dev_dbg(dev, "registering hwmon device\n");
-	data->hwmon_dev = hwmon_device_register(dev);
+	data->hwmon_dev = hwmon_device_register_with_groups(dev, "atk0110",
+							    data,
+							    data->attr_groups);
 	if (IS_ERR(data->hwmon_dev))
 		return PTR_ERR(data->hwmon_dev);
 
-	dev_dbg(dev, "populating sysfs directory\n");
-	err = atk_create_files(data);
-	if (err)
-		goto remove;
-
 	return 0;
-remove:
-	/* Cleanup the registered files */
-	atk_remove_files(data);
-	hwmon_device_unregister(data->hwmon_dev);
-	return err;
 }
 
 static int atk_probe_if(struct atk_data *data)
@@ -1350,7 +1301,7 @@ static int atk_add(struct acpi_device *device)
 
 	dev_dbg(&device->dev, "adding...\n");
 
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	data = devm_kzalloc(&device->dev, sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
 
@@ -1397,20 +1348,20 @@ static int atk_add(struct acpi_device *device)
 		goto out;
 	}
 
+	err = atk_init_attribute_groups(data);
+	if (err)
+		goto out;
 	err = atk_register_hwmon(data);
 	if (err)
-		goto cleanup;
+		goto out;
 
 	atk_debugfs_init(data);
 
 	device->driver_data = data;
 	return 0;
-cleanup:
-	atk_free_sensors(data);
 out:
 	if (data->disable_ec)
 		atk_ec_ctl(data, 0);
-	kfree(data);
 	return err;
 }
 
@@ -1423,8 +1374,6 @@ static int atk_remove(struct acpi_device *device)
 
 	atk_debugfs_cleanup(data);
 
-	atk_remove_files(data);
-	atk_free_sensors(data);
 	hwmon_device_unregister(data->hwmon_dev);
 
 	if (data->disable_ec) {
@@ -1432,8 +1381,6 @@ static int atk_remove(struct acpi_device *device)
 			dev_err(&device->dev, "Failed to disable EC\n");
 	}
 
-	kfree(data);
-
 	return 0;
 }
 
diff --git a/drivers/hwmon/fschmd.c b/drivers/hwmon/fschmd.c
index 5e78229..22d3a84 100644
--- a/drivers/hwmon/fschmd.c
+++ b/drivers/hwmon/fschmd.c
@@ -105,7 +105,7 @@ static const u8 FSCHMD_REG_VOLT[7][6] = {
 static const int FSCHMD_NO_VOLT_SENSORS[7] = { 3, 3, 3, 3, 3, 3, 6 };
 
 /*
- * minimum pwm at which the fan is driven (pwm can by increased depending on
+ * minimum pwm at which the fan is driven (pwm can be increased depending on
  * the temp. Notice that for the scy some fans share there minimum speed.
  * Also notice that with the scy the sensor order is different than with the
  * other chips, this order was in the 2.4 driver and kept for consistency.
diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c
index 32083e4..e88c019 100644
--- a/drivers/hwmon/hwmon.c
+++ b/drivers/hwmon/hwmon.c
@@ -698,6 +698,9 @@ hwmon_device_register_with_info(struct device *dev, const char *name,
 	if (chip && (!chip->ops || !chip->ops->is_visible || !chip->info))
 		return ERR_PTR(-EINVAL);
 
+	if (chip && !dev)
+		return ERR_PTR(-EINVAL);
+
 	return __hwmon_device_register(dev, name, drvdata, chip, extra_groups);
 }
 EXPORT_SYMBOL_GPL(hwmon_device_register_with_info);
diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c
index 3b73dee..17c6460 100644
--- a/drivers/hwmon/k10temp.c
+++ b/drivers/hwmon/k10temp.c
@@ -37,6 +37,10 @@ MODULE_PARM_DESC(force, "force loading on processors with erratum 319");
 /* Provide lock for writing to NB_SMU_IND_ADDR */
 static DEFINE_MUTEX(nb_smu_ind_mutex);
 
+#ifndef PCI_DEVICE_ID_AMD_15H_M70H_NB_F3
+#define PCI_DEVICE_ID_AMD_15H_M70H_NB_F3	0x15b3
+#endif
+
 #ifndef PCI_DEVICE_ID_AMD_17H_DF_F3
 #define PCI_DEVICE_ID_AMD_17H_DF_F3	0x1463
 #endif
@@ -81,6 +85,7 @@ struct k10temp_data {
 	void (*read_tempreg)(struct pci_dev *pdev, u32 *regval);
 	int temp_offset;
 	u32 temp_adjust_mask;
+	bool show_tdie;
 };
 
 struct tctl_offset {
@@ -141,17 +146,24 @@ static void read_tempreg_nb_f17(struct pci_dev *pdev, u32 *regval)
 		     F17H_M01H_REPORTED_TEMP_CTRL_OFFSET, regval);
 }
 
-static ssize_t temp1_input_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
+static unsigned int get_raw_temp(struct k10temp_data *data)
 {
-	struct k10temp_data *data = dev_get_drvdata(dev);
-	u32 regval;
 	unsigned int temp;
+	u32 regval;
 
 	data->read_tempreg(data->pdev, &regval);
 	temp = (regval >> 21) * 125;
 	if (regval & data->temp_adjust_mask)
 		temp -= 49000;
+	return temp;
+}
+
+static ssize_t temp1_input_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct k10temp_data *data = dev_get_drvdata(dev);
+	unsigned int temp = get_raw_temp(data);
+
 	if (temp > data->temp_offset)
 		temp -= data->temp_offset;
 	else
@@ -160,6 +172,23 @@ static ssize_t temp1_input_show(struct device *dev,
 	return sprintf(buf, "%u\n", temp);
 }
 
+static ssize_t temp2_input_show(struct device *dev,
+				struct device_attribute *devattr, char *buf)
+{
+	struct k10temp_data *data = dev_get_drvdata(dev);
+	unsigned int temp = get_raw_temp(data);
+
+	return sprintf(buf, "%u\n", temp);
+}
+
+static ssize_t temp_label_show(struct device *dev,
+			       struct device_attribute *devattr, char *buf)
+{
+	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+
+	return sprintf(buf, "%s\n", attr->index ? "Tctl" : "Tdie");
+}
+
 static ssize_t temp1_max_show(struct device *dev,
 			      struct device_attribute *attr, char *buf)
 {
@@ -187,16 +216,23 @@ static DEVICE_ATTR_RO(temp1_max);
 static SENSOR_DEVICE_ATTR(temp1_crit, S_IRUGO, show_temp_crit, NULL, 0);
 static SENSOR_DEVICE_ATTR(temp1_crit_hyst, S_IRUGO, show_temp_crit, NULL, 1);
 
+static SENSOR_DEVICE_ATTR(temp1_label, 0444, temp_label_show, NULL, 0);
+static DEVICE_ATTR_RO(temp2_input);
+static SENSOR_DEVICE_ATTR(temp2_label, 0444, temp_label_show, NULL, 1);
+
 static umode_t k10temp_is_visible(struct kobject *kobj,
 				  struct attribute *attr, int index)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
 	struct k10temp_data *data = dev_get_drvdata(dev);
 	struct pci_dev *pdev = data->pdev;
+	u32 reg;
 
-	if (index >= 2) {
-		u32 reg;
-
+	switch (index) {
+	case 0 ... 1:	/* temp1_input, temp1_max */
+	default:
+		break;
+	case 2 ... 3:	/* temp1_crit, temp1_crit_hyst */
 		if (!data->read_htcreg)
 			return 0;
 
@@ -208,6 +244,11 @@ static umode_t k10temp_is_visible(struct kobject *kobj,
 		data->read_htcreg(data->pdev, &reg);
 		if (!(reg & HTC_ENABLE))
 			return 0;
+		break;
+	case 4 ... 6:	/* temp1_label, temp2_input, temp2_label */
+		if (!data->show_tdie)
+			return 0;
+		break;
 	}
 	return attr->mode;
 }
@@ -217,6 +258,9 @@ static struct attribute *k10temp_attrs[] = {
 	&dev_attr_temp1_max.attr,
 	&sensor_dev_attr_temp1_crit.dev_attr.attr,
 	&sensor_dev_attr_temp1_crit_hyst.dev_attr.attr,
+	&sensor_dev_attr_temp1_label.dev_attr.attr,
+	&dev_attr_temp2_input.attr,
+	&sensor_dev_attr_temp2_label.dev_attr.attr,
 	NULL
 };
 
@@ -292,6 +336,7 @@ static int k10temp_probe(struct pci_dev *pdev,
 	} else if (boot_cpu_data.x86 == 0x17) {
 		data->temp_adjust_mask = 0x80000;
 		data->read_tempreg = read_tempreg_nb_f17;
+		data->show_tdie = true;
 	} else {
 		data->read_htcreg = read_htcreg_pci;
 		data->read_tempreg = read_tempreg_pci;
@@ -320,6 +365,7 @@ static const struct pci_device_id k10temp_id_table[] = {
 	{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
 	{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) },
 	{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) },
+	{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M70H_NB_F3) },
 	{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
 	{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
 	{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
diff --git a/drivers/hwmon/ltc2990.c b/drivers/hwmon/ltc2990.c
index 8f8fe05..2aefdc5 100644
--- a/drivers/hwmon/ltc2990.c
+++ b/drivers/hwmon/ltc2990.c
@@ -5,18 +5,16 @@
  * Author: Mike Looijmans <mike.looijmans@topic.nl>
  *
  * License: GPLv2
- *
- * This driver assumes the chip is wired as a dual current monitor, and
- * reports the voltage drop across two series resistors. It also reports
- * the chip's internal temperature and Vcc power supply voltage.
  */
 
+#include <linux/bitops.h>
 #include <linux/err.h>
 #include <linux/hwmon.h>
 #include <linux/hwmon-sysfs.h>
 #include <linux/i2c.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/of.h>
 
 #define LTC2990_STATUS	0x00
 #define LTC2990_CONTROL	0x01
@@ -28,45 +26,108 @@
 #define LTC2990_V4_MSB	0x0C
 #define LTC2990_VCC_MSB	0x0E
 
-#define LTC2990_CONTROL_KELVIN		BIT(7)
-#define LTC2990_CONTROL_SINGLE		BIT(6)
-#define LTC2990_CONTROL_MEASURE_ALL	(0x3 << 3)
-#define LTC2990_CONTROL_MODE_CURRENT	0x06
-#define LTC2990_CONTROL_MODE_VOLTAGE	0x07
+#define LTC2990_IN0	BIT(0)
+#define LTC2990_IN1	BIT(1)
+#define LTC2990_IN2	BIT(2)
+#define LTC2990_IN3	BIT(3)
+#define LTC2990_IN4	BIT(4)
+#define LTC2990_CURR1	BIT(5)
+#define LTC2990_CURR2	BIT(6)
+#define LTC2990_TEMP1	BIT(7)
+#define LTC2990_TEMP2	BIT(8)
+#define LTC2990_TEMP3	BIT(9)
+#define LTC2990_NONE	0
+#define LTC2990_ALL	GENMASK(9, 0)
 
-/* convert raw register value to sign-extended integer in 16-bit range */
-static int ltc2990_voltage_to_int(int raw)
-{
-	if (raw & BIT(14))
-		return -(0x4000 - (raw & 0x3FFF)) << 2;
-	else
-		return (raw & 0x3FFF) << 2;
-}
+#define LTC2990_MODE0_SHIFT	0
+#define LTC2990_MODE0_MASK	GENMASK(2, 0)
+#define LTC2990_MODE1_SHIFT	3
+#define LTC2990_MODE1_MASK	GENMASK(1, 0)
+
+/* Enabled measurements for mode bits 2..0 */
+static const int ltc2990_attrs_ena_0[] = {
+	LTC2990_IN1 | LTC2990_IN2 | LTC2990_TEMP3,
+	LTC2990_CURR1 | LTC2990_TEMP3,
+	LTC2990_CURR1 | LTC2990_IN3 | LTC2990_IN4,
+	LTC2990_TEMP2 | LTC2990_IN3 | LTC2990_IN4,
+	LTC2990_TEMP2 | LTC2990_CURR2,
+	LTC2990_TEMP2 | LTC2990_TEMP3,
+	LTC2990_CURR1 | LTC2990_CURR2,
+	LTC2990_IN1 | LTC2990_IN2 | LTC2990_IN3 | LTC2990_IN4
+};
+
+/* Enabled measurements for mode bits 4..3 */
+static const int ltc2990_attrs_ena_1[] = {
+	LTC2990_NONE,
+	LTC2990_TEMP2 | LTC2990_IN1 | LTC2990_CURR1,
+	LTC2990_TEMP3 | LTC2990_IN3 | LTC2990_CURR2,
+	LTC2990_ALL
+};
+
+struct ltc2990_data {
+	struct i2c_client *i2c;
+	u32 mode[2];
+};
 
 /* Return the converted value from the given register in uV or mC */
-static int ltc2990_get_value(struct i2c_client *i2c, u8 reg, int *result)
+static int ltc2990_get_value(struct i2c_client *i2c, int index, int *result)
 {
 	int val;
+	u8 reg;
+
+	switch (index) {
+	case LTC2990_IN0:
+		reg = LTC2990_VCC_MSB;
+		break;
+	case LTC2990_IN1:
+	case LTC2990_CURR1:
+	case LTC2990_TEMP2:
+		reg = LTC2990_V1_MSB;
+		break;
+	case LTC2990_IN2:
+		reg = LTC2990_V2_MSB;
+		break;
+	case LTC2990_IN3:
+	case LTC2990_CURR2:
+	case LTC2990_TEMP3:
+		reg = LTC2990_V3_MSB;
+		break;
+	case LTC2990_IN4:
+		reg = LTC2990_V4_MSB;
+		break;
+	case LTC2990_TEMP1:
+		reg = LTC2990_TINT_MSB;
+		break;
+	default:
+		return -EINVAL;
+	}
 
 	val = i2c_smbus_read_word_swapped(i2c, reg);
 	if (unlikely(val < 0))
 		return val;
 
-	switch (reg) {
-	case LTC2990_TINT_MSB:
-		/* internal temp, 0.0625 degrees/LSB, 13-bit  */
-		val = (val & 0x1FFF) << 3;
-		*result = (val * 1000) >> 7;
+	switch (index) {
+	case LTC2990_TEMP1:
+	case LTC2990_TEMP2:
+	case LTC2990_TEMP3:
+		/* temp, 0.0625 degrees/LSB */
+		*result = sign_extend32(val, 12) * 1000 / 16;
 		break;
-	case LTC2990_V1_MSB:
-	case LTC2990_V3_MSB:
-		 /* Vx-Vy, 19.42uV/LSB. Depends on mode. */
-		*result = ltc2990_voltage_to_int(val) * 1942 / (4 * 100);
+	case LTC2990_CURR1:
+	case LTC2990_CURR2:
+		 /* Vx-Vy, 19.42uV/LSB */
+		*result = sign_extend32(val, 14) * 1942 / 100;
 		break;
-	case LTC2990_VCC_MSB:
-		/* Vcc, 305.18μV/LSB, 2.5V offset */
-		*result = (ltc2990_voltage_to_int(val) * 30518 /
-			   (4 * 100 * 1000)) + 2500;
+	case LTC2990_IN0:
+		/* Vcc, 305.18uV/LSB, 2.5V offset */
+		*result = sign_extend32(val, 14) * 30518 / (100 * 1000) + 2500;
+		break;
+	case LTC2990_IN1:
+	case LTC2990_IN2:
+	case LTC2990_IN3:
+	case LTC2990_IN4:
+		/* Vx, 305.18uV/LSB */
+		*result = sign_extend32(val, 14) * 30518 / (100 * 1000);
 		break;
 	default:
 		return -EINVAL; /* won't happen, keep compiler happy */
@@ -79,48 +140,117 @@ static ssize_t ltc2990_show_value(struct device *dev,
 				  struct device_attribute *da, char *buf)
 {
 	struct sensor_device_attribute *attr = to_sensor_dev_attr(da);
+	struct ltc2990_data *data = dev_get_drvdata(dev);
 	int value;
 	int ret;
 
-	ret = ltc2990_get_value(dev_get_drvdata(dev), attr->index, &value);
+	ret = ltc2990_get_value(data->i2c, attr->index, &value);
 	if (unlikely(ret < 0))
 		return ret;
 
 	return snprintf(buf, PAGE_SIZE, "%d\n", value);
 }
 
+static umode_t ltc2990_attrs_visible(struct kobject *kobj,
+				     struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct ltc2990_data *data = dev_get_drvdata(dev);
+	struct device_attribute *da =
+			container_of(a, struct device_attribute, attr);
+	struct sensor_device_attribute *attr = to_sensor_dev_attr(da);
+
+	int attrs_mask = LTC2990_IN0 | LTC2990_TEMP1 |
+			 (ltc2990_attrs_ena_0[data->mode[0]] &
+			  ltc2990_attrs_ena_1[data->mode[1]]);
+
+	if (attr->index & attrs_mask)
+		return a->mode;
+
+	return 0;
+}
+
 static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, ltc2990_show_value, NULL,
-			  LTC2990_TINT_MSB);
+			  LTC2990_TEMP1);
+static SENSOR_DEVICE_ATTR(temp2_input, S_IRUGO, ltc2990_show_value, NULL,
+			  LTC2990_TEMP2);
+static SENSOR_DEVICE_ATTR(temp3_input, S_IRUGO, ltc2990_show_value, NULL,
+			  LTC2990_TEMP3);
 static SENSOR_DEVICE_ATTR(curr1_input, S_IRUGO, ltc2990_show_value, NULL,
-			  LTC2990_V1_MSB);
+			  LTC2990_CURR1);
 static SENSOR_DEVICE_ATTR(curr2_input, S_IRUGO, ltc2990_show_value, NULL,
-			  LTC2990_V3_MSB);
+			  LTC2990_CURR2);
 static SENSOR_DEVICE_ATTR(in0_input, S_IRUGO, ltc2990_show_value, NULL,
-			  LTC2990_VCC_MSB);
+			  LTC2990_IN0);
+static SENSOR_DEVICE_ATTR(in1_input, S_IRUGO, ltc2990_show_value, NULL,
+			  LTC2990_IN1);
+static SENSOR_DEVICE_ATTR(in2_input, S_IRUGO, ltc2990_show_value, NULL,
+			  LTC2990_IN2);
+static SENSOR_DEVICE_ATTR(in3_input, S_IRUGO, ltc2990_show_value, NULL,
+			  LTC2990_IN3);
+static SENSOR_DEVICE_ATTR(in4_input, S_IRUGO, ltc2990_show_value, NULL,
+			  LTC2990_IN4);
 
 static struct attribute *ltc2990_attrs[] = {
 	&sensor_dev_attr_temp1_input.dev_attr.attr,
+	&sensor_dev_attr_temp2_input.dev_attr.attr,
+	&sensor_dev_attr_temp3_input.dev_attr.attr,
 	&sensor_dev_attr_curr1_input.dev_attr.attr,
 	&sensor_dev_attr_curr2_input.dev_attr.attr,
 	&sensor_dev_attr_in0_input.dev_attr.attr,
+	&sensor_dev_attr_in1_input.dev_attr.attr,
+	&sensor_dev_attr_in2_input.dev_attr.attr,
+	&sensor_dev_attr_in3_input.dev_attr.attr,
+	&sensor_dev_attr_in4_input.dev_attr.attr,
 	NULL,
 };
-ATTRIBUTE_GROUPS(ltc2990);
+
+static const struct attribute_group ltc2990_group = {
+	.attrs = ltc2990_attrs,
+	.is_visible = ltc2990_attrs_visible,
+};
+__ATTRIBUTE_GROUPS(ltc2990);
 
 static int ltc2990_i2c_probe(struct i2c_client *i2c,
 			     const struct i2c_device_id *id)
 {
 	int ret;
 	struct device *hwmon_dev;
+	struct ltc2990_data *data;
+	struct device_node *of_node = i2c->dev.of_node;
 
 	if (!i2c_check_functionality(i2c->adapter, I2C_FUNC_SMBUS_BYTE_DATA |
 				     I2C_FUNC_SMBUS_WORD_DATA))
 		return -ENODEV;
 
-	/* Setup continuous mode, current monitor */
+	data = devm_kzalloc(&i2c->dev, sizeof(struct ltc2990_data), GFP_KERNEL);
+	if (unlikely(!data))
+		return -ENOMEM;
+
+	data->i2c = i2c;
+
+	if (of_node) {
+		ret = of_property_read_u32_array(of_node, "lltc,meas-mode",
+						 data->mode, 2);
+		if (ret < 0)
+			return ret;
+
+		if (data->mode[0] & ~LTC2990_MODE0_MASK ||
+		    data->mode[1] & ~LTC2990_MODE1_MASK)
+			return -EINVAL;
+	} else {
+		ret = i2c_smbus_read_byte_data(i2c, LTC2990_CONTROL);
+		if (ret < 0)
+			return ret;
+
+		data->mode[0] = ret >> LTC2990_MODE0_SHIFT & LTC2990_MODE0_MASK;
+		data->mode[1] = ret >> LTC2990_MODE1_SHIFT & LTC2990_MODE1_MASK;
+	}
+
+	/* Setup continuous mode */
 	ret = i2c_smbus_write_byte_data(i2c, LTC2990_CONTROL,
-					LTC2990_CONTROL_MEASURE_ALL |
-					LTC2990_CONTROL_MODE_CURRENT);
+					data->mode[0] << LTC2990_MODE0_SHIFT |
+					data->mode[1] << LTC2990_MODE1_SHIFT);
 	if (ret < 0) {
 		dev_err(&i2c->dev, "Error: Failed to set control mode.\n");
 		return ret;
@@ -134,7 +264,7 @@ static int ltc2990_i2c_probe(struct i2c_client *i2c,
 
 	hwmon_dev = devm_hwmon_device_register_with_groups(&i2c->dev,
 							   i2c->name,
-							   i2c,
+							   data,
 							   ltc2990_groups);
 
 	return PTR_ERR_OR_ZERO(hwmon_dev);
diff --git a/drivers/hwmon/mc13783-adc.c b/drivers/hwmon/mc13783-adc.c
index 960a1db..67860ad 100644
--- a/drivers/hwmon/mc13783-adc.c
+++ b/drivers/hwmon/mc13783-adc.c
@@ -63,6 +63,10 @@ static int mc13783_adc_read(struct device *dev,
 	if (ret)
 		return ret;
 
+	/* ADIN7 subchannels */
+	if (channel >= 16)
+		channel = 7;
+
 	channel &= 0x7;
 
 	*val = (sample[channel % 4] >> (channel > 3 ? 14 : 2)) & 0x3ff;
@@ -111,6 +115,57 @@ static ssize_t mc13783_adc_read_gp(struct device *dev,
 	return sprintf(buf, "%u\n", val);
 }
 
+static ssize_t mc13783_adc_read_uid(struct device *dev,
+		struct device_attribute *devattr, char *buf)
+{
+	unsigned int val;
+	struct platform_device *pdev = to_platform_device(dev);
+	kernel_ulong_t driver_data = platform_get_device_id(pdev)->driver_data;
+	int ret = mc13783_adc_read(dev, devattr, &val);
+
+	if (ret)
+		return ret;
+
+	if (driver_data & MC13783_ADC_BPDIV2)
+		/* MC13892 have 1/2 divider, input range is [0, 4.800V] */
+		val = DIV_ROUND_CLOSEST(val * 4800, 1024);
+	else
+		/* MC13783 have 0.9 divider, input range is [0, 2.555V] */
+		val = DIV_ROUND_CLOSEST(val * 2555, 1024);
+
+	return sprintf(buf, "%u\n", val);
+}
+
+static ssize_t mc13783_adc_read_temp(struct device *dev,
+		struct device_attribute *devattr, char *buf)
+{
+	unsigned int val;
+	struct platform_device *pdev = to_platform_device(dev);
+	kernel_ulong_t driver_data = platform_get_device_id(pdev)->driver_data;
+	int ret = mc13783_adc_read(dev, devattr, &val);
+
+	if (ret)
+		return ret;
+
+	if (driver_data & MC13783_ADC_BPDIV2) {
+		/*
+		 * MC13892:
+		 * Die Temperature Read Out Code at 25C 680
+		 * Temperature change per LSB +0.4244C
+		 */
+		ret = DIV_ROUND_CLOSEST(-2635920 + val * 4244, 10);
+	} else {
+		/*
+		 * MC13783:
+		 * Die Temperature Read Out Code at 25C 282
+		 * Temperature change per LSB -1.14C
+		 */
+		ret = 346480 - 1140 * val;
+	}
+
+	return sprintf(buf, "%d\n", ret);
+}
+
 static DEVICE_ATTR_RO(name);
 static SENSOR_DEVICE_ATTR(in2_input, S_IRUGO, mc13783_adc_read_bp, NULL, 2);
 static SENSOR_DEVICE_ATTR(in5_input, S_IRUGO, mc13783_adc_read_gp, NULL, 5);
@@ -124,6 +179,9 @@ static SENSOR_DEVICE_ATTR(in12_input, S_IRUGO, mc13783_adc_read_gp, NULL, 12);
 static SENSOR_DEVICE_ATTR(in13_input, S_IRUGO, mc13783_adc_read_gp, NULL, 13);
 static SENSOR_DEVICE_ATTR(in14_input, S_IRUGO, mc13783_adc_read_gp, NULL, 14);
 static SENSOR_DEVICE_ATTR(in15_input, S_IRUGO, mc13783_adc_read_gp, NULL, 15);
+static SENSOR_DEVICE_ATTR(in16_input, S_IRUGO, mc13783_adc_read_uid, NULL, 16);
+static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO,
+			  mc13783_adc_read_temp, NULL, 17);
 
 static struct attribute *mc13783_attr_base[] = {
 	&dev_attr_name.attr,
@@ -131,6 +189,8 @@ static struct attribute *mc13783_attr_base[] = {
 	&sensor_dev_attr_in5_input.dev_attr.attr,
 	&sensor_dev_attr_in6_input.dev_attr.attr,
 	&sensor_dev_attr_in7_input.dev_attr.attr,
+	&sensor_dev_attr_in16_input.dev_attr.attr,
+	&sensor_dev_attr_temp1_input.dev_attr.attr,
 	NULL
 };
 
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 0e6bc63..8b2b72b 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -92,7 +92,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	struct request *rq;
 	int error;
 
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->special = (char *)pc;
 
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 5a8e8e3..5f17838 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -437,7 +437,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		bool delay = false;
 
 		rq = blk_get_request(drive->queue,
-			write ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN,  __GFP_RECLAIM);
+			write ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 		memcpy(scsi_req(rq)->cmd, cmd, BLK_MAX_CDB);
 		ide_req(rq)->type = ATA_PRIV_PC;
 		rq->rq_flags |= rq_flags;
@@ -1426,21 +1426,8 @@ static int idecd_capacity_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int idecd_capacity_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, idecd_capacity_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations idecd_capacity_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= idecd_capacity_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static ide_proc_entry_t idecd_proc[] = {
-	{ "capacity", S_IFREG|S_IRUGO, &idecd_capacity_proc_fops },
+	{ "capacity", S_IFREG|S_IRUGO, idecd_capacity_proc_show },
 	{}
 };
 
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 2acca12..b132240 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -304,7 +304,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
 	struct request *rq;
 	int ret;
 
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->rq_flags = RQF_QUIET;
 	blk_execute_rq(drive->queue, cd->disk, rq, 0);
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index 4e20747..f4f8afd 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -166,7 +166,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 	if (!(setting->flags & DS_SYNC))
 		return setting->set(drive, arg);
 
-	rq = blk_get_request(q, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd_len = 5;
 	scsi_req(rq)->cmd[0] = REQ_DEVSET_EXEC;
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index f1a7c58..e3b4e65 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -478,7 +478,7 @@ static int set_multcount(ide_drive_t *drive, int arg)
 	if (drive->special_flags & IDE_SFLAG_SET_MULTMODE)
 		return -EBUSY;
 
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_TASKFILE;
 
 	drive->mult_req = arg;
diff --git a/drivers/ide/ide-disk_proc.c b/drivers/ide/ide-disk_proc.c
index 82a36ce..95d239b 100644
--- a/drivers/ide/ide-disk_proc.c
+++ b/drivers/ide/ide-disk_proc.c
@@ -52,19 +52,6 @@ static int idedisk_cache_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int idedisk_cache_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, idedisk_cache_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations idedisk_cache_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= idedisk_cache_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int idedisk_capacity_proc_show(struct seq_file *m, void *v)
 {
 	ide_drive_t*drive = (ide_drive_t *)m->private;
@@ -73,19 +60,6 @@ static int idedisk_capacity_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int idedisk_capacity_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, idedisk_capacity_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations idedisk_capacity_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= idedisk_capacity_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __idedisk_proc_show(struct seq_file *m, ide_drive_t *drive, u8 sub_cmd)
 {
 	u8 *buf;
@@ -114,43 +88,17 @@ static int idedisk_sv_proc_show(struct seq_file *m, void *v)
 	return __idedisk_proc_show(m, m->private, ATA_SMART_READ_VALUES);
 }
 
-static int idedisk_sv_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, idedisk_sv_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations idedisk_sv_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= idedisk_sv_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int idedisk_st_proc_show(struct seq_file *m, void *v)
 {
 	return __idedisk_proc_show(m, m->private, ATA_SMART_READ_THRESHOLDS);
 }
 
-static int idedisk_st_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, idedisk_st_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations idedisk_st_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= idedisk_st_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 ide_proc_entry_t ide_disk_proc[] = {
-	{ "cache",	  S_IFREG|S_IRUGO, &idedisk_cache_proc_fops	},
-	{ "capacity",	  S_IFREG|S_IRUGO, &idedisk_capacity_proc_fops	},
-	{ "geometry",	  S_IFREG|S_IRUGO, &ide_geometry_proc_fops	},
-	{ "smart_values", S_IFREG|S_IRUSR, &idedisk_sv_proc_fops	},
-	{ "smart_thresholds", S_IFREG|S_IRUSR, &idedisk_st_proc_fops	},
+	{ "cache",	  S_IFREG|S_IRUGO, idedisk_cache_proc_show	},
+	{ "capacity",	  S_IFREG|S_IRUGO, idedisk_capacity_proc_show	},
+	{ "geometry",	  S_IFREG|S_IRUGO, ide_geometry_proc_show	},
+	{ "smart_values", S_IFREG|S_IRUSR, idedisk_sv_proc_show		},
+	{ "smart_thresholds", S_IFREG|S_IRUSR, idedisk_st_proc_show	},
 	{}
 };
 
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 54d4d78..6f34465 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -180,7 +180,6 @@ EXPORT_SYMBOL_GPL(ide_dma_unmap_sg);
 void ide_dma_off_quietly(ide_drive_t *drive)
 {
 	drive->dev_flags &= ~IDE_DFLAG_USING_DMA;
-	ide_toggle_bounce(drive, 0);
 
 	drive->hwif->dma_ops->dma_host_set(drive, 0);
 }
@@ -211,7 +210,6 @@ EXPORT_SYMBOL(ide_dma_off);
 void ide_dma_on(ide_drive_t *drive)
 {
 	drive->dev_flags |= IDE_DFLAG_USING_DMA;
-	ide_toggle_bounce(drive, 1);
 
 	drive->hwif->dma_ops->dma_host_set(drive, 1);
 }
diff --git a/drivers/ide/ide-floppy_proc.c b/drivers/ide/ide-floppy_proc.c
index 471457e..7f697ddb 100644
--- a/drivers/ide/ide-floppy_proc.c
+++ b/drivers/ide/ide-floppy_proc.c
@@ -14,22 +14,9 @@ static int idefloppy_capacity_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int idefloppy_capacity_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, idefloppy_capacity_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations idefloppy_capacity_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= idefloppy_capacity_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 ide_proc_entry_t ide_floppy_proc[] = {
-	{ "capacity",	S_IFREG|S_IRUGO, &idefloppy_capacity_proc_fops	},
-	{ "geometry",	S_IFREG|S_IRUGO, &ide_geometry_proc_fops	},
+	{ "capacity",	S_IFREG|S_IRUGO, idefloppy_capacity_proc_show	},
+	{ "geometry",	S_IFREG|S_IRUGO, ide_geometry_proc_show		},
 	{}
 };
 
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 3661abb..af5119a 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -125,7 +125,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
 	if (NULL == (void *) arg) {
 		struct request *rq;
 
-		rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+		rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 		ide_req(rq)->type = ATA_PRIV_TASKFILE;
 		blk_execute_rq(drive->queue, NULL, rq, 0);
 		err = scsi_req(rq)->result ? -EIO : 0;
@@ -222,7 +222,7 @@ static int generic_drive_reset(ide_drive_t *drive)
 	struct request *rq;
 	int ret = 0;
 
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd_len = 1;
 	scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index e1180fa..78cb79e 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -6,32 +6,6 @@
 #include <linux/ide.h>
 #include <linux/bitops.h>
 
-/**
- *	ide_toggle_bounce	-	handle bounce buffering
- *	@drive: drive to update
- *	@on: on/off boolean
- *
- *	Enable or disable bounce buffering for the device. Drives move
- *	between PIO and DMA and that changes the rules we need.
- */
-
-void ide_toggle_bounce(ide_drive_t *drive, int on)
-{
-	u64 addr = BLK_BOUNCE_HIGH;	/* dma64_addr_t */
-
-	if (!PCI_DMA_BUS_IS_PHYS) {
-		addr = BLK_BOUNCE_ANY;
-	} else if (on && drive->media == ide_disk) {
-		struct device *dev = drive->hwif->dev;
-
-		if (dev && dev->dma_mask)
-			addr = *dev->dma_mask;
-	}
-
-	if (drive->queue)
-		blk_queue_bounce_limit(drive->queue, addr);
-}
-
 u64 ide_get_lba_addr(struct ide_cmd *cmd, int lba48)
 {
 	struct ide_taskfile *tf = &cmd->tf;
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 6465bcc..622f0ed 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -32,7 +32,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	}
 	spin_unlock_irq(&hwif->lock);
 
-	rq = blk_get_request(q, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
 	scsi_req(rq)->cmd[0] = REQ_PARK_HEADS;
 	scsi_req(rq)->cmd_len = 1;
 	ide_req(rq)->type = ATA_PRIV_MISC;
@@ -47,7 +47,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	 * Make sure that *some* command is sent to the drive after the
 	 * timeout has expired, so power management will be reenabled.
 	 */
-	rq = blk_get_request(q, REQ_OP_DRV_IN, GFP_NOWAIT);
+	rq = blk_get_request(q, REQ_OP_DRV_IN, BLK_MQ_REQ_NOWAIT);
 	if (IS_ERR(rq))
 		goto out;
 
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index ad8a125..59217aa 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -19,7 +19,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	}
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_PM_SUSPEND;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_SUSPEND;
@@ -90,8 +90,7 @@ int generic_ide_resume(struct device *dev)
 	}
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	rq = blk_get_request_flags(drive->queue, REQ_OP_DRV_IN,
-				   BLK_MQ_REQ_PREEMPT);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_PREEMPT);
 	ide_req(rq)->type = ATA_PRIV_PM_RESUME;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_RESUME;
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 2019e66..56d7bc2 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -796,8 +796,7 @@ static int ide_init_queue(ide_drive_t *drive)
 	 * This will be fixed once we teach pci_map_sg() about our boundary
 	 * requirements, hopefully soon. *FIXME*
 	 */
-	if (!PCI_DMA_BUS_IS_PHYS)
-		max_sg_entries >>= 1;
+	max_sg_entries >>= 1;
 #endif /* CONFIG_PCI */
 
 	blk_queue_max_segments(q, max_sg_entries);
@@ -805,9 +804,6 @@ static int ide_init_queue(ide_drive_t *drive)
 	/* assign drive queue */
 	drive->queue = q;
 
-	/* needs drive->queue to be set */
-	ide_toggle_bounce(drive, 1);
-
 	return 0;
 }
 
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index 863db44..45c9974 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -56,19 +56,6 @@ static int ide_imodel_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int ide_imodel_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_imodel_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ide_imodel_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ide_imodel_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int ide_mate_proc_show(struct seq_file *m, void *v)
 {
 	ide_hwif_t	*hwif = (ide_hwif_t *) m->private;
@@ -80,19 +67,6 @@ static int ide_mate_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int ide_mate_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_mate_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ide_mate_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ide_mate_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int ide_channel_proc_show(struct seq_file *m, void *v)
 {
 	ide_hwif_t	*hwif = (ide_hwif_t *) m->private;
@@ -101,19 +75,6 @@ static int ide_channel_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int ide_channel_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_channel_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ide_channel_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ide_channel_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int ide_identify_proc_show(struct seq_file *m, void *v)
 {
 	ide_drive_t *drive = (ide_drive_t *)m->private;
@@ -141,19 +102,6 @@ static int ide_identify_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int ide_identify_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_identify_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ide_identify_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ide_identify_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /**
  *	ide_find_setting	-	find a specific setting
  *	@st: setting table pointer
@@ -441,27 +389,14 @@ static const struct file_operations ide_settings_proc_fops = {
 	.write		= ide_settings_proc_write,
 };
 
-static int ide_capacity_proc_show(struct seq_file *m, void *v)
+int ide_capacity_proc_show(struct seq_file *m, void *v)
 {
 	seq_printf(m, "%llu\n", (long long)0x7fffffff);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(ide_capacity_proc_show);
 
-static int ide_capacity_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_capacity_proc_show, NULL);
-}
-
-const struct file_operations ide_capacity_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ide_capacity_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-EXPORT_SYMBOL_GPL(ide_capacity_proc_fops);
-
-static int ide_geometry_proc_show(struct seq_file *m, void *v)
+int ide_geometry_proc_show(struct seq_file *m, void *v)
 {
 	ide_drive_t	*drive = (ide_drive_t *) m->private;
 
@@ -471,20 +406,7 @@ static int ide_geometry_proc_show(struct seq_file *m, void *v)
 			drive->bios_cyl, drive->bios_head, drive->bios_sect);
 	return 0;
 }
-
-static int ide_geometry_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_geometry_proc_show, PDE_DATA(inode));
-}
-
-const struct file_operations ide_geometry_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ide_geometry_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-EXPORT_SYMBOL(ide_geometry_proc_fops);
+EXPORT_SYMBOL(ide_geometry_proc_show);
 
 static int ide_dmodel_proc_show(struct seq_file *seq, void *v)
 {
@@ -495,19 +417,6 @@ static int ide_dmodel_proc_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int ide_dmodel_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_dmodel_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ide_dmodel_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ide_dmodel_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int ide_driver_proc_show(struct seq_file *m, void *v)
 {
 	ide_drive_t		*drive = (ide_drive_t *)m->private;
@@ -523,65 +432,6 @@ static int ide_driver_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int ide_driver_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_driver_proc_show, PDE_DATA(inode));
-}
-
-static int ide_replace_subdriver(ide_drive_t *drive, const char *driver)
-{
-	struct device *dev = &drive->gendev;
-	int ret = 1;
-	int err;
-
-	device_release_driver(dev);
-	/* FIXME: device can still be in use by previous driver */
-	strlcpy(drive->driver_req, driver, sizeof(drive->driver_req));
-	err = device_attach(dev);
-	if (err < 0)
-		printk(KERN_WARNING "IDE: %s: device_attach error: %d\n",
-			__func__, err);
-	drive->driver_req[0] = 0;
-	if (dev->driver == NULL) {
-		err = device_attach(dev);
-		if (err < 0)
-			printk(KERN_WARNING
-				"IDE: %s: device_attach(2) error: %d\n",
-				__func__, err);
-	}
-	if (dev->driver && !strcmp(dev->driver->name, driver))
-		ret = 0;
-
-	return ret;
-}
-
-static ssize_t ide_driver_proc_write(struct file *file, const char __user *buffer,
-				     size_t count, loff_t *pos)
-{
-	ide_drive_t	*drive = PDE_DATA(file_inode(file));
-	char name[32];
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-	if (count > 31)
-		count = 31;
-	if (copy_from_user(name, buffer, count))
-		return -EFAULT;
-	name[count] = '\0';
-	if (ide_replace_subdriver(drive, name))
-		return -EINVAL;
-	return count;
-}
-
-static const struct file_operations ide_driver_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ide_driver_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-	.write		= ide_driver_proc_write,
-};
-
 static int ide_media_proc_show(struct seq_file *m, void *v)
 {
 	ide_drive_t	*drive = (ide_drive_t *) m->private;
@@ -613,11 +463,10 @@ static const struct file_operations ide_media_proc_fops = {
 };
 
 static ide_proc_entry_t generic_drive_entries[] = {
-	{ "driver",	S_IFREG|S_IRUGO,	 &ide_driver_proc_fops	},
-	{ "identify",	S_IFREG|S_IRUSR,	 &ide_identify_proc_fops},
-	{ "media",	S_IFREG|S_IRUGO,	 &ide_media_proc_fops	},
-	{ "model",	S_IFREG|S_IRUGO,	 &ide_dmodel_proc_fops	},
-	{ "settings",	S_IFREG|S_IRUSR|S_IWUSR, &ide_settings_proc_fops},
+	{ "driver",	S_IFREG|S_IRUGO,	 ide_driver_proc_show	},
+	{ "identify",	S_IFREG|S_IRUSR,	 ide_identify_proc_show	},
+	{ "media",	S_IFREG|S_IRUGO,	 ide_media_proc_show	},
+	{ "model",	S_IFREG|S_IRUGO,	 ide_dmodel_proc_show	},
 	{}
 };
 
@@ -628,7 +477,7 @@ static void ide_add_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t *p
 	if (!dir || !p)
 		return;
 	while (p->name != NULL) {
-		ent = proc_create_data(p->name, p->mode, dir, p->proc_fops, data);
+		ent = proc_create_single_data(p->name, p->mode, dir, p->show, data);
 		if (!ent) return;
 		p++;
 	}
@@ -693,8 +542,12 @@ void ide_proc_port_register_devices(ide_hwif_t *hwif)
 			continue;
 
 		drive->proc = proc_mkdir(drive->name, parent);
-		if (drive->proc)
+		if (drive->proc) {
 			ide_add_proc_entries(drive->proc, generic_drive_entries, drive);
+			proc_create_data("setting", S_IFREG|S_IRUSR|S_IWUSR,
+					drive->proc, &ide_settings_proc_fops,
+					drive);
+		}
 		sprintf(name, "ide%d/%s", (drive->name[2]-'a')/2, drive->name);
 		ent = proc_symlink(drive->name, proc_ide_root, name);
 		if (!ent) return;
@@ -704,6 +557,7 @@ void ide_proc_port_register_devices(ide_hwif_t *hwif)
 void ide_proc_unregister_device(ide_drive_t *drive)
 {
 	if (drive->proc) {
+		remove_proc_entry("settings", drive->proc);
 		ide_remove_proc_entries(drive->proc, generic_drive_entries);
 		remove_proc_entry(drive->name, proc_ide_root);
 		remove_proc_entry(drive->name, drive->hwif->proc);
@@ -712,9 +566,9 @@ void ide_proc_unregister_device(ide_drive_t *drive)
 }
 
 static ide_proc_entry_t hwif_entries[] = {
-	{ "channel",	S_IFREG|S_IRUGO,	&ide_channel_proc_fops	},
-	{ "mate",	S_IFREG|S_IRUGO,	&ide_mate_proc_fops	},
-	{ "model",	S_IFREG|S_IRUGO,	&ide_imodel_proc_fops	},
+	{ "channel",	S_IFREG|S_IRUGO,	ide_channel_proc_show	},
+	{ "mate",	S_IFREG|S_IRUGO,	ide_mate_proc_show	},
+	{ "model",	S_IFREG|S_IRUGO,	ide_imodel_proc_show	},
 	{}
 };
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index fd57e8c..aee7b46 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -854,7 +854,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 	BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE);
 	BUG_ON(size < 0 || size % tape->blk_size);
 
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd[13] = cmd;
 	rq->rq_disk = tape->disk;
@@ -862,7 +862,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 
 	if (size) {
 		ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size,
-				      __GFP_RECLAIM);
+				      GFP_NOIO);
 		if (ret)
 			goto out_put;
 	}
@@ -1847,22 +1847,9 @@ static int idetape_name_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int idetape_name_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, idetape_name_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations idetape_name_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= idetape_name_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static ide_proc_entry_t idetape_proc[] = {
-	{ "capacity",	S_IFREG|S_IRUGO,	&ide_capacity_proc_fops	},
-	{ "name",	S_IFREG|S_IRUGO,	&idetape_name_proc_fops	},
+	{ "capacity",	S_IFREG|S_IRUGO,	ide_capacity_proc_show	},
+	{ "name",	S_IFREG|S_IRUGO,	idetape_name_proc_show	},
 	{}
 };
 
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index abe0822..c034cd9 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -431,7 +431,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 
 	rq = blk_get_request(drive->queue,
 		(cmd->tf_flags & IDE_TFLAG_WRITE) ?
-			REQ_OP_DRV_OUT : REQ_OP_DRV_IN, __GFP_RECLAIM);
+			REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_TASKFILE;
 
 	/*
@@ -442,7 +442,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 	 */
 	if (nsect) {
 		error = blk_rq_map_kern(drive->queue, rq, buf,
-					nsect * SECTOR_SIZE, __GFP_RECLAIM);
+					nsect * SECTOR_SIZE, GFP_NOIO);
 		if (error)
 			goto put_req;
 	}
diff --git a/drivers/input/misc/hp_sdc_rtc.c b/drivers/input/misc/hp_sdc_rtc.c
index 49b34de0..47eb8ca 100644
--- a/drivers/input/misc/hp_sdc_rtc.c
+++ b/drivers/input/misc/hp_sdc_rtc.c
@@ -509,18 +509,6 @@ static int hp_sdc_rtc_proc_show(struct seq_file *m, void *v)
 #undef NY
 }
 
-static int hp_sdc_rtc_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, hp_sdc_rtc_proc_show, NULL);
-}
-
-static const struct file_operations hp_sdc_rtc_proc_fops = {
-	.open		= hp_sdc_rtc_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int hp_sdc_rtc_ioctl(struct file *file, 
 			    unsigned int cmd, unsigned long arg)
 {
@@ -713,7 +701,7 @@ static int __init hp_sdc_rtc_init(void)
 	if (misc_register(&hp_sdc_rtc_dev) != 0)
 		printk(KERN_INFO "Could not register misc. dev for i8042 rtc\n");
 
-        proc_create("driver/rtc", 0, NULL, &hp_sdc_rtc_proc_fops);
+        proc_create_single("driver/rtc", 0, NULL, hp_sdc_rtc_proc_show);
 
 	printk(KERN_INFO "HP i8042 SDC + MSM-58321 RTC support loaded "
 			 "(RTC v " RTC_VERSION ")\n");
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index df171cb..5b714a0 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -146,6 +146,7 @@ config INTEL_IOMMU
 	select DMA_DIRECT_OPS
 	select IOMMU_API
 	select IOMMU_IOVA
+	select NEED_DMA_MAP_STATE
 	select DMAR_TABLE
 	help
 	  DMA remapping (DMAR) devices support enables independent address
diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index 19cd937..baa1ee2 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -1340,19 +1340,6 @@ static int capi20_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int capi20_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, capi20_proc_show, NULL);
-}
-
-static const struct file_operations capi20_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= capi20_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /*
  * /proc/capi/capi20ncci:
  *  applid ncci
@@ -1373,23 +1360,10 @@ static int capi20ncci_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int capi20ncci_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, capi20ncci_proc_show, NULL);
-}
-
-static const struct file_operations capi20ncci_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= capi20ncci_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static void __init proc_init(void)
 {
-	proc_create("capi/capi20", 0, NULL, &capi20_proc_fops);
-	proc_create("capi/capi20ncci", 0, NULL, &capi20ncci_proc_fops);
+	proc_create_single("capi/capi20", 0, NULL, capi20_proc_show);
+	proc_create_single("capi/capi20ncci", 0, NULL, capi20ncci_proc_show);
 }
 
 static void __exit proc_exit(void)
diff --git a/drivers/isdn/capi/capidrv.c b/drivers/isdn/capi/capidrv.c
index 49fef08..7ac5179 100644
--- a/drivers/isdn/capi/capidrv.c
+++ b/drivers/isdn/capi/capidrv.c
@@ -2460,22 +2460,9 @@ static int capidrv_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int capidrv_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, capidrv_proc_show, NULL);
-}
-
-static const struct file_operations capidrv_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= capidrv_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static void __init proc_init(void)
 {
-	proc_create("capi/capidrv", 0, NULL, &capidrv_proc_fops);
+	proc_create_single("capi/capidrv", 0, NULL, capidrv_proc_show);
 }
 
 static void __exit proc_exit(void)
diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c
index 46c189a..0ff517d 100644
--- a/drivers/isdn/capi/kcapi.c
+++ b/drivers/isdn/capi/kcapi.c
@@ -534,7 +534,8 @@ int attach_capi_ctr(struct capi_ctr *ctr)
 	init_waitqueue_head(&ctr->state_wait_queue);
 
 	sprintf(ctr->procfn, "capi/controllers/%d", ctr->cnr);
-	ctr->procent = proc_create_data(ctr->procfn, 0, NULL, ctr->proc_fops, ctr);
+	ctr->procent = proc_create_single_data(ctr->procfn, 0, NULL,
+			ctr->proc_show, ctr);
 
 	ncontrollers++;
 
diff --git a/drivers/isdn/capi/kcapi_proc.c b/drivers/isdn/capi/kcapi_proc.c
index 68db3c5..c94bd12 100644
--- a/drivers/isdn/capi/kcapi_proc.c
+++ b/drivers/isdn/capi/kcapi_proc.c
@@ -108,32 +108,6 @@ static const struct seq_operations seq_contrstats_ops = {
 	.show	= contrstats_show,
 };
 
-static int seq_controller_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &seq_controller_ops);
-}
-
-static int seq_contrstats_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &seq_contrstats_ops);
-}
-
-static const struct file_operations proc_controller_ops = {
-	.owner		= THIS_MODULE,
-	.open		= seq_controller_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static const struct file_operations proc_contrstats_ops = {
-	.owner		= THIS_MODULE,
-	.open		= seq_contrstats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 // /proc/capi/applications:
 //      applid l3cnt dblkcnt dblklen #ncci recvqueuelen
 // /proc/capi/applstats:
@@ -216,34 +190,6 @@ static const struct seq_operations seq_applstats_ops = {
 	.show	= applstats_show,
 };
 
-static int
-seq_applications_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &seq_applications_ops);
-}
-
-static int
-seq_applstats_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &seq_applstats_ops);
-}
-
-static const struct file_operations proc_applications_ops = {
-	.owner		= THIS_MODULE,
-	.open		= seq_applications_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static const struct file_operations proc_applstats_ops = {
-	.owner		= THIS_MODULE,
-	.open		= seq_applstats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 // ---------------------------------------------------------------------------
 
 static void *capi_driver_start(struct seq_file *seq, loff_t *pos)
@@ -279,22 +225,6 @@ static const struct seq_operations seq_capi_driver_ops = {
 	.show	= capi_driver_show,
 };
 
-static int
-seq_capi_driver_open(struct inode *inode, struct file *file)
-{
-	int err;
-	err = seq_open(file, &seq_capi_driver_ops);
-	return err;
-}
-
-static const struct file_operations proc_driver_ops = {
-	.owner		= THIS_MODULE,
-	.open		= seq_capi_driver_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 // ---------------------------------------------------------------------------
 
 void __init
@@ -302,11 +232,11 @@ kcapi_proc_init(void)
 {
 	proc_mkdir("capi",             NULL);
 	proc_mkdir("capi/controllers", NULL);
-	proc_create("capi/controller",   0, NULL, &proc_controller_ops);
-	proc_create("capi/contrstats",   0, NULL, &proc_contrstats_ops);
-	proc_create("capi/applications", 0, NULL, &proc_applications_ops);
-	proc_create("capi/applstats",    0, NULL, &proc_applstats_ops);
-	proc_create("capi/driver",       0, NULL, &proc_driver_ops);
+	proc_create_seq("capi/controller",   0, NULL, &seq_controller_ops);
+	proc_create_seq("capi/contrstats",   0, NULL, &seq_contrstats_ops);
+	proc_create_seq("capi/applications", 0, NULL, &seq_applications_ops);
+	proc_create_seq("capi/applstats",    0, NULL, &seq_applstats_ops);
+	proc_create_seq("capi/driver",       0, NULL, &seq_capi_driver_ops);
 }
 
 void __exit
diff --git a/drivers/isdn/gigaset/capi.c b/drivers/isdn/gigaset/capi.c
index ccec777..56748af 100644
--- a/drivers/isdn/gigaset/capi.c
+++ b/drivers/isdn/gigaset/capi.c
@@ -2437,19 +2437,6 @@ static int gigaset_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int gigaset_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, gigaset_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations gigaset_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= gigaset_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /**
  * gigaset_isdn_regdev() - register device to LL
  * @cs:		device descriptor structure.
@@ -2479,7 +2466,7 @@ int gigaset_isdn_regdev(struct cardstate *cs, const char *isdnid)
 	iif->ctr.release_appl  = gigaset_release_appl;
 	iif->ctr.send_message  = gigaset_send_message;
 	iif->ctr.procinfo      = gigaset_procinfo;
-	iif->ctr.proc_fops = &gigaset_proc_fops;
+	iif->ctr.proc_show     = gigaset_proc_show,
 	INIT_LIST_HEAD(&iif->appls);
 	skb_queue_head_init(&iif->sendqueue);
 	atomic_set(&iif->sendqlen, 0);
diff --git a/drivers/isdn/hardware/avm/avmcard.h b/drivers/isdn/hardware/avm/avmcard.h
index c95712d..cdfa89c 100644
--- a/drivers/isdn/hardware/avm/avmcard.h
+++ b/drivers/isdn/hardware/avm/avmcard.h
@@ -556,7 +556,7 @@ u16  b1_send_message(struct capi_ctr *ctrl, struct sk_buff *skb);
 void b1_parse_version(avmctrl_info *card);
 irqreturn_t b1_interrupt(int interrupt, void *devptr);
 
-extern const struct file_operations b1ctl_proc_fops;
+int b1_proc_show(struct seq_file *m, void *v);
 
 avmcard_dmainfo *avmcard_dma_alloc(char *name, struct pci_dev *,
 				   long rsize, long ssize);
@@ -576,6 +576,6 @@ void b1dma_register_appl(struct capi_ctr *ctrl,
 			 capi_register_params *rp);
 void b1dma_release_appl(struct capi_ctr *ctrl, u16 appl);
 u16  b1dma_send_message(struct capi_ctr *ctrl, struct sk_buff *skb);
-extern const struct file_operations b1dmactl_proc_fops;
+int b1dma_proc_show(struct seq_file *m, void *v);
 
 #endif /* _AVMCARD_H_ */
diff --git a/drivers/isdn/hardware/avm/b1.c b/drivers/isdn/hardware/avm/b1.c
index b1833d0..5ee5489 100644
--- a/drivers/isdn/hardware/avm/b1.c
+++ b/drivers/isdn/hardware/avm/b1.c
@@ -637,7 +637,7 @@ irqreturn_t b1_interrupt(int interrupt, void *devptr)
 }
 
 /* ------------------------------------------------------------- */
-static int b1ctl_proc_show(struct seq_file *m, void *v)
+int b1_proc_show(struct seq_file *m, void *v)
 {
 	struct capi_ctr *ctrl = m->private;
 	avmctrl_info *cinfo = (avmctrl_info *)(ctrl->driverdata);
@@ -699,20 +699,7 @@ static int b1ctl_proc_show(struct seq_file *m, void *v)
 
 	return 0;
 }
-
-static int b1ctl_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, b1ctl_proc_show, PDE_DATA(inode));
-}
-
-const struct file_operations b1ctl_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= b1ctl_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-EXPORT_SYMBOL(b1ctl_proc_fops);
+EXPORT_SYMBOL(b1_proc_show);
 
 /* ------------------------------------------------------------- */
 
diff --git a/drivers/isdn/hardware/avm/b1dma.c b/drivers/isdn/hardware/avm/b1dma.c
index 9538a9e..6a3dc99 100644
--- a/drivers/isdn/hardware/avm/b1dma.c
+++ b/drivers/isdn/hardware/avm/b1dma.c
@@ -858,7 +858,7 @@ u16 b1dma_send_message(struct capi_ctr *ctrl, struct sk_buff *skb)
 
 /* ------------------------------------------------------------- */
 
-static int b1dmactl_proc_show(struct seq_file *m, void *v)
+int b1dma_proc_show(struct seq_file *m, void *v)
 {
 	struct capi_ctr *ctrl = m->private;
 	avmctrl_info *cinfo = (avmctrl_info *)(ctrl->driverdata);
@@ -941,20 +941,7 @@ static int b1dmactl_proc_show(struct seq_file *m, void *v)
 
 	return 0;
 }
-
-static int b1dmactl_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, b1dmactl_proc_show, PDE_DATA(inode));
-}
-
-const struct file_operations b1dmactl_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= b1dmactl_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-EXPORT_SYMBOL(b1dmactl_proc_fops);
+EXPORT_SYMBOL(b1dma_proc_show);
 
 /* ------------------------------------------------------------- */
 
diff --git a/drivers/isdn/hardware/avm/b1isa.c b/drivers/isdn/hardware/avm/b1isa.c
index 54e871a..cdfea72 100644
--- a/drivers/isdn/hardware/avm/b1isa.c
+++ b/drivers/isdn/hardware/avm/b1isa.c
@@ -121,7 +121,7 @@ static int b1isa_probe(struct pci_dev *pdev)
 	cinfo->capi_ctrl.load_firmware = b1_load_firmware;
 	cinfo->capi_ctrl.reset_ctr     = b1_reset_ctr;
 	cinfo->capi_ctrl.procinfo      = b1isa_procinfo;
-	cinfo->capi_ctrl.proc_fops = &b1ctl_proc_fops;
+	cinfo->capi_ctrl.proc_show     = b1_proc_show;
 	strcpy(cinfo->capi_ctrl.name, card->name);
 
 	retval = attach_capi_ctr(&cinfo->capi_ctrl);
diff --git a/drivers/isdn/hardware/avm/b1pci.c b/drivers/isdn/hardware/avm/b1pci.c
index ac4863c..b76b57a 100644
--- a/drivers/isdn/hardware/avm/b1pci.c
+++ b/drivers/isdn/hardware/avm/b1pci.c
@@ -112,7 +112,7 @@ static int b1pci_probe(struct capicardparams *p, struct pci_dev *pdev)
 	cinfo->capi_ctrl.load_firmware = b1_load_firmware;
 	cinfo->capi_ctrl.reset_ctr     = b1_reset_ctr;
 	cinfo->capi_ctrl.procinfo      = b1pci_procinfo;
-	cinfo->capi_ctrl.proc_fops = &b1ctl_proc_fops;
+	cinfo->capi_ctrl.proc_show     = b1_proc_show;
 	strcpy(cinfo->capi_ctrl.name, card->name);
 	cinfo->capi_ctrl.owner         = THIS_MODULE;
 
@@ -251,7 +251,7 @@ static int b1pciv4_probe(struct capicardparams *p, struct pci_dev *pdev)
 	cinfo->capi_ctrl.load_firmware = b1dma_load_firmware;
 	cinfo->capi_ctrl.reset_ctr     = b1dma_reset_ctr;
 	cinfo->capi_ctrl.procinfo      = b1pciv4_procinfo;
-	cinfo->capi_ctrl.proc_fops = &b1dmactl_proc_fops;
+	cinfo->capi_ctrl.proc_show     = b1dma_proc_show;
 	strcpy(cinfo->capi_ctrl.name, card->name);
 
 	retval = attach_capi_ctr(&cinfo->capi_ctrl);
diff --git a/drivers/isdn/hardware/avm/b1pcmcia.c b/drivers/isdn/hardware/avm/b1pcmcia.c
index 6b0d19d..3aca16e 100644
--- a/drivers/isdn/hardware/avm/b1pcmcia.c
+++ b/drivers/isdn/hardware/avm/b1pcmcia.c
@@ -108,7 +108,7 @@ static int b1pcmcia_add_card(unsigned int port, unsigned irq,
 	cinfo->capi_ctrl.load_firmware = b1_load_firmware;
 	cinfo->capi_ctrl.reset_ctr     = b1_reset_ctr;
 	cinfo->capi_ctrl.procinfo      = b1pcmcia_procinfo;
-	cinfo->capi_ctrl.proc_fops = &b1ctl_proc_fops;
+	cinfo->capi_ctrl.proc_show     = b1_proc_show;
 	strcpy(cinfo->capi_ctrl.name, card->name);
 
 	retval = attach_capi_ctr(&cinfo->capi_ctrl);
diff --git a/drivers/isdn/hardware/avm/c4.c b/drivers/isdn/hardware/avm/c4.c
index 034caba..ac72cd2 100644
--- a/drivers/isdn/hardware/avm/c4.c
+++ b/drivers/isdn/hardware/avm/c4.c
@@ -1127,19 +1127,6 @@ static int c4_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int c4_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, c4_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations c4_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= c4_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /* ------------------------------------------------------------- */
 
 static int c4_add_card(struct capicardparams *p, struct pci_dev *dev,
@@ -1211,7 +1198,7 @@ static int c4_add_card(struct capicardparams *p, struct pci_dev *dev,
 		cinfo->capi_ctrl.load_firmware = c4_load_firmware;
 		cinfo->capi_ctrl.reset_ctr     = c4_reset_ctr;
 		cinfo->capi_ctrl.procinfo      = c4_procinfo;
-		cinfo->capi_ctrl.proc_fops = &c4_proc_fops;
+		cinfo->capi_ctrl.proc_show     = c4_proc_show;
 		strcpy(cinfo->capi_ctrl.name, card->name);
 
 		retval = attach_capi_ctr(&cinfo->capi_ctrl);
diff --git a/drivers/isdn/hardware/avm/t1isa.c b/drivers/isdn/hardware/avm/t1isa.c
index 9f80d20..2153619 100644
--- a/drivers/isdn/hardware/avm/t1isa.c
+++ b/drivers/isdn/hardware/avm/t1isa.c
@@ -430,7 +430,7 @@ static int t1isa_probe(struct pci_dev *pdev, int cardnr)
 	cinfo->capi_ctrl.load_firmware = t1isa_load_firmware;
 	cinfo->capi_ctrl.reset_ctr     = t1isa_reset_ctr;
 	cinfo->capi_ctrl.procinfo      = t1isa_procinfo;
-	cinfo->capi_ctrl.proc_fops = &b1ctl_proc_fops;
+	cinfo->capi_ctrl.proc_show     = b1_proc_show;
 	strcpy(cinfo->capi_ctrl.name, card->name);
 
 	retval = attach_capi_ctr(&cinfo->capi_ctrl);
diff --git a/drivers/isdn/hardware/avm/t1pci.c b/drivers/isdn/hardware/avm/t1pci.c
index 2180b16..f5ed1d5 100644
--- a/drivers/isdn/hardware/avm/t1pci.c
+++ b/drivers/isdn/hardware/avm/t1pci.c
@@ -119,7 +119,7 @@ static int t1pci_add_card(struct capicardparams *p, struct pci_dev *pdev)
 	cinfo->capi_ctrl.load_firmware = b1dma_load_firmware;
 	cinfo->capi_ctrl.reset_ctr     = b1dma_reset_ctr;
 	cinfo->capi_ctrl.procinfo      = t1pci_procinfo;
-	cinfo->capi_ctrl.proc_fops = &b1dmactl_proc_fops;
+	cinfo->capi_ctrl.proc_show     = b1dma_proc_show;
 	strcpy(cinfo->capi_ctrl.name, card->name);
 
 	retval = attach_capi_ctr(&cinfo->capi_ctrl);
diff --git a/drivers/isdn/hardware/eicon/capimain.c b/drivers/isdn/hardware/eicon/capimain.c
index be36d82..f9244dc 100644
--- a/drivers/isdn/hardware/eicon/capimain.c
+++ b/drivers/isdn/hardware/eicon/capimain.c
@@ -90,19 +90,6 @@ static int diva_ctl_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int diva_ctl_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, diva_ctl_proc_show, NULL);
-}
-
-static const struct file_operations diva_ctl_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= diva_ctl_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /*
  * set additional os settings in capi_ctr struct
  */
@@ -111,7 +98,7 @@ void diva_os_set_controller_struct(struct capi_ctr *ctrl)
 	ctrl->driver_name = DRIVERLNAME;
 	ctrl->load_firmware = NULL;
 	ctrl->reset_ctr = NULL;
-	ctrl->proc_fops = &diva_ctl_proc_fops;
+	ctrl->proc_show = diva_ctl_proc_show;
 	ctrl->owner = THIS_MODULE;
 }
 
diff --git a/drivers/isdn/hardware/eicon/diva_didd.c b/drivers/isdn/hardware/eicon/diva_didd.c
index fab6ccf..60e7925 100644
--- a/drivers/isdn/hardware/eicon/diva_didd.c
+++ b/drivers/isdn/hardware/eicon/diva_didd.c
@@ -78,26 +78,13 @@ static int divadidd_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int divadidd_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, divadidd_proc_show, NULL);
-}
-
-static const struct file_operations divadidd_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= divadidd_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init create_proc(void)
 {
 	proc_net_eicon = proc_mkdir("eicon", init_net.proc_net);
 
 	if (proc_net_eicon) {
-		proc_didd = proc_create(DRIVERLNAME, S_IRUGO, proc_net_eicon,
-					&divadidd_proc_fops);
+		proc_didd = proc_create_single(DRIVERLNAME, S_IRUGO,
+				proc_net_eicon, divadidd_proc_show);
 		return (1);
 	}
 	return (0);
diff --git a/drivers/isdn/hardware/eicon/divasi.c b/drivers/isdn/hardware/eicon/divasi.c
index 525518c..e7081e0 100644
--- a/drivers/isdn/hardware/eicon/divasi.c
+++ b/drivers/isdn/hardware/eicon/divasi.c
@@ -101,23 +101,10 @@ static int um_idi_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int um_idi_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, um_idi_proc_show, NULL);
-}
-
-static const struct file_operations um_idi_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= um_idi_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init create_um_idi_proc(void)
 {
-	um_idi_proc_entry = proc_create(DRIVERLNAME, S_IRUGO, proc_net_eicon,
-					&um_idi_proc_fops);
+	um_idi_proc_entry = proc_create_single(DRIVERLNAME, S_IRUGO,
+			proc_net_eicon, um_idi_proc_show);
 	if (!um_idi_proc_entry)
 		return (0);
 	return (1);
diff --git a/drivers/isdn/hysdn/hycapi.c b/drivers/isdn/hysdn/hycapi.c
index eac0f51..a2c15cd 100644
--- a/drivers/isdn/hysdn/hycapi.c
+++ b/drivers/isdn/hysdn/hycapi.c
@@ -467,19 +467,6 @@ static int hycapi_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int hycapi_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, hycapi_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations hycapi_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= hycapi_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /**************************************************************
 hycapi_load_firmware
 
@@ -774,7 +761,7 @@ hycapi_capi_create(hysdn_card *card)
 		ctrl->load_firmware = hycapi_load_firmware;
 		ctrl->reset_ctr     = hycapi_reset_ctr;
 		ctrl->procinfo      = hycapi_procinfo;
-		ctrl->proc_fops = &hycapi_proc_fops;
+		ctrl->proc_show     = hycapi_proc_show;
 		strcpy(ctrl->name, cinfo->cardname);
 		ctrl->owner = THIS_MODULE;
 
diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 1f8f489..98f90aa 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -588,7 +588,7 @@ static const struct proto_ops data_sock_ops = {
 	.getname	= data_sock_getname,
 	.sendmsg	= mISDN_sock_sendmsg,
 	.recvmsg	= mISDN_sock_recvmsg,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= data_sock_setsockopt,
@@ -745,7 +745,6 @@ static const struct proto_ops base_sock_ops = {
 	.getname	= sock_no_getname,
 	.sendmsg	= sock_no_sendmsg,
 	.recvmsg	= sock_no_recvmsg,
-	.poll		= sock_no_poll,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= sock_no_setsockopt,
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 63171cd..60aa7bc 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -431,7 +431,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	return 0;
 err_sysfs:
 	if (tt->exit)
-		tt->exit(targetdata);
+		tt->exit(targetdata, true);
 err_init:
 	blk_cleanup_queue(tqueue);
 	tdisk->queue = NULL;
@@ -446,7 +446,7 @@ err_reserve:
 	return ret;
 }
 
-static void __nvm_remove_target(struct nvm_target *t)
+static void __nvm_remove_target(struct nvm_target *t, bool graceful)
 {
 	struct nvm_tgt_type *tt = t->type;
 	struct gendisk *tdisk = t->disk;
@@ -459,7 +459,7 @@ static void __nvm_remove_target(struct nvm_target *t)
 		tt->sysfs_exit(tdisk);
 
 	if (tt->exit)
-		tt->exit(tdisk->private_data);
+		tt->exit(tdisk->private_data, graceful);
 
 	nvm_remove_tgt_dev(t->dev, 1);
 	put_disk(tdisk);
@@ -489,7 +489,7 @@ static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove)
 		mutex_unlock(&dev->mlock);
 		return 1;
 	}
-	__nvm_remove_target(t);
+	__nvm_remove_target(t, true);
 	mutex_unlock(&dev->mlock);
 
 	return 0;
@@ -963,7 +963,7 @@ void nvm_unregister(struct nvm_dev *dev)
 	list_for_each_entry_safe(t, tmp, &dev->targets, list) {
 		if (t->dev->parent != dev)
 			continue;
-		__nvm_remove_target(t);
+		__nvm_remove_target(t, false);
 	}
 	mutex_unlock(&dev->mlock);
 
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
index 29a2311..b1c6d7e 100644
--- a/drivers/lightnvm/pblk-cache.c
+++ b/drivers/lightnvm/pblk-cache.c
@@ -44,13 +44,15 @@ retry:
 		goto out;
 	}
 
-	if (unlikely(!bio_has_data(bio)))
-		goto out;
-
 	pblk_ppa_set_empty(&w_ctx.ppa);
 	w_ctx.flags = flags;
-	if (bio->bi_opf & REQ_PREFLUSH)
+	if (bio->bi_opf & REQ_PREFLUSH) {
 		w_ctx.flags |= PBLK_FLUSH_ENTRY;
+		pblk_write_kick(pblk);
+	}
+
+	if (unlikely(!bio_has_data(bio)))
+		goto out;
 
 	for (i = 0; i < nr_entries; i++) {
 		void *data = bio_data(bio);
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 94d5d97..ed9cc97 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -40,7 +40,7 @@ static void pblk_line_mark_bb(struct work_struct *work)
 	}
 
 	kfree(ppa);
-	mempool_free(line_ws, pblk->gen_ws_pool);
+	mempool_free(line_ws, &pblk->gen_ws_pool);
 }
 
 static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
@@ -102,7 +102,7 @@ static void pblk_end_io_erase(struct nvm_rq *rqd)
 	struct pblk *pblk = rqd->private;
 
 	__pblk_end_io_erase(pblk, rqd);
-	mempool_free(rqd, pblk->e_rq_pool);
+	mempool_free(rqd, &pblk->e_rq_pool);
 }
 
 /*
@@ -237,15 +237,15 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type)
 	switch (type) {
 	case PBLK_WRITE:
 	case PBLK_WRITE_INT:
-		pool = pblk->w_rq_pool;
+		pool = &pblk->w_rq_pool;
 		rq_size = pblk_w_rq_size;
 		break;
 	case PBLK_READ:
-		pool = pblk->r_rq_pool;
+		pool = &pblk->r_rq_pool;
 		rq_size = pblk_g_rq_size;
 		break;
 	default:
-		pool = pblk->e_rq_pool;
+		pool = &pblk->e_rq_pool;
 		rq_size = pblk_g_rq_size;
 	}
 
@@ -265,20 +265,22 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type)
 	case PBLK_WRITE:
 		kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap);
 	case PBLK_WRITE_INT:
-		pool = pblk->w_rq_pool;
+		pool = &pblk->w_rq_pool;
 		break;
 	case PBLK_READ:
-		pool = pblk->r_rq_pool;
+		pool = &pblk->r_rq_pool;
 		break;
 	case PBLK_ERASE:
-		pool = pblk->e_rq_pool;
+		pool = &pblk->e_rq_pool;
 		break;
 	default:
 		pr_err("pblk: trying to free unknown rqd type\n");
 		return;
 	}
 
-	nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
+	if (rqd->meta_list)
+		nvm_dev_dma_free(dev->parent, rqd->meta_list,
+				rqd->dma_meta_list);
 	mempool_free(rqd, pool);
 }
 
@@ -292,7 +294,7 @@ void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
 
 	for (i = off; i < nr_pages + off; i++) {
 		bv = bio->bi_io_vec[i];
-		mempool_free(bv.bv_page, pblk->page_bio_pool);
+		mempool_free(bv.bv_page, &pblk->page_bio_pool);
 	}
 }
 
@@ -304,23 +306,23 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
 	int i, ret;
 
 	for (i = 0; i < nr_pages; i++) {
-		page = mempool_alloc(pblk->page_bio_pool, flags);
+		page = mempool_alloc(&pblk->page_bio_pool, flags);
 
 		ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
 		if (ret != PBLK_EXPOSED_PAGE_SIZE) {
 			pr_err("pblk: could not add page to bio\n");
-			mempool_free(page, pblk->page_bio_pool);
+			mempool_free(page, &pblk->page_bio_pool);
 			goto err;
 		}
 	}
 
 	return 0;
 err:
-	pblk_bio_free_pages(pblk, bio, 0, i - 1);
+	pblk_bio_free_pages(pblk, bio, (bio->bi_vcnt - i), i);
 	return -1;
 }
 
-static void pblk_write_kick(struct pblk *pblk)
+void pblk_write_kick(struct pblk *pblk)
 {
 	wake_up_process(pblk->writer_ts);
 	mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(1000));
@@ -342,13 +344,6 @@ void pblk_write_should_kick(struct pblk *pblk)
 		pblk_write_kick(pblk);
 }
 
-void pblk_end_io_sync(struct nvm_rq *rqd)
-{
-	struct completion *waiting = rqd->private;
-
-	complete(waiting);
-}
-
 static void pblk_wait_for_meta(struct pblk *pblk)
 {
 	do {
@@ -380,7 +375,13 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
 
 	lockdep_assert_held(&line->lock);
 
-	if (!vsc) {
+	if (line->w_err_gc->has_write_err) {
+		if (line->gc_group != PBLK_LINEGC_WERR) {
+			line->gc_group = PBLK_LINEGC_WERR;
+			move_list = &l_mg->gc_werr_list;
+			pblk_rl_werr_line_in(&pblk->rl);
+		}
+	} else if (!vsc) {
 		if (line->gc_group != PBLK_LINEGC_FULL) {
 			line->gc_group = PBLK_LINEGC_FULL;
 			move_list = &l_mg->gc_full_list;
@@ -467,16 +468,13 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 
-#ifdef CONFIG_NVM_DEBUG
-	int ret;
+	atomic_inc(&pblk->inflight_io);
 
-	ret = pblk_check_io(pblk, rqd);
-	if (ret)
-		return ret;
+#ifdef CONFIG_NVM_DEBUG
+	if (pblk_check_io(pblk, rqd))
+		return NVM_IO_ERR;
 #endif
 
-	atomic_inc(&pblk->inflight_io);
-
 	return nvm_submit_io(dev, rqd);
 }
 
@@ -484,16 +482,13 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 
-#ifdef CONFIG_NVM_DEBUG
-	int ret;
+	atomic_inc(&pblk->inflight_io);
 
-	ret = pblk_check_io(pblk, rqd);
-	if (ret)
-		return ret;
+#ifdef CONFIG_NVM_DEBUG
+	if (pblk_check_io(pblk, rqd))
+		return NVM_IO_ERR;
 #endif
 
-	atomic_inc(&pblk->inflight_io);
-
 	return nvm_submit_io_sync(dev, rqd);
 }
 
@@ -856,9 +851,10 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
 	atomic_dec(&pblk->inflight_io);
 
 	if (rqd.error) {
-		if (dir == PBLK_WRITE)
+		if (dir == PBLK_WRITE) {
 			pblk_log_write_err(pblk, &rqd);
-		else if (dir == PBLK_READ)
+			ret = 1;
+		} else if (dir == PBLK_READ)
 			pblk_log_read_err(pblk, &rqd);
 	}
 
@@ -1071,6 +1067,25 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
 	return 1;
 }
 
+static int pblk_line_alloc_bitmaps(struct pblk *pblk, struct pblk_line *line)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+
+	line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+	if (!line->map_bitmap)
+		return -ENOMEM;
+
+	/* will be initialized using bb info from map_bitmap */
+	line->invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL);
+	if (!line->invalid_bitmap) {
+		kfree(line->map_bitmap);
+		line->map_bitmap = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
 /* For now lines are always assumed full lines. Thus, smeta former and current
  * lun bitmaps are omitted.
  */
@@ -1108,7 +1123,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
 
 	if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) {
 		pr_debug("pblk: line smeta I/O failed. Retry\n");
-		return 1;
+		return 0;
 	}
 
 	bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line);
@@ -1174,19 +1189,9 @@ static int pblk_prepare_new_line(struct pblk *pblk, struct pblk_line *line)
 static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
 {
 	struct pblk_line_meta *lm = &pblk->lm;
+	int blk_in_line = atomic_read(&line->blk_in_line);
 	int blk_to_erase;
 
-	line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_ATOMIC);
-	if (!line->map_bitmap)
-		return -ENOMEM;
-
-	/* will be initialized using bb info from map_bitmap */
-	line->invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_ATOMIC);
-	if (!line->invalid_bitmap) {
-		kfree(line->map_bitmap);
-		return -ENOMEM;
-	}
-
 	/* Bad blocks do not need to be erased */
 	bitmap_copy(line->erase_bitmap, line->blk_bitmap, lm->blk_per_line);
 
@@ -1199,16 +1204,19 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
 		blk_to_erase = pblk_prepare_new_line(pblk, line);
 		line->state = PBLK_LINESTATE_FREE;
 	} else {
-		blk_to_erase = atomic_read(&line->blk_in_line);
+		blk_to_erase = blk_in_line;
 	}
 
-	if (line->state != PBLK_LINESTATE_FREE) {
-		kfree(line->map_bitmap);
-		kfree(line->invalid_bitmap);
+	if (blk_in_line < lm->min_blk_line) {
 		spin_unlock(&line->lock);
+		return -EAGAIN;
+	}
+
+	if (line->state != PBLK_LINESTATE_FREE) {
 		WARN(1, "pblk: corrupted line %d, state %d\n",
 							line->id, line->state);
-		return -EAGAIN;
+		spin_unlock(&line->lock);
+		return -EINTR;
 	}
 
 	line->state = PBLK_LINESTATE_OPEN;
@@ -1241,13 +1249,16 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
 	}
 	spin_unlock(&l_mg->free_lock);
 
-	pblk_rl_free_lines_dec(&pblk->rl, line, true);
+	ret = pblk_line_alloc_bitmaps(pblk, line);
+	if (ret)
+		return ret;
 
 	if (!pblk_line_init_bb(pblk, line, 0)) {
 		list_add(&line->list, &l_mg->free_list);
 		return -EINTR;
 	}
 
+	pblk_rl_free_lines_dec(&pblk->rl, line, true);
 	return 0;
 }
 
@@ -1259,6 +1270,24 @@ void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line)
 	line->emeta = NULL;
 }
 
+static void pblk_line_reinit(struct pblk_line *line)
+{
+	*line->vsc = cpu_to_le32(EMPTY_ENTRY);
+
+	line->map_bitmap = NULL;
+	line->invalid_bitmap = NULL;
+	line->smeta = NULL;
+	line->emeta = NULL;
+}
+
+void pblk_line_free(struct pblk_line *line)
+{
+	kfree(line->map_bitmap);
+	kfree(line->invalid_bitmap);
+
+	pblk_line_reinit(line);
+}
+
 struct pblk_line *pblk_line_get(struct pblk *pblk)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -1292,10 +1321,14 @@ retry:
 
 	ret = pblk_line_prepare(pblk, line);
 	if (ret) {
-		if (ret == -EAGAIN) {
+		switch (ret) {
+		case -EAGAIN:
+			list_add(&line->list, &l_mg->bad_list);
+			goto retry;
+		case -EINTR:
 			list_add(&line->list, &l_mg->corrupt_list);
 			goto retry;
-		} else {
+		default:
 			pr_err("pblk: failed to prepare line %d\n", line->id);
 			list_add(&line->list, &l_mg->free_list);
 			l_mg->nr_free_lines++;
@@ -1321,11 +1354,14 @@ retry:
 		return NULL;
 	}
 
+	retry_line->map_bitmap = line->map_bitmap;
+	retry_line->invalid_bitmap = line->invalid_bitmap;
 	retry_line->smeta = line->smeta;
 	retry_line->emeta = line->emeta;
 	retry_line->meta_line = line->meta_line;
 
-	pblk_line_free(pblk, line);
+	pblk_line_reinit(line);
+
 	l_mg->data_line = retry_line;
 	spin_unlock(&l_mg->free_lock);
 
@@ -1378,6 +1414,9 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
 	}
 	spin_unlock(&l_mg->free_lock);
 
+	if (pblk_line_alloc_bitmaps(pblk, line))
+		return NULL;
+
 	if (pblk_line_erase(pblk, line)) {
 		line = pblk_line_retry(pblk, line);
 		if (!line)
@@ -1449,7 +1488,7 @@ static void pblk_line_close_meta_sync(struct pblk *pblk)
 	flush_workqueue(pblk->close_wq);
 }
 
-void pblk_pipeline_stop(struct pblk *pblk)
+void __pblk_pipeline_flush(struct pblk *pblk)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	int ret;
@@ -1474,6 +1513,11 @@ void pblk_pipeline_stop(struct pblk *pblk)
 
 	flush_workqueue(pblk->bb_wq);
 	pblk_line_close_meta_sync(pblk);
+}
+
+void __pblk_pipeline_stop(struct pblk *pblk)
+{
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 
 	spin_lock(&l_mg->free_lock);
 	pblk->state = PBLK_STATE_STOPPED;
@@ -1482,6 +1526,12 @@ void pblk_pipeline_stop(struct pblk *pblk)
 	spin_unlock(&l_mg->free_lock);
 }
 
+void pblk_pipeline_stop(struct pblk *pblk)
+{
+	__pblk_pipeline_flush(pblk);
+	__pblk_pipeline_stop(pblk);
+}
+
 struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -1511,6 +1561,9 @@ retry_erase:
 		goto retry_erase;
 	}
 
+	if (pblk_line_alloc_bitmaps(pblk, new))
+		return NULL;
+
 retry_setup:
 	if (!pblk_line_init_metadata(pblk, new, cur)) {
 		new = pblk_line_retry(pblk, new);
@@ -1550,19 +1603,6 @@ out:
 	return new;
 }
 
-void pblk_line_free(struct pblk *pblk, struct pblk_line *line)
-{
-	kfree(line->map_bitmap);
-	kfree(line->invalid_bitmap);
-
-	*line->vsc = cpu_to_le32(EMPTY_ENTRY);
-
-	line->map_bitmap = NULL;
-	line->invalid_bitmap = NULL;
-	line->smeta = NULL;
-	line->emeta = NULL;
-}
-
 static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -1572,9 +1612,14 @@ static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line)
 	WARN_ON(line->state != PBLK_LINESTATE_GC);
 	line->state = PBLK_LINESTATE_FREE;
 	line->gc_group = PBLK_LINEGC_NONE;
-	pblk_line_free(pblk, line);
-	spin_unlock(&line->lock);
+	pblk_line_free(line);
+
+	if (line->w_err_gc->has_write_err) {
+		pblk_rl_werr_line_out(&pblk->rl);
+		line->w_err_gc->has_write_err = 0;
+	}
 
+	spin_unlock(&line->lock);
 	atomic_dec(&gc->pipeline_gc);
 
 	spin_lock(&l_mg->free_lock);
@@ -1593,7 +1638,7 @@ static void pblk_line_put_ws(struct work_struct *work)
 	struct pblk_line *line = line_put_ws->line;
 
 	__pblk_line_put(pblk, line);
-	mempool_free(line_put_ws, pblk->gen_ws_pool);
+	mempool_free(line_put_ws, &pblk->gen_ws_pool);
 }
 
 void pblk_line_put(struct kref *ref)
@@ -1610,7 +1655,7 @@ void pblk_line_put_wq(struct kref *ref)
 	struct pblk *pblk = line->pblk;
 	struct pblk_line_ws *line_put_ws;
 
-	line_put_ws = mempool_alloc(pblk->gen_ws_pool, GFP_ATOMIC);
+	line_put_ws = mempool_alloc(&pblk->gen_ws_pool, GFP_ATOMIC);
 	if (!line_put_ws)
 		return;
 
@@ -1737,11 +1782,34 @@ void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line)
 
 	spin_lock(&l_mg->close_lock);
 	spin_lock(&line->lock);
+
+	/* Update the in-memory start address for emeta, in case it has
+	 * shifted due to write errors
+	 */
+	if (line->emeta_ssec != line->cur_sec)
+		line->emeta_ssec = line->cur_sec;
+
 	list_add_tail(&line->list, &l_mg->emeta_list);
 	spin_unlock(&line->lock);
 	spin_unlock(&l_mg->close_lock);
 
 	pblk_line_should_sync_meta(pblk);
+
+
+}
+
+static void pblk_save_lba_list(struct pblk *pblk, struct pblk_line *line)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	unsigned int lba_list_size = lm->emeta_len[2];
+	struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
+	struct pblk_emeta *emeta = line->emeta;
+
+	w_err_gc->lba_list = pblk_malloc(lba_list_size,
+					 l_mg->emeta_alloc_type, GFP_KERNEL);
+	memcpy(w_err_gc->lba_list, emeta_to_lbas(pblk, emeta->buf),
+				lba_list_size);
 }
 
 void pblk_line_close_ws(struct work_struct *work)
@@ -1750,9 +1818,16 @@ void pblk_line_close_ws(struct work_struct *work)
 									ws);
 	struct pblk *pblk = line_ws->pblk;
 	struct pblk_line *line = line_ws->line;
+	struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
+
+	/* Write errors makes the emeta start address stored in smeta invalid,
+	 * so keep a copy of the lba list until we've gc'd the line
+	 */
+	if (w_err_gc->has_write_err)
+		pblk_save_lba_list(pblk, line);
 
 	pblk_line_close(pblk, line);
-	mempool_free(line_ws, pblk->gen_ws_pool);
+	mempool_free(line_ws, &pblk->gen_ws_pool);
 }
 
 void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
@@ -1761,7 +1836,7 @@ void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
 {
 	struct pblk_line_ws *line_ws;
 
-	line_ws = mempool_alloc(pblk->gen_ws_pool, gfp_mask);
+	line_ws = mempool_alloc(&pblk->gen_ws_pool, gfp_mask);
 
 	line_ws->pblk = pblk;
 	line_ws->line = line;
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 6851a5c..df88f1b 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -129,6 +129,53 @@ out:
 	kfree(gc_rq_ws);
 }
 
+static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
+				       struct pblk_line *line)
+{
+	struct line_emeta *emeta_buf;
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_line_meta *lm = &pblk->lm;
+	unsigned int lba_list_size = lm->emeta_len[2];
+	__le64 *lba_list;
+	int ret;
+
+	emeta_buf = pblk_malloc(lm->emeta_len[0],
+				l_mg->emeta_alloc_type, GFP_KERNEL);
+	if (!emeta_buf)
+		return NULL;
+
+	ret = pblk_line_read_emeta(pblk, line, emeta_buf);
+	if (ret) {
+		pr_err("pblk: line %d read emeta failed (%d)\n",
+				line->id, ret);
+		pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+		return NULL;
+	}
+
+	/* If this read fails, it means that emeta is corrupted.
+	 * For now, leave the line untouched.
+	 * TODO: Implement a recovery routine that scans and moves
+	 * all sectors on the line.
+	 */
+
+	ret = pblk_recov_check_emeta(pblk, emeta_buf);
+	if (ret) {
+		pr_err("pblk: inconsistent emeta (line %d)\n",
+				line->id);
+		pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+		return NULL;
+	}
+
+	lba_list = pblk_malloc(lba_list_size,
+			       l_mg->emeta_alloc_type, GFP_KERNEL);
+	if (lba_list)
+		memcpy(lba_list, emeta_to_lbas(pblk, emeta_buf), lba_list_size);
+
+	pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+
+	return lba_list;
+}
+
 static void pblk_gc_line_prepare_ws(struct work_struct *work)
 {
 	struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
@@ -138,46 +185,26 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_gc *gc = &pblk->gc;
-	struct line_emeta *emeta_buf;
 	struct pblk_line_ws *gc_rq_ws;
 	struct pblk_gc_rq *gc_rq;
 	__le64 *lba_list;
 	unsigned long *invalid_bitmap;
 	int sec_left, nr_secs, bit;
-	int ret;
 
 	invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL);
 	if (!invalid_bitmap)
 		goto fail_free_ws;
 
-	emeta_buf = pblk_malloc(lm->emeta_len[0], l_mg->emeta_alloc_type,
-								GFP_KERNEL);
-	if (!emeta_buf) {
-		pr_err("pblk: cannot use GC emeta\n");
-		goto fail_free_bitmap;
-	}
-
-	ret = pblk_line_read_emeta(pblk, line, emeta_buf);
-	if (ret) {
-		pr_err("pblk: line %d read emeta failed (%d)\n", line->id, ret);
-		goto fail_free_emeta;
-	}
-
-	/* If this read fails, it means that emeta is corrupted. For now, leave
-	 * the line untouched. TODO: Implement a recovery routine that scans and
-	 * moves all sectors on the line.
-	 */
-
-	ret = pblk_recov_check_emeta(pblk, emeta_buf);
-	if (ret) {
-		pr_err("pblk: inconsistent emeta (line %d)\n", line->id);
-		goto fail_free_emeta;
-	}
-
-	lba_list = emeta_to_lbas(pblk, emeta_buf);
-	if (!lba_list) {
-		pr_err("pblk: could not interpret emeta (line %d)\n", line->id);
-		goto fail_free_emeta;
+	if (line->w_err_gc->has_write_err) {
+		lba_list = line->w_err_gc->lba_list;
+		line->w_err_gc->lba_list = NULL;
+	} else {
+		lba_list = get_lba_list_from_emeta(pblk, line);
+		if (!lba_list) {
+			pr_err("pblk: could not interpret emeta (line %d)\n",
+					line->id);
+			goto fail_free_ws;
+		}
 	}
 
 	spin_lock(&line->lock);
@@ -187,14 +214,14 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
 
 	if (sec_left < 0) {
 		pr_err("pblk: corrupted GC line (%d)\n", line->id);
-		goto fail_free_emeta;
+		goto fail_free_lba_list;
 	}
 
 	bit = -1;
 next_rq:
 	gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL);
 	if (!gc_rq)
-		goto fail_free_emeta;
+		goto fail_free_lba_list;
 
 	nr_secs = 0;
 	do {
@@ -240,7 +267,7 @@ next_rq:
 		goto next_rq;
 
 out:
-	pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+	pblk_mfree(lba_list, l_mg->emeta_alloc_type);
 	kfree(line_ws);
 	kfree(invalid_bitmap);
 
@@ -251,9 +278,8 @@ out:
 
 fail_free_gc_rq:
 	kfree(gc_rq);
-fail_free_emeta:
-	pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
-fail_free_bitmap:
+fail_free_lba_list:
+	pblk_mfree(lba_list, l_mg->emeta_alloc_type);
 	kfree(invalid_bitmap);
 fail_free_ws:
 	kfree(line_ws);
@@ -349,12 +375,14 @@ static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk,
 static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl)
 {
 	unsigned int nr_blocks_free, nr_blocks_need;
+	unsigned int werr_lines = atomic_read(&rl->werr_lines);
 
 	nr_blocks_need = pblk_rl_high_thrs(rl);
 	nr_blocks_free = pblk_rl_nr_free_blks(rl);
 
 	/* This is not critical, no need to take lock here */
-	return ((gc->gc_active) && (nr_blocks_need > nr_blocks_free));
+	return ((werr_lines > 0) ||
+		((gc->gc_active) && (nr_blocks_need > nr_blocks_free)));
 }
 
 void pblk_gc_free_full_lines(struct pblk *pblk)
@@ -649,7 +677,7 @@ fail_free_main_kthread:
 	return ret;
 }
 
-void pblk_gc_exit(struct pblk *pblk)
+void pblk_gc_exit(struct pblk *pblk, bool graceful)
 {
 	struct pblk_gc *gc = &pblk->gc;
 
@@ -663,10 +691,12 @@ void pblk_gc_exit(struct pblk *pblk)
 	if (gc->gc_reader_ts)
 		kthread_stop(gc->gc_reader_ts);
 
-	flush_workqueue(gc->gc_reader_wq);
-	destroy_workqueue(gc->gc_reader_wq);
+	if (graceful) {
+		flush_workqueue(gc->gc_reader_wq);
+		flush_workqueue(gc->gc_line_reader_wq);
+	}
 
-	flush_workqueue(gc->gc_line_reader_wq);
+	destroy_workqueue(gc->gc_reader_wq);
 	destroy_workqueue(gc->gc_line_reader_wq);
 
 	if (gc->gc_writer_ts)
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 91a5bc2..ce561f5 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -20,10 +20,15 @@
 
 #include "pblk.h"
 
+unsigned int write_buffer_size;
+
+module_param(write_buffer_size, uint, 0644);
+MODULE_PARM_DESC(write_buffer_size, "number of entries in a write buffer");
+
 static struct kmem_cache *pblk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache,
 				*pblk_w_rq_cache;
 static DECLARE_RWSEM(pblk_lock);
-struct bio_set *pblk_bio_set;
+struct bio_set pblk_bio_set;
 
 static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
 			  struct bio *bio)
@@ -127,10 +132,8 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
 	if (!line) {
 		/* Configure next line for user data */
 		line = pblk_line_get_first_data(pblk);
-		if (!line) {
-			pr_err("pblk: line list corrupted\n");
+		if (!line)
 			return -EFAULT;
-		}
 	}
 
 	return 0;
@@ -141,6 +144,7 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
 	sector_t i;
 	struct ppa_addr ppa;
 	size_t map_size;
+	int ret = 0;
 
 	map_size = pblk_trans_map_size(pblk);
 	pblk->trans_map = vmalloc(map_size);
@@ -152,7 +156,11 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
 	for (i = 0; i < pblk->rl.nr_secs; i++)
 		pblk_trans_map_set(pblk, i, ppa);
 
-	return pblk_l2p_recover(pblk, factory_init);
+	ret = pblk_l2p_recover(pblk, factory_init);
+	if (ret)
+		vfree(pblk->trans_map);
+
+	return ret;
 }
 
 static void pblk_rwb_free(struct pblk *pblk)
@@ -169,10 +177,15 @@ static int pblk_rwb_init(struct pblk *pblk)
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
 	struct pblk_rb_entry *entries;
-	unsigned long nr_entries;
+	unsigned long nr_entries, buffer_size;
 	unsigned int power_size, power_seg_sz;
 
-	nr_entries = pblk_rb_calculate_size(pblk->pgs_in_buffer);
+	if (write_buffer_size && (write_buffer_size > pblk->pgs_in_buffer))
+		buffer_size = write_buffer_size;
+	else
+		buffer_size = pblk->pgs_in_buffer;
+
+	nr_entries = pblk_rb_calculate_size(buffer_size);
 
 	entries = vzalloc(nr_entries * sizeof(struct pblk_rb_entry));
 	if (!entries)
@@ -341,7 +354,7 @@ static int pblk_core_init(struct pblk *pblk)
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
-	int max_write_ppas;
+	int ret, max_write_ppas;
 
 	atomic64_set(&pblk->user_wa, 0);
 	atomic64_set(&pblk->pad_wa, 0);
@@ -375,33 +388,33 @@ static int pblk_core_init(struct pblk *pblk)
 		goto fail_free_pad_dist;
 
 	/* Internal bios can be at most the sectors signaled by the device. */
-	pblk->page_bio_pool = mempool_create_page_pool(NVM_MAX_VLBA, 0);
-	if (!pblk->page_bio_pool)
+	ret = mempool_init_page_pool(&pblk->page_bio_pool, NVM_MAX_VLBA, 0);
+	if (ret)
 		goto free_global_caches;
 
-	pblk->gen_ws_pool = mempool_create_slab_pool(PBLK_GEN_WS_POOL_SIZE,
-							pblk_ws_cache);
-	if (!pblk->gen_ws_pool)
+	ret = mempool_init_slab_pool(&pblk->gen_ws_pool, PBLK_GEN_WS_POOL_SIZE,
+				     pblk_ws_cache);
+	if (ret)
 		goto free_page_bio_pool;
 
-	pblk->rec_pool = mempool_create_slab_pool(geo->all_luns,
-							pblk_rec_cache);
-	if (!pblk->rec_pool)
+	ret = mempool_init_slab_pool(&pblk->rec_pool, geo->all_luns,
+				     pblk_rec_cache);
+	if (ret)
 		goto free_gen_ws_pool;
 
-	pblk->r_rq_pool = mempool_create_slab_pool(geo->all_luns,
-							pblk_g_rq_cache);
-	if (!pblk->r_rq_pool)
+	ret = mempool_init_slab_pool(&pblk->r_rq_pool, geo->all_luns,
+				     pblk_g_rq_cache);
+	if (ret)
 		goto free_rec_pool;
 
-	pblk->e_rq_pool = mempool_create_slab_pool(geo->all_luns,
-							pblk_g_rq_cache);
-	if (!pblk->e_rq_pool)
+	ret = mempool_init_slab_pool(&pblk->e_rq_pool, geo->all_luns,
+				     pblk_g_rq_cache);
+	if (ret)
 		goto free_r_rq_pool;
 
-	pblk->w_rq_pool = mempool_create_slab_pool(geo->all_luns,
-							pblk_w_rq_cache);
-	if (!pblk->w_rq_pool)
+	ret = mempool_init_slab_pool(&pblk->w_rq_pool, geo->all_luns,
+				     pblk_w_rq_cache);
+	if (ret)
 		goto free_e_rq_pool;
 
 	pblk->close_wq = alloc_workqueue("pblk-close-wq",
@@ -423,6 +436,7 @@ static int pblk_core_init(struct pblk *pblk)
 		goto free_r_end_wq;
 
 	INIT_LIST_HEAD(&pblk->compl_list);
+	INIT_LIST_HEAD(&pblk->resubmit_list);
 
 	return 0;
 
@@ -433,17 +447,17 @@ free_bb_wq:
 free_close_wq:
 	destroy_workqueue(pblk->close_wq);
 free_w_rq_pool:
-	mempool_destroy(pblk->w_rq_pool);
+	mempool_exit(&pblk->w_rq_pool);
 free_e_rq_pool:
-	mempool_destroy(pblk->e_rq_pool);
+	mempool_exit(&pblk->e_rq_pool);
 free_r_rq_pool:
-	mempool_destroy(pblk->r_rq_pool);
+	mempool_exit(&pblk->r_rq_pool);
 free_rec_pool:
-	mempool_destroy(pblk->rec_pool);
+	mempool_exit(&pblk->rec_pool);
 free_gen_ws_pool:
-	mempool_destroy(pblk->gen_ws_pool);
+	mempool_exit(&pblk->gen_ws_pool);
 free_page_bio_pool:
-	mempool_destroy(pblk->page_bio_pool);
+	mempool_exit(&pblk->page_bio_pool);
 free_global_caches:
 	pblk_free_global_caches(pblk);
 fail_free_pad_dist:
@@ -462,12 +476,12 @@ static void pblk_core_free(struct pblk *pblk)
 	if (pblk->bb_wq)
 		destroy_workqueue(pblk->bb_wq);
 
-	mempool_destroy(pblk->page_bio_pool);
-	mempool_destroy(pblk->gen_ws_pool);
-	mempool_destroy(pblk->rec_pool);
-	mempool_destroy(pblk->r_rq_pool);
-	mempool_destroy(pblk->e_rq_pool);
-	mempool_destroy(pblk->w_rq_pool);
+	mempool_exit(&pblk->page_bio_pool);
+	mempool_exit(&pblk->gen_ws_pool);
+	mempool_exit(&pblk->rec_pool);
+	mempool_exit(&pblk->r_rq_pool);
+	mempool_exit(&pblk->e_rq_pool);
+	mempool_exit(&pblk->w_rq_pool);
 
 	pblk_free_global_caches(pblk);
 	kfree(pblk->pad_dist);
@@ -489,11 +503,17 @@ static void pblk_line_mg_free(struct pblk *pblk)
 	}
 }
 
-static void pblk_line_meta_free(struct pblk_line *line)
+static void pblk_line_meta_free(struct pblk_line_mgmt *l_mg,
+				struct pblk_line *line)
 {
+	struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
+
 	kfree(line->blk_bitmap);
 	kfree(line->erase_bitmap);
 	kfree(line->chks);
+
+	pblk_mfree(w_err_gc->lba_list, l_mg->emeta_alloc_type);
+	kfree(w_err_gc);
 }
 
 static void pblk_lines_free(struct pblk *pblk)
@@ -506,8 +526,8 @@ static void pblk_lines_free(struct pblk *pblk)
 	for (i = 0; i < l_mg->nr_lines; i++) {
 		line = &pblk->lines[i];
 
-		pblk_line_free(pblk, line);
-		pblk_line_meta_free(line);
+		pblk_line_free(line);
+		pblk_line_meta_free(l_mg, line);
 	}
 	spin_unlock(&l_mg->free_lock);
 
@@ -748,14 +768,14 @@ static int pblk_setup_line_meta_20(struct pblk *pblk, struct pblk_line *line,
 		chunk->cnlb = chunk_meta->cnlb;
 		chunk->wp = chunk_meta->wp;
 
-		if (!(chunk->state & NVM_CHK_ST_OFFLINE))
-			continue;
-
 		if (chunk->type & NVM_CHK_TP_SZ_SPEC) {
 			WARN_ONCE(1, "pblk: custom-sized chunks unsupported\n");
 			continue;
 		}
 
+		if (!(chunk->state & NVM_CHK_ST_OFFLINE))
+			continue;
+
 		set_bit(pos, line->blk_bitmap);
 		nr_bad_chks++;
 	}
@@ -809,20 +829,28 @@ static int pblk_alloc_line_meta(struct pblk *pblk, struct pblk_line *line)
 		return -ENOMEM;
 
 	line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
-	if (!line->erase_bitmap) {
-		kfree(line->blk_bitmap);
-		return -ENOMEM;
-	}
+	if (!line->erase_bitmap)
+		goto free_blk_bitmap;
+
 
 	line->chks = kmalloc(lm->blk_per_line * sizeof(struct nvm_chk_meta),
 								GFP_KERNEL);
-	if (!line->chks) {
-		kfree(line->erase_bitmap);
-		kfree(line->blk_bitmap);
-		return -ENOMEM;
-	}
+	if (!line->chks)
+		goto free_erase_bitmap;
+
+	line->w_err_gc = kzalloc(sizeof(struct pblk_w_err_gc), GFP_KERNEL);
+	if (!line->w_err_gc)
+		goto free_chks;
 
 	return 0;
+
+free_chks:
+	kfree(line->chks);
+free_erase_bitmap:
+	kfree(line->erase_bitmap);
+free_blk_bitmap:
+	kfree(line->blk_bitmap);
+	return -ENOMEM;
 }
 
 static int pblk_line_mg_init(struct pblk *pblk)
@@ -847,12 +875,14 @@ static int pblk_line_mg_init(struct pblk *pblk)
 	INIT_LIST_HEAD(&l_mg->gc_mid_list);
 	INIT_LIST_HEAD(&l_mg->gc_low_list);
 	INIT_LIST_HEAD(&l_mg->gc_empty_list);
+	INIT_LIST_HEAD(&l_mg->gc_werr_list);
 
 	INIT_LIST_HEAD(&l_mg->emeta_list);
 
-	l_mg->gc_lists[0] = &l_mg->gc_high_list;
-	l_mg->gc_lists[1] = &l_mg->gc_mid_list;
-	l_mg->gc_lists[2] = &l_mg->gc_low_list;
+	l_mg->gc_lists[0] = &l_mg->gc_werr_list;
+	l_mg->gc_lists[1] = &l_mg->gc_high_list;
+	l_mg->gc_lists[2] = &l_mg->gc_mid_list;
+	l_mg->gc_lists[3] = &l_mg->gc_low_list;
 
 	spin_lock_init(&l_mg->free_lock);
 	spin_lock_init(&l_mg->close_lock);
@@ -1047,6 +1077,11 @@ static int pblk_lines_init(struct pblk *pblk)
 		nr_free_chks += pblk_setup_line_meta(pblk, line, chunk_meta, i);
 	}
 
+	if (!nr_free_chks) {
+		pr_err("pblk: too many bad blocks prevent for sane instance\n");
+		return -EINTR;
+	}
+
 	pblk_set_provision(pblk, nr_free_chks);
 
 	kfree(chunk_meta);
@@ -1054,7 +1089,7 @@ static int pblk_lines_init(struct pblk *pblk)
 
 fail_free_lines:
 	while (--i >= 0)
-		pblk_line_meta_free(&pblk->lines[i]);
+		pblk_line_meta_free(l_mg, &pblk->lines[i]);
 	kfree(pblk->lines);
 fail_free_chunk_meta:
 	kfree(chunk_meta);
@@ -1110,23 +1145,25 @@ static void pblk_free(struct pblk *pblk)
 	kfree(pblk);
 }
 
-static void pblk_tear_down(struct pblk *pblk)
+static void pblk_tear_down(struct pblk *pblk, bool graceful)
 {
-	pblk_pipeline_stop(pblk);
+	if (graceful)
+		__pblk_pipeline_flush(pblk);
+	__pblk_pipeline_stop(pblk);
 	pblk_writer_stop(pblk);
 	pblk_rb_sync_l2p(&pblk->rwb);
 	pblk_rl_free(&pblk->rl);
 
-	pr_debug("pblk: consistent tear down\n");
+	pr_debug("pblk: consistent tear down (graceful:%d)\n", graceful);
 }
 
-static void pblk_exit(void *private)
+static void pblk_exit(void *private, bool graceful)
 {
 	struct pblk *pblk = private;
 
 	down_write(&pblk_lock);
-	pblk_gc_exit(pblk);
-	pblk_tear_down(pblk);
+	pblk_gc_exit(pblk, graceful);
+	pblk_tear_down(pblk, graceful);
 
 #ifdef CONFIG_NVM_DEBUG
 	pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk));
@@ -1175,6 +1212,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 	pblk->state = PBLK_STATE_RUNNING;
 	pblk->gc.gc_enabled = 0;
 
+	spin_lock_init(&pblk->resubmit_lock);
 	spin_lock_init(&pblk->trans_lock);
 	spin_lock_init(&pblk->lock);
 
@@ -1297,18 +1335,18 @@ static int __init pblk_module_init(void)
 {
 	int ret;
 
-	pblk_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0);
-	if (!pblk_bio_set)
-		return -ENOMEM;
+	ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
+	if (ret)
+		return ret;
 	ret = nvm_register_tgt_type(&tt_pblk);
 	if (ret)
-		bioset_free(pblk_bio_set);
+		bioset_exit(&pblk_bio_set);
 	return ret;
 }
 
 static void pblk_module_exit(void)
 {
-	bioset_free(pblk_bio_set);
+	bioset_exit(&pblk_bio_set);
 	nvm_unregister_tgt_type(&tt_pblk);
 }
 
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index 20dbaa8..953ca31 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -18,11 +18,11 @@
 
 #include "pblk.h"
 
-static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
-			       struct ppa_addr *ppa_list,
-			       unsigned long *lun_bitmap,
-			       struct pblk_sec_meta *meta_list,
-			       unsigned int valid_secs)
+static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
+			      struct ppa_addr *ppa_list,
+			      unsigned long *lun_bitmap,
+			      struct pblk_sec_meta *meta_list,
+			      unsigned int valid_secs)
 {
 	struct pblk_line *line = pblk_line_get_data(pblk);
 	struct pblk_emeta *emeta;
@@ -35,8 +35,14 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 	if (pblk_line_is_full(line)) {
 		struct pblk_line *prev_line = line;
 
+		/* If we cannot allocate a new line, make sure to store metadata
+		 * on current line and then fail
+		 */
 		line = pblk_line_replace_data(pblk);
 		pblk_line_close_meta(pblk, prev_line);
+
+		if (!line)
+			return -EINTR;
 	}
 
 	emeta = line->emeta;
@@ -74,6 +80,7 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 	}
 
 	pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap);
+	return 0;
 }
 
 void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
@@ -87,8 +94,12 @@ void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 
 	for (i = off; i < rqd->nr_ppas; i += min) {
 		map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
-		pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
-					lun_bitmap, &meta_list[i], map_secs);
+		if (pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+					lun_bitmap, &meta_list[i], map_secs)) {
+			bio_put(rqd->bio);
+			pblk_free_rqd(pblk, rqd, PBLK_WRITE);
+			pblk_pipeline_stop(pblk);
+		}
 	}
 }
 
@@ -108,8 +119,12 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 
 	for (i = 0; i < rqd->nr_ppas; i += min) {
 		map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
-		pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
-					lun_bitmap, &meta_list[i], map_secs);
+		if (pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+					lun_bitmap, &meta_list[i], map_secs)) {
+			bio_put(rqd->bio);
+			pblk_free_rqd(pblk, rqd, PBLK_WRITE);
+			pblk_pipeline_stop(pblk);
+		}
 
 		erase_lun = pblk_ppa_to_pos(geo, rqd->ppa_list[i]);
 
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index 52fdd85..00cd1f2 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -142,10 +142,9 @@ static void clean_wctx(struct pblk_w_ctx *w_ctx)
 {
 	int flags;
 
-try:
 	flags = READ_ONCE(w_ctx->flags);
-	if (!(flags & PBLK_SUBMITTED_ENTRY))
-		goto try;
+	WARN_ONCE(!(flags & PBLK_SUBMITTED_ENTRY),
+			"pblk: overwriting unsubmitted data\n");
 
 	/* Release flags on context. Protect from writes and reads */
 	smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY);
@@ -350,7 +349,7 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
 }
 
 static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio,
-				  unsigned int pos)
+				   unsigned int pos)
 {
 	struct pblk_rb_entry *entry;
 	unsigned int sync, flush_point;
@@ -420,7 +419,7 @@ void pblk_rb_flush(struct pblk_rb *rb)
 	if (pblk_rb_flush_point_set(rb, NULL, mem))
 		return;
 
-	pblk_write_should_kick(pblk);
+	pblk_write_kick(pblk);
 }
 
 static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
@@ -504,45 +503,6 @@ int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
 }
 
 /*
- * The caller of this function must ensure that the backpointer will not
- * overwrite the entries passed on the list.
- */
-unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
-				      struct list_head *list,
-				      unsigned int max)
-{
-	struct pblk_rb_entry *entry, *tentry;
-	struct page *page;
-	unsigned int read = 0;
-	int ret;
-
-	list_for_each_entry_safe(entry, tentry, list, index) {
-		if (read > max) {
-			pr_err("pblk: too many entries on list\n");
-			goto out;
-		}
-
-		page = virt_to_page(entry->data);
-		if (!page) {
-			pr_err("pblk: could not allocate write bio page\n");
-			goto out;
-		}
-
-		ret = bio_add_page(bio, page, rb->seg_size, 0);
-		if (ret != rb->seg_size) {
-			pr_err("pblk: could not add page to write bio\n");
-			goto out;
-		}
-
-		list_del(&entry->index);
-		read++;
-	}
-
-out:
-	return read;
-}
-
-/*
  * Read available entries on rb and add them to the given bio. To avoid a memory
  * copy, a page reference to the write buffer is used to be added to the bio.
  *
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 9eee10f..1869469 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -39,10 +39,10 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
 }
 
 static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
-				 sector_t blba, unsigned long *read_bitmap)
+				 struct bio *bio, sector_t blba,
+				 unsigned long *read_bitmap)
 {
 	struct pblk_sec_meta *meta_list = rqd->meta_list;
-	struct bio *bio = rqd->bio;
 	struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
 	int nr_secs = rqd->nr_ppas;
 	bool advanced_bio = false;
@@ -102,32 +102,69 @@ next:
 #endif
 }
 
-static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd)
+
+static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
+				sector_t blba)
 {
-	int err;
+	struct pblk_sec_meta *meta_lba_list = rqd->meta_list;
+	int nr_lbas = rqd->nr_ppas;
+	int i;
 
-	err = pblk_submit_io(pblk, rqd);
-	if (err)
-		return NVM_IO_ERR;
+	for (i = 0; i < nr_lbas; i++) {
+		u64 lba = le64_to_cpu(meta_lba_list[i].lba);
+
+		if (lba == ADDR_EMPTY)
+			continue;
+
+		if (lba != blba + i) {
+#ifdef CONFIG_NVM_DEBUG
+			struct ppa_addr *p;
 
-	return NVM_IO_OK;
+			p = (nr_lbas == 1) ? &rqd->ppa_list[i] : &rqd->ppa_addr;
+			print_ppa(&pblk->dev->geo, p, "seq", i);
+#endif
+			pr_err("pblk: corrupted read LBA (%llu/%llu)\n",
+							lba, (u64)blba + i);
+			WARN_ON(1);
+		}
+	}
 }
 
-static void pblk_read_check(struct pblk *pblk, struct nvm_rq *rqd,
-			   sector_t blba)
+/*
+ * There can be holes in the lba list.
+ */
+static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd,
+				 u64 *lba_list, int nr_lbas)
 {
-	struct pblk_sec_meta *meta_list = rqd->meta_list;
-	int nr_lbas = rqd->nr_ppas;
-	int i;
+	struct pblk_sec_meta *meta_lba_list = rqd->meta_list;
+	int i, j;
 
-	for (i = 0; i < nr_lbas; i++) {
-		u64 lba = le64_to_cpu(meta_list[i].lba);
+	for (i = 0, j = 0; i < nr_lbas; i++) {
+		u64 lba = lba_list[i];
+		u64 meta_lba;
 
 		if (lba == ADDR_EMPTY)
 			continue;
 
-		WARN(lba != blba + i, "pblk: corrupted read LBA\n");
+		meta_lba = le64_to_cpu(meta_lba_list[j].lba);
+
+		if (lba != meta_lba) {
+#ifdef CONFIG_NVM_DEBUG
+			struct ppa_addr *p;
+			int nr_ppas = rqd->nr_ppas;
+
+			p = (nr_ppas == 1) ? &rqd->ppa_list[j] : &rqd->ppa_addr;
+			print_ppa(&pblk->dev->geo, p, "seq", j);
+#endif
+			pr_err("pblk: corrupted read LBA (%llu/%llu)\n",
+								lba, meta_lba);
+			WARN_ON(1);
+		}
+
+		j++;
 	}
+
+	WARN_ONCE(j != rqd->nr_ppas, "pblk: corrupted random request\n");
 }
 
 static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd)
@@ -152,7 +189,6 @@ static void pblk_end_user_read(struct bio *bio)
 	WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n");
 #endif
 	bio_endio(bio);
-	bio_put(bio);
 }
 
 static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
@@ -160,23 +196,18 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
-	struct bio *bio = rqd->bio;
+	struct bio *int_bio = rqd->bio;
 	unsigned long start_time = r_ctx->start_time;
 
 	generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time);
 
 	if (rqd->error)
 		pblk_log_read_err(pblk, rqd);
-#ifdef CONFIG_NVM_DEBUG
-	else
-		WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n");
-#endif
 
-	pblk_read_check(pblk, rqd, r_ctx->lba);
+	pblk_read_check_seq(pblk, rqd, r_ctx->lba);
 
-	bio_put(bio);
-	if (r_ctx->private)
-		pblk_end_user_read((struct bio *)r_ctx->private);
+	if (int_bio)
+		bio_put(int_bio);
 
 	if (put_line)
 		pblk_read_put_rqd_kref(pblk, rqd);
@@ -193,16 +224,19 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
 static void pblk_end_io_read(struct nvm_rq *rqd)
 {
 	struct pblk *pblk = rqd->private;
+	struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+	struct bio *bio = (struct bio *)r_ctx->private;
 
+	pblk_end_user_read(bio);
 	__pblk_end_io_read(pblk, rqd, true);
 }
 
-static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
-				 unsigned int bio_init_idx,
-				 unsigned long *read_bitmap)
+static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
+			     struct bio *orig_bio, unsigned int bio_init_idx,
+			     unsigned long *read_bitmap)
 {
-	struct bio *new_bio, *bio = rqd->bio;
 	struct pblk_sec_meta *meta_list = rqd->meta_list;
+	struct bio *new_bio;
 	struct bio_vec src_bv, dst_bv;
 	void *ppa_ptr = NULL;
 	void *src_p, *dst_p;
@@ -219,11 +253,11 @@ static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 	new_bio = bio_alloc(GFP_KERNEL, nr_holes);
 
 	if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
-		goto err;
+		goto fail_add_pages;
 
 	if (nr_holes != new_bio->bi_vcnt) {
 		pr_err("pblk: malformed bio\n");
-		goto err;
+		goto fail;
 	}
 
 	for (i = 0; i < nr_secs; i++)
@@ -246,7 +280,7 @@ static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 	if (ret) {
 		bio_put(rqd->bio);
 		pr_err("pblk: sync read IO submission failed\n");
-		goto err;
+		goto fail;
 	}
 
 	if (rqd->error) {
@@ -282,7 +316,7 @@ static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 		meta_list[hole].lba = lba_list_media[i];
 
 		src_bv = new_bio->bi_io_vec[i++];
-		dst_bv = bio->bi_io_vec[bio_init_idx + hole];
+		dst_bv = orig_bio->bi_io_vec[bio_init_idx + hole];
 
 		src_p = kmap_atomic(src_bv.bv_page);
 		dst_p = kmap_atomic(dst_bv.bv_page);
@@ -294,35 +328,33 @@ static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 		kunmap_atomic(src_p);
 		kunmap_atomic(dst_p);
 
-		mempool_free(src_bv.bv_page, pblk->page_bio_pool);
+		mempool_free(src_bv.bv_page, &pblk->page_bio_pool);
 
 		hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1);
 	} while (hole < nr_secs);
 
 	bio_put(new_bio);
 
-	/* Complete the original bio and associated request */
-	bio_endio(bio);
-	rqd->bio = bio;
+	/* restore original request */
+	rqd->bio = NULL;
 	rqd->nr_ppas = nr_secs;
 
 	__pblk_end_io_read(pblk, rqd, false);
-	return NVM_IO_OK;
-
-err:
-	pr_err("pblk: failed to perform partial read\n");
+	return NVM_IO_DONE;
 
+fail:
 	/* Free allocated pages in new bio */
-	pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
+	pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt);
+fail_add_pages:
+	pr_err("pblk: failed to perform partial read\n");
 	__pblk_end_io_read(pblk, rqd, false);
 	return NVM_IO_ERR;
 }
 
-static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
+static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
 			 sector_t lba, unsigned long *read_bitmap)
 {
 	struct pblk_sec_meta *meta_list = rqd->meta_list;
-	struct bio *bio = rqd->bio;
 	struct ppa_addr ppa;
 
 	pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
@@ -386,14 +418,15 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 	rqd = pblk_alloc_rqd(pblk, PBLK_READ);
 
 	rqd->opcode = NVM_OP_PREAD;
-	rqd->bio = bio;
 	rqd->nr_ppas = nr_secs;
+	rqd->bio = NULL; /* cloned bio if needed */
 	rqd->private = pblk;
 	rqd->end_io = pblk_end_io_read;
 
 	r_ctx = nvm_rq_to_pdu(rqd);
 	r_ctx->start_time = jiffies;
 	r_ctx->lba = blba;
+	r_ctx->private = bio; /* original bio */
 
 	/* Save the index for this bio's start. This is needed in case
 	 * we need to fill a partial read.
@@ -411,17 +444,15 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 		rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
 		rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
 
-		pblk_read_ppalist_rq(pblk, rqd, blba, &read_bitmap);
+		pblk_read_ppalist_rq(pblk, rqd, bio, blba, &read_bitmap);
 	} else {
-		pblk_read_rq(pblk, rqd, blba, &read_bitmap);
+		pblk_read_rq(pblk, rqd, bio, blba, &read_bitmap);
 	}
 
-	bio_get(bio);
 	if (bitmap_full(&read_bitmap, nr_secs)) {
-		bio_endio(bio);
 		atomic_inc(&pblk->inflight_io);
 		__pblk_end_io_read(pblk, rqd, false);
-		return NVM_IO_OK;
+		return NVM_IO_DONE;
 	}
 
 	/* All sectors are to be read from the device */
@@ -429,20 +460,17 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 		struct bio *int_bio = NULL;
 
 		/* Clone read bio to deal with read errors internally */
-		int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set);
+		int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
 		if (!int_bio) {
 			pr_err("pblk: could not clone read bio\n");
 			goto fail_end_io;
 		}
 
 		rqd->bio = int_bio;
-		r_ctx->private = bio;
 
-		ret = pblk_submit_read_io(pblk, rqd);
-		if (ret) {
+		if (pblk_submit_io(pblk, rqd)) {
 			pr_err("pblk: read IO submission failed\n");
-			if (int_bio)
-				bio_put(int_bio);
+			ret = NVM_IO_ERR;
 			goto fail_end_io;
 		}
 
@@ -452,7 +480,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 	/* The read bio request could be partially filled by the write buffer,
 	 * but there are some holes that need to be read from the drive.
 	 */
-	return pblk_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap);
+	return pblk_partial_read(pblk, rqd, bio, bio_init_idx, &read_bitmap);
 
 fail_rqd_free:
 	pblk_free_rqd(pblk, rqd, PBLK_READ);
@@ -585,6 +613,8 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
 		goto err_free_bio;
 	}
 
+	pblk_read_check_rand(pblk, &rqd, gc_rq->lba_list, gc_rq->nr_secs);
+
 	atomic_dec(&pblk->inflight_io);
 
 	if (rqd.error) {
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 3e079c2..5983428 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -16,97 +16,6 @@
 
 #include "pblk.h"
 
-void pblk_submit_rec(struct work_struct *work)
-{
-	struct pblk_rec_ctx *recovery =
-			container_of(work, struct pblk_rec_ctx, ws_rec);
-	struct pblk *pblk = recovery->pblk;
-	struct nvm_rq *rqd = recovery->rqd;
-	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
-	struct bio *bio;
-	unsigned int nr_rec_secs;
-	unsigned int pgs_read;
-	int ret;
-
-	nr_rec_secs = bitmap_weight((unsigned long int *)&rqd->ppa_status,
-								NVM_MAX_VLBA);
-
-	bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
-
-	bio->bi_iter.bi_sector = 0;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-	rqd->bio = bio;
-	rqd->nr_ppas = nr_rec_secs;
-
-	pgs_read = pblk_rb_read_to_bio_list(&pblk->rwb, bio, &recovery->failed,
-								nr_rec_secs);
-	if (pgs_read != nr_rec_secs) {
-		pr_err("pblk: could not read recovery entries\n");
-		goto err;
-	}
-
-	if (pblk_setup_w_rec_rq(pblk, rqd, c_ctx)) {
-		pr_err("pblk: could not setup recovery request\n");
-		goto err;
-	}
-
-#ifdef CONFIG_NVM_DEBUG
-	atomic_long_add(nr_rec_secs, &pblk->recov_writes);
-#endif
-
-	ret = pblk_submit_io(pblk, rqd);
-	if (ret) {
-		pr_err("pblk: I/O submission failed: %d\n", ret);
-		goto err;
-	}
-
-	mempool_free(recovery, pblk->rec_pool);
-	return;
-
-err:
-	bio_put(bio);
-	pblk_free_rqd(pblk, rqd, PBLK_WRITE);
-}
-
-int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
-			struct pblk_rec_ctx *recovery, u64 *comp_bits,
-			unsigned int comp)
-{
-	struct nvm_rq *rec_rqd;
-	struct pblk_c_ctx *rec_ctx;
-	int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
-
-	rec_rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
-	rec_ctx = nvm_rq_to_pdu(rec_rqd);
-
-	/* Copy completion bitmap, but exclude the first X completed entries */
-	bitmap_shift_right((unsigned long int *)&rec_rqd->ppa_status,
-				(unsigned long int *)comp_bits,
-				comp, NVM_MAX_VLBA);
-
-	/* Save the context for the entries that need to be re-written and
-	 * update current context with the completed entries.
-	 */
-	rec_ctx->sentry = pblk_rb_wrap_pos(&pblk->rwb, c_ctx->sentry + comp);
-	if (comp >= c_ctx->nr_valid) {
-		rec_ctx->nr_valid = 0;
-		rec_ctx->nr_padded = nr_entries - comp;
-
-		c_ctx->nr_padded = comp - c_ctx->nr_valid;
-	} else {
-		rec_ctx->nr_valid = c_ctx->nr_valid - comp;
-		rec_ctx->nr_padded = c_ctx->nr_padded;
-
-		c_ctx->nr_valid = comp;
-		c_ctx->nr_padded = 0;
-	}
-
-	recovery->rqd = rec_rqd;
-	recovery->pblk = pblk;
-
-	return 0;
-}
-
 int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf)
 {
 	u32 crc;
@@ -865,18 +774,30 @@ static void pblk_recov_wa_counters(struct pblk *pblk,
 }
 
 static int pblk_line_was_written(struct pblk_line *line,
-			    struct pblk_line_meta *lm)
+			    struct pblk *pblk)
 {
 
-	int i;
-	int state_mask = NVM_CHK_ST_OFFLINE | NVM_CHK_ST_FREE;
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct nvm_chk_meta *chunk;
+	struct ppa_addr bppa;
+	int smeta_blk;
 
-	for (i = 0; i < lm->blk_per_line; i++) {
-		if (!(line->chks[i].state & state_mask))
-			return 1;
-	}
+	if (line->state == PBLK_LINESTATE_BAD)
+		return 0;
 
-	return 0;
+	smeta_blk = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+	if (smeta_blk >= lm->blk_per_line)
+		return 0;
+
+	bppa = pblk->luns[smeta_blk].bppa;
+	chunk = &line->chks[pblk_ppa_to_pos(geo, bppa)];
+
+	if (chunk->state & NVM_CHK_ST_FREE)
+		return 0;
+
+	return 1;
 }
 
 struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
@@ -915,7 +836,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
 		line->lun_bitmap = ((void *)(smeta_buf)) +
 						sizeof(struct line_smeta);
 
-		if (!pblk_line_was_written(line, lm))
+		if (!pblk_line_was_written(line, pblk))
 			continue;
 
 		/* Lines that cannot be read are assumed as not written here */
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
index 883a711..6a0616a 100644
--- a/drivers/lightnvm/pblk-rl.c
+++ b/drivers/lightnvm/pblk-rl.c
@@ -73,6 +73,16 @@ void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries)
 	pblk_rl_kick_u_timer(rl);
 }
 
+void pblk_rl_werr_line_in(struct pblk_rl *rl)
+{
+	atomic_inc(&rl->werr_lines);
+}
+
+void pblk_rl_werr_line_out(struct pblk_rl *rl)
+{
+	atomic_dec(&rl->werr_lines);
+}
+
 void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries)
 {
 	atomic_add(nr_entries, &rl->rb_gc_cnt);
@@ -99,11 +109,21 @@ static void __pblk_rl_update_rates(struct pblk_rl *rl,
 {
 	struct pblk *pblk = container_of(rl, struct pblk, rl);
 	int max = rl->rb_budget;
+	int werr_gc_needed = atomic_read(&rl->werr_lines);
 
 	if (free_blocks >= rl->high) {
-		rl->rb_user_max = max;
-		rl->rb_gc_max = 0;
-		rl->rb_state = PBLK_RL_HIGH;
+		if (werr_gc_needed) {
+			/* Allocate a small budget for recovering
+			 * lines with write errors
+			 */
+			rl->rb_gc_max = 1 << rl->rb_windows_pw;
+			rl->rb_user_max = max - rl->rb_gc_max;
+			rl->rb_state = PBLK_RL_WERR;
+		} else {
+			rl->rb_user_max = max;
+			rl->rb_gc_max = 0;
+			rl->rb_state = PBLK_RL_OFF;
+		}
 	} else if (free_blocks < rl->high) {
 		int shift = rl->high_pw - rl->rb_windows_pw;
 		int user_windows = free_blocks >> shift;
@@ -124,7 +144,7 @@ static void __pblk_rl_update_rates(struct pblk_rl *rl,
 		rl->rb_state = PBLK_RL_LOW;
 	}
 
-	if (rl->rb_state == (PBLK_RL_MID | PBLK_RL_LOW))
+	if (rl->rb_state != PBLK_RL_OFF)
 		pblk_gc_should_start(pblk);
 	else
 		pblk_gc_should_stop(pblk);
@@ -221,6 +241,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
 	atomic_set(&rl->rb_user_cnt, 0);
 	atomic_set(&rl->rb_gc_cnt, 0);
 	atomic_set(&rl->rb_space, -1);
+	atomic_set(&rl->werr_lines, 0);
 
 	timer_setup(&rl->u_timer, pblk_rl_u_timer, 0);
 
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
index e61909a..88a0a7c 100644
--- a/drivers/lightnvm/pblk-sysfs.c
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -173,6 +173,8 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
 	int free_line_cnt = 0, closed_line_cnt = 0, emeta_line_cnt = 0;
 	int d_line_cnt = 0, l_line_cnt = 0;
 	int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0;
+	int gc_werr = 0;
+
 	int bad = 0, cor = 0;
 	int msecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0;
 	int map_weight = 0, meta_weight = 0;
@@ -237,6 +239,15 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
 		gc_empty++;
 	}
 
+	list_for_each_entry(line, &l_mg->gc_werr_list, list) {
+		if (line->type == PBLK_LINETYPE_DATA)
+			d_line_cnt++;
+		else if (line->type == PBLK_LINETYPE_LOG)
+			l_line_cnt++;
+		closed_line_cnt++;
+		gc_werr++;
+	}
+
 	list_for_each_entry(line, &l_mg->bad_list, list)
 		bad++;
 	list_for_each_entry(line, &l_mg->corrupt_list, list)
@@ -275,8 +286,8 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
 					l_mg->nr_lines);
 
 	sz += snprintf(page + sz, PAGE_SIZE - sz,
-		"GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n",
-			gc_full, gc_high, gc_mid, gc_low, gc_empty,
+		"GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, werr: %d, queue:%d\n",
+			gc_full, gc_high, gc_mid, gc_low, gc_empty, gc_werr,
 			atomic_read(&pblk->gc.read_inflight_gc));
 
 	sz += snprintf(page + sz, PAGE_SIZE - sz,
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 3e6f1eb..f353e52 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -103,68 +103,150 @@ retry:
 	pblk_rb_sync_end(&pblk->rwb, &flags);
 }
 
-/* When a write fails, we are not sure whether the block has grown bad or a page
- * range is more susceptible to write errors. If a high number of pages fail, we
- * assume that the block is bad and we mark it accordingly. In all cases, we
- * remap and resubmit the failed entries as fast as possible; if a flush is
- * waiting on a completion, the whole stack would stall otherwise.
- */
-static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
+/* Map remaining sectors in chunk, starting from ppa */
+static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa)
 {
-	void *comp_bits = &rqd->ppa_status;
-	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
-	struct pblk_rec_ctx *recovery;
-	struct ppa_addr *ppa_list = rqd->ppa_list;
-	int nr_ppas = rqd->nr_ppas;
-	unsigned int c_entries;
-	int bit, ret;
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line *line;
+	struct ppa_addr map_ppa = *ppa;
+	u64 paddr;
+	int done = 0;
 
-	if (unlikely(nr_ppas == 1))
-		ppa_list = &rqd->ppa_addr;
+	line = &pblk->lines[pblk_ppa_to_line(*ppa)];
+	spin_lock(&line->lock);
 
-	recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
+	while (!done)  {
+		paddr = pblk_dev_ppa_to_line_addr(pblk, map_ppa);
 
-	INIT_LIST_HEAD(&recovery->failed);
+		if (!test_and_set_bit(paddr, line->map_bitmap))
+			line->left_msecs--;
 
-	bit = -1;
-	while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
-		struct pblk_rb_entry *entry;
-		struct ppa_addr ppa;
+		if (!test_and_set_bit(paddr, line->invalid_bitmap))
+			le32_add_cpu(line->vsc, -1);
 
-		/* Logic error */
-		if (bit > c_ctx->nr_valid) {
-			WARN_ONCE(1, "pblk: corrupted write request\n");
-			mempool_free(recovery, pblk->rec_pool);
-			goto out;
+		if (geo->version == NVM_OCSSD_SPEC_12) {
+			map_ppa.ppa++;
+			if (map_ppa.g.pg == geo->num_pg)
+				done = 1;
+		} else {
+			map_ppa.m.sec++;
+			if (map_ppa.m.sec == geo->clba)
+				done = 1;
 		}
+	}
 
-		ppa = ppa_list[bit];
-		entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa);
-		if (!entry) {
-			pr_err("pblk: could not scan entry on write failure\n");
-			mempool_free(recovery, pblk->rec_pool);
-			goto out;
-		}
+	line->w_err_gc->has_write_err = 1;
+	spin_unlock(&line->lock);
+}
 
-		/* The list is filled first and emptied afterwards. No need for
-		 * protecting it with a lock
+static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry,
+				  unsigned int nr_entries)
+{
+	struct pblk_rb *rb = &pblk->rwb;
+	struct pblk_rb_entry *entry;
+	struct pblk_line *line;
+	struct pblk_w_ctx *w_ctx;
+	struct ppa_addr ppa_l2p;
+	int flags;
+	unsigned int pos, i;
+
+	spin_lock(&pblk->trans_lock);
+	pos = sentry;
+	for (i = 0; i < nr_entries; i++) {
+		entry = &rb->entries[pos];
+		w_ctx = &entry->w_ctx;
+
+		/* Check if the lba has been overwritten */
+		ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba);
+		if (!pblk_ppa_comp(ppa_l2p, entry->cacheline))
+			w_ctx->lba = ADDR_EMPTY;
+
+		/* Mark up the entry as submittable again */
+		flags = READ_ONCE(w_ctx->flags);
+		flags |= PBLK_WRITTEN_DATA;
+		/* Release flags on write context. Protect from writes */
+		smp_store_release(&w_ctx->flags, flags);
+
+		/* Decrese the reference count to the line as we will
+		 * re-map these entries
 		 */
-		list_add_tail(&entry->index, &recovery->failed);
+		line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)];
+		kref_put(&line->ref, pblk_line_put);
+
+		pos = (pos + 1) & (rb->nr_entries - 1);
 	}
+	spin_unlock(&pblk->trans_lock);
+}
 
-	c_entries = find_first_bit(comp_bits, nr_ppas);
-	ret = pblk_recov_setup_rq(pblk, c_ctx, recovery, comp_bits, c_entries);
-	if (ret) {
-		pr_err("pblk: could not recover from write failure\n");
-		mempool_free(recovery, pblk->rec_pool);
-		goto out;
+static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx)
+{
+	struct pblk_c_ctx *r_ctx;
+
+	r_ctx = kzalloc(sizeof(struct pblk_c_ctx), GFP_KERNEL);
+	if (!r_ctx)
+		return;
+
+	r_ctx->lun_bitmap = NULL;
+	r_ctx->sentry = c_ctx->sentry;
+	r_ctx->nr_valid = c_ctx->nr_valid;
+	r_ctx->nr_padded = c_ctx->nr_padded;
+
+	spin_lock(&pblk->resubmit_lock);
+	list_add_tail(&r_ctx->list, &pblk->resubmit_list);
+	spin_unlock(&pblk->resubmit_lock);
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes);
+#endif
+}
+
+static void pblk_submit_rec(struct work_struct *work)
+{
+	struct pblk_rec_ctx *recovery =
+			container_of(work, struct pblk_rec_ctx, ws_rec);
+	struct pblk *pblk = recovery->pblk;
+	struct nvm_rq *rqd = recovery->rqd;
+	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+	struct ppa_addr *ppa_list;
+
+	pblk_log_write_err(pblk, rqd);
+
+	if (rqd->nr_ppas == 1)
+		ppa_list = &rqd->ppa_addr;
+	else
+		ppa_list = rqd->ppa_list;
+
+	pblk_map_remaining(pblk, ppa_list);
+	pblk_queue_resubmit(pblk, c_ctx);
+
+	pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap);
+	if (c_ctx->nr_padded)
+		pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
+							c_ctx->nr_padded);
+	bio_put(rqd->bio);
+	pblk_free_rqd(pblk, rqd, PBLK_WRITE);
+	mempool_free(recovery, &pblk->rec_pool);
+
+	atomic_dec(&pblk->inflight_io);
+}
+
+
+static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
+{
+	struct pblk_rec_ctx *recovery;
+
+	recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC);
+	if (!recovery) {
+		pr_err("pblk: could not allocate recovery work\n");
+		return;
 	}
 
+	recovery->pblk = pblk;
+	recovery->rqd = rqd;
+
 	INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
 	queue_work(pblk->close_wq, &recovery->ws_rec);
-
-out:
-	pblk_complete_write(pblk, rqd, c_ctx);
 }
 
 static void pblk_end_io_write(struct nvm_rq *rqd)
@@ -173,8 +255,8 @@ static void pblk_end_io_write(struct nvm_rq *rqd)
 	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
 
 	if (rqd->error) {
-		pblk_log_write_err(pblk, rqd);
-		return pblk_end_w_fail(pblk, rqd);
+		pblk_end_w_fail(pblk, rqd);
+		return;
 	}
 #ifdef CONFIG_NVM_DEBUG
 	else
@@ -198,6 +280,7 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd)
 	if (rqd->error) {
 		pblk_log_write_err(pblk, rqd);
 		pr_err("pblk: metadata I/O failed. Line %d\n", line->id);
+		line->w_err_gc->has_write_err = 1;
 	}
 
 	sync = atomic_add_return(rqd->nr_ppas, &emeta->sync);
@@ -266,31 +349,6 @@ static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	return 0;
 }
 
-int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
-			struct pblk_c_ctx *c_ctx)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	unsigned long *lun_bitmap;
-	int ret;
-
-	lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
-	if (!lun_bitmap)
-		return -ENOMEM;
-
-	c_ctx->lun_bitmap = lun_bitmap;
-
-	ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas, pblk_end_io_write);
-	if (ret)
-		return ret;
-
-	pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0);
-
-	rqd->ppa_status = (u64)0;
-	rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
-
-	return ret;
-}
-
 static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
 				  unsigned int secs_to_flush)
 {
@@ -339,6 +397,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
 	bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
 					l_mg->emeta_alloc_type, GFP_KERNEL);
 	if (IS_ERR(bio)) {
+		pr_err("pblk: failed to map emeta io");
 		ret = PTR_ERR(bio);
 		goto fail_free_rqd;
 	}
@@ -515,26 +574,54 @@ static int pblk_submit_write(struct pblk *pblk)
 	unsigned int secs_avail, secs_to_sync, secs_to_com;
 	unsigned int secs_to_flush;
 	unsigned long pos;
+	unsigned int resubmit;
 
-	/* If there are no sectors in the cache, flushes (bios without data)
-	 * will be cleared on the cache threads
-	 */
-	secs_avail = pblk_rb_read_count(&pblk->rwb);
-	if (!secs_avail)
-		return 1;
-
-	secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
-	if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
-		return 1;
-
-	secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
-	if (secs_to_sync > pblk->max_write_pgs) {
-		pr_err("pblk: bad buffer sync calculation\n");
-		return 1;
-	}
+	spin_lock(&pblk->resubmit_lock);
+	resubmit = !list_empty(&pblk->resubmit_list);
+	spin_unlock(&pblk->resubmit_lock);
+
+	/* Resubmit failed writes first */
+	if (resubmit) {
+		struct pblk_c_ctx *r_ctx;
+
+		spin_lock(&pblk->resubmit_lock);
+		r_ctx = list_first_entry(&pblk->resubmit_list,
+					struct pblk_c_ctx, list);
+		list_del(&r_ctx->list);
+		spin_unlock(&pblk->resubmit_lock);
+
+		secs_avail = r_ctx->nr_valid;
+		pos = r_ctx->sentry;
+
+		pblk_prepare_resubmit(pblk, pos, secs_avail);
+		secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
+				secs_avail);
 
-	secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
-	pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
+		kfree(r_ctx);
+	} else {
+		/* If there are no sectors in the cache,
+		 * flushes (bios without data) will be cleared on
+		 * the cache threads
+		 */
+		secs_avail = pblk_rb_read_count(&pblk->rwb);
+		if (!secs_avail)
+			return 1;
+
+		secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
+		if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
+			return 1;
+
+		secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
+					secs_to_flush);
+		if (secs_to_sync > pblk->max_write_pgs) {
+			pr_err("pblk: bad buffer sync calculation\n");
+			return 1;
+		}
+
+		secs_to_com = (secs_to_sync > secs_avail) ?
+			secs_avail : secs_to_sync;
+		pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
+	}
 
 	bio = bio_alloc(GFP_KERNEL, secs_to_sync);
 
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 9c682ac..34cc1d6 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -89,12 +89,14 @@ struct pblk_sec_meta {
 /* The number of GC lists and the rate-limiter states go together. This way the
  * rate-limiter can dictate how much GC is needed based on resource utilization.
  */
-#define PBLK_GC_NR_LISTS 3
+#define PBLK_GC_NR_LISTS 4
 
 enum {
-	PBLK_RL_HIGH = 1,
-	PBLK_RL_MID = 2,
-	PBLK_RL_LOW = 3,
+	PBLK_RL_OFF = 0,
+	PBLK_RL_WERR = 1,
+	PBLK_RL_HIGH = 2,
+	PBLK_RL_MID = 3,
+	PBLK_RL_LOW = 4
 };
 
 #define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS)
@@ -128,7 +130,6 @@ struct pblk_pad_rq {
 struct pblk_rec_ctx {
 	struct pblk *pblk;
 	struct nvm_rq *rqd;
-	struct list_head failed;
 	struct work_struct ws_rec;
 };
 
@@ -279,6 +280,8 @@ struct pblk_rl {
 	int rb_user_active;
 	int rb_gc_active;
 
+	atomic_t werr_lines;	/* Number of write error lines that needs gc */
+
 	struct timer_list u_timer;
 
 	unsigned long long nr_secs;
@@ -312,6 +315,7 @@ enum {
 	PBLK_LINEGC_MID = 23,
 	PBLK_LINEGC_HIGH = 24,
 	PBLK_LINEGC_FULL = 25,
+	PBLK_LINEGC_WERR = 26
 };
 
 #define PBLK_MAGIC 0x70626c6b /*pblk*/
@@ -413,6 +417,11 @@ struct pblk_smeta {
 	struct line_smeta *buf;		/* smeta buffer in persistent format */
 };
 
+struct pblk_w_err_gc {
+	int has_write_err;
+	__le64 *lba_list;
+};
+
 struct pblk_line {
 	struct pblk *pblk;
 	unsigned int id;		/* Line number corresponds to the
@@ -458,6 +467,8 @@ struct pblk_line {
 
 	struct kref ref;		/* Write buffer L2P references */
 
+	struct pblk_w_err_gc *w_err_gc;	/* Write error gc recovery metadata */
+
 	spinlock_t lock;		/* Necessary for invalid_bitmap only */
 };
 
@@ -489,6 +500,8 @@ struct pblk_line_mgmt {
 	struct list_head gc_mid_list;	/* Full lines ready to GC, mid isc */
 	struct list_head gc_low_list;	/* Full lines ready to GC, low isc */
 
+	struct list_head gc_werr_list;  /* Write err recovery list */
+
 	struct list_head gc_full_list;	/* Full lines ready to GC, no valid */
 	struct list_head gc_empty_list;	/* Full lines close, all valid */
 
@@ -664,12 +677,15 @@ struct pblk {
 
 	struct list_head compl_list;
 
-	mempool_t *page_bio_pool;
-	mempool_t *gen_ws_pool;
-	mempool_t *rec_pool;
-	mempool_t *r_rq_pool;
-	mempool_t *w_rq_pool;
-	mempool_t *e_rq_pool;
+	spinlock_t resubmit_lock;	 /* Resubmit list lock */
+	struct list_head resubmit_list; /* Resubmit list for failed writes*/
+
+	mempool_t page_bio_pool;
+	mempool_t gen_ws_pool;
+	mempool_t rec_pool;
+	mempool_t r_rq_pool;
+	mempool_t w_rq_pool;
+	mempool_t e_rq_pool;
 
 	struct workqueue_struct *close_wq;
 	struct workqueue_struct *bb_wq;
@@ -713,9 +729,6 @@ void pblk_rb_sync_l2p(struct pblk_rb *rb);
 unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
 				 unsigned int pos, unsigned int nr_entries,
 				 unsigned int count);
-unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
-				      struct list_head *list,
-				      unsigned int max);
 int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
 			struct ppa_addr ppa, int bio_iter, bool advanced_bio);
 unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries);
@@ -766,11 +779,13 @@ struct pblk_line *pblk_line_get_data(struct pblk *pblk);
 struct pblk_line *pblk_line_get_erase(struct pblk *pblk);
 int pblk_line_erase(struct pblk *pblk, struct pblk_line *line);
 int pblk_line_is_full(struct pblk_line *line);
-void pblk_line_free(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_free(struct pblk_line *line);
 void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line);
 void pblk_line_close(struct pblk *pblk, struct pblk_line *line);
 void pblk_line_close_ws(struct work_struct *work);
 void pblk_pipeline_stop(struct pblk *pblk);
+void __pblk_pipeline_stop(struct pblk *pblk);
+void __pblk_pipeline_flush(struct pblk *pblk);
 void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
 		     void (*work)(struct work_struct *), gfp_t gfp_mask,
 		     struct workqueue_struct *wq);
@@ -794,7 +809,6 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
 void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas);
 void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
 		unsigned long *lun_bitmap);
-void pblk_end_io_sync(struct nvm_rq *rqd);
 int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
 		       int nr_pages);
 void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
@@ -837,23 +851,20 @@ void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 int pblk_write_ts(void *data);
 void pblk_write_timer_fn(struct timer_list *t);
 void pblk_write_should_kick(struct pblk *pblk);
+void pblk_write_kick(struct pblk *pblk);
 
 /*
  * pblk read path
  */
-extern struct bio_set *pblk_bio_set;
+extern struct bio_set pblk_bio_set;
 int pblk_submit_read(struct pblk *pblk, struct bio *bio);
 int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
 /*
  * pblk recovery
  */
-void pblk_submit_rec(struct work_struct *work);
 struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
 int pblk_recov_pad(struct pblk *pblk);
 int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
-int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
-			struct pblk_rec_ctx *recovery, u64 *comp_bits,
-			unsigned int comp);
 
 /*
  * pblk gc
@@ -864,7 +875,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
 #define PBLK_GC_RSV_LINE 1	/* Reserved lines for GC */
 
 int pblk_gc_init(struct pblk *pblk);
-void pblk_gc_exit(struct pblk *pblk);
+void pblk_gc_exit(struct pblk *pblk, bool graceful);
 void pblk_gc_should_start(struct pblk *pblk);
 void pblk_gc_should_stop(struct pblk *pblk);
 void pblk_gc_should_kick(struct pblk *pblk);
@@ -894,6 +905,9 @@ void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
 			    bool used);
 int pblk_rl_is_limit(struct pblk_rl *rl);
 
+void pblk_rl_werr_line_in(struct pblk_rl *rl);
+void pblk_rl_werr_line_out(struct pblk_rl *rl);
+
 /*
  * pblk sysfs
  */
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 433dbed..6663893 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -191,10 +191,10 @@ static int init_pmu(void);
 static void pmu_start(void);
 static irqreturn_t via_pmu_interrupt(int irq, void *arg);
 static irqreturn_t gpio1_interrupt(int irq, void *arg);
-static const struct file_operations pmu_info_proc_fops;
-static const struct file_operations pmu_irqstats_proc_fops;
+static int pmu_info_proc_show(struct seq_file *m, void *v);
+static int pmu_irqstats_proc_show(struct seq_file *m, void *v);
+static int pmu_battery_proc_show(struct seq_file *m, void *v);
 static void pmu_pass_intr(unsigned char *data, int len);
-static const struct file_operations pmu_battery_proc_fops;
 static const struct file_operations pmu_options_proc_fops;
 
 #ifdef CONFIG_ADB
@@ -511,13 +511,15 @@ static int __init via_pmu_dev_init(void)
 		for (i=0; i<pmu_battery_count; i++) {
 			char title[16];
 			sprintf(title, "battery_%ld", i);
-			proc_pmu_batt[i] = proc_create_data(title, 0, proc_pmu_root,
-					&pmu_battery_proc_fops, (void *)i);
+			proc_pmu_batt[i] = proc_create_single_data(title, 0,
+					proc_pmu_root, pmu_battery_proc_show,
+					(void *)i);
 		}
 
-		proc_pmu_info = proc_create("info", 0, proc_pmu_root, &pmu_info_proc_fops);
-		proc_pmu_irqstats = proc_create("interrupts", 0, proc_pmu_root,
-						&pmu_irqstats_proc_fops);
+		proc_pmu_info = proc_create_single("info", 0, proc_pmu_root,
+				pmu_info_proc_show);
+		proc_pmu_irqstats = proc_create_single("interrupts", 0,
+				proc_pmu_root, pmu_irqstats_proc_show);
 		proc_pmu_options = proc_create("options", 0600, proc_pmu_root,
 						&pmu_options_proc_fops);
 	}
@@ -811,19 +813,6 @@ static int pmu_info_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int pmu_info_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmu_info_proc_show, NULL);
-}
-
-static const struct file_operations pmu_info_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= pmu_info_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int pmu_irqstats_proc_show(struct seq_file *m, void *v)
 {
 	int i;
@@ -848,19 +837,6 @@ static int pmu_irqstats_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int pmu_irqstats_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmu_irqstats_proc_show, NULL);
-}
-
-static const struct file_operations pmu_irqstats_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= pmu_irqstats_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int pmu_battery_proc_show(struct seq_file *m, void *v)
 {
 	long batnum = (long)m->private;
@@ -875,19 +851,6 @@ static int pmu_battery_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int pmu_battery_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmu_battery_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations pmu_battery_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= pmu_battery_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int pmu_options_proc_show(struct seq_file *m, void *v)
 {
 #if defined(CONFIG_SUSPEND) && defined(CONFIG_PPC32)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 3a0cfb2..d6bf294f 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -269,7 +269,7 @@ struct bcache_device {
 	atomic_t		*stripe_sectors_dirty;
 	unsigned long		*full_dirty_stripes;
 
-	struct bio_set		*bio_split;
+	struct bio_set		bio_split;
 
 	unsigned		data_csum:1;
 
@@ -345,6 +345,7 @@ struct cached_dev {
 
 	struct keybuf		writeback_keys;
 
+	struct task_struct	*status_update_thread;
 	/*
 	 * Order the write-half of writeback operations strongly in dispatch
 	 * order.  (Maintain LBA order; don't allow reads completing out of
@@ -392,6 +393,7 @@ struct cached_dev {
 #define DEFAULT_CACHED_DEV_ERROR_LIMIT	64
 	atomic_t		io_errors;
 	unsigned		error_limit;
+	unsigned		offline_seconds;
 
 	char			backing_dev_name[BDEVNAME_SIZE];
 };
@@ -528,9 +530,9 @@ struct cache_set {
 	struct closure		sb_write;
 	struct semaphore	sb_write_mutex;
 
-	mempool_t		*search;
-	mempool_t		*bio_meta;
-	struct bio_set		*bio_split;
+	mempool_t		search;
+	mempool_t		bio_meta;
+	struct bio_set		bio_split;
 
 	/* For the btree cache */
 	struct shrinker		shrink;
@@ -655,7 +657,7 @@ struct cache_set {
 	 * A btree node on disk could have too many bsets for an iterator to fit
 	 * on the stack - have to dynamically allocate them
 	 */
-	mempool_t		*fill_iter;
+	mempool_t		fill_iter;
 
 	struct bset_sort_state	sort;
 
@@ -956,8 +958,6 @@ void bch_prio_write(struct cache *);
 void bch_write_bdev_super(struct cached_dev *, struct closure *);
 
 extern struct workqueue_struct *bcache_wq;
-extern const char * const bch_cache_modes[];
-extern const char * const bch_stop_on_failure_modes[];
 extern struct mutex bch_register_lock;
 extern struct list_head bch_cache_sets;
 
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 579c696..f3403b4 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1118,8 +1118,7 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
 
 void bch_bset_sort_state_free(struct bset_sort_state *state)
 {
-	if (state->pool)
-		mempool_destroy(state->pool);
+	mempool_exit(&state->pool);
 }
 
 int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
@@ -1129,11 +1128,7 @@ int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
 	state->page_order = page_order;
 	state->crit_factor = int_sqrt(1 << page_order);
 
-	state->pool = mempool_create_page_pool(1, page_order);
-	if (!state->pool)
-		return -ENOMEM;
-
-	return 0;
+	return mempool_init_page_pool(&state->pool, 1, page_order);
 }
 EXPORT_SYMBOL(bch_bset_sort_state_init);
 
@@ -1191,7 +1186,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
 
 		BUG_ON(order > state->page_order);
 
-		outp = mempool_alloc(state->pool, GFP_NOIO);
+		outp = mempool_alloc(&state->pool, GFP_NOIO);
 		out = page_address(outp);
 		used_mempool = true;
 		order = state->page_order;
@@ -1220,7 +1215,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
 	}
 
 	if (used_mempool)
-		mempool_free(virt_to_page(out), state->pool);
+		mempool_free(virt_to_page(out), &state->pool);
 	else
 		free_pages((unsigned long) out, order);
 
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 0c24280..b867f22 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -347,7 +347,7 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b,
 /* Sorting */
 
 struct bset_sort_state {
-	mempool_t		*pool;
+	mempool_t		pool;
 
 	unsigned		page_order;
 	unsigned		crit_factor;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 17936b2..2a0968c0 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -204,7 +204,7 @@ void bch_btree_node_read_done(struct btree *b)
 	struct bset *i = btree_bset_first(b);
 	struct btree_iter *iter;
 
-	iter = mempool_alloc(b->c->fill_iter, GFP_NOIO);
+	iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
 	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
 	iter->used = 0;
 
@@ -271,7 +271,7 @@ void bch_btree_node_read_done(struct btree *b)
 		bch_bset_init_next(&b->keys, write_block(b),
 				   bset_magic(&b->c->sb));
 out:
-	mempool_free(iter, b->c->fill_iter);
+	mempool_free(iter, &b->c->fill_iter);
 	return;
 err:
 	set_btree_node_io_error(b);
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 2ddf851..9612873 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -17,12 +17,12 @@
 void bch_bbio_free(struct bio *bio, struct cache_set *c)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
-	mempool_free(b, c->bio_meta);
+	mempool_free(b, &c->bio_meta);
 }
 
 struct bio *bch_bbio_alloc(struct cache_set *c)
 {
-	struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO);
+	struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO);
 	struct bio *bio = &b->bio;
 
 	bio_init(bio, bio->bi_inline_vecs, bucket_pages(c));
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 8e3e865..ae67f5f 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -213,7 +213,7 @@ static void bch_data_insert_start(struct closure *cl)
 	do {
 		unsigned i;
 		struct bkey *k;
-		struct bio_set *split = op->c->bio_split;
+		struct bio_set *split = &op->c->bio_split;
 
 		/* 1 for the device pointer and 1 for the chksum */
 		if (bch_keylist_realloc(&op->insert_keys,
@@ -548,7 +548,7 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 
 	n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
 				      KEY_OFFSET(k) - bio->bi_iter.bi_sector),
-			   GFP_NOIO, s->d->bio_split);
+			   GFP_NOIO, &s->d->bio_split);
 
 	bio_key = &container_of(n, struct bbio, bio)->key;
 	bch_bkey_copy_single_ptr(bio_key, k, ptr);
@@ -707,7 +707,7 @@ static void search_free(struct closure *cl)
 
 	bio_complete(s);
 	closure_debug_destroy(cl);
-	mempool_free(s, s->d->c->search);
+	mempool_free(s, &s->d->c->search);
 }
 
 static inline struct search *search_alloc(struct bio *bio,
@@ -715,7 +715,7 @@ static inline struct search *search_alloc(struct bio *bio,
 {
 	struct search *s;
 
-	s = mempool_alloc(d->c->search, GFP_NOIO);
+	s = mempool_alloc(&d->c->search, GFP_NOIO);
 
 	closure_init(&s->cl, NULL);
 	do_bio_hook(s, bio, request_endio);
@@ -864,7 +864,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	s->cache_missed = 1;
 
 	if (s->cache_miss || s->iop.bypass) {
-		miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+		miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
 		ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
 		goto out_submit;
 	}
@@ -887,14 +887,14 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 
 	s->iop.replace = true;
 
-	miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+	miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
 
 	/* btree_search_recurse()'s btree iterator is no good anymore */
 	ret = miss == bio ? MAP_DONE : -EINTR;
 
 	cache_bio = bio_alloc_bioset(GFP_NOWAIT,
 			DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS),
-			dc->disk.bio_split);
+			&dc->disk.bio_split);
 	if (!cache_bio)
 		goto out_submit;
 
@@ -1008,7 +1008,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 			struct bio *flush;
 
 			flush = bio_alloc_bioset(GFP_NOIO, 0,
-						 dc->disk.bio_split);
+						 &dc->disk.bio_split);
 			if (!flush) {
 				s->iop.status = BLK_STS_RESOURCE;
 				goto insert_data;
@@ -1021,7 +1021,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 			closure_bio_submit(s->iop.c, flush, cl);
 		}
 	} else {
-		s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
+		s->iop.bio = bio_clone_fast(bio, GFP_NOIO, &dc->disk.bio_split);
 		/* I/O request sent to backing device */
 		bio->bi_end_io = backing_request_endio;
 		closure_bio_submit(s->iop.c, bio, cl);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 3dea06b..a31e55b 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -37,24 +37,6 @@ static const char invalid_uuid[] = {
 	0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
 };
 
-/* Default is -1; we skip past it for struct cached_dev's cache mode */
-const char * const bch_cache_modes[] = {
-	"default",
-	"writethrough",
-	"writeback",
-	"writearound",
-	"none",
-	NULL
-};
-
-/* Default is -1; we skip past it for stop_when_cache_set_failed */
-const char * const bch_stop_on_failure_modes[] = {
-	"default",
-	"auto",
-	"always",
-	NULL
-};
-
 static struct kobject *bcache_kobj;
 struct mutex bch_register_lock;
 LIST_HEAD(bch_cache_sets);
@@ -654,6 +636,11 @@ static int ioctl_dev(struct block_device *b, fmode_t mode,
 		     unsigned int cmd, unsigned long arg)
 {
 	struct bcache_device *d = b->bd_disk->private_data;
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+
+	if (dc->io_disable)
+		return -EIO;
+
 	return d->ioctl(d, mode, cmd, arg);
 }
 
@@ -766,8 +753,7 @@ static void bcache_device_free(struct bcache_device *d)
 		put_disk(d->disk);
 	}
 
-	if (d->bio_split)
-		bioset_free(d->bio_split);
+	bioset_exit(&d->bio_split);
 	kvfree(d->full_dirty_stripes);
 	kvfree(d->stripe_sectors_dirty);
 
@@ -809,9 +795,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	if (idx < 0)
 		return idx;
 
-	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio),
-					   BIOSET_NEED_BVECS |
-					   BIOSET_NEED_RESCUER)) ||
+	if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
+			BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
 	    !(d->disk = alloc_disk(BCACHE_MINORS))) {
 		ida_simple_remove(&bcache_device_idx, idx);
 		return -ENOMEM;
@@ -864,6 +849,44 @@ static void calc_cached_dev_sectors(struct cache_set *c)
 	c->cached_dev_sectors = sectors;
 }
 
+#define BACKING_DEV_OFFLINE_TIMEOUT 5
+static int cached_dev_status_update(void *arg)
+{
+	struct cached_dev *dc = arg;
+	struct request_queue *q;
+
+	/*
+	 * If this delayed worker is stopping outside, directly quit here.
+	 * dc->io_disable might be set via sysfs interface, so check it
+	 * here too.
+	 */
+	while (!kthread_should_stop() && !dc->io_disable) {
+		q = bdev_get_queue(dc->bdev);
+		if (blk_queue_dying(q))
+			dc->offline_seconds++;
+		else
+			dc->offline_seconds = 0;
+
+		if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
+			pr_err("%s: device offline for %d seconds",
+			       dc->backing_dev_name,
+			       BACKING_DEV_OFFLINE_TIMEOUT);
+			pr_err("%s: disable I/O request due to backing "
+			       "device offline", dc->disk.name);
+			dc->io_disable = true;
+			/* let others know earlier that io_disable is true */
+			smp_mb();
+			bcache_device_stop(&dc->disk);
+			break;
+		}
+		schedule_timeout_interruptible(HZ);
+	}
+
+	wait_for_kthread_stop();
+	return 0;
+}
+
+
 void bch_cached_dev_run(struct cached_dev *dc)
 {
 	struct bcache_device *d = &dc->disk;
@@ -906,6 +929,14 @@ void bch_cached_dev_run(struct cached_dev *dc)
 	if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
 	    sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
 		pr_debug("error creating sysfs link");
+
+	dc->status_update_thread = kthread_run(cached_dev_status_update,
+					       dc, "bcache_status_update");
+	if (IS_ERR(dc->status_update_thread)) {
+		pr_warn("failed to create bcache_status_update kthread, "
+			"continue to run without monitoring backing "
+			"device status");
+	}
 }
 
 /*
@@ -1139,6 +1170,8 @@ static void cached_dev_free(struct closure *cl)
 		kthread_stop(dc->writeback_thread);
 	if (dc->writeback_write_wq)
 		destroy_workqueue(dc->writeback_write_wq);
+	if (!IS_ERR_OR_NULL(dc->status_update_thread))
+		kthread_stop(dc->status_update_thread);
 
 	if (atomic_read(&dc->running))
 		bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
@@ -1465,14 +1498,10 @@ static void cache_set_free(struct closure *cl)
 
 	if (c->moving_gc_wq)
 		destroy_workqueue(c->moving_gc_wq);
-	if (c->bio_split)
-		bioset_free(c->bio_split);
-	if (c->fill_iter)
-		mempool_destroy(c->fill_iter);
-	if (c->bio_meta)
-		mempool_destroy(c->bio_meta);
-	if (c->search)
-		mempool_destroy(c->search);
+	bioset_exit(&c->bio_split);
+	mempool_exit(&c->fill_iter);
+	mempool_exit(&c->bio_meta);
+	mempool_exit(&c->search);
 	kfree(c->devices);
 
 	mutex_lock(&bch_register_lock);
@@ -1683,21 +1712,17 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	INIT_LIST_HEAD(&c->btree_cache_freed);
 	INIT_LIST_HEAD(&c->data_buckets);
 
-	c->search = mempool_create_slab_pool(32, bch_search_cache);
-	if (!c->search)
-		goto err;
-
 	iter_size = (sb->bucket_size / sb->block_size + 1) *
 		sizeof(struct btree_iter_set);
 
 	if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
-	    !(c->bio_meta = mempool_create_kmalloc_pool(2,
-				sizeof(struct bbio) + sizeof(struct bio_vec) *
-				bucket_pages(c))) ||
-	    !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
-	    !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio),
-					   BIOSET_NEED_BVECS |
-					   BIOSET_NEED_RESCUER)) ||
+	    mempool_init_slab_pool(&c->search, 32, bch_search_cache) ||
+	    mempool_init_kmalloc_pool(&c->bio_meta, 2,
+				      sizeof(struct bbio) + sizeof(struct bio_vec) *
+				      bucket_pages(c)) ||
+	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
+	    bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
+			BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
 	    !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
 	    !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
 						WQ_MEM_RECLAIM, 0)) ||
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index dfeef58..8ccbc8f3 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -16,6 +16,22 @@
 #include <linux/sort.h>
 #include <linux/sched/clock.h>
 
+/* Default is -1; we skip past it for struct cached_dev's cache mode */
+static const char * const bch_cache_modes[] = {
+	"writethrough",
+	"writeback",
+	"writearound",
+	"none",
+	NULL
+};
+
+/* Default is -1; we skip past it for stop_when_cache_set_failed */
+static const char * const bch_stop_on_failure_modes[] = {
+	"auto",
+	"always",
+	NULL
+};
+
 static const char * const cache_replacement_policies[] = {
 	"lru",
 	"fifo",
@@ -114,6 +130,20 @@ rw_attribute(btree_shrinker_disabled);
 rw_attribute(copy_gc_enabled);
 rw_attribute(size);
 
+static ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
+			    size_t selected)
+{
+	char *out = buf;
+	size_t i;
+
+	for (i = 0; list[i]; i++)
+		out += snprintf(out, buf + size - out,
+				i == selected ? "[%s] " : "%s ", list[i]);
+
+	out[-1] = '\n';
+	return out - buf;
+}
+
 SHOW(__bch_cached_dev)
 {
 	struct cached_dev *dc = container_of(kobj, struct cached_dev,
@@ -124,12 +154,12 @@ SHOW(__bch_cached_dev)
 
 	if (attr == &sysfs_cache_mode)
 		return bch_snprint_string_list(buf, PAGE_SIZE,
-					       bch_cache_modes + 1,
+					       bch_cache_modes,
 					       BDEV_CACHE_MODE(&dc->sb));
 
 	if (attr == &sysfs_stop_when_cache_set_failed)
 		return bch_snprint_string_list(buf, PAGE_SIZE,
-					       bch_stop_on_failure_modes + 1,
+					       bch_stop_on_failure_modes,
 					       dc->stop_when_cache_set_failed);
 
 
@@ -253,8 +283,7 @@ STORE(__cached_dev)
 		bch_cached_dev_run(dc);
 
 	if (attr == &sysfs_cache_mode) {
-		v = bch_read_string_list(buf, bch_cache_modes + 1);
-
+		v = __sysfs_match_string(bch_cache_modes, -1, buf);
 		if (v < 0)
 			return v;
 
@@ -265,8 +294,7 @@ STORE(__cached_dev)
 	}
 
 	if (attr == &sysfs_stop_when_cache_set_failed) {
-		v = bch_read_string_list(buf, bch_stop_on_failure_modes + 1);
-
+		v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf);
 		if (v < 0)
 			return v;
 
@@ -635,6 +663,7 @@ SHOW_LOCKED(bch_cache_set)
 STORE(__bch_cache_set)
 {
 	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+	ssize_t v;
 
 	if (attr == &sysfs_unregister)
 		bch_cache_set_unregister(c);
@@ -698,8 +727,7 @@ STORE(__bch_cache_set)
 		      c->congested_write_threshold_us);
 
 	if (attr == &sysfs_errors) {
-		ssize_t v = bch_read_string_list(buf, error_actions);
-
+		v = __sysfs_match_string(error_actions, -1, buf);
 		if (v < 0)
 			return v;
 
@@ -714,8 +742,7 @@ STORE(__bch_cache_set)
 		c->error_decay = strtoul_or_return(buf) / 88;
 
 	if (attr == &sysfs_io_disable) {
-		int v = strtoul_or_return(buf);
-
+		v = strtoul_or_return(buf);
 		if (v) {
 			if (test_and_set_bit(CACHE_SET_IO_DISABLE,
 					     &c->flags))
@@ -929,6 +956,7 @@ SHOW_LOCKED(bch_cache)
 STORE(__bch_cache)
 {
 	struct cache *ca = container_of(kobj, struct cache, kobj);
+	ssize_t v;
 
 	if (attr == &sysfs_discard) {
 		bool v = strtoul_or_return(buf);
@@ -943,8 +971,7 @@ STORE(__bch_cache)
 	}
 
 	if (attr == &sysfs_cache_replacement_policy) {
-		ssize_t v = bch_read_string_list(buf, cache_replacement_policies);
-
+		v = __sysfs_match_string(cache_replacement_policies, -1, buf);
 		if (v < 0)
 			return v;
 
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 74febd5..fc479b0 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -120,41 +120,6 @@ ssize_t bch_hprint(char *buf, int64_t v)
 		return sprintf(buf, "%llu.%i%c", q, t * 10 / 1024, units[u]);
 }
 
-ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
-			    size_t selected)
-{
-	char *out = buf;
-	size_t i;
-
-	for (i = 0; list[i]; i++)
-		out += snprintf(out, buf + size - out,
-				i == selected ? "[%s] " : "%s ", list[i]);
-
-	out[-1] = '\n';
-	return out - buf;
-}
-
-ssize_t bch_read_string_list(const char *buf, const char * const list[])
-{
-	size_t i;
-	char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL);
-	if (!d)
-		return -ENOMEM;
-
-	s = strim(d);
-
-	for (i = 0; list[i]; i++)
-		if (!strcmp(list[i], s))
-			break;
-
-	kfree(d);
-
-	if (!list[i])
-		return -EINVAL;
-
-	return i;
-}
-
 bool bch_is_zero(const char *p, size_t n)
 {
 	size_t i;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 2680245..cced87f 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -365,11 +365,6 @@ ssize_t bch_hprint(char *buf, int64_t v);
 bool bch_is_zero(const char *p, size_t n);
 int bch_parse_uuid(const char *s, char *uuid);
 
-ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
-			    size_t selected);
-
-ssize_t bch_read_string_list(const char *buf, const char * const list[]);
-
 struct time_stats {
 	spinlock_t	lock;
 	/*
diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index 874841f..8e33a38 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -19,7 +19,7 @@
 
 struct dm_bio_prison {
 	spinlock_t lock;
-	mempool_t *cell_pool;
+	mempool_t cell_pool;
 	struct rb_root cells;
 };
 
@@ -34,14 +34,15 @@ static struct kmem_cache *_cell_cache;
 struct dm_bio_prison *dm_bio_prison_create(void)
 {
 	struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
+	int ret;
 
 	if (!prison)
 		return NULL;
 
 	spin_lock_init(&prison->lock);
 
-	prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
-	if (!prison->cell_pool) {
+	ret = mempool_init_slab_pool(&prison->cell_pool, MIN_CELLS, _cell_cache);
+	if (ret) {
 		kfree(prison);
 		return NULL;
 	}
@@ -54,21 +55,21 @@ EXPORT_SYMBOL_GPL(dm_bio_prison_create);
 
 void dm_bio_prison_destroy(struct dm_bio_prison *prison)
 {
-	mempool_destroy(prison->cell_pool);
+	mempool_exit(&prison->cell_pool);
 	kfree(prison);
 }
 EXPORT_SYMBOL_GPL(dm_bio_prison_destroy);
 
 struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison, gfp_t gfp)
 {
-	return mempool_alloc(prison->cell_pool, gfp);
+	return mempool_alloc(&prison->cell_pool, gfp);
 }
 EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell);
 
 void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
 			     struct dm_bio_prison_cell *cell)
 {
-	mempool_free(cell, prison->cell_pool);
+	mempool_free(cell, &prison->cell_pool);
 }
 EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell);
 
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
index 8ce3a1a..601b156 100644
--- a/drivers/md/dm-bio-prison-v2.c
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -21,7 +21,7 @@ struct dm_bio_prison_v2 {
 	struct workqueue_struct *wq;
 
 	spinlock_t lock;
-	mempool_t *cell_pool;
+	mempool_t cell_pool;
 	struct rb_root cells;
 };
 
@@ -36,6 +36,7 @@ static struct kmem_cache *_cell_cache;
 struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq)
 {
 	struct dm_bio_prison_v2 *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
+	int ret;
 
 	if (!prison)
 		return NULL;
@@ -43,8 +44,8 @@ struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq)
 	prison->wq = wq;
 	spin_lock_init(&prison->lock);
 
-	prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
-	if (!prison->cell_pool) {
+	ret = mempool_init_slab_pool(&prison->cell_pool, MIN_CELLS, _cell_cache);
+	if (ret) {
 		kfree(prison);
 		return NULL;
 	}
@@ -57,21 +58,21 @@ EXPORT_SYMBOL_GPL(dm_bio_prison_create_v2);
 
 void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison)
 {
-	mempool_destroy(prison->cell_pool);
+	mempool_exit(&prison->cell_pool);
 	kfree(prison);
 }
 EXPORT_SYMBOL_GPL(dm_bio_prison_destroy_v2);
 
 struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison, gfp_t gfp)
 {
-	return mempool_alloc(prison->cell_pool, gfp);
+	return mempool_alloc(&prison->cell_pool, gfp);
 }
 EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell_v2);
 
 void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison,
 				struct dm_bio_prison_cell_v2 *cell)
 {
-	mempool_free(cell, prison->cell_pool);
+	mempool_free(cell, &prison->cell_pool);
 }
 EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell_v2);
 
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index da20863..001c712 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -447,9 +447,9 @@ struct cache {
 	struct work_struct migration_worker;
 	struct delayed_work waker;
 	struct dm_bio_prison_v2 *prison;
-	struct bio_set *bs;
+	struct bio_set bs;
 
-	mempool_t *migration_pool;
+	mempool_t migration_pool;
 
 	struct dm_cache_policy *policy;
 	unsigned policy_nr_args;
@@ -550,7 +550,7 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache)
 {
 	struct dm_cache_migration *mg;
 
-	mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
+	mg = mempool_alloc(&cache->migration_pool, GFP_NOWAIT);
 	if (!mg)
 		return NULL;
 
@@ -569,7 +569,7 @@ static void free_migration(struct dm_cache_migration *mg)
 	if (atomic_dec_and_test(&cache->nr_allocated_migrations))
 		wake_up(&cache->migration_wait);
 
-	mempool_free(mg, cache->migration_pool);
+	mempool_free(mg, &cache->migration_pool);
 }
 
 /*----------------------------------------------------------------*/
@@ -924,7 +924,7 @@ static void issue_op(struct bio *bio, void *context)
 static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
 				      dm_oblock_t oblock, dm_cblock_t cblock)
 {
-	struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, cache->bs);
+	struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs);
 
 	BUG_ON(!origin_bio);
 
@@ -2011,7 +2011,7 @@ static void destroy(struct cache *cache)
 {
 	unsigned i;
 
-	mempool_destroy(cache->migration_pool);
+	mempool_exit(&cache->migration_pool);
 
 	if (cache->prison)
 		dm_bio_prison_destroy_v2(cache->prison);
@@ -2047,8 +2047,7 @@ static void destroy(struct cache *cache)
 		kfree(cache->ctr_args[i]);
 	kfree(cache->ctr_args);
 
-	if (cache->bs)
-		bioset_free(cache->bs);
+	bioset_exit(&cache->bs);
 
 	kfree(cache);
 }
@@ -2498,8 +2497,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 	cache->features = ca->features;
 	if (writethrough_mode(cache)) {
 		/* Create bioset for writethrough bios issued to origin */
-		cache->bs = bioset_create(BIO_POOL_SIZE, 0, 0);
-		if (!cache->bs)
+		r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0);
+		if (r)
 			goto bad;
 	}
 
@@ -2630,9 +2629,9 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 		goto bad;
 	}
 
-	cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
-							 migration_cache);
-	if (!cache->migration_pool) {
+	r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE,
+				   migration_cache);
+	if (r) {
 		*error = "Error creating cache's migration mempool";
 		goto bad;
 	}
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 3222e21..f21c5d2 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -91,8 +91,8 @@ struct mapped_device {
 	/*
 	 * io objects are allocated from here.
 	 */
-	struct bio_set *io_bs;
-	struct bio_set *bs;
+	struct bio_set io_bs;
+	struct bio_set bs;
 
 	/*
 	 * freeze/thaw support require holding onto a super block
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 44ff473..da02f4d 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -143,14 +143,14 @@ struct crypt_config {
 	 * pool for per bio private data, crypto requests,
 	 * encryption requeusts/buffer pages and integrity tags
 	 */
-	mempool_t *req_pool;
-	mempool_t *page_pool;
-	mempool_t *tag_pool;
+	mempool_t req_pool;
+	mempool_t page_pool;
+	mempool_t tag_pool;
 	unsigned tag_pool_max_sectors;
 
 	struct percpu_counter n_allocated_pages;
 
-	struct bio_set *bs;
+	struct bio_set bs;
 	struct mutex bio_alloc_lock;
 
 	struct workqueue_struct *io_queue;
@@ -1245,7 +1245,7 @@ static void crypt_alloc_req_skcipher(struct crypt_config *cc,
 	unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
 
 	if (!ctx->r.req)
-		ctx->r.req = mempool_alloc(cc->req_pool, GFP_NOIO);
+		ctx->r.req = mempool_alloc(&cc->req_pool, GFP_NOIO);
 
 	skcipher_request_set_tfm(ctx->r.req, cc->cipher_tfm.tfms[key_index]);
 
@@ -1262,7 +1262,7 @@ static void crypt_alloc_req_aead(struct crypt_config *cc,
 				 struct convert_context *ctx)
 {
 	if (!ctx->r.req_aead)
-		ctx->r.req_aead = mempool_alloc(cc->req_pool, GFP_NOIO);
+		ctx->r.req_aead = mempool_alloc(&cc->req_pool, GFP_NOIO);
 
 	aead_request_set_tfm(ctx->r.req_aead, cc->cipher_tfm.tfms_aead[0]);
 
@@ -1290,7 +1290,7 @@ static void crypt_free_req_skcipher(struct crypt_config *cc,
 	struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
 
 	if ((struct skcipher_request *)(io + 1) != req)
-		mempool_free(req, cc->req_pool);
+		mempool_free(req, &cc->req_pool);
 }
 
 static void crypt_free_req_aead(struct crypt_config *cc,
@@ -1299,7 +1299,7 @@ static void crypt_free_req_aead(struct crypt_config *cc,
 	struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
 
 	if ((struct aead_request *)(io + 1) != req)
-		mempool_free(req, cc->req_pool);
+		mempool_free(req, &cc->req_pool);
 }
 
 static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_bio)
@@ -1409,7 +1409,7 @@ retry:
 	if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
 		mutex_lock(&cc->bio_alloc_lock);
 
-	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
+	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, &cc->bs);
 	if (!clone)
 		goto out;
 
@@ -1418,7 +1418,7 @@ retry:
 	remaining_size = size;
 
 	for (i = 0; i < nr_iovecs; i++) {
-		page = mempool_alloc(cc->page_pool, gfp_mask);
+		page = mempool_alloc(&cc->page_pool, gfp_mask);
 		if (!page) {
 			crypt_free_buffer_pages(cc, clone);
 			bio_put(clone);
@@ -1453,7 +1453,7 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
 
 	bio_for_each_segment_all(bv, clone, i) {
 		BUG_ON(!bv->bv_page);
-		mempool_free(bv->bv_page, cc->page_pool);
+		mempool_free(bv->bv_page, &cc->page_pool);
 	}
 }
 
@@ -1492,7 +1492,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 		crypt_free_req(cc, io->ctx.r.req, base_bio);
 
 	if (unlikely(io->integrity_metadata_from_pool))
-		mempool_free(io->integrity_metadata, io->cc->tag_pool);
+		mempool_free(io->integrity_metadata, &io->cc->tag_pool);
 	else
 		kfree(io->integrity_metadata);
 
@@ -1565,7 +1565,7 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 	 * biovecs we don't need to worry about the block layer
 	 * modifying the biovec array; so leverage bio_clone_fast().
 	 */
-	clone = bio_clone_fast(io->base_bio, gfp, cc->bs);
+	clone = bio_clone_fast(io->base_bio, gfp, &cc->bs);
 	if (!clone)
 		return 1;
 
@@ -2219,15 +2219,13 @@ static void crypt_dtr(struct dm_target *ti)
 
 	crypt_free_tfms(cc);
 
-	if (cc->bs)
-		bioset_free(cc->bs);
+	bioset_exit(&cc->bs);
 
-	mempool_destroy(cc->page_pool);
-	mempool_destroy(cc->req_pool);
-	mempool_destroy(cc->tag_pool);
+	mempool_exit(&cc->page_pool);
+	mempool_exit(&cc->req_pool);
+	mempool_exit(&cc->tag_pool);
 
-	if (cc->page_pool)
-		WARN_ON(percpu_counter_sum(&cc->n_allocated_pages) != 0);
+	WARN_ON(percpu_counter_sum(&cc->n_allocated_pages) != 0);
 	percpu_counter_destroy(&cc->n_allocated_pages);
 
 	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
@@ -2743,8 +2741,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		iv_size_padding = align_mask;
 	}
 
-	ret = -ENOMEM;
-
 	/*  ...| IV + padding | original IV | original sec. number | bio tag offset | */
 	additional_req_size = sizeof(struct dm_crypt_request) +
 		iv_size_padding + cc->iv_size +
@@ -2752,8 +2748,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		sizeof(uint64_t) +
 		sizeof(unsigned int);
 
-	cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + additional_req_size);
-	if (!cc->req_pool) {
+	ret = mempool_init_kmalloc_pool(&cc->req_pool, MIN_IOS, cc->dmreq_start + additional_req_size);
+	if (ret) {
 		ti->error = "Cannot allocate crypt request mempool";
 		goto bad;
 	}
@@ -2762,14 +2758,14 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size,
 		      ARCH_KMALLOC_MINALIGN);
 
-	cc->page_pool = mempool_create(BIO_MAX_PAGES, crypt_page_alloc, crypt_page_free, cc);
-	if (!cc->page_pool) {
+	ret = mempool_init(&cc->page_pool, BIO_MAX_PAGES, crypt_page_alloc, crypt_page_free, cc);
+	if (ret) {
 		ti->error = "Cannot allocate page mempool";
 		goto bad;
 	}
 
-	cc->bs = bioset_create(MIN_IOS, 0, BIOSET_NEED_BVECS);
-	if (!cc->bs) {
+	ret = bioset_init(&cc->bs, MIN_IOS, 0, BIOSET_NEED_BVECS);
+	if (ret) {
 		ti->error = "Cannot allocate crypt bioset";
 		goto bad;
 	}
@@ -2806,11 +2802,10 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		if (!cc->tag_pool_max_sectors)
 			cc->tag_pool_max_sectors = 1;
 
-		cc->tag_pool = mempool_create_kmalloc_pool(MIN_IOS,
+		ret = mempool_init_kmalloc_pool(&cc->tag_pool, MIN_IOS,
 			cc->tag_pool_max_sectors * cc->on_disk_tag_size);
-		if (!cc->tag_pool) {
+		if (ret) {
 			ti->error = "Cannot allocate integrity tags mempool";
-			ret = -ENOMEM;
 			goto bad;
 		}
 
@@ -2903,7 +2898,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 				GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
 			if (bio_sectors(bio) > cc->tag_pool_max_sectors)
 				dm_accept_partial_bio(bio, cc->tag_pool_max_sectors);
-			io->integrity_metadata = mempool_alloc(cc->tag_pool, GFP_NOIO);
+			io->integrity_metadata = mempool_alloc(&cc->tag_pool, GFP_NOIO);
 			io->integrity_metadata_from_pool = true;
 		}
 	}
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 514fb4a..fc68c7a 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -142,7 +142,7 @@ struct dm_integrity_c {
 	unsigned tag_size;
 	__s8 log2_tag_size;
 	sector_t start;
-	mempool_t *journal_io_mempool;
+	mempool_t journal_io_mempool;
 	struct dm_io_client *io;
 	struct dm_bufio_client *bufio;
 	struct workqueue_struct *metadata_wq;
@@ -1817,7 +1817,7 @@ static void complete_copy_from_journal(unsigned long error, void *context)
 	struct journal_completion *comp = io->comp;
 	struct dm_integrity_c *ic = comp->ic;
 	remove_range(ic, &io->range);
-	mempool_free(io, ic->journal_io_mempool);
+	mempool_free(io, &ic->journal_io_mempool);
 	if (unlikely(error != 0))
 		dm_integrity_io_error(ic, "copying from journal", -EIO);
 	complete_journal_op(comp);
@@ -1886,7 +1886,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
 			}
 			next_loop = k - 1;
 
-			io = mempool_alloc(ic->journal_io_mempool, GFP_NOIO);
+			io = mempool_alloc(&ic->journal_io_mempool, GFP_NOIO);
 			io->comp = &comp;
 			io->range.logical_sector = sec;
 			io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
@@ -1918,7 +1918,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
 				if (j == k) {
 					remove_range_unlocked(ic, &io->range);
 					spin_unlock_irq(&ic->endio_wait.lock);
-					mempool_free(io, ic->journal_io_mempool);
+					mempool_free(io, &ic->journal_io_mempool);
 					goto skip_io;
 				}
 				for (l = j; l < k; l++) {
@@ -2980,9 +2980,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 	}
 
-	ic->journal_io_mempool = mempool_create_slab_pool(JOURNAL_IO_MEMPOOL, journal_io_cache);
-	if (!ic->journal_io_mempool) {
-		r = -ENOMEM;
+	r = mempool_init_slab_pool(&ic->journal_io_mempool, JOURNAL_IO_MEMPOOL, journal_io_cache);
+	if (r) {
 		ti->error = "Cannot allocate mempool";
 		goto bad;
 	}
@@ -3196,7 +3195,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
 		destroy_workqueue(ic->writer_wq);
 	if (ic->bufio)
 		dm_bufio_client_destroy(ic->bufio);
-	mempool_destroy(ic->journal_io_mempool);
+	mempool_exit(&ic->journal_io_mempool);
 	if (ic->io)
 		dm_io_client_destroy(ic->io);
 	if (ic->dev)
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index a8d914d..53c6ed0 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -22,8 +22,8 @@
 #define DM_IO_MAX_REGIONS	BITS_PER_LONG
 
 struct dm_io_client {
-	mempool_t *pool;
-	struct bio_set *bios;
+	mempool_t pool;
+	struct bio_set bios;
 };
 
 /*
@@ -49,32 +49,33 @@ struct dm_io_client *dm_io_client_create(void)
 {
 	struct dm_io_client *client;
 	unsigned min_ios = dm_get_reserved_bio_based_ios();
+	int ret;
 
 	client = kmalloc(sizeof(*client), GFP_KERNEL);
 	if (!client)
 		return ERR_PTR(-ENOMEM);
 
-	client->pool = mempool_create_slab_pool(min_ios, _dm_io_cache);
-	if (!client->pool)
+	ret = mempool_init_slab_pool(&client->pool, min_ios, _dm_io_cache);
+	if (ret)
 		goto bad;
 
-	client->bios = bioset_create(min_ios, 0, BIOSET_NEED_BVECS);
-	if (!client->bios)
+	ret = bioset_init(&client->bios, min_ios, 0, BIOSET_NEED_BVECS);
+	if (ret)
 		goto bad;
 
 	return client;
 
    bad:
-	mempool_destroy(client->pool);
+	mempool_exit(&client->pool);
 	kfree(client);
-	return ERR_PTR(-ENOMEM);
+	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL(dm_io_client_create);
 
 void dm_io_client_destroy(struct dm_io_client *client)
 {
-	mempool_destroy(client->pool);
-	bioset_free(client->bios);
+	mempool_exit(&client->pool);
+	bioset_exit(&client->bios);
 	kfree(client);
 }
 EXPORT_SYMBOL(dm_io_client_destroy);
@@ -120,7 +121,7 @@ static void complete_io(struct io *io)
 		invalidate_kernel_vmap_range(io->vma_invalidate_address,
 					     io->vma_invalidate_size);
 
-	mempool_free(io, io->client->pool);
+	mempool_free(io, &io->client->pool);
 	fn(error_bits, context);
 }
 
@@ -344,7 +345,7 @@ static void do_region(int op, int op_flags, unsigned region,
 					  dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
 		}
 
-		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
+		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, &io->client->bios);
 		bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
 		bio_set_dev(bio, where->bdev);
 		bio->bi_end_io = endio;
@@ -442,7 +443,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 
 	init_completion(&sio.wait);
 
-	io = mempool_alloc(client->pool, GFP_NOIO);
+	io = mempool_alloc(&client->pool, GFP_NOIO);
 	io->error_bits = 0;
 	atomic_set(&io->count, 1); /* see dispatch_io() */
 	io->client = client;
@@ -474,7 +475,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
 		return -EIO;
 	}
 
-	io = mempool_alloc(client->pool, GFP_NOIO);
+	io = mempool_alloc(&client->pool, GFP_NOIO);
 	io->error_bits = 0;
 	atomic_set(&io->count, 1); /* see dispatch_io() */
 	io->client = client;
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index e6e7c68..c89a675 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -47,7 +47,7 @@ struct dm_kcopyd_client {
 	wait_queue_head_t destroyq;
 	atomic_t nr_jobs;
 
-	mempool_t *job_pool;
+	mempool_t job_pool;
 
 	struct workqueue_struct *kcopyd_wq;
 	struct work_struct kcopyd_work;
@@ -479,7 +479,7 @@ static int run_complete_job(struct kcopyd_job *job)
 	 */
 	if (job->master_job == job) {
 		mutex_destroy(&job->lock);
-		mempool_free(job, kc->job_pool);
+		mempool_free(job, &kc->job_pool);
 	}
 	fn(read_err, write_err, context);
 
@@ -751,7 +751,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 	 * Allocate an array of jobs consisting of one master job
 	 * followed by SPLIT_COUNT sub jobs.
 	 */
-	job = mempool_alloc(kc->job_pool, GFP_NOIO);
+	job = mempool_alloc(&kc->job_pool, GFP_NOIO);
 	mutex_init(&job->lock);
 
 	/*
@@ -835,7 +835,7 @@ void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
 {
 	struct kcopyd_job *job;
 
-	job = mempool_alloc(kc->job_pool, GFP_NOIO);
+	job = mempool_alloc(&kc->job_pool, GFP_NOIO);
 
 	memset(job, 0, sizeof(struct kcopyd_job));
 	job->kc = kc;
@@ -879,7 +879,7 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
  *---------------------------------------------------------------*/
 struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle)
 {
-	int r = -ENOMEM;
+	int r;
 	struct dm_kcopyd_client *kc;
 
 	kc = kmalloc(sizeof(*kc), GFP_KERNEL);
@@ -892,14 +892,16 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *thro
 	INIT_LIST_HEAD(&kc->pages_jobs);
 	kc->throttle = throttle;
 
-	kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
-	if (!kc->job_pool)
+	r = mempool_init_slab_pool(&kc->job_pool, MIN_JOBS, _job_cache);
+	if (r)
 		goto bad_slab;
 
 	INIT_WORK(&kc->kcopyd_work, do_work);
 	kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0);
-	if (!kc->kcopyd_wq)
+	if (!kc->kcopyd_wq) {
+		r = -ENOMEM;
 		goto bad_workqueue;
+	}
 
 	kc->pages = NULL;
 	kc->nr_reserved_pages = kc->nr_free_pages = 0;
@@ -923,7 +925,7 @@ bad_io_client:
 bad_client_pages:
 	destroy_workqueue(kc->kcopyd_wq);
 bad_workqueue:
-	mempool_destroy(kc->job_pool);
+	mempool_exit(&kc->job_pool);
 bad_slab:
 	kfree(kc);
 
@@ -942,7 +944,7 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
 	destroy_workqueue(kc->kcopyd_wq);
 	dm_io_client_destroy(kc->io_client);
 	client_free_pages(kc);
-	mempool_destroy(kc->job_pool);
+	mempool_exit(&kc->job_pool);
 	kfree(kc);
 }
 EXPORT_SYMBOL(dm_kcopyd_client_destroy);
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 53b7b06d..52090be 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -76,7 +76,7 @@ struct log_c {
 	 */
 	uint32_t integrated_flush;
 
-	mempool_t *flush_entry_pool;
+	mempool_t flush_entry_pool;
 };
 
 static struct kmem_cache *_flush_entry_cache;
@@ -249,11 +249,10 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 		goto out;
 	}
 
-	lc->flush_entry_pool = mempool_create_slab_pool(FLUSH_ENTRY_POOL_SIZE,
-							_flush_entry_cache);
-	if (!lc->flush_entry_pool) {
+	r = mempool_init_slab_pool(&lc->flush_entry_pool, FLUSH_ENTRY_POOL_SIZE,
+				   _flush_entry_cache);
+	if (r) {
 		DMERR("Failed to create flush_entry_pool");
-		r = -ENOMEM;
 		goto out;
 	}
 
@@ -313,7 +312,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 out:
 	kfree(devices_rdata);
 	if (r) {
-		mempool_destroy(lc->flush_entry_pool);
+		mempool_exit(&lc->flush_entry_pool);
 		kfree(lc);
 		kfree(ctr_str);
 	} else {
@@ -342,7 +341,7 @@ static void userspace_dtr(struct dm_dirty_log *log)
 	if (lc->log_dev)
 		dm_put_device(lc->ti, lc->log_dev);
 
-	mempool_destroy(lc->flush_entry_pool);
+	mempool_exit(&lc->flush_entry_pool);
 
 	kfree(lc->usr_argv_str);
 	kfree(lc);
@@ -570,7 +569,7 @@ static int userspace_flush(struct dm_dirty_log *log)
 	int mark_list_is_empty;
 	int clear_list_is_empty;
 	struct dm_dirty_log_flush_entry *fe, *tmp_fe;
-	mempool_t *flush_entry_pool = lc->flush_entry_pool;
+	mempool_t *flush_entry_pool = &lc->flush_entry_pool;
 
 	spin_lock_irqsave(&lc->flush_lock, flags);
 	list_splice_init(&lc->mark_list, &mark_list);
@@ -653,7 +652,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
 	struct dm_dirty_log_flush_entry *fe;
 
 	/* Wait for an allocation, but _never_ fail */
-	fe = mempool_alloc(lc->flush_entry_pool, GFP_NOIO);
+	fe = mempool_alloc(&lc->flush_entry_pool, GFP_NOIO);
 	BUG_ON(!fe);
 
 	spin_lock_irqsave(&lc->flush_lock, flags);
@@ -687,7 +686,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
 	 * to cause the region to be resync'ed when the
 	 * device is activated next time.
 	 */
-	fe = mempool_alloc(lc->flush_entry_pool, GFP_ATOMIC);
+	fe = mempool_alloc(&lc->flush_entry_pool, GFP_ATOMIC);
 	if (!fe) {
 		DMERR("Failed to allocate memory to clear region.");
 		return;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 203a041..d94ba6f 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -520,7 +520,8 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 
 	bdev = pgpath->path.dev->bdev;
 	q = bdev_get_queue(bdev);
-	clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC);
+	clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE,
+			BLK_MQ_REQ_NOWAIT);
 	if (IS_ERR(clone)) {
 		/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
 		if (blk_queue_dying(q)) {
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 85c32b2..43149eb 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -63,7 +63,7 @@ struct dm_region_hash {
 
 	/* hash table */
 	rwlock_t hash_lock;
-	mempool_t *region_pool;
+	mempool_t region_pool;
 	unsigned mask;
 	unsigned nr_buckets;
 	unsigned prime;
@@ -169,6 +169,7 @@ struct dm_region_hash *dm_region_hash_create(
 	struct dm_region_hash *rh;
 	unsigned nr_buckets, max_buckets;
 	size_t i;
+	int ret;
 
 	/*
 	 * Calculate a suitable number of buckets for our hash
@@ -220,9 +221,9 @@ struct dm_region_hash *dm_region_hash_create(
 	INIT_LIST_HEAD(&rh->failed_recovered_regions);
 	rh->flush_failure = 0;
 
-	rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
-						      sizeof(struct dm_region));
-	if (!rh->region_pool) {
+	ret = mempool_init_kmalloc_pool(&rh->region_pool, MIN_REGIONS,
+					sizeof(struct dm_region));
+	if (ret) {
 		vfree(rh->buckets);
 		kfree(rh);
 		rh = ERR_PTR(-ENOMEM);
@@ -242,14 +243,14 @@ void dm_region_hash_destroy(struct dm_region_hash *rh)
 		list_for_each_entry_safe(reg, nreg, rh->buckets + h,
 					 hash_list) {
 			BUG_ON(atomic_read(&reg->pending));
-			mempool_free(reg, rh->region_pool);
+			mempool_free(reg, &rh->region_pool);
 		}
 	}
 
 	if (rh->log)
 		dm_dirty_log_destroy(rh->log);
 
-	mempool_destroy(rh->region_pool);
+	mempool_exit(&rh->region_pool);
 	vfree(rh->buckets);
 	kfree(rh);
 }
@@ -287,7 +288,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
 {
 	struct dm_region *reg, *nreg;
 
-	nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
+	nreg = mempool_alloc(&rh->region_pool, GFP_ATOMIC);
 	if (unlikely(!nreg))
 		nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
 
@@ -303,7 +304,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
 	reg = __rh_lookup(rh, region);
 	if (reg)
 		/* We lost the race. */
-		mempool_free(nreg, rh->region_pool);
+		mempool_free(nreg, &rh->region_pool);
 	else {
 		__rh_insert(rh, nreg);
 		if (nreg->state == DM_RH_CLEAN) {
@@ -481,17 +482,17 @@ void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled)
 	list_for_each_entry_safe(reg, next, &recovered, list) {
 		rh->log->type->clear_region(rh->log, reg->key);
 		complete_resync_work(reg, 1);
-		mempool_free(reg, rh->region_pool);
+		mempool_free(reg, &rh->region_pool);
 	}
 
 	list_for_each_entry_safe(reg, next, &failed_recovered, list) {
 		complete_resync_work(reg, errors_handled ? 0 : 1);
-		mempool_free(reg, rh->region_pool);
+		mempool_free(reg, &rh->region_pool);
 	}
 
 	list_for_each_entry_safe(reg, next, &clean, list) {
 		rh->log->type->clear_region(rh->log, reg->key);
-		mempool_free(reg, rh->region_pool);
+		mempool_free(reg, &rh->region_pool);
 	}
 
 	rh->log->type->flush(rh->log);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index bf0b840..6e547b8 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -406,7 +406,7 @@ static blk_status_t dm_dispatch_clone_request(struct request *clone, struct requ
 	if (blk_queue_io_stat(clone->q))
 		clone->rq_flags |= RQF_IO_STAT;
 
-	clone->start_time = jiffies;
+	clone->start_time_ns = ktime_get_ns();
 	r = blk_insert_cloned_request(clone->q, clone);
 	if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE)
 		/* must complete clone in terms of original request */
@@ -433,7 +433,7 @@ static int setup_clone(struct request *clone, struct request *rq,
 {
 	int r;
 
-	r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
+	r = blk_rq_prep_clone(clone, rq, &tio->md->bs, gfp_mask,
 			      dm_rq_bio_constructor, tio);
 	if (r)
 		return r;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 216035b..b11ddc5 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -87,7 +87,7 @@ struct dm_snapshot {
 	 */
 	struct list_head out_of_order_list;
 
-	mempool_t *pending_pool;
+	mempool_t pending_pool;
 
 	struct dm_exception_table pending;
 	struct dm_exception_table complete;
@@ -682,7 +682,7 @@ static void free_completed_exception(struct dm_exception *e)
 
 static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s)
 {
-	struct dm_snap_pending_exception *pe = mempool_alloc(s->pending_pool,
+	struct dm_snap_pending_exception *pe = mempool_alloc(&s->pending_pool,
 							     GFP_NOIO);
 
 	atomic_inc(&s->pending_exceptions_count);
@@ -695,7 +695,7 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
 {
 	struct dm_snapshot *s = pe->snap;
 
-	mempool_free(pe, s->pending_pool);
+	mempool_free(pe, &s->pending_pool);
 	smp_mb__before_atomic();
 	atomic_dec(&s->pending_exceptions_count);
 }
@@ -1196,10 +1196,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_kcopyd;
 	}
 
-	s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
-	if (!s->pending_pool) {
+	r = mempool_init_slab_pool(&s->pending_pool, MIN_IOS, pending_cache);
+	if (r) {
 		ti->error = "Could not allocate mempool for pending exceptions";
-		r = -ENOMEM;
 		goto bad_pending_pool;
 	}
 
@@ -1259,7 +1258,7 @@ bad_read_metadata:
 	unregister_snapshot(s);
 
 bad_load_and_register:
-	mempool_destroy(s->pending_pool);
+	mempool_exit(&s->pending_pool);
 
 bad_pending_pool:
 	dm_kcopyd_client_destroy(s->kcopyd_client);
@@ -1355,7 +1354,7 @@ static void snapshot_dtr(struct dm_target *ti)
 	while (atomic_read(&s->pending_exceptions_count))
 		msleep(1);
 	/*
-	 * Ensure instructions in mempool_destroy aren't reordered
+	 * Ensure instructions in mempool_exit aren't reordered
 	 * before atomic_read.
 	 */
 	smp_mb();
@@ -1367,7 +1366,7 @@ static void snapshot_dtr(struct dm_target *ti)
 
 	__free_exceptions(s);
 
-	mempool_destroy(s->pending_pool);
+	mempool_exit(&s->pending_pool);
 
 	dm_exception_store_destroy(s->store);
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index b111074..6c92382 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -260,7 +260,7 @@ struct pool {
 	struct dm_deferred_set *all_io_ds;
 
 	struct dm_thin_new_mapping *next_mapping;
-	mempool_t *mapping_pool;
+	mempool_t mapping_pool;
 
 	process_bio_fn process_bio;
 	process_bio_fn process_discard;
@@ -917,7 +917,7 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
 	cell_error(m->tc->pool, m->cell);
 	list_del(&m->list);
-	mempool_free(m, m->tc->pool->mapping_pool);
+	mempool_free(m, &m->tc->pool->mapping_pool);
 }
 
 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
@@ -961,7 +961,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 
 out:
 	list_del(&m->list);
-	mempool_free(m, pool->mapping_pool);
+	mempool_free(m, &pool->mapping_pool);
 }
 
 /*----------------------------------------------------------------*/
@@ -971,7 +971,7 @@ static void free_discard_mapping(struct dm_thin_new_mapping *m)
 	struct thin_c *tc = m->tc;
 	if (m->cell)
 		cell_defer_no_holder(tc, m->cell);
-	mempool_free(m, tc->pool->mapping_pool);
+	mempool_free(m, &tc->pool->mapping_pool);
 }
 
 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
@@ -999,7 +999,7 @@ static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
 		bio_endio(m->bio);
 
 	cell_defer_no_holder(tc, m->cell);
-	mempool_free(m, tc->pool->mapping_pool);
+	mempool_free(m, &tc->pool->mapping_pool);
 }
 
 /*----------------------------------------------------------------*/
@@ -1092,7 +1092,7 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
 		metadata_operation_failed(pool, "dm_thin_remove_range", r);
 		bio_io_error(m->bio);
 		cell_defer_no_holder(tc, m->cell);
-		mempool_free(m, pool->mapping_pool);
+		mempool_free(m, &pool->mapping_pool);
 		return;
 	}
 
@@ -1105,7 +1105,7 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
 		metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
 		bio_io_error(m->bio);
 		cell_defer_no_holder(tc, m->cell);
-		mempool_free(m, pool->mapping_pool);
+		mempool_free(m, &pool->mapping_pool);
 		return;
 	}
 
@@ -1150,7 +1150,7 @@ static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
 		bio_endio(m->bio);
 
 	cell_defer_no_holder(tc, m->cell);
-	mempool_free(m, pool->mapping_pool);
+	mempool_free(m, &pool->mapping_pool);
 }
 
 static void process_prepared(struct pool *pool, struct list_head *head,
@@ -1196,7 +1196,7 @@ static int ensure_next_mapping(struct pool *pool)
 	if (pool->next_mapping)
 		return 0;
 
-	pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
+	pool->next_mapping = mempool_alloc(&pool->mapping_pool, GFP_ATOMIC);
 
 	return pool->next_mapping ? 0 : -ENOMEM;
 }
@@ -2835,8 +2835,8 @@ static void __pool_destroy(struct pool *pool)
 		destroy_workqueue(pool->wq);
 
 	if (pool->next_mapping)
-		mempool_free(pool->next_mapping, pool->mapping_pool);
-	mempool_destroy(pool->mapping_pool);
+		mempool_free(pool->next_mapping, &pool->mapping_pool);
+	mempool_exit(&pool->mapping_pool);
 	dm_deferred_set_destroy(pool->shared_read_ds);
 	dm_deferred_set_destroy(pool->all_io_ds);
 	kfree(pool);
@@ -2931,11 +2931,11 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 	}
 
 	pool->next_mapping = NULL;
-	pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
-						      _new_mapping_cache);
-	if (!pool->mapping_pool) {
+	r = mempool_init_slab_pool(&pool->mapping_pool, MAPPING_POOL_SIZE,
+				   _new_mapping_cache);
+	if (r) {
 		*error = "Error creating pool's mapping mempool";
-		err_p = ERR_PTR(-ENOMEM);
+		err_p = ERR_PTR(r);
 		goto bad_mapping_pool;
 	}
 
@@ -2955,7 +2955,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 	return pool;
 
 bad_sort_array:
-	mempool_destroy(pool->mapping_pool);
+	mempool_exit(&pool->mapping_pool);
 bad_mapping_pool:
 	dm_deferred_set_destroy(pool->all_io_ds);
 bad_all_io_ds:
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index e13f908..8640586 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -309,13 +309,13 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
 	unsigned n;
 
 	if (!fio->rs)
-		fio->rs = mempool_alloc(v->fec->rs_pool, GFP_NOIO);
+		fio->rs = mempool_alloc(&v->fec->rs_pool, GFP_NOIO);
 
 	fec_for_each_prealloc_buffer(n) {
 		if (fio->bufs[n])
 			continue;
 
-		fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOWAIT);
+		fio->bufs[n] = mempool_alloc(&v->fec->prealloc_pool, GFP_NOWAIT);
 		if (unlikely(!fio->bufs[n])) {
 			DMERR("failed to allocate FEC buffer");
 			return -ENOMEM;
@@ -327,7 +327,7 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
 		if (fio->bufs[n])
 			continue;
 
-		fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOWAIT);
+		fio->bufs[n] = mempool_alloc(&v->fec->extra_pool, GFP_NOWAIT);
 		/* we can manage with even one buffer if necessary */
 		if (unlikely(!fio->bufs[n]))
 			break;
@@ -335,7 +335,7 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
 	fio->nbufs = n;
 
 	if (!fio->output)
-		fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO);
+		fio->output = mempool_alloc(&v->fec->output_pool, GFP_NOIO);
 
 	return 0;
 }
@@ -493,15 +493,15 @@ void verity_fec_finish_io(struct dm_verity_io *io)
 	if (!verity_fec_is_enabled(io->v))
 		return;
 
-	mempool_free(fio->rs, f->rs_pool);
+	mempool_free(fio->rs, &f->rs_pool);
 
 	fec_for_each_prealloc_buffer(n)
-		mempool_free(fio->bufs[n], f->prealloc_pool);
+		mempool_free(fio->bufs[n], &f->prealloc_pool);
 
 	fec_for_each_extra_buffer(fio, n)
-		mempool_free(fio->bufs[n], f->extra_pool);
+		mempool_free(fio->bufs[n], &f->extra_pool);
 
-	mempool_free(fio->output, f->output_pool);
+	mempool_free(fio->output, &f->output_pool);
 }
 
 /*
@@ -549,9 +549,9 @@ void verity_fec_dtr(struct dm_verity *v)
 	if (!verity_fec_is_enabled(v))
 		goto out;
 
-	mempool_destroy(f->rs_pool);
-	mempool_destroy(f->prealloc_pool);
-	mempool_destroy(f->extra_pool);
+	mempool_exit(&f->rs_pool);
+	mempool_exit(&f->prealloc_pool);
+	mempool_exit(&f->extra_pool);
 	kmem_cache_destroy(f->cache);
 
 	if (f->data_bufio)
@@ -675,6 +675,7 @@ int verity_fec_ctr(struct dm_verity *v)
 	struct dm_verity_fec *f = v->fec;
 	struct dm_target *ti = v->ti;
 	u64 hash_blocks;
+	int ret;
 
 	if (!verity_fec_is_enabled(v)) {
 		verity_fec_dtr(v);
@@ -770,11 +771,11 @@ int verity_fec_ctr(struct dm_verity *v)
 	}
 
 	/* Preallocate an rs_control structure for each worker thread */
-	f->rs_pool = mempool_create(num_online_cpus(), fec_rs_alloc,
-				    fec_rs_free, (void *) v);
-	if (!f->rs_pool) {
+	ret = mempool_init(&f->rs_pool, num_online_cpus(), fec_rs_alloc,
+			   fec_rs_free, (void *) v);
+	if (ret) {
 		ti->error = "Cannot allocate RS pool";
-		return -ENOMEM;
+		return ret;
 	}
 
 	f->cache = kmem_cache_create("dm_verity_fec_buffers",
@@ -786,26 +787,26 @@ int verity_fec_ctr(struct dm_verity *v)
 	}
 
 	/* Preallocate DM_VERITY_FEC_BUF_PREALLOC buffers for each thread */
-	f->prealloc_pool = mempool_create_slab_pool(num_online_cpus() *
-						    DM_VERITY_FEC_BUF_PREALLOC,
-						    f->cache);
-	if (!f->prealloc_pool) {
+	ret = mempool_init_slab_pool(&f->prealloc_pool, num_online_cpus() *
+				     DM_VERITY_FEC_BUF_PREALLOC,
+				     f->cache);
+	if (ret) {
 		ti->error = "Cannot allocate FEC buffer prealloc pool";
-		return -ENOMEM;
+		return ret;
 	}
 
-	f->extra_pool = mempool_create_slab_pool(0, f->cache);
-	if (!f->extra_pool) {
+	ret = mempool_init_slab_pool(&f->extra_pool, 0, f->cache);
+	if (ret) {
 		ti->error = "Cannot allocate FEC buffer extra pool";
-		return -ENOMEM;
+		return ret;
 	}
 
 	/* Preallocate an output buffer for each thread */
-	f->output_pool = mempool_create_kmalloc_pool(num_online_cpus(),
-						     1 << v->data_dev_block_bits);
-	if (!f->output_pool) {
+	ret = mempool_init_kmalloc_pool(&f->output_pool, num_online_cpus(),
+					1 << v->data_dev_block_bits);
+	if (ret) {
 		ti->error = "Cannot allocate FEC output pool";
-		return -ENOMEM;
+		return ret;
 	}
 
 	/* Reserve space for our per-bio data */
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index bb31ce8..6ad803b 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -46,10 +46,10 @@ struct dm_verity_fec {
 	sector_t hash_blocks;	/* blocks covered after v->hash_start */
 	unsigned char roots;	/* number of parity bytes, M-N of RS(M, N) */
 	unsigned char rsn;	/* N of RS(M, N) */
-	mempool_t *rs_pool;	/* mempool for fio->rs */
-	mempool_t *prealloc_pool;	/* mempool for preallocated buffers */
-	mempool_t *extra_pool;	/* mempool for extra buffers */
-	mempool_t *output_pool;	/* mempool for output */
+	mempool_t rs_pool;	/* mempool for fio->rs */
+	mempool_t prealloc_pool;	/* mempool for preallocated buffers */
+	mempool_t extra_pool;	/* mempool for extra buffers */
+	mempool_t output_pool;	/* mempool for output */
 	struct kmem_cache *cache;	/* cache for buffers */
 };
 
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index e73b077..30602d1 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -57,7 +57,7 @@ struct dmz_target {
 	struct workqueue_struct *chunk_wq;
 
 	/* For cloned BIOs to zones */
-	struct bio_set		*bio_set;
+	struct bio_set		bio_set;
 
 	/* For flush */
 	spinlock_t		flush_lock;
@@ -121,7 +121,7 @@ static int dmz_submit_read_bio(struct dmz_target *dmz, struct dm_zone *zone,
 	}
 
 	/* Partial BIO: we need to clone the BIO */
-	clone = bio_clone_fast(bio, GFP_NOIO, dmz->bio_set);
+	clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set);
 	if (!clone)
 		return -ENOMEM;
 
@@ -779,10 +779,9 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift;
 
 	/* Zone BIO */
-	dmz->bio_set = bioset_create(DMZ_MIN_BIOS, 0, 0);
-	if (!dmz->bio_set) {
+	ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0);
+	if (ret) {
 		ti->error = "Create BIO set failed";
-		ret = -ENOMEM;
 		goto err_meta;
 	}
 
@@ -828,7 +827,7 @@ err_cwq:
 	destroy_workqueue(dmz->chunk_wq);
 err_bio:
 	mutex_destroy(&dmz->chunk_lock);
-	bioset_free(dmz->bio_set);
+	bioset_exit(&dmz->bio_set);
 err_meta:
 	dmz_dtr_metadata(dmz->metadata);
 err_dev:
@@ -858,7 +857,7 @@ static void dmz_dtr(struct dm_target *ti)
 
 	dmz_dtr_metadata(dmz->metadata);
 
-	bioset_free(dmz->bio_set);
+	bioset_exit(&dmz->bio_set);
 
 	dmz_put_zoned_device(ti);
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0a7b010..98dff36 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -148,8 +148,8 @@ static int dm_numa_node = DM_NUMA_NODE;
  * For mempools pre-allocation at the table loading time.
  */
 struct dm_md_mempools {
-	struct bio_set *bs;
-	struct bio_set *io_bs;
+	struct bio_set bs;
+	struct bio_set io_bs;
 };
 
 struct table_device {
@@ -537,7 +537,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 	struct dm_target_io *tio;
 	struct bio *clone;
 
-	clone = bio_alloc_bioset(GFP_NOIO, 0, md->io_bs);
+	clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
 	if (!clone)
 		return NULL;
 
@@ -572,7 +572,7 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
 		/* the dm_target_io embedded in ci->io is available */
 		tio = &ci->io->tio;
 	} else {
-		struct bio *clone = bio_alloc_bioset(gfp_mask, 0, ci->io->md->bs);
+		struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
 		if (!clone)
 			return NULL;
 
@@ -1583,7 +1583,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
 				 * won't be affected by this reassignment.
 				 */
 				struct bio *b = bio_clone_bioset(bio, GFP_NOIO,
-								 md->queue->bio_split);
+								 &md->queue->bio_split);
 				ci.io->orig_bio = b;
 				bio_advance(bio, (bio_sectors(bio) - ci.sector_count) << 9);
 				bio_chain(b, bio);
@@ -1785,10 +1785,8 @@ static void cleanup_mapped_device(struct mapped_device *md)
 		destroy_workqueue(md->wq);
 	if (md->kworker_task)
 		kthread_stop(md->kworker_task);
-	if (md->bs)
-		bioset_free(md->bs);
-	if (md->io_bs)
-		bioset_free(md->io_bs);
+	bioset_exit(&md->bs);
+	bioset_exit(&md->io_bs);
 
 	if (md->dax_dev) {
 		kill_dax(md->dax_dev);
@@ -1965,16 +1963,10 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 		 * If so, reload bioset because front_pad may have changed
 		 * because a different table was loaded.
 		 */
-		if (md->bs) {
-			bioset_free(md->bs);
-			md->bs = NULL;
-		}
-		if (md->io_bs) {
-			bioset_free(md->io_bs);
-			md->io_bs = NULL;
-		}
+		bioset_exit(&md->bs);
+		bioset_exit(&md->io_bs);
 
-	} else if (md->bs) {
+	} else if (bioset_initialized(&md->bs)) {
 		/*
 		 * There's no need to reload with request-based dm
 		 * because the size of front_pad doesn't change.
@@ -1986,12 +1978,14 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 		goto out;
 	}
 
-	BUG_ON(!p || md->bs || md->io_bs);
+	BUG_ON(!p ||
+	       bioset_initialized(&md->bs) ||
+	       bioset_initialized(&md->io_bs));
 
 	md->bs = p->bs;
-	p->bs = NULL;
+	memset(&p->bs, 0, sizeof(p->bs));
 	md->io_bs = p->io_bs;
-	p->io_bs = NULL;
+	memset(&p->io_bs, 0, sizeof(p->io_bs));
 out:
 	/* mempool bind completed, no longer need any mempools in the table */
 	dm_table_free_md_mempools(t);
@@ -2905,6 +2899,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
 	struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
 	unsigned int pool_size = 0;
 	unsigned int front_pad, io_front_pad;
+	int ret;
 
 	if (!pools)
 		return NULL;
@@ -2916,10 +2911,10 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
 		pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
 		front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 		io_front_pad = roundup(front_pad,  __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
-		pools->io_bs = bioset_create(pool_size, io_front_pad, 0);
-		if (!pools->io_bs)
+		ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
+		if (ret)
 			goto out;
-		if (integrity && bioset_integrity_create(pools->io_bs, pool_size))
+		if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
 			goto out;
 		break;
 	case DM_TYPE_REQUEST_BASED:
@@ -2932,11 +2927,11 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
 		BUG();
 	}
 
-	pools->bs = bioset_create(pool_size, front_pad, 0);
-	if (!pools->bs)
+	ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
+	if (ret)
 		goto out;
 
-	if (integrity && bioset_integrity_create(pools->bs, pool_size))
+	if (integrity && bioset_integrity_create(&pools->bs, pool_size))
 		goto out;
 
 	return pools;
@@ -2952,10 +2947,8 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
 	if (!pools)
 		return;
 
-	if (pools->bs)
-		bioset_free(pools->bs);
-	if (pools->io_bs)
-		bioset_free(pools->io_bs);
+	bioset_exit(&pools->bs);
+	bioset_exit(&pools->io_bs);
 
 	kfree(pools);
 }
diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c
index 38264b3..c2fdf89 100644
--- a/drivers/md/md-faulty.c
+++ b/drivers/md/md-faulty.c
@@ -214,7 +214,7 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
 		}
 	}
 	if (failit) {
-		struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+		struct bio *b = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
 
 		bio_set_dev(b, conf->rdev->bdev);
 		b->bi_private = bio;
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 4964323..d45c697 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -269,7 +269,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
 	if (unlikely(bio_end_sector(bio) > end_sector)) {
 		/* This bio crosses a device boundary, so we have to split it */
 		struct bio *split = bio_split(bio, end_sector - bio_sector,
-					      GFP_NOIO, mddev->bio_set);
+					      GFP_NOIO, &mddev->bio_set);
 		bio_chain(split, bio);
 		generic_make_request(bio);
 		bio = split;
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 0a7e99d..f71fcdb 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -80,7 +80,7 @@ static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status)
 
 	bio->bi_status = status;
 	bio_endio(bio);
-	mempool_free(mp_bh, conf->pool);
+	mempool_free(mp_bh, &conf->pool);
 }
 
 static void multipath_end_request(struct bio *bio)
@@ -117,7 +117,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
 		return true;
 	}
 
-	mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
+	mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
 
 	mp_bh->master_bio = bio;
 	mp_bh->mddev = mddev;
@@ -125,7 +125,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
 	mp_bh->path = multipath_map(conf);
 	if (mp_bh->path < 0) {
 		bio_io_error(bio);
-		mempool_free(mp_bh, conf->pool);
+		mempool_free(mp_bh, &conf->pool);
 		return true;
 	}
 	multipath = conf->multipaths + mp_bh->path;
@@ -378,6 +378,7 @@ static int multipath_run (struct mddev *mddev)
 	struct multipath_info *disk;
 	struct md_rdev *rdev;
 	int working_disks;
+	int ret;
 
 	if (md_check_no_bitmap(mddev))
 		return -EINVAL;
@@ -431,9 +432,9 @@ static int multipath_run (struct mddev *mddev)
 	}
 	mddev->degraded = conf->raid_disks - working_disks;
 
-	conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
-						 sizeof(struct multipath_bh));
-	if (conf->pool == NULL)
+	ret = mempool_init_kmalloc_pool(&conf->pool, NR_RESERVED_BUFS,
+					sizeof(struct multipath_bh));
+	if (ret)
 		goto out_free_conf;
 
 	mddev->thread = md_register_thread(multipathd, mddev,
@@ -455,7 +456,7 @@ static int multipath_run (struct mddev *mddev)
 	return 0;
 
 out_free_conf:
-	mempool_destroy(conf->pool);
+	mempool_exit(&conf->pool);
 	kfree(conf->multipaths);
 	kfree(conf);
 	mddev->private = NULL;
@@ -467,7 +468,7 @@ static void multipath_free(struct mddev *mddev, void *priv)
 {
 	struct mpconf *conf = priv;
 
-	mempool_destroy(conf->pool);
+	mempool_exit(&conf->pool);
 	kfree(conf->multipaths);
 	kfree(conf);
 }
diff --git a/drivers/md/md-multipath.h b/drivers/md/md-multipath.h
index 0adb941..b3099e5 100644
--- a/drivers/md/md-multipath.h
+++ b/drivers/md/md-multipath.h
@@ -13,7 +13,7 @@ struct mpconf {
 	spinlock_t		device_lock;
 	struct list_head	retry_list;
 
-	mempool_t		*pool;
+	mempool_t		pool;
 };
 
 /*
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c208c01..fc692b7 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -193,10 +193,10 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 {
 	struct bio *b;
 
-	if (!mddev || !mddev->bio_set)
+	if (!mddev || !bioset_initialized(&mddev->bio_set))
 		return bio_alloc(gfp_mask, nr_iovecs);
 
-	b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
+	b = bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
 	if (!b)
 		return NULL;
 	return b;
@@ -205,10 +205,10 @@ EXPORT_SYMBOL_GPL(bio_alloc_mddev);
 
 static struct bio *md_bio_alloc_sync(struct mddev *mddev)
 {
-	if (!mddev || !mddev->sync_set)
+	if (!mddev || !bioset_initialized(&mddev->sync_set))
 		return bio_alloc(GFP_NOIO, 1);
 
-	return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set);
+	return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
 }
 
 /*
@@ -510,7 +510,10 @@ static void mddev_delayed_delete(struct work_struct *ws);
 
 static void mddev_put(struct mddev *mddev)
 {
-	struct bio_set *bs = NULL, *sync_bs = NULL;
+	struct bio_set bs, sync_bs;
+
+	memset(&bs, 0, sizeof(bs));
+	memset(&sync_bs, 0, sizeof(sync_bs));
 
 	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
 		return;
@@ -521,8 +524,8 @@ static void mddev_put(struct mddev *mddev)
 		list_del_init(&mddev->all_mddevs);
 		bs = mddev->bio_set;
 		sync_bs = mddev->sync_set;
-		mddev->bio_set = NULL;
-		mddev->sync_set = NULL;
+		memset(&mddev->bio_set, 0, sizeof(mddev->bio_set));
+		memset(&mddev->sync_set, 0, sizeof(mddev->sync_set));
 		if (mddev->gendisk) {
 			/* We did a probe so need to clean up.  Call
 			 * queue_work inside the spinlock so that
@@ -535,10 +538,8 @@ static void mddev_put(struct mddev *mddev)
 			kfree(mddev);
 	}
 	spin_unlock(&all_mddevs_lock);
-	if (bs)
-		bioset_free(bs);
-	if (sync_bs)
-		bioset_free(sync_bs);
+	bioset_exit(&bs);
+	bioset_exit(&sync_bs);
 }
 
 static void md_safemode_timeout(struct timer_list *t);
@@ -2123,7 +2124,7 @@ int md_integrity_register(struct mddev *mddev)
 			       bdev_get_integrity(reference->bdev));
 
 	pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
-	if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
+	if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) {
 		pr_err("md: failed to create integrity pool for %s\n",
 		       mdname(mddev));
 		return -EINVAL;
@@ -5497,17 +5498,15 @@ int md_run(struct mddev *mddev)
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
 	}
 
-	if (mddev->bio_set == NULL) {
-		mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-		if (!mddev->bio_set)
-			return -ENOMEM;
+	if (!bioset_initialized(&mddev->bio_set)) {
+		err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+		if (err)
+			return err;
 	}
-	if (mddev->sync_set == NULL) {
-		mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-		if (!mddev->sync_set) {
-			err = -ENOMEM;
+	if (!bioset_initialized(&mddev->sync_set)) {
+		err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+		if (err)
 			goto abort;
-		}
 	}
 
 	spin_lock(&pers_lock);
@@ -5668,14 +5667,8 @@ int md_run(struct mddev *mddev)
 	return 0;
 
 abort:
-	if (mddev->bio_set) {
-		bioset_free(mddev->bio_set);
-		mddev->bio_set = NULL;
-	}
-	if (mddev->sync_set) {
-		bioset_free(mddev->sync_set);
-		mddev->sync_set = NULL;
-	}
+	bioset_exit(&mddev->bio_set);
+	bioset_exit(&mddev->sync_set);
 
 	return err;
 }
@@ -5888,14 +5881,8 @@ void md_stop(struct mddev *mddev)
 	 * This is called from dm-raid
 	 */
 	__md_stop(mddev);
-	if (mddev->bio_set) {
-		bioset_free(mddev->bio_set);
-		mddev->bio_set = NULL;
-	}
-	if (mddev->sync_set) {
-		bioset_free(mddev->sync_set);
-		mddev->sync_set = NULL;
-	}
+	bioset_exit(&mddev->bio_set);
+	bioset_exit(&mddev->sync_set);
 }
 
 EXPORT_SYMBOL_GPL(md_stop);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fbc925c..3507cab2 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -452,8 +452,8 @@ struct mddev {
 
 	struct attribute_group		*to_remove;
 
-	struct bio_set			*bio_set;
-	struct bio_set			*sync_set; /* for sync operations like
+	struct bio_set			bio_set;
+	struct bio_set			sync_set; /* for sync operations like
 						   * metadata and bitmap writes
 						   */
 
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 584c103..65ae47a 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -479,7 +479,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 	if (bio_end_sector(bio) > zone->zone_end) {
 		struct bio *split = bio_split(bio,
 			zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
-			mddev->bio_set);
+			&mddev->bio_set);
 		bio_chain(split, bio);
 		generic_make_request(bio);
 		bio = split;
@@ -582,7 +582,8 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
 	sector = bio_sector;
 
 	if (sectors < bio_sectors(bio)) {
-		struct bio *split = bio_split(bio, sectors, GFP_NOIO, mddev->bio_set);
+		struct bio *split = bio_split(bio, sectors, GFP_NOIO,
+					      &mddev->bio_set);
 		bio_chain(split, bio);
 		generic_make_request(bio);
 		bio = split;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e9e3308..bad2852 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -221,7 +221,7 @@ static void free_r1bio(struct r1bio *r1_bio)
 	struct r1conf *conf = r1_bio->mddev->private;
 
 	put_all_bios(conf, r1_bio);
-	mempool_free(r1_bio, conf->r1bio_pool);
+	mempool_free(r1_bio, &conf->r1bio_pool);
 }
 
 static void put_buf(struct r1bio *r1_bio)
@@ -236,7 +236,7 @@ static void put_buf(struct r1bio *r1_bio)
 			rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
 	}
 
-	mempool_free(r1_bio, conf->r1buf_pool);
+	mempool_free(r1_bio, &conf->r1buf_pool);
 
 	lower_barrier(conf, sect);
 }
@@ -1178,7 +1178,7 @@ alloc_r1bio(struct mddev *mddev, struct bio *bio)
 	struct r1conf *conf = mddev->private;
 	struct r1bio *r1_bio;
 
-	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+	r1_bio = mempool_alloc(&conf->r1bio_pool, GFP_NOIO);
 	/* Ensure no bio records IO_BLOCKED */
 	memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0]));
 	init_r1bio(r1_bio, mddev, bio);
@@ -1268,7 +1268,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 
 	if (max_sectors < bio_sectors(bio)) {
 		struct bio *split = bio_split(bio, max_sectors,
-					      gfp, conf->bio_split);
+					      gfp, &conf->bio_split);
 		bio_chain(split, bio);
 		generic_make_request(bio);
 		bio = split;
@@ -1278,7 +1278,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 
 	r1_bio->read_disk = rdisk;
 
-	read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
+	read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
 
 	r1_bio->bios[rdisk] = read_bio;
 
@@ -1439,7 +1439,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 	if (max_sectors < bio_sectors(bio)) {
 		struct bio *split = bio_split(bio, max_sectors,
-					      GFP_NOIO, conf->bio_split);
+					      GFP_NOIO, &conf->bio_split);
 		bio_chain(split, bio);
 		generic_make_request(bio);
 		bio = split;
@@ -1479,9 +1479,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 		if (r1_bio->behind_master_bio)
 			mbio = bio_clone_fast(r1_bio->behind_master_bio,
-					      GFP_NOIO, mddev->bio_set);
+					      GFP_NOIO, &mddev->bio_set);
 		else
-			mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+			mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
 
 		if (r1_bio->behind_master_bio) {
 			if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
@@ -1657,8 +1657,7 @@ static void close_sync(struct r1conf *conf)
 		_allow_barrier(conf, idx);
 	}
 
-	mempool_destroy(conf->r1buf_pool);
-	conf->r1buf_pool = NULL;
+	mempool_exit(&conf->r1buf_pool);
 }
 
 static int raid1_spare_active(struct mddev *mddev)
@@ -2348,10 +2347,10 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
 		if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
 			wbio = bio_clone_fast(r1_bio->behind_master_bio,
 					      GFP_NOIO,
-					      mddev->bio_set);
+					      &mddev->bio_set);
 		} else {
 			wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
-					      mddev->bio_set);
+					      &mddev->bio_set);
 		}
 
 		bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
@@ -2564,17 +2563,15 @@ static int init_resync(struct r1conf *conf)
 	int buffs;
 
 	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
-	BUG_ON(conf->r1buf_pool);
-	conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
-					  conf->poolinfo);
-	if (!conf->r1buf_pool)
-		return -ENOMEM;
-	return 0;
+	BUG_ON(mempool_initialized(&conf->r1buf_pool));
+
+	return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc,
+			    r1buf_pool_free, conf->poolinfo);
 }
 
 static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
 {
-	struct r1bio *r1bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
+	struct r1bio *r1bio = mempool_alloc(&conf->r1buf_pool, GFP_NOIO);
 	struct resync_pages *rps;
 	struct bio *bio;
 	int i;
@@ -2617,7 +2614,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 	int idx = sector_to_idx(sector_nr);
 	int page_idx = 0;
 
-	if (!conf->r1buf_pool)
+	if (!mempool_initialized(&conf->r1buf_pool))
 		if (init_resync(conf))
 			return 0;
 
@@ -2953,14 +2950,13 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	if (!conf->poolinfo)
 		goto abort;
 	conf->poolinfo->raid_disks = mddev->raid_disks * 2;
-	conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
-					  r1bio_pool_free,
-					  conf->poolinfo);
-	if (!conf->r1bio_pool)
+	err = mempool_init(&conf->r1bio_pool, NR_RAID1_BIOS, r1bio_pool_alloc,
+			   r1bio_pool_free, conf->poolinfo);
+	if (err)
 		goto abort;
 
-	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
-	if (!conf->bio_split)
+	err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
+	if (err)
 		goto abort;
 
 	conf->poolinfo->mddev = mddev;
@@ -3033,7 +3029,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 
  abort:
 	if (conf) {
-		mempool_destroy(conf->r1bio_pool);
+		mempool_exit(&conf->r1bio_pool);
 		kfree(conf->mirrors);
 		safe_put_page(conf->tmppage);
 		kfree(conf->poolinfo);
@@ -3041,8 +3037,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 		kfree(conf->nr_waiting);
 		kfree(conf->nr_queued);
 		kfree(conf->barrier);
-		if (conf->bio_split)
-			bioset_free(conf->bio_split);
+		bioset_exit(&conf->bio_split);
 		kfree(conf);
 	}
 	return ERR_PTR(err);
@@ -3144,7 +3139,7 @@ static void raid1_free(struct mddev *mddev, void *priv)
 {
 	struct r1conf *conf = priv;
 
-	mempool_destroy(conf->r1bio_pool);
+	mempool_exit(&conf->r1bio_pool);
 	kfree(conf->mirrors);
 	safe_put_page(conf->tmppage);
 	kfree(conf->poolinfo);
@@ -3152,8 +3147,7 @@ static void raid1_free(struct mddev *mddev, void *priv)
 	kfree(conf->nr_waiting);
 	kfree(conf->nr_queued);
 	kfree(conf->barrier);
-	if (conf->bio_split)
-		bioset_free(conf->bio_split);
+	bioset_exit(&conf->bio_split);
 	kfree(conf);
 }
 
@@ -3199,13 +3193,17 @@ static int raid1_reshape(struct mddev *mddev)
 	 * At the same time, we "pack" the devices so that all the missing
 	 * devices have the higher raid_disk numbers.
 	 */
-	mempool_t *newpool, *oldpool;
+	mempool_t newpool, oldpool;
 	struct pool_info *newpoolinfo;
 	struct raid1_info *newmirrors;
 	struct r1conf *conf = mddev->private;
 	int cnt, raid_disks;
 	unsigned long flags;
 	int d, d2;
+	int ret;
+
+	memset(&newpool, 0, sizeof(newpool));
+	memset(&oldpool, 0, sizeof(oldpool));
 
 	/* Cannot change chunk_size, layout, or level */
 	if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
@@ -3237,17 +3235,17 @@ static int raid1_reshape(struct mddev *mddev)
 	newpoolinfo->mddev = mddev;
 	newpoolinfo->raid_disks = raid_disks * 2;
 
-	newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
-				 r1bio_pool_free, newpoolinfo);
-	if (!newpool) {
+	ret = mempool_init(&newpool, NR_RAID1_BIOS, r1bio_pool_alloc,
+			   r1bio_pool_free, newpoolinfo);
+	if (ret) {
 		kfree(newpoolinfo);
-		return -ENOMEM;
+		return ret;
 	}
 	newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
 			     GFP_KERNEL);
 	if (!newmirrors) {
 		kfree(newpoolinfo);
-		mempool_destroy(newpool);
+		mempool_exit(&newpool);
 		return -ENOMEM;
 	}
 
@@ -3287,7 +3285,7 @@ static int raid1_reshape(struct mddev *mddev)
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
 
-	mempool_destroy(oldpool);
+	mempool_exit(&oldpool);
 	return 0;
 }
 
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index eb84bc6..e7ccad89 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -118,10 +118,10 @@ struct r1conf {
 	 * mempools - it changes when the array grows or shrinks
 	 */
 	struct pool_info	*poolinfo;
-	mempool_t		*r1bio_pool;
-	mempool_t		*r1buf_pool;
+	mempool_t		r1bio_pool;
+	mempool_t		r1buf_pool;
 
-	struct bio_set		*bio_split;
+	struct bio_set		bio_split;
 
 	/* temporary buffer to synchronous IO when attempting to repair
 	 * a read error.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3c60774..37d4b23 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -291,14 +291,14 @@ static void free_r10bio(struct r10bio *r10_bio)
 	struct r10conf *conf = r10_bio->mddev->private;
 
 	put_all_bios(conf, r10_bio);
-	mempool_free(r10_bio, conf->r10bio_pool);
+	mempool_free(r10_bio, &conf->r10bio_pool);
 }
 
 static void put_buf(struct r10bio *r10_bio)
 {
 	struct r10conf *conf = r10_bio->mddev->private;
 
-	mempool_free(r10_bio, conf->r10buf_pool);
+	mempool_free(r10_bio, &conf->r10buf_pool);
 
 	lower_barrier(conf);
 }
@@ -1204,7 +1204,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 				   (unsigned long long)r10_bio->sector);
 	if (max_sectors < bio_sectors(bio)) {
 		struct bio *split = bio_split(bio, max_sectors,
-					      gfp, conf->bio_split);
+					      gfp, &conf->bio_split);
 		bio_chain(split, bio);
 		generic_make_request(bio);
 		bio = split;
@@ -1213,7 +1213,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	}
 	slot = r10_bio->read_slot;
 
-	read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
+	read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
 
 	r10_bio->devs[slot].bio = read_bio;
 	r10_bio->devs[slot].rdev = rdev;
@@ -1261,7 +1261,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 	} else
 		rdev = conf->mirrors[devnum].rdev;
 
-	mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+	mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
 	if (replacement)
 		r10_bio->devs[n_copy].repl_bio = mbio;
 	else
@@ -1509,7 +1509,7 @@ retry_write:
 
 	if (r10_bio->sectors < bio_sectors(bio)) {
 		struct bio *split = bio_split(bio, r10_bio->sectors,
-					      GFP_NOIO, conf->bio_split);
+					      GFP_NOIO, &conf->bio_split);
 		bio_chain(split, bio);
 		generic_make_request(bio);
 		bio = split;
@@ -1533,7 +1533,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
 	struct r10conf *conf = mddev->private;
 	struct r10bio *r10_bio;
 
-	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+	r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
 
 	r10_bio->master_bio = bio;
 	r10_bio->sectors = sectors;
@@ -1732,8 +1732,7 @@ static void close_sync(struct r10conf *conf)
 	wait_barrier(conf);
 	allow_barrier(conf);
 
-	mempool_destroy(conf->r10buf_pool);
-	conf->r10buf_pool = NULL;
+	mempool_exit(&conf->r10buf_pool);
 }
 
 static int raid10_spare_active(struct mddev *mddev)
@@ -2583,7 +2582,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
 		if (sectors > sect_to_write)
 			sectors = sect_to_write;
 		/* Write at 'sector' for 'sectors' */
-		wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+		wbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
 		bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
 		wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
 		wbio->bi_iter.bi_sector = wsector +
@@ -2816,25 +2815,25 @@ static void raid10d(struct md_thread *thread)
 
 static int init_resync(struct r10conf *conf)
 {
-	int buffs;
-	int i;
+	int ret, buffs, i;
 
 	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
-	BUG_ON(conf->r10buf_pool);
+	BUG_ON(mempool_initialized(&conf->r10buf_pool));
 	conf->have_replacement = 0;
 	for (i = 0; i < conf->geo.raid_disks; i++)
 		if (conf->mirrors[i].replacement)
 			conf->have_replacement = 1;
-	conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
-	if (!conf->r10buf_pool)
-		return -ENOMEM;
+	ret = mempool_init(&conf->r10buf_pool, buffs,
+			   r10buf_pool_alloc, r10buf_pool_free, conf);
+	if (ret)
+		return ret;
 	conf->next_resync = 0;
 	return 0;
 }
 
 static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
 {
-	struct r10bio *r10bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+	struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO);
 	struct rsync_pages *rp;
 	struct bio *bio;
 	int nalloc;
@@ -2945,7 +2944,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 	sector_t chunk_mask = conf->geo.chunk_mask;
 	int page_idx = 0;
 
-	if (!conf->r10buf_pool)
+	if (!mempool_initialized(&conf->r10buf_pool))
 		if (init_resync(conf))
 			return 0;
 
@@ -3699,13 +3698,13 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 
 	conf->geo = geo;
 	conf->copies = copies;
-	conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
-					   r10bio_pool_free, conf);
-	if (!conf->r10bio_pool)
+	err = mempool_init(&conf->r10bio_pool, NR_RAID10_BIOS, r10bio_pool_alloc,
+			   r10bio_pool_free, conf);
+	if (err)
 		goto out;
 
-	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
-	if (!conf->bio_split)
+	err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
+	if (err)
 		goto out;
 
 	calc_sectors(conf, mddev->dev_sectors);
@@ -3733,6 +3732,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 	init_waitqueue_head(&conf->wait_barrier);
 	atomic_set(&conf->nr_pending, 0);
 
+	err = -ENOMEM;
 	conf->thread = md_register_thread(raid10d, mddev, "raid10");
 	if (!conf->thread)
 		goto out;
@@ -3742,11 +3742,10 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 
  out:
 	if (conf) {
-		mempool_destroy(conf->r10bio_pool);
+		mempool_exit(&conf->r10bio_pool);
 		kfree(conf->mirrors);
 		safe_put_page(conf->tmppage);
-		if (conf->bio_split)
-			bioset_free(conf->bio_split);
+		bioset_exit(&conf->bio_split);
 		kfree(conf);
 	}
 	return ERR_PTR(err);
@@ -3953,7 +3952,7 @@ static int raid10_run(struct mddev *mddev)
 
 out_free_conf:
 	md_unregister_thread(&mddev->thread);
-	mempool_destroy(conf->r10bio_pool);
+	mempool_exit(&conf->r10bio_pool);
 	safe_put_page(conf->tmppage);
 	kfree(conf->mirrors);
 	kfree(conf);
@@ -3966,13 +3965,12 @@ static void raid10_free(struct mddev *mddev, void *priv)
 {
 	struct r10conf *conf = priv;
 
-	mempool_destroy(conf->r10bio_pool);
+	mempool_exit(&conf->r10bio_pool);
 	safe_put_page(conf->tmppage);
 	kfree(conf->mirrors);
 	kfree(conf->mirrors_old);
 	kfree(conf->mirrors_new);
-	if (conf->bio_split)
-		bioset_free(conf->bio_split);
+	bioset_exit(&conf->bio_split);
 	kfree(conf);
 }
 
@@ -4543,7 +4541,7 @@ read_more:
 		 * on all the target devices.
 		 */
 		// FIXME
-		mempool_free(r10_bio, conf->r10buf_pool);
+		mempool_free(r10_bio, &conf->r10buf_pool);
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 		return sectors_done;
 	}
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index e2e8840..d3eaaf3 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -93,10 +93,10 @@ struct r10conf {
 						   */
 	wait_queue_head_t	wait_barrier;
 
-	mempool_t		*r10bio_pool;
-	mempool_t		*r10buf_pool;
+	mempool_t		r10bio_pool;
+	mempool_t		r10buf_pool;
 	struct page		*tmppage;
-	struct bio_set		*bio_split;
+	struct bio_set		bio_split;
 
 	/* When taking over an array from a different personality, we store
 	 * the new thread here until we fully activate the array.
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 3c65f52..2b775ab 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -125,9 +125,9 @@ struct r5l_log {
 	struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
 
 	struct kmem_cache *io_kc;
-	mempool_t *io_pool;
-	struct bio_set *bs;
-	mempool_t *meta_pool;
+	mempool_t io_pool;
+	struct bio_set bs;
+	mempool_t meta_pool;
 
 	struct md_thread *reclaim_thread;
 	unsigned long reclaim_target;	/* number of space that need to be
@@ -579,7 +579,7 @@ static void r5l_log_endio(struct bio *bio)
 		md_error(log->rdev->mddev, log->rdev);
 
 	bio_put(bio);
-	mempool_free(io->meta_page, log->meta_pool);
+	mempool_free(io->meta_page, &log->meta_pool);
 
 	spin_lock_irqsave(&log->io_list_lock, flags);
 	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
@@ -748,7 +748,7 @@ static void r5l_submit_current_io(struct r5l_log *log)
 
 static struct bio *r5l_bio_alloc(struct r5l_log *log)
 {
-	struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
+	struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, &log->bs);
 
 	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 	bio_set_dev(bio, log->rdev->bdev);
@@ -780,7 +780,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 	struct r5l_io_unit *io;
 	struct r5l_meta_block *block;
 
-	io = mempool_alloc(log->io_pool, GFP_ATOMIC);
+	io = mempool_alloc(&log->io_pool, GFP_ATOMIC);
 	if (!io)
 		return NULL;
 	memset(io, 0, sizeof(*io));
@@ -791,7 +791,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 	bio_list_init(&io->flush_barriers);
 	io->state = IO_UNIT_RUNNING;
 
-	io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
+	io->meta_page = mempool_alloc(&log->meta_pool, GFP_NOIO);
 	block = page_address(io->meta_page);
 	clear_page(block);
 	block->magic = cpu_to_le32(R5LOG_MAGIC);
@@ -1223,7 +1223,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
 		log->next_checkpoint = io->log_start;
 
 		list_del(&io->log_sibling);
-		mempool_free(io, log->io_pool);
+		mempool_free(io, &log->io_pool);
 		r5l_run_no_mem_stripe(log);
 
 		found = true;
@@ -1647,7 +1647,7 @@ static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
 {
 	struct page *page;
 
-	ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, log->bs);
+	ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, &log->bs);
 	if (!ctx->ra_bio)
 		return -ENOMEM;
 
@@ -3066,6 +3066,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	struct request_queue *q = bdev_get_queue(rdev->bdev);
 	struct r5l_log *log;
 	char b[BDEVNAME_SIZE];
+	int ret;
 
 	pr_debug("md/raid:%s: using device %s as journal\n",
 		 mdname(conf->mddev), bdevname(rdev->bdev, b));
@@ -3111,16 +3112,16 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	if (!log->io_kc)
 		goto io_kc;
 
-	log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
-	if (!log->io_pool)
+	ret = mempool_init_slab_pool(&log->io_pool, R5L_POOL_SIZE, log->io_kc);
+	if (ret)
 		goto io_pool;
 
-	log->bs = bioset_create(R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-	if (!log->bs)
+	ret = bioset_init(&log->bs, R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+	if (ret)
 		goto io_bs;
 
-	log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
-	if (!log->meta_pool)
+	ret = mempool_init_page_pool(&log->meta_pool, R5L_POOL_SIZE, 0);
+	if (ret)
 		goto out_mempool;
 
 	spin_lock_init(&log->tree_lock);
@@ -3155,11 +3156,11 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	rcu_assign_pointer(conf->log, NULL);
 	md_unregister_thread(&log->reclaim_thread);
 reclaim_thread:
-	mempool_destroy(log->meta_pool);
+	mempool_exit(&log->meta_pool);
 out_mempool:
-	bioset_free(log->bs);
+	bioset_exit(&log->bs);
 io_bs:
-	mempool_destroy(log->io_pool);
+	mempool_exit(&log->io_pool);
 io_pool:
 	kmem_cache_destroy(log->io_kc);
 io_kc:
@@ -3178,9 +3179,9 @@ void r5l_exit_log(struct r5conf *conf)
 	wake_up(&conf->mddev->sb_wait);
 	flush_work(&log->disable_writeback_work);
 	md_unregister_thread(&log->reclaim_thread);
-	mempool_destroy(log->meta_pool);
-	bioset_free(log->bs);
-	mempool_destroy(log->io_pool);
+	mempool_exit(&log->meta_pool);
+	bioset_exit(&log->bs);
+	mempool_exit(&log->io_pool);
 	kmem_cache_destroy(log->io_kc);
 	kfree(log);
 }
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 42890a08..3a7c363 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -105,9 +105,9 @@ struct ppl_conf {
 	atomic64_t seq;		/* current log write sequence number */
 
 	struct kmem_cache *io_kc;
-	mempool_t *io_pool;
-	struct bio_set *bs;
-	struct bio_set *flush_bs;
+	mempool_t io_pool;
+	struct bio_set bs;
+	struct bio_set flush_bs;
 
 	/* used only for recovery */
 	int recovered_entries;
@@ -244,7 +244,7 @@ static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
 	struct ppl_header *pplhdr;
 	struct page *header_page;
 
-	io = mempool_alloc(ppl_conf->io_pool, GFP_NOWAIT);
+	io = mempool_alloc(&ppl_conf->io_pool, GFP_NOWAIT);
 	if (!io)
 		return NULL;
 
@@ -503,7 +503,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
 			struct bio *prev = bio;
 
 			bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
-					       ppl_conf->bs);
+					       &ppl_conf->bs);
 			bio->bi_opf = prev->bi_opf;
 			bio_copy_dev(bio, prev);
 			bio->bi_iter.bi_sector = bio_end_sector(prev);
@@ -570,7 +570,7 @@ static void ppl_io_unit_finished(struct ppl_io_unit *io)
 	list_del(&io->log_sibling);
 	spin_unlock(&log->io_list_lock);
 
-	mempool_free(io, ppl_conf->io_pool);
+	mempool_free(io, &ppl_conf->io_pool);
 
 	spin_lock(&ppl_conf->no_mem_stripes_lock);
 	if (!list_empty(&ppl_conf->no_mem_stripes)) {
@@ -642,7 +642,7 @@ static void ppl_do_flush(struct ppl_io_unit *io)
 			struct bio *bio;
 			char b[BDEVNAME_SIZE];
 
-			bio = bio_alloc_bioset(GFP_NOIO, 0, ppl_conf->flush_bs);
+			bio = bio_alloc_bioset(GFP_NOIO, 0, &ppl_conf->flush_bs);
 			bio_set_dev(bio, bdev);
 			bio->bi_private = io;
 			bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
@@ -1246,11 +1246,9 @@ static void __ppl_exit_log(struct ppl_conf *ppl_conf)
 
 	kfree(ppl_conf->child_logs);
 
-	if (ppl_conf->bs)
-		bioset_free(ppl_conf->bs);
-	if (ppl_conf->flush_bs)
-		bioset_free(ppl_conf->flush_bs);
-	mempool_destroy(ppl_conf->io_pool);
+	bioset_exit(&ppl_conf->bs);
+	bioset_exit(&ppl_conf->flush_bs);
+	mempool_exit(&ppl_conf->io_pool);
 	kmem_cache_destroy(ppl_conf->io_kc);
 
 	kfree(ppl_conf);
@@ -1387,24 +1385,18 @@ int ppl_init_log(struct r5conf *conf)
 		goto err;
 	}
 
-	ppl_conf->io_pool = mempool_create(conf->raid_disks, ppl_io_pool_alloc,
-					   ppl_io_pool_free, ppl_conf->io_kc);
-	if (!ppl_conf->io_pool) {
-		ret = -ENOMEM;
+	ret = mempool_init(&ppl_conf->io_pool, conf->raid_disks, ppl_io_pool_alloc,
+			   ppl_io_pool_free, ppl_conf->io_kc);
+	if (ret)
 		goto err;
-	}
 
-	ppl_conf->bs = bioset_create(conf->raid_disks, 0, BIOSET_NEED_BVECS);
-	if (!ppl_conf->bs) {
-		ret = -ENOMEM;
+	ret = bioset_init(&ppl_conf->bs, conf->raid_disks, 0, BIOSET_NEED_BVECS);
+	if (ret)
 		goto err;
-	}
 
-	ppl_conf->flush_bs = bioset_create(conf->raid_disks, 0, 0);
-	if (!ppl_conf->flush_bs) {
-		ret = -ENOMEM;
+	ret = bioset_init(&ppl_conf->flush_bs, conf->raid_disks, 0, 0);
+	if (ret)
 		goto err;
-	}
 
 	ppl_conf->count = conf->raid_disks;
 	ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index be117d0..a2e6498 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5192,7 +5192,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 	/*
 	 * use bio_clone_fast to make a copy of the bio
 	 */
-	align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set);
+	align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set);
 	if (!align_bi)
 		return 0;
 	/*
@@ -5277,7 +5277,7 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
 
 	if (sectors < bio_sectors(raid_bio)) {
 		struct r5conf *conf = mddev->private;
-		split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split);
+		split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
 		bio_chain(split, raid_bio);
 		generic_make_request(raid_bio);
 		raid_bio = split;
@@ -6773,8 +6773,7 @@ static void free_conf(struct r5conf *conf)
 		if (conf->disks[i].extra_page)
 			put_page(conf->disks[i].extra_page);
 	kfree(conf->disks);
-	if (conf->bio_split)
-		bioset_free(conf->bio_split);
+	bioset_exit(&conf->bio_split);
 	kfree(conf->stripe_hashtbl);
 	kfree(conf->pending_data);
 	kfree(conf);
@@ -6853,6 +6852,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	int i;
 	int group_cnt, worker_cnt_per_group;
 	struct r5worker_group *new_group;
+	int ret;
 
 	if (mddev->new_level != 5
 	    && mddev->new_level != 4
@@ -6950,8 +6950,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 			goto abort;
 	}
 
-	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
-	if (!conf->bio_split)
+	ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
+	if (ret)
 		goto abort;
 	conf->mddev = mddev;
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 3f8da26..72e75ba 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -669,7 +669,7 @@ struct r5conf {
 	int			pool_size; /* number of disks in stripeheads in pool */
 	spinlock_t		device_lock;
 	struct disk_info	*disks;
-	struct bio_set		*bio_split;
+	struct bio_set		bio_split;
 
 	/* When taking over an array from a different personality, we store
 	 * the new thread here until we fully activate the array.
diff --git a/drivers/media/pci/saa7164/saa7164-core.c b/drivers/media/pci/saa7164/saa7164-core.c
index fca36a4..d697e1a 100644
--- a/drivers/media/pci/saa7164/saa7164-core.c
+++ b/drivers/media/pci/saa7164/saa7164-core.c
@@ -1122,23 +1122,11 @@ static int saa7164_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int saa7164_proc_open(struct inode *inode, struct file *filp)
-{
-	return single_open(filp, saa7164_proc_show, NULL);
-}
-
-static const struct file_operations saa7164_proc_fops = {
-	.open		= saa7164_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int saa7164_proc_create(void)
 {
 	struct proc_dir_entry *pe;
 
-	pe = proc_create("saa7164", S_IRUGO, NULL, &saa7164_proc_fops);
+	pe = proc_create_single("saa7164", S_IRUGO, NULL, saa7164_proc_show);
 	if (!pe)
 		return -ENOMEM;
 
diff --git a/drivers/media/pci/zoran/videocodec.c b/drivers/media/pci/zoran/videocodec.c
index 5ff23ef..4427ae7 100644
--- a/drivers/media/pci/zoran/videocodec.c
+++ b/drivers/media/pci/zoran/videocodec.c
@@ -344,19 +344,6 @@ static int proc_videocodecs_show(struct seq_file *m, void *v)
 
 	return 0;
 }
-
-static int proc_videocodecs_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_videocodecs_show, NULL);
-}
-
-static const struct file_operations videocodecs_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_videocodecs_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 /* ===================== */
@@ -373,7 +360,8 @@ videocodec_init (void)
 	       VIDEOCODEC_VERSION);
 
 #ifdef CONFIG_PROC_FS
-	videocodec_proc_entry = proc_create("videocodecs", 0, NULL, &videocodecs_proc_fops);
+	videocodec_proc_entry = proc_create_single("videocodecs", 0, NULL,
+			proc_videocodecs_show);
 	if (!videocodec_proc_entry) {
 		dprintk(1, KERN_ERR "videocodec: can't init procfs.\n");
 	}
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index 57b13df..a15181f 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -2094,14 +2094,9 @@ static const struct block_device_operations msb_bdops = {
 static int msb_init_disk(struct memstick_dev *card)
 {
 	struct msb_data *msb = memstick_get_drvdata(card);
-	struct memstick_host *host = card->host;
 	int rc;
-	u64 limit = BLK_BOUNCE_HIGH;
 	unsigned long capacity;
 
-	if (host->dev.dma_mask && *(host->dev.dma_mask))
-		limit = *(host->dev.dma_mask);
-
 	mutex_lock(&msb_disk_lock);
 	msb->disk_id = idr_alloc(&msb_disk_idr, card, 0, 256, GFP_KERNEL);
 	mutex_unlock(&msb_disk_lock);
@@ -2123,7 +2118,6 @@ static int msb_init_disk(struct memstick_dev *card)
 
 	msb->queue->queuedata = card;
 
-	blk_queue_bounce_limit(msb->queue, limit);
 	blk_queue_max_hw_sectors(msb->queue, MS_BLOCK_MAX_PAGES);
 	blk_queue_max_segments(msb->queue, MS_BLOCK_MAX_SEGS);
 	blk_queue_max_segment_size(msb->queue,
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 8897962..5ee9326 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -1170,17 +1170,12 @@ static int mspro_block_init_card(struct memstick_dev *card)
 static int mspro_block_init_disk(struct memstick_dev *card)
 {
 	struct mspro_block_data *msb = memstick_get_drvdata(card);
-	struct memstick_host *host = card->host;
 	struct mspro_devinfo *dev_info = NULL;
 	struct mspro_sys_info *sys_info = NULL;
 	struct mspro_sys_attr *s_attr = NULL;
 	int rc, disk_id;
-	u64 limit = BLK_BOUNCE_HIGH;
 	unsigned long capacity;
 
-	if (host->dev.dma_mask && *(host->dev.dma_mask))
-		limit = *(host->dev.dma_mask);
-
 	for (rc = 0; msb->attr_group.attrs[rc]; ++rc) {
 		s_attr = mspro_from_sysfs_attr(msb->attr_group.attrs[rc]);
 
@@ -1219,7 +1214,6 @@ static int mspro_block_init_disk(struct memstick_dev *card)
 
 	msb->queue->queuedata = card;
 
-	blk_queue_bounce_limit(msb->queue, limit);
 	blk_queue_max_hw_sectors(msb->queue, MSPRO_BLOCK_MAX_PAGES);
 	blk_queue_max_segments(msb->queue, MSPRO_BLOCK_MAX_SEGS);
 	blk_queue_max_segment_size(msb->queue,
diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index 51eb1b0..a746ccd 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -197,9 +197,9 @@ static int	mpt_host_page_access_control(MPT_ADAPTER *ioc, u8 access_control_valu
 static int	mpt_host_page_alloc(MPT_ADAPTER *ioc, pIOCInit_t ioc_init);
 
 #ifdef CONFIG_PROC_FS
-static const struct file_operations mpt_summary_proc_fops;
-static const struct file_operations mpt_version_proc_fops;
-static const struct file_operations mpt_iocinfo_proc_fops;
+static int mpt_summary_proc_show(struct seq_file *m, void *v);
+static int mpt_version_proc_show(struct seq_file *m, void *v);
+static int mpt_iocinfo_proc_show(struct seq_file *m, void *v);
 #endif
 static void	mpt_get_fw_exp_ver(char *buf, MPT_ADAPTER *ioc);
 
@@ -2040,8 +2040,10 @@ mpt_attach(struct pci_dev *pdev, const struct pci_device_id *id)
 	 */
 	dent = proc_mkdir(ioc->name, mpt_proc_root_dir);
 	if (dent) {
-		proc_create_data("info", S_IRUGO, dent, &mpt_iocinfo_proc_fops, ioc);
-		proc_create_data("summary", S_IRUGO, dent, &mpt_summary_proc_fops, ioc);
+		proc_create_single_data("info", S_IRUGO, dent,
+				mpt_iocinfo_proc_show, ioc);
+		proc_create_single_data("summary", S_IRUGO, dent,
+				mpt_summary_proc_show, ioc);
 	}
 #endif
 
@@ -6606,8 +6608,10 @@ procmpt_create(void)
 	if (mpt_proc_root_dir == NULL)
 		return -ENOTDIR;
 
-	proc_create("summary", S_IRUGO, mpt_proc_root_dir, &mpt_summary_proc_fops);
-	proc_create("version", S_IRUGO, mpt_proc_root_dir, &mpt_version_proc_fops);
+	proc_create_single("summary", S_IRUGO, mpt_proc_root_dir,
+			mpt_summary_proc_show);
+	proc_create_single("version", S_IRUGO, mpt_proc_root_dir,
+			mpt_version_proc_show);
 	return 0;
 }
 
@@ -6646,19 +6650,6 @@ static int mpt_summary_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int mpt_summary_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, mpt_summary_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations mpt_summary_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= mpt_summary_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int mpt_version_proc_show(struct seq_file *m, void *v)
 {
 	u8	 cb_idx;
@@ -6701,19 +6692,6 @@ static int mpt_version_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int mpt_version_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, mpt_version_proc_show, NULL);
-}
-
-static const struct file_operations mpt_version_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= mpt_version_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int mpt_iocinfo_proc_show(struct seq_file *m, void *v)
 {
 	MPT_ADAPTER	*ioc = m->private;
@@ -6793,19 +6771,6 @@ static int mpt_iocinfo_proc_show(struct seq_file *m, void *v)
 
 	return 0;
 }
-
-static int mpt_iocinfo_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, mpt_iocinfo_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations mpt_iocinfo_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= mpt_iocinfo_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif		/* CONFIG_PROC_FS } */
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index 86503f6..19a5aa7 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -1929,7 +1929,7 @@ static enum blk_eh_timer_return mptsas_eh_timed_out(struct scsi_cmnd *sc)
 	MPT_SCSI_HOST *hd;
 	MPT_ADAPTER   *ioc;
 	VirtDevice    *vdevice;
-	enum blk_eh_timer_return rc = BLK_EH_NOT_HANDLED;
+	enum blk_eh_timer_return rc = BLK_EH_DONE;
 
 	hd = shost_priv(sc->device->host);
 	if (hd == NULL) {
diff --git a/drivers/mfd/mc13xxx-core.c b/drivers/mfd/mc13xxx-core.c
index d7f54e4..c63e331 100644
--- a/drivers/mfd/mc13xxx-core.c
+++ b/drivers/mfd/mc13xxx-core.c
@@ -279,8 +279,21 @@ int mc13xxx_adc_do_conversion(struct mc13xxx *mc13xxx, unsigned int mode,
 	adc0 = MC13XXX_ADC0_ADINC1 | MC13XXX_ADC0_ADINC2;
 	adc1 = MC13XXX_ADC1_ADEN | MC13XXX_ADC1_ADTRIGIGN | MC13XXX_ADC1_ASC;
 
-	if (channel > 7)
+	/*
+	 * Channels mapped through ADIN7:
+	 * 7  - General purpose ADIN7
+	 * 16 - UID
+	 * 17 - Die temperature
+	 */
+	if (channel > 7 && channel < 16) {
 		adc1 |= MC13XXX_ADC1_ADSEL;
+	} else if (channel == 16) {
+		adc0 |= MC13XXX_ADC0_ADIN7SEL_UID;
+		channel = 7;
+	} else if (channel == 17) {
+		adc0 |= MC13XXX_ADC0_ADIN7SEL_DIE;
+		channel = 7;
+	}
 
 	switch (mode) {
 	case MC13XXX_ADC_MODE_TS:
diff --git a/drivers/misc/sgi-gru/gruprocfs.c b/drivers/misc/sgi-gru/gruprocfs.c
index 4f76359..42ea2ec 100644
--- a/drivers/misc/sgi-gru/gruprocfs.c
+++ b/drivers/misc/sgi-gru/gruprocfs.c
@@ -270,16 +270,6 @@ static int options_open(struct inode *inode, struct file *file)
 	return single_open(file, options_show, NULL);
 }
 
-static int cch_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &cch_seq_ops);
-}
-
-static int gru_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &gru_seq_ops);
-}
-
 /* *INDENT-OFF* */
 static const struct file_operations statistics_fops = {
 	.open 		= statistics_open,
@@ -305,73 +295,30 @@ static const struct file_operations options_fops = {
 	.release 	= single_release,
 };
 
-static const struct file_operations cch_fops = {
-	.open 		= cch_open,
-	.read 		= seq_read,
-	.llseek 	= seq_lseek,
-	.release 	= seq_release,
-};
-static const struct file_operations gru_fops = {
-	.open 		= gru_open,
-	.read 		= seq_read,
-	.llseek 	= seq_lseek,
-	.release 	= seq_release,
-};
-
-static struct proc_entry {
-	char *name;
-	umode_t mode;
-	const struct file_operations *fops;
-	struct proc_dir_entry *entry;
-} proc_files[] = {
-	{"statistics", 0644, &statistics_fops},
-	{"mcs_statistics", 0644, &mcs_statistics_fops},
-	{"debug_options", 0644, &options_fops},
-	{"cch_status", 0444, &cch_fops},
-	{"gru_status", 0444, &gru_fops},
-	{NULL}
-};
-/* *INDENT-ON* */
-
 static struct proc_dir_entry *proc_gru __read_mostly;
 
-static int create_proc_file(struct proc_entry *p)
-{
-	p->entry = proc_create(p->name, p->mode, proc_gru, p->fops);
-	if (!p->entry)
-		return -1;
-	return 0;
-}
-
-static void delete_proc_files(void)
-{
-	struct proc_entry *p;
-
-	if (proc_gru) {
-		for (p = proc_files; p->name; p++)
-			if (p->entry)
-				remove_proc_entry(p->name, proc_gru);
-		proc_remove(proc_gru);
-	}
-}
-
 int gru_proc_init(void)
 {
-	struct proc_entry *p;
-
 	proc_gru = proc_mkdir("sgi_uv/gru", NULL);
-
-	for (p = proc_files; p->name; p++)
-		if (create_proc_file(p))
-			goto err;
+	if (!proc_gru)
+		return -1;
+	if (!proc_create("statistics", 0644, proc_gru, &statistics_fops))
+		goto err;
+	if (!proc_create("mcs_statistics", 0644, proc_gru, &mcs_statistics_fops))
+		goto err;
+	if (!proc_create("debug_options", 0644, proc_gru, &options_fops))
+		goto err;
+	if (!proc_create_seq("cch_status", 0444, proc_gru, &cch_seq_ops))
+		goto err;
+	if (!proc_create_seq("gru_status", 0444, proc_gru, &gru_seq_ops))
+		goto err;
 	return 0;
-
 err:
-	delete_proc_files();
+	remove_proc_subtree("sgi_uv/gru", NULL);
 	return -1;
 }
 
 void gru_proc_exit(void)
 {
-	delete_proc_files();
+	remove_proc_subtree("sgi_uv/gru", NULL);
 }
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 38a7586..d89e178 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -244,7 +244,7 @@ static ssize_t power_ro_lock_store(struct device *dev,
 	mq = &md->queue;
 
 	/* Dispatch locking to the block layer */
-	req = blk_get_request(mq->queue, REQ_OP_DRV_OUT, __GFP_RECLAIM);
+	req = blk_get_request(mq->queue, REQ_OP_DRV_OUT, 0);
 	if (IS_ERR(req)) {
 		count = PTR_ERR(req);
 		goto out_put;
@@ -650,8 +650,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md,
 	 */
 	mq = &md->queue;
 	req = blk_get_request(mq->queue,
-		idata->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN,
-		__GFP_RECLAIM);
+		idata->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto cmd_done;
@@ -721,8 +720,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md,
 	 */
 	mq = &md->queue;
 	req = blk_get_request(mq->queue,
-		idata[0]->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN,
-		__GFP_RECLAIM);
+		idata[0]->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto cmd_err;
@@ -2750,7 +2748,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val)
 	int ret;
 
 	/* Ask the block layer about the card status */
-	req = blk_get_request(mq->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	req = blk_get_request(mq->queue, REQ_OP_DRV_IN, 0);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 	req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_CARD_STATUS;
@@ -2786,7 +2784,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp)
 		return -ENOMEM;
 
 	/* Ask the block layer for the EXT CSD */
-	req = blk_get_request(mq->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	req = blk_get_request(mq->queue, REQ_OP_DRV_IN, 0);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto out_free;
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 56e9a80..648eb67 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -111,8 +111,9 @@ static enum blk_eh_timer_return mmc_cqe_timed_out(struct request *req)
 				__mmc_cqe_recovery_notifier(mq);
 			return BLK_EH_RESET_TIMER;
 		}
-		/* No timeout */
-		return BLK_EH_HANDLED;
+		/* No timeout (XXX: huh? comment doesn't make much sense) */
+		blk_mq_complete_request(req);
+		return BLK_EH_DONE;
 	default:
 		/* Timeout is handled by mmc core */
 		return BLK_EH_RESET_TIMER;
diff --git a/drivers/mmc/core/sdio_uart.c b/drivers/mmc/core/sdio_uart.c
index d3c91f4..25e1130 100644
--- a/drivers/mmc/core/sdio_uart.c
+++ b/drivers/mmc/core/sdio_uart.c
@@ -1008,19 +1008,6 @@ static int sdio_uart_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int sdio_uart_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, sdio_uart_proc_show, NULL);
-}
-
-static const struct file_operations sdio_uart_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= sdio_uart_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static const struct tty_port_operations sdio_uart_port_ops = {
 	.dtr_rts = uart_dtr_rts,
 	.carrier_raised = uart_carrier_raised,
@@ -1045,7 +1032,7 @@ static const struct tty_operations sdio_uart_ops = {
 	.tiocmset		= sdio_uart_tiocmset,
 	.install		= sdio_uart_install,
 	.cleanup		= sdio_uart_cleanup,
-	.proc_fops		= &sdio_uart_proc_fops,
+	.proc_show		= sdio_uart_proc_show,
 };
 
 static struct tty_driver *sdio_uart_tty_driver;
diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig
index 6def544..57b02c4b 100644
--- a/drivers/mtd/devices/Kconfig
+++ b/drivers/mtd/devices/Kconfig
@@ -81,6 +81,7 @@ config MTD_DATAFLASH_OTP
 config MTD_M25P80
 	tristate "Support most SPI Flash chips (AT26DF, M25P, W25X, ...)"
 	depends on SPI_MASTER && MTD_SPI_NOR
+	select SPI_MEM
 	help
 	  This enables access to most modern SPI flash chips, used for
 	  program and data storage.   Series supported include Atmel AT26DF,
diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c
index a4e18f6..e84563d 100644
--- a/drivers/mtd/devices/m25p80.c
+++ b/drivers/mtd/devices/m25p80.c
@@ -24,12 +24,13 @@
 #include <linux/mtd/partitions.h>
 
 #include <linux/spi/spi.h>
+#include <linux/spi/spi-mem.h>
 #include <linux/spi/flash.h>
 #include <linux/mtd/spi-nor.h>
 
 #define	MAX_CMD_SIZE		6
 struct m25p {
-	struct spi_device	*spi;
+	struct spi_mem		*spimem;
 	struct spi_nor		spi_nor;
 	u8			command[MAX_CMD_SIZE];
 };
@@ -37,97 +38,68 @@ struct m25p {
 static int m25p80_read_reg(struct spi_nor *nor, u8 code, u8 *val, int len)
 {
 	struct m25p *flash = nor->priv;
-	struct spi_device *spi = flash->spi;
+	struct spi_mem_op op = SPI_MEM_OP(SPI_MEM_OP_CMD(code, 1),
+					  SPI_MEM_OP_NO_ADDR,
+					  SPI_MEM_OP_NO_DUMMY,
+					  SPI_MEM_OP_DATA_IN(len, val, 1));
 	int ret;
 
-	ret = spi_write_then_read(spi, &code, 1, val, len);
+	ret = spi_mem_exec_op(flash->spimem, &op);
 	if (ret < 0)
-		dev_err(&spi->dev, "error %d reading %x\n", ret, code);
+		dev_err(&flash->spimem->spi->dev, "error %d reading %x\n", ret,
+			code);
 
 	return ret;
 }
 
-static void m25p_addr2cmd(struct spi_nor *nor, unsigned int addr, u8 *cmd)
-{
-	/* opcode is in cmd[0] */
-	cmd[1] = addr >> (nor->addr_width * 8 -  8);
-	cmd[2] = addr >> (nor->addr_width * 8 - 16);
-	cmd[3] = addr >> (nor->addr_width * 8 - 24);
-	cmd[4] = addr >> (nor->addr_width * 8 - 32);
-}
-
-static int m25p_cmdsz(struct spi_nor *nor)
-{
-	return 1 + nor->addr_width;
-}
-
 static int m25p80_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
 {
 	struct m25p *flash = nor->priv;
-	struct spi_device *spi = flash->spi;
-
-	flash->command[0] = opcode;
-	if (buf)
-		memcpy(&flash->command[1], buf, len);
+	struct spi_mem_op op = SPI_MEM_OP(SPI_MEM_OP_CMD(opcode, 1),
+					  SPI_MEM_OP_NO_ADDR,
+					  SPI_MEM_OP_NO_DUMMY,
+					  SPI_MEM_OP_DATA_OUT(len, buf, 1));
 
-	return spi_write(spi, flash->command, len + 1);
+	return spi_mem_exec_op(flash->spimem, &op);
 }
 
 static ssize_t m25p80_write(struct spi_nor *nor, loff_t to, size_t len,
 			    const u_char *buf)
 {
 	struct m25p *flash = nor->priv;
-	struct spi_device *spi = flash->spi;
-	unsigned int inst_nbits, addr_nbits, data_nbits, data_idx;
-	struct spi_transfer t[3] = {};
-	struct spi_message m;
-	int cmd_sz = m25p_cmdsz(nor);
-	ssize_t ret;
+	struct spi_mem_op op =
+			SPI_MEM_OP(SPI_MEM_OP_CMD(nor->program_opcode, 1),
+				   SPI_MEM_OP_ADDR(nor->addr_width, to, 1),
+				   SPI_MEM_OP_DUMMY(0, 1),
+				   SPI_MEM_OP_DATA_OUT(len, buf, 1));
+	size_t remaining = len;
+	int ret;
 
 	/* get transfer protocols. */
-	inst_nbits = spi_nor_get_protocol_inst_nbits(nor->write_proto);
-	addr_nbits = spi_nor_get_protocol_addr_nbits(nor->write_proto);
-	data_nbits = spi_nor_get_protocol_data_nbits(nor->write_proto);
-
-	spi_message_init(&m);
+	op.cmd.buswidth = spi_nor_get_protocol_inst_nbits(nor->write_proto);
+	op.addr.buswidth = spi_nor_get_protocol_addr_nbits(nor->write_proto);
+	op.dummy.buswidth = op.addr.buswidth;
+	op.data.buswidth = spi_nor_get_protocol_data_nbits(nor->write_proto);
 
 	if (nor->program_opcode == SPINOR_OP_AAI_WP && nor->sst_write_second)
-		cmd_sz = 1;
-
-	flash->command[0] = nor->program_opcode;
-	m25p_addr2cmd(nor, to, flash->command);
+		op.addr.nbytes = 0;
 
-	t[0].tx_buf = flash->command;
-	t[0].tx_nbits = inst_nbits;
-	t[0].len = cmd_sz;
-	spi_message_add_tail(&t[0], &m);
-
-	/* split the op code and address bytes into two transfers if needed. */
-	data_idx = 1;
-	if (addr_nbits != inst_nbits) {
-		t[0].len = 1;
+	while (remaining) {
+		op.data.nbytes = remaining < UINT_MAX ? remaining : UINT_MAX;
+		ret = spi_mem_adjust_op_size(flash->spimem, &op);
+		if (ret)
+			return ret;
 
-		t[1].tx_buf = &flash->command[1];
-		t[1].tx_nbits = addr_nbits;
-		t[1].len = cmd_sz - 1;
-		spi_message_add_tail(&t[1], &m);
+		ret = spi_mem_exec_op(flash->spimem, &op);
+		if (ret)
+			return ret;
 
-		data_idx = 2;
+		op.addr.val += op.data.nbytes;
+		remaining -= op.data.nbytes;
+		op.data.buf.out += op.data.nbytes;
 	}
 
-	t[data_idx].tx_buf = buf;
-	t[data_idx].tx_nbits = data_nbits;
-	t[data_idx].len = len;
-	spi_message_add_tail(&t[data_idx], &m);
-
-	ret = spi_sync(spi, &m);
-	if (ret)
-		return ret;
-
-	ret = m.actual_length - cmd_sz;
-	if (ret < 0)
-		return -EIO;
-	return ret;
+	return len;
 }
 
 /*
@@ -138,92 +110,39 @@ static ssize_t m25p80_read(struct spi_nor *nor, loff_t from, size_t len,
 			   u_char *buf)
 {
 	struct m25p *flash = nor->priv;
-	struct spi_device *spi = flash->spi;
-	unsigned int inst_nbits, addr_nbits, data_nbits, data_idx;
-	struct spi_transfer t[3];
-	struct spi_message m;
-	unsigned int dummy = nor->read_dummy;
-	ssize_t ret;
-	int cmd_sz;
+	struct spi_mem_op op =
+			SPI_MEM_OP(SPI_MEM_OP_CMD(nor->read_opcode, 1),
+				   SPI_MEM_OP_ADDR(nor->addr_width, from, 1),
+				   SPI_MEM_OP_DUMMY(nor->read_dummy, 1),
+				   SPI_MEM_OP_DATA_IN(len, buf, 1));
+	size_t remaining = len;
+	int ret;
 
 	/* get transfer protocols. */
-	inst_nbits = spi_nor_get_protocol_inst_nbits(nor->read_proto);
-	addr_nbits = spi_nor_get_protocol_addr_nbits(nor->read_proto);
-	data_nbits = spi_nor_get_protocol_data_nbits(nor->read_proto);
+	op.cmd.buswidth = spi_nor_get_protocol_inst_nbits(nor->read_proto);
+	op.addr.buswidth = spi_nor_get_protocol_addr_nbits(nor->read_proto);
+	op.dummy.buswidth = op.addr.buswidth;
+	op.data.buswidth = spi_nor_get_protocol_data_nbits(nor->read_proto);
 
 	/* convert the dummy cycles to the number of bytes */
-	dummy = (dummy * addr_nbits) / 8;
-
-	if (spi_flash_read_supported(spi)) {
-		struct spi_flash_read_message msg;
-
-		memset(&msg, 0, sizeof(msg));
+	op.dummy.nbytes = (nor->read_dummy * op.dummy.buswidth) / 8;
 
-		msg.buf = buf;
-		msg.from = from;
-		msg.len = len;
-		msg.read_opcode = nor->read_opcode;
-		msg.addr_width = nor->addr_width;
-		msg.dummy_bytes = dummy;
-		msg.opcode_nbits = inst_nbits;
-		msg.addr_nbits = addr_nbits;
-		msg.data_nbits = data_nbits;
-
-		ret = spi_flash_read(spi, &msg);
-		if (ret < 0)
+	while (remaining) {
+		op.data.nbytes = remaining < UINT_MAX ? remaining : UINT_MAX;
+		ret = spi_mem_adjust_op_size(flash->spimem, &op);
+		if (ret)
 			return ret;
-		return msg.retlen;
-	}
 
-	spi_message_init(&m);
-	memset(t, 0, (sizeof t));
-
-	flash->command[0] = nor->read_opcode;
-	m25p_addr2cmd(nor, from, flash->command);
-
-	t[0].tx_buf = flash->command;
-	t[0].tx_nbits = inst_nbits;
-	t[0].len = m25p_cmdsz(nor) + dummy;
-	spi_message_add_tail(&t[0], &m);
-
-	/*
-	 * Set all dummy/mode cycle bits to avoid sending some manufacturer
-	 * specific pattern, which might make the memory enter its Continuous
-	 * Read mode by mistake.
-	 * Based on the different mode cycle bit patterns listed and described
-	 * in the JESD216B specification, the 0xff value works for all memories
-	 * and all manufacturers.
-	 */
-	cmd_sz = t[0].len;
-	memset(flash->command + cmd_sz - dummy, 0xff, dummy);
-
-	/* split the op code and address bytes into two transfers if needed. */
-	data_idx = 1;
-	if (addr_nbits != inst_nbits) {
-		t[0].len = 1;
-
-		t[1].tx_buf = &flash->command[1];
-		t[1].tx_nbits = addr_nbits;
-		t[1].len = cmd_sz - 1;
-		spi_message_add_tail(&t[1], &m);
+		ret = spi_mem_exec_op(flash->spimem, &op);
+		if (ret)
+			return ret;
 
-		data_idx = 2;
+		op.addr.val += op.data.nbytes;
+		remaining -= op.data.nbytes;
+		op.data.buf.in += op.data.nbytes;
 	}
 
-	t[data_idx].rx_buf = buf;
-	t[data_idx].rx_nbits = data_nbits;
-	t[data_idx].len = min3(len, spi_max_transfer_size(spi),
-			       spi_max_message_size(spi) - cmd_sz);
-	spi_message_add_tail(&t[data_idx], &m);
-
-	ret = spi_sync(spi, &m);
-	if (ret)
-		return ret;
-
-	ret = m.actual_length - cmd_sz;
-	if (ret < 0)
-		return -EIO;
-	return ret;
+	return len;
 }
 
 /*
@@ -231,8 +150,9 @@ static ssize_t m25p80_read(struct spi_nor *nor, loff_t from, size_t len,
  * matches what the READ command supports, at least until this driver
  * understands FAST_READ (for clocks over 25 MHz).
  */
-static int m25p_probe(struct spi_device *spi)
+static int m25p_probe(struct spi_mem *spimem)
 {
+	struct spi_device *spi = spimem->spi;
 	struct flash_platform_data	*data;
 	struct m25p *flash;
 	struct spi_nor *nor;
@@ -244,9 +164,9 @@ static int m25p_probe(struct spi_device *spi)
 	char *flash_name;
 	int ret;
 
-	data = dev_get_platdata(&spi->dev);
+	data = dev_get_platdata(&spimem->spi->dev);
 
-	flash = devm_kzalloc(&spi->dev, sizeof(*flash), GFP_KERNEL);
+	flash = devm_kzalloc(&spimem->spi->dev, sizeof(*flash), GFP_KERNEL);
 	if (!flash)
 		return -ENOMEM;
 
@@ -258,12 +178,12 @@ static int m25p_probe(struct spi_device *spi)
 	nor->write_reg = m25p80_write_reg;
 	nor->read_reg = m25p80_read_reg;
 
-	nor->dev = &spi->dev;
+	nor->dev = &spimem->spi->dev;
 	spi_nor_set_flash_node(nor, spi->dev.of_node);
 	nor->priv = flash;
 
-	spi_set_drvdata(spi, flash);
-	flash->spi = spi;
+	spi_mem_set_drvdata(spimem, flash);
+	flash->spimem = spimem;
 
 	if (spi->mode & SPI_RX_QUAD) {
 		hwcaps.mask |= SNOR_HWCAPS_READ_1_1_4;
@@ -303,9 +223,9 @@ static int m25p_probe(struct spi_device *spi)
 }
 
 
-static int m25p_remove(struct spi_device *spi)
+static int m25p_remove(struct spi_mem *spimem)
 {
-	struct m25p	*flash = spi_get_drvdata(spi);
+	struct m25p	*flash = spi_mem_get_drvdata(spimem);
 
 	spi_nor_restore(&flash->spi_nor);
 
@@ -313,9 +233,9 @@ static int m25p_remove(struct spi_device *spi)
 	return mtd_device_unregister(&flash->spi_nor.mtd);
 }
 
-static void m25p_shutdown(struct spi_device *spi)
+static void m25p_shutdown(struct spi_mem *spimem)
 {
-	struct m25p *flash = spi_get_drvdata(spi);
+	struct m25p *flash = spi_mem_get_drvdata(spimem);
 
 	spi_nor_restore(&flash->spi_nor);
 }
@@ -386,12 +306,14 @@ static const struct of_device_id m25p_of_table[] = {
 };
 MODULE_DEVICE_TABLE(of, m25p_of_table);
 
-static struct spi_driver m25p80_driver = {
-	.driver = {
-		.name	= "m25p80",
-		.of_match_table = m25p_of_table,
+static struct spi_mem_driver m25p80_driver = {
+	.spidrv = {
+		.driver = {
+			.name	= "m25p80",
+			.of_match_table = m25p_of_table,
+		},
+		.id_table	= m25p_ids,
 	},
-	.id_table	= m25p_ids,
 	.probe	= m25p_probe,
 	.remove	= m25p_remove,
 	.shutdown	= m25p_shutdown,
@@ -402,7 +324,7 @@ static struct spi_driver m25p80_driver = {
 	 */
 };
 
-module_spi_driver(m25p80_driver);
+module_spi_mem_driver(m25p80_driver);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Mike Lavender");
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 16ae4ae..29c0bfd 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -82,7 +82,6 @@ static blk_status_t do_blktrans_request(struct mtd_blktrans_ops *tr,
 
 	block = blk_rq_pos(req) << 9 >> tr->blkshift;
 	nsect = blk_rq_cur_bytes(req) >> tr->blkshift;
-	buf = bio_data(req->bio);
 
 	if (req_op(req) == REQ_OP_FLUSH) {
 		if (tr->flush(dev))
@@ -100,9 +99,14 @@ static blk_status_t do_blktrans_request(struct mtd_blktrans_ops *tr,
 			return BLK_STS_IOERR;
 		return BLK_STS_OK;
 	case REQ_OP_READ:
-		for (; nsect > 0; nsect--, block++, buf += tr->blksize)
-			if (tr->readsect(dev, block, buf))
+		buf = kmap(bio_page(req->bio)) + bio_offset(req->bio);
+		for (; nsect > 0; nsect--, block++, buf += tr->blksize) {
+			if (tr->readsect(dev, block, buf)) {
+				kunmap(bio_page(req->bio));
 				return BLK_STS_IOERR;
+			}
+		}
+		kunmap(bio_page(req->bio));
 		rq_flush_dcache_pages(req);
 		return BLK_STS_OK;
 	case REQ_OP_WRITE:
@@ -110,9 +114,14 @@ static blk_status_t do_blktrans_request(struct mtd_blktrans_ops *tr,
 			return BLK_STS_IOERR;
 
 		rq_flush_dcache_pages(req);
-		for (; nsect > 0; nsect--, block++, buf += tr->blksize)
-			if (tr->writesect(dev, block, buf))
+		buf = kmap(bio_page(req->bio)) + bio_offset(req->bio);
+		for (; nsect > 0; nsect--, block++, buf += tr->blksize) {
+			if (tr->writesect(dev, block, buf)) {
+				kunmap(bio_page(req->bio));
 				return BLK_STS_IOERR;
+			}
+		}
+		kunmap(bio_page(req->bio));
 		return BLK_STS_OK;
 	default:
 		return BLK_STS_IOERR;
@@ -418,7 +427,6 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
 	new->rq->queuedata = new;
 	blk_queue_logical_block_size(new->rq, tr->blksize);
 
-	blk_queue_bounce_limit(new->rq, BLK_BOUNCE_HIGH);
 	blk_queue_flag_set(QUEUE_FLAG_NONROT, new->rq);
 	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, new->rq);
 
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 807d17d..64a1fca 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1829,18 +1829,6 @@ static int mtd_proc_show(struct seq_file *m, void *v)
 	mutex_unlock(&mtd_table_mutex);
 	return 0;
 }
-
-static int mtd_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, mtd_proc_show, NULL);
-}
-
-static const struct file_operations mtd_proc_ops = {
-	.open		= mtd_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif /* CONFIG_PROC_FS */
 
 /*====================================================================*/
@@ -1883,7 +1871,7 @@ static int __init init_mtd(void)
 		goto err_bdi;
 	}
 
-	proc_mtd = proc_create("mtd", 0, NULL, &mtd_proc_ops);
+	proc_mtd = proc_create_single("mtd", 0, NULL, mtd_proc_show);
 
 	ret = init_mtdchar();
 	if (ret)
diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c
index 01059f1..9f7d83e 100644
--- a/drivers/net/bonding/bond_procfs.c
+++ b/drivers/net/bonding/bond_procfs.c
@@ -10,7 +10,7 @@
 static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(RCU)
 {
-	struct bonding *bond = seq->private;
+	struct bonding *bond = PDE_DATA(file_inode(seq->file));
 	struct list_head *iter;
 	struct slave *slave;
 	loff_t off = 0;
@@ -29,7 +29,7 @@ static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos)
 
 static void *bond_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct bonding *bond = seq->private;
+	struct bonding *bond = PDE_DATA(file_inode(seq->file));
 	struct list_head *iter;
 	struct slave *slave;
 	bool found = false;
@@ -56,7 +56,7 @@ static void bond_info_seq_stop(struct seq_file *seq, void *v)
 
 static void bond_info_show_master(struct seq_file *seq)
 {
-	struct bonding *bond = seq->private;
+	struct bonding *bond = PDE_DATA(file_inode(seq->file));
 	const struct bond_opt_value *optval;
 	struct slave *curr, *primary;
 	int i;
@@ -167,7 +167,7 @@ static void bond_info_show_master(struct seq_file *seq)
 static void bond_info_show_slave(struct seq_file *seq,
 				 const struct slave *slave)
 {
-	struct bonding *bond = seq->private;
+	struct bonding *bond = PDE_DATA(file_inode(seq->file));
 
 	seq_printf(seq, "\nSlave Interface: %s\n", slave->dev->name);
 	seq_printf(seq, "MII Status: %s\n", bond_slave_link_status(slave->link));
@@ -257,38 +257,14 @@ static const struct seq_operations bond_info_seq_ops = {
 	.show  = bond_info_seq_show,
 };
 
-static int bond_info_open(struct inode *inode, struct file *file)
-{
-	struct seq_file *seq;
-	int res;
-
-	res = seq_open(file, &bond_info_seq_ops);
-	if (!res) {
-		/* recover the pointer buried in proc_dir_entry data */
-		seq = file->private_data;
-		seq->private = PDE_DATA(inode);
-	}
-
-	return res;
-}
-
-static const struct file_operations bond_info_fops = {
-	.owner   = THIS_MODULE,
-	.open    = bond_info_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release,
-};
-
 void bond_create_proc_entry(struct bonding *bond)
 {
 	struct net_device *bond_dev = bond->dev;
 	struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id);
 
 	if (bn->proc_dir) {
-		bond->proc_entry = proc_create_data(bond_dev->name,
-						    0444, bn->proc_dir,
-						    &bond_info_fops, bond);
+		bond->proc_entry = proc_create_seq_data(bond_dev->name, 0444,
+				bn->proc_dir, &bond_info_seq_ops, bond);
 		if (bond->proc_entry == NULL)
 			netdev_warn(bond_dev, "Cannot create /proc/net/%s/%s\n",
 				    DRV_NAME, bond_dev->name);
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index a4ebd87..661828e 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -1289,9 +1289,8 @@ static int efx_init_io(struct efx_nic *efx)
 
 	pci_set_master(pci_dev);
 
-	/* Set the PCI DMA mask.  Try all possibilities from our
-	 * genuine mask down to 32 bits, because some architectures
-	 * (e.g. x86_64 with iommu_sac_force set) will allow 40 bit
+	/* Set the PCI DMA mask.  Try all possibilities from our genuine mask
+	 * down to 32 bits, because some architectures will allow 40 bit
 	 * masks event though they reject 46 bit masks.
 	 */
 	while (dma_mask > 0x7fffffffUL) {
diff --git a/drivers/net/ethernet/sfc/falcon/efx.c b/drivers/net/ethernet/sfc/falcon/efx.c
index 3d6c91e..dd5530a 100644
--- a/drivers/net/ethernet/sfc/falcon/efx.c
+++ b/drivers/net/ethernet/sfc/falcon/efx.c
@@ -1242,9 +1242,8 @@ static int ef4_init_io(struct ef4_nic *efx)
 
 	pci_set_master(pci_dev);
 
-	/* Set the PCI DMA mask.  Try all possibilities from our
-	 * genuine mask down to 32 bits, because some architectures
-	 * (e.g. x86_64 with iommu_sac_force set) will allow 40 bit
+	/* Set the PCI DMA mask.  Try all possibilities from our genuine mask
+	 * down to 32 bits, because some architectures will allow 40 bit
 	 * masks event though they reject 46 bit masks.
 	 */
 	while (dma_mask > 0x7fffffffUL) {
diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c
index dfabbae..f347fd9 100644
--- a/drivers/net/hamradio/bpqether.c
+++ b/drivers/net/hamradio/bpqether.c
@@ -449,20 +449,6 @@ static const struct seq_operations bpq_seqops = {
 	.show = bpq_seq_show,
 };
 
-static int bpq_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &bpq_seqops);
-}
-
-static const struct file_operations bpq_info_fops = {
-	.owner = THIS_MODULE,
-	.open = bpq_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
-
 /* ------------------------------------------------------------------------ */
 
 static const struct net_device_ops bpq_netdev_ops = {
@@ -590,7 +576,7 @@ static int bpq_device_event(struct notifier_block *this,
 static int __init bpq_init_driver(void)
 {
 #ifdef CONFIG_PROC_FS
-	if (!proc_create("bpqether", 0444, init_net.proc_net, &bpq_info_fops)) {
+	if (!proc_create_seq("bpqether", 0444, init_net.proc_net, &bpq_seqops)) {
 		printk(KERN_ERR
 			"bpq: cannot create /proc/net/bpqether entry.\n");
 		return -ENOENT;
diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c
index 3de2729..6c03932 100644
--- a/drivers/net/hamradio/scc.c
+++ b/drivers/net/hamradio/scc.c
@@ -2084,21 +2084,6 @@ static const struct seq_operations scc_net_seq_ops = {
 	.stop   = scc_net_seq_stop,
 	.show   = scc_net_seq_show,
 };
-
-
-static int scc_net_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &scc_net_seq_ops);
-}
-
-static const struct file_operations scc_net_seq_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = scc_net_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_private,
-};
-
 #endif /* CONFIG_PROC_FS */
 
  
@@ -2122,7 +2107,7 @@ static int __init scc_init_driver (void)
 	}
 	rtnl_unlock();
 
-	proc_create("z8530drv", 0, init_net.proc_net, &scc_net_seq_fops);
+	proc_create_seq("z8530drv", 0, init_net.proc_net, &scc_net_seq_ops);
 
 	return 0;
 }
diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c
index 83034eb..16ec7af 100644
--- a/drivers/net/hamradio/yam.c
+++ b/drivers/net/hamradio/yam.c
@@ -841,20 +841,6 @@ static const struct seq_operations yam_seqops = {
 	.stop = yam_seq_stop,
 	.show = yam_seq_show,
 };
-
-static int yam_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &yam_seqops);
-}
-
-static const struct file_operations yam_info_fops = {
-	.owner = THIS_MODULE,
-	.open = yam_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 #endif
 
 
@@ -1168,7 +1154,7 @@ static int __init yam_init_driver(void)
 	yam_timer.expires = jiffies + HZ / 100;
 	add_timer(&yam_timer);
 
-	proc_create("yam", 0444, init_net.proc_net, &yam_info_fops);
+	proc_create_seq("yam", 0444, init_net.proc_net, &yam_seqops);
 	return 0;
  error:
 	while (--i >= 0) {
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 7df0733..de51e8f70f 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -1096,21 +1096,6 @@ static const struct seq_operations pppoe_seq_ops = {
 	.stop		= pppoe_seq_stop,
 	.show		= pppoe_seq_show,
 };
-
-static int pppoe_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &pppoe_seq_ops,
-			sizeof(struct seq_net_private));
-}
-
-static const struct file_operations pppoe_seq_fops = {
-	.owner		= THIS_MODULE,
-	.open		= pppoe_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 #endif /* CONFIG_PROC_FS */
 
 static const struct proto_ops pppoe_ops = {
@@ -1122,7 +1107,7 @@ static const struct proto_ops pppoe_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= pppoe_getname,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= sock_no_setsockopt,
@@ -1146,7 +1131,8 @@ static __net_init int pppoe_init_net(struct net *net)
 
 	rwlock_init(&pn->hash_lock);
 
-	pde = proc_create("pppoe", 0444, net->proc_net, &pppoe_seq_fops);
+	pde = proc_create_net("pppoe", 0444, net->proc_net,
+			&pppoe_seq_ops, sizeof(struct seq_net_private));
 #ifdef CONFIG_PROC_FS
 	if (!pde)
 		return -ENOMEM;
diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index c4267ec..157b67c 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -624,7 +624,6 @@ static const struct proto_ops pptp_ops = {
 	.socketpair = sock_no_socketpair,
 	.accept     = sock_no_accept,
 	.getname    = pptp_getname,
-	.poll       = sock_no_poll,
 	.listen     = sock_no_listen,
 	.shutdown   = sock_no_shutdown,
 	.setsockopt = sock_no_setsockopt,
diff --git a/drivers/net/wireless/atmel/atmel.c b/drivers/net/wireless/atmel/atmel.c
index d122386..b01dc34 100644
--- a/drivers/net/wireless/atmel/atmel.c
+++ b/drivers/net/wireless/atmel/atmel.c
@@ -1482,18 +1482,6 @@ static int atmel_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int atmel_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, atmel_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations atmel_proc_fops = {
-	.open		= atmel_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static const struct net_device_ops atmel_netdev_ops = {
 	.ndo_open 		= atmel_open,
 	.ndo_stop		= atmel_close,
@@ -1614,7 +1602,8 @@ struct net_device *init_atmel_card(unsigned short irq, unsigned long port,
 
 	netif_carrier_off(dev);
 
-	if (!proc_create_data("driver/atmel", 0, NULL, &atmel_proc_fops, priv))
+	if (!proc_create_single_data("driver/atmel", 0, NULL, atmel_proc_show,
+			priv))
 		printk(KERN_WARNING "atmel: unable to create /proc entry.\n");
 
 	printk(KERN_INFO "%s: Atmel at76c50x. Version %d.%d. MAC %pM\n",
diff --git a/drivers/net/wireless/intersil/hostap/hostap_ap.c b/drivers/net/wireless/intersil/hostap/hostap_ap.c
index b4dfe189..d1884b8 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_ap.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_ap.c
@@ -69,7 +69,7 @@ static void prism2_send_mgmt(struct net_device *dev,
 #ifndef PRISM2_NO_PROCFS_DEBUG
 static int ap_debug_proc_show(struct seq_file *m, void *v)
 {
-	struct ap_data *ap = m->private;
+	struct ap_data *ap = PDE_DATA(file_inode(m->file));
 
 	seq_printf(m, "BridgedUnicastFrames=%u\n", ap->bridged_unicast);
 	seq_printf(m, "BridgedMulticastFrames=%u\n", ap->bridged_multicast);
@@ -81,18 +81,6 @@ static int ap_debug_proc_show(struct seq_file *m, void *v)
 	seq_printf(m, "tx_drop_nonassoc=%u\n", ap->tx_drop_nonassoc);
 	return 0;
 }
-
-static int ap_debug_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ap_debug_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ap_debug_proc_fops = {
-	.open		= ap_debug_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif /* PRISM2_NO_PROCFS_DEBUG */
 
 
@@ -333,7 +321,7 @@ void hostap_deauth_all_stas(struct net_device *dev, struct ap_data *ap,
 
 static int ap_control_proc_show(struct seq_file *m, void *v)
 {
-	struct ap_data *ap = m->private;
+	struct ap_data *ap = PDE_DATA(file_inode(m->file));
 	char *policy_txt;
 	struct mac_entry *entry;
 
@@ -365,20 +353,20 @@ static int ap_control_proc_show(struct seq_file *m, void *v)
 
 static void *ap_control_proc_start(struct seq_file *m, loff_t *_pos)
 {
-	struct ap_data *ap = m->private;
+	struct ap_data *ap = PDE_DATA(file_inode(m->file));
 	spin_lock_bh(&ap->mac_restrictions.lock);
 	return seq_list_start_head(&ap->mac_restrictions.mac_list, *_pos);
 }
 
 static void *ap_control_proc_next(struct seq_file *m, void *v, loff_t *_pos)
 {
-	struct ap_data *ap = m->private;
+	struct ap_data *ap = PDE_DATA(file_inode(m->file));
 	return seq_list_next(v, &ap->mac_restrictions.mac_list, _pos);
 }
 
 static void ap_control_proc_stop(struct seq_file *m, void *v)
 {
-	struct ap_data *ap = m->private;
+	struct ap_data *ap = PDE_DATA(file_inode(m->file));
 	spin_unlock_bh(&ap->mac_restrictions.lock);
 }
 
@@ -389,24 +377,6 @@ static const struct seq_operations ap_control_proc_seqops = {
 	.show	= ap_control_proc_show,
 };
 
-static int ap_control_proc_open(struct inode *inode, struct file *file)
-{
-	int ret = seq_open(file, &ap_control_proc_seqops);
-	if (ret == 0) {
-		struct seq_file *m = file->private_data;
-		m->private = PDE_DATA(inode);
-	}
-	return ret;
-}
-
-static const struct file_operations ap_control_proc_fops = {
-	.open		= ap_control_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-
 int ap_control_add_mac(struct mac_restrictions *mac_restrictions, u8 *mac)
 {
 	struct mac_entry *entry;
@@ -585,20 +555,20 @@ static int prism2_ap_proc_show(struct seq_file *m, void *v)
 
 static void *prism2_ap_proc_start(struct seq_file *m, loff_t *_pos)
 {
-	struct ap_data *ap = m->private;
+	struct ap_data *ap = PDE_DATA(file_inode(m->file));
 	spin_lock_bh(&ap->sta_table_lock);
 	return seq_list_start_head(&ap->sta_list, *_pos);
 }
 
 static void *prism2_ap_proc_next(struct seq_file *m, void *v, loff_t *_pos)
 {
-	struct ap_data *ap = m->private;
+	struct ap_data *ap = PDE_DATA(file_inode(m->file));
 	return seq_list_next(v, &ap->sta_list, _pos);
 }
 
 static void prism2_ap_proc_stop(struct seq_file *m, void *v)
 {
-	struct ap_data *ap = m->private;
+	struct ap_data *ap = PDE_DATA(file_inode(m->file));
 	spin_unlock_bh(&ap->sta_table_lock);
 }
 
@@ -608,23 +578,6 @@ static const struct seq_operations prism2_ap_proc_seqops = {
 	.stop	= prism2_ap_proc_stop,
 	.show	= prism2_ap_proc_show,
 };
-
-static int prism2_ap_proc_open(struct inode *inode, struct file *file)
-{
-	int ret = seq_open(file, &prism2_ap_proc_seqops);
-	if (ret == 0) {
-		struct seq_file *m = file->private_data;
-		m->private = PDE_DATA(inode);
-	}
-	return ret;
-}
-
-static const struct file_operations prism2_ap_proc_fops = {
-	.open		= prism2_ap_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
 #endif /* PRISM2_NO_KERNEL_IEEE80211_MGMT */
 
 
@@ -896,12 +849,13 @@ void hostap_init_ap_proc(local_info_t *local)
 		return;
 
 #ifndef PRISM2_NO_PROCFS_DEBUG
-	proc_create_data("ap_debug", 0, ap->proc, &ap_debug_proc_fops, ap);
+	proc_create_single_data("ap_debug", 0, ap->proc, ap_debug_proc_show, ap);
 #endif /* PRISM2_NO_PROCFS_DEBUG */
 
 #ifndef PRISM2_NO_KERNEL_IEEE80211_MGMT
-	proc_create_data("ap_control", 0, ap->proc, &ap_control_proc_fops, ap);
-	proc_create_data("ap", 0, ap->proc, &prism2_ap_proc_fops, ap);
+	proc_create_seq_data("ap_control", 0, ap->proc, &ap_control_proc_seqops,
+			ap);
+	proc_create_seq_data("ap", 0, ap->proc, &prism2_ap_proc_seqops, ap);
 #endif /* PRISM2_NO_KERNEL_IEEE80211_MGMT */
 
 }
@@ -1106,18 +1060,6 @@ static int prism2_sta_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int prism2_sta_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, prism2_sta_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations prism2_sta_proc_fops = {
-	.open		= prism2_sta_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static void handle_add_proc_queue(struct work_struct *work)
 {
 	struct ap_data *ap = container_of(work, struct ap_data,
@@ -1138,9 +1080,9 @@ static void handle_add_proc_queue(struct work_struct *work)
 
 		if (sta) {
 			sprintf(name, "%pM", sta->addr);
-			sta->proc = proc_create_data(
+			sta->proc = proc_create_single_data(
 				name, 0, ap->proc,
-				&prism2_sta_proc_fops, sta);
+				prism2_sta_proc_show, sta);
 
 			atomic_dec(&sta->users);
 		}
diff --git a/drivers/net/wireless/intersil/hostap/hostap_hw.c b/drivers/net/wireless/intersil/hostap/hostap_hw.c
index 5c4a17a..2720aa3 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_hw.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_hw.c
@@ -2951,19 +2951,6 @@ static int prism2_registers_proc_show(struct seq_file *m, void *v)
 
 	return 0;
 }
-
-static int prism2_registers_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, prism2_registers_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations prism2_registers_proc_fops = {
-	.open		= prism2_registers_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 #endif /* PRISM2_NO_PROCFS_DEBUG */
 
 
@@ -3279,8 +3266,8 @@ static int hostap_hw_ready(struct net_device *dev)
 		}
 		hostap_init_proc(local);
 #ifndef PRISM2_NO_PROCFS_DEBUG
-		proc_create_data("registers", 0, local->proc,
-				 &prism2_registers_proc_fops, local);
+		proc_create_single_data("registers", 0, local->proc,
+				 prism2_registers_proc_show, local);
 #endif /* PRISM2_NO_PROCFS_DEBUG */
 		hostap_init_ap_proc(local);
 		return 0;
diff --git a/drivers/net/wireless/intersil/hostap/hostap_proc.c b/drivers/net/wireless/intersil/hostap/hostap_proc.c
index d234231..5b33cca 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_proc.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_proc.c
@@ -43,18 +43,6 @@ static int prism2_debug_proc_show(struct seq_file *m, void *v)
 
 	return 0;
 }
-
-static int prism2_debug_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, prism2_debug_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations prism2_debug_proc_fops = {
-	.open		= prism2_debug_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif /* PRISM2_NO_PROCFS_DEBUG */
 
 
@@ -95,19 +83,6 @@ static int prism2_stats_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int prism2_stats_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, prism2_stats_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations prism2_stats_proc_fops = {
-	.open		= prism2_stats_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-
 static int prism2_wds_proc_show(struct seq_file *m, void *v)
 {
 	struct list_head *ptr = v;
@@ -122,20 +97,20 @@ static int prism2_wds_proc_show(struct seq_file *m, void *v)
 
 static void *prism2_wds_proc_start(struct seq_file *m, loff_t *_pos)
 {
-	local_info_t *local = m->private;
+	local_info_t *local = PDE_DATA(file_inode(m->file));
 	read_lock_bh(&local->iface_lock);
 	return seq_list_start(&local->hostap_interfaces, *_pos);
 }
 
 static void *prism2_wds_proc_next(struct seq_file *m, void *v, loff_t *_pos)
 {
-	local_info_t *local = m->private;
+	local_info_t *local = PDE_DATA(file_inode(m->file));
 	return seq_list_next(v, &local->hostap_interfaces, _pos);
 }
 
 static void prism2_wds_proc_stop(struct seq_file *m, void *v)
 {
-	local_info_t *local = m->private;
+	local_info_t *local = PDE_DATA(file_inode(m->file));
 	read_unlock_bh(&local->iface_lock);
 }
 
@@ -146,27 +121,9 @@ static const struct seq_operations prism2_wds_proc_seqops = {
 	.show	= prism2_wds_proc_show,
 };
 
-static int prism2_wds_proc_open(struct inode *inode, struct file *file)
-{
-	int ret = seq_open(file, &prism2_wds_proc_seqops);
-	if (ret == 0) {
-		struct seq_file *m = file->private_data;
-		m->private = PDE_DATA(inode);
-	}
-	return ret;
-}
-
-static const struct file_operations prism2_wds_proc_fops = {
-	.open		= prism2_wds_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-
 static int prism2_bss_list_proc_show(struct seq_file *m, void *v)
 {
-	local_info_t *local = m->private;
+	local_info_t *local = PDE_DATA(file_inode(m->file));
 	struct list_head *ptr = v;
 	struct hostap_bss_info *bss;
 
@@ -193,20 +150,20 @@ static int prism2_bss_list_proc_show(struct seq_file *m, void *v)
 
 static void *prism2_bss_list_proc_start(struct seq_file *m, loff_t *_pos)
 {
-	local_info_t *local = m->private;
+	local_info_t *local = PDE_DATA(file_inode(m->file));
 	spin_lock_bh(&local->lock);
 	return seq_list_start_head(&local->bss_list, *_pos);
 }
 
 static void *prism2_bss_list_proc_next(struct seq_file *m, void *v, loff_t *_pos)
 {
-	local_info_t *local = m->private;
+	local_info_t *local = PDE_DATA(file_inode(m->file));
 	return seq_list_next(v, &local->bss_list, _pos);
 }
 
 static void prism2_bss_list_proc_stop(struct seq_file *m, void *v)
 {
-	local_info_t *local = m->private;
+	local_info_t *local = PDE_DATA(file_inode(m->file));
 	spin_unlock_bh(&local->lock);
 }
 
@@ -217,24 +174,6 @@ static const struct seq_operations prism2_bss_list_proc_seqops = {
 	.show	= prism2_bss_list_proc_show,
 };
 
-static int prism2_bss_list_proc_open(struct inode *inode, struct file *file)
-{
-	int ret = seq_open(file, &prism2_bss_list_proc_seqops);
-	if (ret == 0) {
-		struct seq_file *m = file->private_data;
-		m->private = PDE_DATA(inode);
-	}
-	return ret;
-}
-
-static const struct file_operations prism2_bss_list_proc_fops = {
-	.open		= prism2_bss_list_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-
 static int prism2_crypt_proc_show(struct seq_file *m, void *v)
 {
 	local_info_t *local = m->private;
@@ -252,19 +191,6 @@ static int prism2_crypt_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int prism2_crypt_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, prism2_crypt_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations prism2_crypt_proc_fops = {
-	.open		= prism2_crypt_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-
 static ssize_t prism2_pda_proc_read(struct file *file, char __user *buf,
 				    size_t count, loff_t *_pos)
 {
@@ -342,7 +268,7 @@ static int prism2_io_debug_proc_read(char *page, char **start, off_t off,
 #ifndef PRISM2_NO_STATION_MODES
 static int prism2_scan_results_proc_show(struct seq_file *m, void *v)
 {
-	local_info_t *local = m->private;
+	local_info_t *local = PDE_DATA(file_inode(m->file));
 	unsigned long entry;
 	int i, len;
 	struct hfa384x_hostscan_result *scanres;
@@ -392,7 +318,7 @@ static int prism2_scan_results_proc_show(struct seq_file *m, void *v)
 
 static void *prism2_scan_results_proc_start(struct seq_file *m, loff_t *_pos)
 {
-	local_info_t *local = m->private;
+	local_info_t *local = PDE_DATA(file_inode(m->file));
 	spin_lock_bh(&local->lock);
 
 	/* We have a header (pos 0) + N results to show (pos 1...N) */
@@ -403,7 +329,7 @@ static void *prism2_scan_results_proc_start(struct seq_file *m, loff_t *_pos)
 
 static void *prism2_scan_results_proc_next(struct seq_file *m, void *v, loff_t *_pos)
 {
-	local_info_t *local = m->private;
+	local_info_t *local = PDE_DATA(file_inode(m->file));
 
 	++*_pos;
 	if (*_pos > local->last_scan_results_count)
@@ -413,7 +339,7 @@ static void *prism2_scan_results_proc_next(struct seq_file *m, void *v, loff_t *
 
 static void prism2_scan_results_proc_stop(struct seq_file *m, void *v)
 {
-	local_info_t *local = m->private;
+	local_info_t *local = PDE_DATA(file_inode(m->file));
 	spin_unlock_bh(&local->lock);
 }
 
@@ -423,25 +349,6 @@ static const struct seq_operations prism2_scan_results_proc_seqops = {
 	.stop	= prism2_scan_results_proc_stop,
 	.show	= prism2_scan_results_proc_show,
 };
-
-static int prism2_scan_results_proc_open(struct inode *inode, struct file *file)
-{
-	int ret = seq_open(file, &prism2_scan_results_proc_seqops);
-	if (ret == 0) {
-		struct seq_file *m = file->private_data;
-		m->private = PDE_DATA(inode);
-	}
-	return ret;
-}
-
-static const struct file_operations prism2_scan_results_proc_fops = {
-	.open		= prism2_scan_results_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-
 #endif /* PRISM2_NO_STATION_MODES */
 
 
@@ -463,29 +370,29 @@ void hostap_init_proc(local_info_t *local)
 	}
 
 #ifndef PRISM2_NO_PROCFS_DEBUG
-	proc_create_data("debug", 0, local->proc,
-			 &prism2_debug_proc_fops, local);
+	proc_create_single_data("debug", 0, local->proc,
+			prism2_debug_proc_show, local);
 #endif /* PRISM2_NO_PROCFS_DEBUG */
-	proc_create_data("stats", 0, local->proc,
-			 &prism2_stats_proc_fops, local);
-	proc_create_data("wds", 0, local->proc,
-			 &prism2_wds_proc_fops, local);
+	proc_create_single_data("stats", 0, local->proc, prism2_stats_proc_show,
+			local);
+	proc_create_seq_data("wds", 0, local->proc,
+			&prism2_wds_proc_seqops, local);
 	proc_create_data("pda", 0, local->proc,
 			 &prism2_pda_proc_fops, local);
 	proc_create_data("aux_dump", 0, local->proc,
 			 local->func->read_aux_fops ?: &prism2_aux_dump_proc_fops,
 			 local);
-	proc_create_data("bss_list", 0, local->proc,
-			 &prism2_bss_list_proc_fops, local);
-	proc_create_data("crypt", 0, local->proc,
-			 &prism2_crypt_proc_fops, local);
+	proc_create_seq_data("bss_list", 0, local->proc,
+			&prism2_bss_list_proc_seqops, local);
+	proc_create_single_data("crypt", 0, local->proc, prism2_crypt_proc_show,
+		local);
 #ifdef PRISM2_IO_DEBUG
-	proc_create_data("io_debug", 0, local->proc,
-			 &prism2_io_debug_proc_fops, local);
+	proc_create_single_data("io_debug", 0, local->proc,
+			prism2_debug_proc_show, local);
 #endif /* PRISM2_IO_DEBUG */
 #ifndef PRISM2_NO_STATION_MODES
-	proc_create_data("scan_results", 0, local->proc,
-			 &prism2_scan_results_proc_fops, local);
+	proc_create_seq_data("scan_results", 0, local->proc,
+			&prism2_scan_results_proc_seqops, local);
 #endif /* PRISM2_NO_STATION_MODES */
 }
 
diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c
index 7f9b16b..a7e0a17 100644
--- a/drivers/net/wireless/ray_cs.c
+++ b/drivers/net/wireless/ray_cs.c
@@ -2663,19 +2663,6 @@ static int ray_cs_proc_show(struct seq_file *m, void *v)
 	}
 	return 0;
 }
-
-static int ray_cs_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ray_cs_proc_show, NULL);
-}
-
-static const struct file_operations ray_cs_proc_fops = {
-	.owner = THIS_MODULE,
-	.open = ray_cs_proc_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
 #endif
 /*===========================================================================*/
 static int build_auth_frame(ray_dev_t *local, UCHAR *dest, int auth_type)
@@ -2814,7 +2801,7 @@ static int __init init_ray_cs(void)
 #ifdef CONFIG_PROC_FS
 	proc_mkdir("driver/ray_cs", NULL);
 
-	proc_create("driver/ray_cs/ray_cs", 0, NULL, &ray_cs_proc_fops);
+	proc_create_single("driver/ray_cs/ray_cs", 0, NULL, ray_cs_proc_show);
 	proc_create("driver/ray_cs/essid", 0200, NULL, &ray_cs_essid_proc_fops);
 	proc_create_data("driver/ray_cs/net_type", 0200, NULL, &int_proc_fops,
 			 &net_type);
diff --git a/drivers/nubus/proc.c b/drivers/nubus/proc.c
index c2e5a7e..88e1f9a 100644
--- a/drivers/nubus/proc.c
+++ b/drivers/nubus/proc.c
@@ -45,18 +45,6 @@ nubus_devices_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int nubus_devices_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, nubus_devices_proc_show, NULL);
-}
-
-static const struct file_operations nubus_devices_proc_fops = {
-	.open		= nubus_devices_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static struct proc_dir_entry *proc_bus_nubus_dir;
 
 /*
@@ -149,18 +137,6 @@ static int nubus_proc_rsrc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int nubus_proc_rsrc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, nubus_proc_rsrc_show, inode);
-}
-
-static const struct file_operations nubus_proc_rsrc_fops = {
-	.open		= nubus_proc_rsrc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 void nubus_proc_add_rsrc_mem(struct proc_dir_entry *procdir,
 			     const struct nubus_dirent *ent,
 			     unsigned int size)
@@ -176,8 +152,8 @@ void nubus_proc_add_rsrc_mem(struct proc_dir_entry *procdir,
 		pde_data = nubus_proc_alloc_pde_data(nubus_dirptr(ent), size);
 	else
 		pde_data = NULL;
-	proc_create_data(name, S_IFREG | 0444, procdir,
-			 &nubus_proc_rsrc_fops, pde_data);
+	proc_create_single_data(name, S_IFREG | 0444, procdir,
+			nubus_proc_rsrc_show, pde_data);
 }
 
 void nubus_proc_add_rsrc(struct proc_dir_entry *procdir,
@@ -190,32 +166,21 @@ void nubus_proc_add_rsrc(struct proc_dir_entry *procdir,
 		return;
 
 	snprintf(name, sizeof(name), "%x", ent->type);
-	proc_create_data(name, S_IFREG | 0444, procdir,
-			 &nubus_proc_rsrc_fops,
-			 nubus_proc_alloc_pde_data(data, 0));
+	proc_create_single_data(name, S_IFREG | 0444, procdir,
+			nubus_proc_rsrc_show,
+			nubus_proc_alloc_pde_data(data, 0));
 }
 
 /*
  * /proc/nubus stuff
  */
 
-static int nubus_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, nubus_proc_show, NULL);
-}
-
-static const struct file_operations nubus_proc_fops = {
-	.open		= nubus_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 void __init nubus_proc_init(void)
 {
-	proc_create("nubus", 0, NULL, &nubus_proc_fops);
+	proc_create_single("nubus", 0, NULL, nubus_proc_show);
 	proc_bus_nubus_dir = proc_mkdir("bus/nubus", NULL);
 	if (!proc_bus_nubus_dir)
 		return;
-	proc_create("devices", 0, proc_bus_nubus_dir, &nubus_devices_proc_fops);
+	proc_create_single("devices", 0, proc_bus_nubus_dir,
+			nubus_devices_proc_show);
 }
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 3085227..2e96b34 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -276,7 +276,8 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 	if (rw == READ) {
 		if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align)))
 			return -EIO;
-		return memcpy_mcsafe(buf, nsio->addr + offset, size);
+		if (memcpy_mcsafe(buf, nsio->addr + offset, size) != 0)
+			return -EIO;
 	}
 
 	if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 9d71492..e023d6a 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -101,15 +101,15 @@ static blk_status_t read_pmem(struct page *page, unsigned int off,
 		void *pmem_addr, unsigned int len)
 {
 	unsigned int chunk;
-	int rc;
+	unsigned long rem;
 	void *mem;
 
 	while (len) {
 		mem = kmap_atomic(page);
 		chunk = min_t(unsigned int, len, PAGE_SIZE);
-		rc = memcpy_mcsafe(mem + off, pmem_addr, chunk);
+		rem = memcpy_mcsafe(mem + off, pmem_addr, chunk);
 		kunmap_atomic(mem);
-		if (rc)
+		if (rem)
 			return BLK_STS_IOERR;
 		len -= chunk;
 		off = 0;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b9ca782..c8b3006 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -101,6 +101,15 @@ static void nvme_ns_remove(struct nvme_ns *ns);
 static int nvme_revalidate_disk(struct gendisk *disk);
 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
 
+static void nvme_queue_scan(struct nvme_ctrl *ctrl)
+{
+	/*
+	 * Only new queue scan work when admin and IO queues are both alive
+	 */
+	if (ctrl->state == NVME_CTRL_LIVE)
+		queue_work(nvme_wq, &ctrl->scan_work);
+}
+
 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
 {
 	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
@@ -244,9 +253,6 @@ EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
 void nvme_cancel_request(struct request *req, void *data, bool reserved)
 {
-	if (!blk_mq_request_started(req))
-		return;
-
 	dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
 				"Cancelling I/O %d", req->tag);
 
@@ -351,7 +357,7 @@ static void nvme_free_ns_head(struct kref *ref)
 	nvme_mpath_remove_disk(head);
 	ida_simple_remove(&head->subsys->ns_ida, head->instance);
 	list_del_init(&head->entry);
-	cleanup_srcu_struct(&head->srcu);
+	cleanup_srcu_struct_quiesced(&head->srcu);
 	nvme_put_subsystem(head->subsys);
 	kfree(head);
 }
@@ -1033,6 +1039,21 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
 }
 EXPORT_SYMBOL_GPL(nvme_set_queue_count);
 
+#define NVME_AEN_SUPPORTED \
+	(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT)
+
+static void nvme_enable_aen(struct nvme_ctrl *ctrl)
+{
+	u32 result;
+	int status;
+
+	status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT,
+			ctrl->oaes & NVME_AEN_SUPPORTED, NULL, 0, &result);
+	if (status)
+		dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
+			 ctrl->oaes & NVME_AEN_SUPPORTED);
+}
+
 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 {
 	struct nvme_user_io io;
@@ -1351,13 +1372,19 @@ static void nvme_set_chunk_size(struct nvme_ns *ns)
 	blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
 }
 
-static void nvme_config_discard(struct nvme_ctrl *ctrl,
-		unsigned stream_alignment, struct request_queue *queue)
+static void nvme_config_discard(struct nvme_ns *ns)
 {
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	struct request_queue *queue = ns->queue;
 	u32 size = queue_logical_block_size(queue);
 
-	if (stream_alignment)
-		size *= stream_alignment;
+	if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) {
+		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue);
+		return;
+	}
+
+	if (ctrl->nr_streams && ns->sws && ns->sgs)
+		size *= ns->sws * ns->sgs;
 
 	BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
 			NVME_DSM_MAX_RANGES);
@@ -1365,9 +1392,12 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl,
 	queue->limits.discard_alignment = 0;
 	queue->limits.discard_granularity = size;
 
+	/* If discard is already enabled, don't reset queue limits */
+	if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue))
+		return;
+
 	blk_queue_max_discard_sectors(queue, UINT_MAX);
 	blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
-	blk_queue_flag_set(QUEUE_FLAG_DISCARD, queue);
 
 	if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
 		blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
@@ -1411,10 +1441,6 @@ static void nvme_update_disk_info(struct gendisk *disk,
 {
 	sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
 	unsigned short bs = 1 << ns->lba_shift;
-	unsigned stream_alignment = 0;
-
-	if (ns->ctrl->nr_streams && ns->sws && ns->sgs)
-		stream_alignment = ns->sws * ns->sgs;
 
 	blk_mq_freeze_queue(disk->queue);
 	blk_integrity_unregister(disk);
@@ -1428,10 +1454,9 @@ static void nvme_update_disk_info(struct gendisk *disk,
 		nvme_init_integrity(disk, ns->ms, ns->pi_type);
 	if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk))
 		capacity = 0;
-	set_capacity(disk, capacity);
 
-	if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
-		nvme_config_discard(ns->ctrl, stream_alignment, disk->queue);
+	set_capacity(disk, capacity);
+	nvme_config_discard(ns);
 	blk_mq_unfreeze_queue(disk->queue);
 }
 
@@ -1577,7 +1602,7 @@ static int nvme_pr_reserve(struct block_device *bdev, u64 key,
 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
 		enum pr_type type, bool abort)
 {
-	u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
+	u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1);
 	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
 }
 
@@ -1589,7 +1614,7 @@ static int nvme_pr_clear(struct block_device *bdev, u64 key)
 
 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
 {
-	u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
+	u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0);
 	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
 }
 
@@ -2183,7 +2208,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 		 * Verify that the subsystem actually supports multiple
 		 * controllers, else bail out.
 		 */
-		if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
+		if (!ctrl->opts->discovery_nqn &&
+		    nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
 			dev_err(ctrl->device,
 				"ignoring ctrl due to duplicate subnqn (%s).\n",
 				found->subnqn);
@@ -2314,7 +2340,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
 		ret = nvme_get_effects_log(ctrl);
 		if (ret < 0)
-			return ret;
+			goto out_free;
 	}
 
 	if (!ctrl->identified) {
@@ -2345,6 +2371,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 
 	ctrl->oacs = le16_to_cpu(id->oacs);
 	ctrl->oncs = le16_to_cpup(&id->oncs);
+	ctrl->oaes = le32_to_cpu(id->oaes);
 	atomic_set(&ctrl->abort_limit, id->acl + 1);
 	ctrl->vwc = id->vwc;
 	ctrl->cntlid = le16_to_cpup(&id->cntlid);
@@ -3170,6 +3197,42 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn)
 	nvme_remove_invalid_namespaces(ctrl, nn);
 }
 
+static bool nvme_scan_changed_ns_log(struct nvme_ctrl *ctrl)
+{
+	size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
+	__le32 *log;
+	int error, i;
+	bool ret = false;
+
+	log = kzalloc(log_size, GFP_KERNEL);
+	if (!log)
+		return false;
+
+	error = nvme_get_log(ctrl, NVME_LOG_CHANGED_NS, log, log_size);
+	if (error) {
+		dev_warn(ctrl->device,
+			"reading changed ns log failed: %d\n", error);
+		goto out_free_log;
+	}
+
+	if (log[0] == cpu_to_le32(0xffffffff))
+		goto out_free_log;
+
+	for (i = 0; i < NVME_MAX_CHANGED_NAMESPACES; i++) {
+		u32 nsid = le32_to_cpu(log[i]);
+
+		if (nsid == 0)
+			break;
+		dev_info(ctrl->device, "rescanning namespace %d.\n", nsid);
+		nvme_validate_ns(ctrl, nsid);
+	}
+	ret = true;
+
+out_free_log:
+	kfree(log);
+	return ret;
+}
+
 static void nvme_scan_work(struct work_struct *work)
 {
 	struct nvme_ctrl *ctrl =
@@ -3182,6 +3245,12 @@ static void nvme_scan_work(struct work_struct *work)
 
 	WARN_ON_ONCE(!ctrl->tagset);
 
+	if (test_and_clear_bit(EVENT_NS_CHANGED, &ctrl->events)) {
+		if (nvme_scan_changed_ns_log(ctrl))
+			goto out_sort_namespaces;
+		dev_info(ctrl->device, "rescanning namespaces.\n");
+	}
+
 	if (nvme_identify_ctrl(ctrl, &id))
 		return;
 
@@ -3189,26 +3258,17 @@ static void nvme_scan_work(struct work_struct *work)
 	if (ctrl->vs >= NVME_VS(1, 1, 0) &&
 	    !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
 		if (!nvme_scan_ns_list(ctrl, nn))
-			goto done;
+			goto out_free_id;
 	}
 	nvme_scan_ns_sequential(ctrl, nn);
- done:
+out_free_id:
+	kfree(id);
+out_sort_namespaces:
 	down_write(&ctrl->namespaces_rwsem);
 	list_sort(NULL, &ctrl->namespaces, ns_cmp);
 	up_write(&ctrl->namespaces_rwsem);
-	kfree(id);
 }
 
-void nvme_queue_scan(struct nvme_ctrl *ctrl)
-{
-	/*
-	 * Only new queue scan work when admin and IO queues are both alive
-	 */
-	if (ctrl->state == NVME_CTRL_LIVE)
-		queue_work(nvme_wq, &ctrl->scan_work);
-}
-EXPORT_SYMBOL_GPL(nvme_queue_scan);
-
 /*
  * This function iterates the namespace list unlocked to allow recovery from
  * controller failure. It is up to the caller to ensure the namespace list is
@@ -3322,8 +3382,23 @@ static void nvme_fw_act_work(struct work_struct *work)
 	nvme_get_fw_slot_info(ctrl);
 }
 
+static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
+{
+	switch ((result & 0xff00) >> 8) {
+	case NVME_AER_NOTICE_NS_CHANGED:
+		set_bit(EVENT_NS_CHANGED, &ctrl->events);
+		nvme_queue_scan(ctrl);
+		break;
+	case NVME_AER_NOTICE_FW_ACT_STARTING:
+		queue_work(nvme_wq, &ctrl->fw_act_work);
+		break;
+	default:
+		dev_warn(ctrl->device, "async event result %08x\n", result);
+	}
+}
+
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
-		union nvme_result *res)
+		volatile union nvme_result *res)
 {
 	u32 result = le32_to_cpu(res->u32);
 
@@ -3331,6 +3406,9 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		return;
 
 	switch (result & 0x7) {
+	case NVME_AER_NOTICE:
+		nvme_handle_aen_notice(ctrl, result);
+		break;
 	case NVME_AER_ERROR:
 	case NVME_AER_SMART:
 	case NVME_AER_CSS:
@@ -3340,18 +3418,6 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 	default:
 		break;
 	}
-
-	switch (result & 0xff07) {
-	case NVME_AER_NOTICE_NS_CHANGED:
-		dev_info(ctrl->device, "rescanning\n");
-		nvme_queue_scan(ctrl);
-		break;
-	case NVME_AER_NOTICE_FW_ACT_STARTING:
-		queue_work(nvme_wq, &ctrl->fw_act_work);
-		break;
-	default:
-		dev_warn(ctrl->device, "async event result %08x\n", result);
-	}
 	queue_work(nvme_wq, &ctrl->async_event_work);
 }
 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
@@ -3374,6 +3440,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
 
 	if (ctrl->queue_count > 1) {
 		nvme_queue_scan(ctrl);
+		nvme_enable_aen(ctrl);
 		queue_work(nvme_wq, &ctrl->async_event_work);
 		nvme_start_queues(ctrl);
 	}
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 7ae732a..5f5f706 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -57,7 +57,7 @@ static struct nvmf_host *nvmf_host_add(const char *hostnqn)
 		goto out_unlock;
 
 	kref_init(&host->ref);
-	memcpy(host->nqn, hostnqn, NVMF_NQN_SIZE);
+	strlcpy(host->nqn, hostnqn, NVMF_NQN_SIZE);
 
 	list_add_tail(&host->list, &nvmf_hosts);
 out_unlock:
@@ -545,71 +545,54 @@ blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, struct request *rq,
 		return BLK_STS_OK;
 
 	switch (ctrl->state) {
-	case NVME_CTRL_DELETING:
-		goto reject_io;
-
 	case NVME_CTRL_NEW:
 	case NVME_CTRL_CONNECTING:
+	case NVME_CTRL_DELETING:
+		/*
+		 * This is the case of starting a new or deleting an association
+		 * but connectivity was lost before it was fully created or torn
+		 * down. We need to error the commands used to initialize the
+		 * controller so the reconnect can go into a retry attempt.  The
+		 * commands should all be marked REQ_FAILFAST_DRIVER, which will
+		 * hit the reject path below. Anything else will be queued while
+		 * the state settles.
+		 */
 		if (!is_connected)
-			/*
-			 * This is the case of starting a new
-			 * association but connectivity was lost
-			 * before it was fully created. We need to
-			 * error the commands used to initialize the
-			 * controller so the reconnect can go into a
-			 * retry attempt. The commands should all be
-			 * marked REQ_FAILFAST_DRIVER, which will hit
-			 * the reject path below. Anything else will
-			 * be queued while the state settles.
-			 */
-			goto reject_or_queue_io;
-
-		if ((queue_live &&
-		     !(nvme_req(rq)->flags & NVME_REQ_USERCMD)) ||
-		    (!queue_live && blk_rq_is_passthrough(rq) &&
-		     cmd->common.opcode == nvme_fabrics_command &&
-		     cmd->fabrics.fctype == nvme_fabrics_type_connect))
-			/*
-			 * If queue is live, allow only commands that
-			 * are internally generated pass through. These
-			 * are commands on the admin queue to initialize
-			 * the controller. This will reject any ioctl
-			 * admin cmds received while initializing.
-			 *
-			 * If the queue is not live, allow only a
-			 * connect command. This will reject any ioctl
-			 * admin cmd as well as initialization commands
-			 * if the controller reverted the queue to non-live.
-			 */
+			break;
+
+		/*
+		 * If queue is live, allow only commands that are internally
+		 * generated pass through.  These are commands on the admin
+		 * queue to initialize the controller. This will reject any
+		 * ioctl admin cmds received while initializing.
+		 */
+		if (queue_live && !(nvme_req(rq)->flags & NVME_REQ_USERCMD))
 			return BLK_STS_OK;
 
 		/*
-		 * fall-thru to the reject_or_queue_io clause
+		 * If the queue is not live, allow only a connect command.  This
+		 * will reject any ioctl admin cmd as well as initialization
+		 * commands if the controller reverted the queue to non-live.
 		 */
+		if (!queue_live && blk_rq_is_passthrough(rq) &&
+		     cmd->common.opcode == nvme_fabrics_command &&
+		     cmd->fabrics.fctype == nvme_fabrics_type_connect)
+			return BLK_STS_OK;
 		break;
-
-	/* these cases fall-thru
-	 * case NVME_CTRL_LIVE:
-	 * case NVME_CTRL_RESETTING:
-	 */
 	default:
 		break;
 	}
 
-reject_or_queue_io:
 	/*
-	 * Any other new io is something we're not in a state to send
-	 * to the device. Default action is to busy it and retry it
-	 * after the controller state is recovered. However, anything
-	 * marked for failfast or nvme multipath is immediately failed.
-	 * Note: commands used to initialize the controller will be
-	 *  marked for failfast.
+	 * Any other new io is something we're not in a state to send to the
+	 * device.  Default action is to busy it and retry it after the
+	 * controller state is recovered. However, anything marked for failfast
+	 * or nvme multipath is immediately failed.  Note: commands used to
+	 * initialize the controller will be marked for failfast.
 	 * Note: nvme cli/ioctl commands are marked for failfast.
 	 */
 	if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
 		return BLK_STS_RESOURCE;
-
-reject_io:
 	nvme_req(rq)->status = NVME_SC_ABORT_REQ;
 	return BLK_STS_IOERR;
 }
@@ -689,10 +672,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 			opts->discovery_nqn =
 				!(strcmp(opts->subsysnqn,
 					 NVME_DISC_SUBSYS_NAME));
-			if (opts->discovery_nqn) {
-				opts->kato = 0;
-				opts->nr_io_queues = 0;
-			}
 			break;
 		case NVMF_OPT_TRADDR:
 			p = match_strdup(args);
@@ -851,6 +830,11 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 		}
 	}
 
+	if (opts->discovery_nqn) {
+		opts->kato = 0;
+		opts->nr_io_queues = 0;
+		opts->duplicate_connect = true;
+	}
 	if (ctrl_loss_tmo < 0)
 		opts->max_reconnects = -1;
 	else
@@ -983,16 +967,6 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
 		goto out_module_put;
 	}
 
-	if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) {
-		dev_warn(ctrl->device,
-			"controller returned incorrect NQN: \"%s\".\n",
-			ctrl->subsys->subnqn);
-		module_put(ops->module);
-		up_read(&nvmf_transports_rwsem);
-		nvme_delete_ctrl_sync(ctrl);
-		return ERR_PTR(-EINVAL);
-	}
-
 	module_put(ops->module);
 	up_read(&nvmf_transports_rwsem);
 	return ctrl;
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index ef46c91..0cf0460 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -139,7 +139,9 @@ static inline bool
 nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl,
 			struct nvmf_ctrl_options *opts)
 {
-	if (strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) ||
+	if (ctrl->state == NVME_CTRL_DELETING ||
+	    ctrl->state == NVME_CTRL_DEAD ||
+	    strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) ||
 	    strcmp(opts->host->nqn, ctrl->opts->host->nqn) ||
 	    memcmp(&opts->host->id, &ctrl->opts->host->id, sizeof(uuid_t)))
 		return false;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 6cb26bc..0bad658 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1686,16 +1686,6 @@ done:
 		goto check_error;
 	}
 
-	/*
-	 * Force failures of commands if we're killing the controller
-	 * or have an error on a command used to create an new association
-	 */
-	if (status &&
-	    (blk_queue_dying(rq->q) ||
-	     ctrl->ctrl.state == NVME_CTRL_NEW ||
-	     ctrl->ctrl.state == NVME_CTRL_CONNECTING))
-		status |= cpu_to_le16(NVME_SC_DNR << 1);
-
 	__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
 	nvme_end_request(rq, status, result);
 
@@ -2403,9 +2393,6 @@ nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
 	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
 	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req);
 
-	if (!blk_mq_request_started(req))
-		return;
-
 	__nvme_fc_abort_op(ctrl, op);
 }
 
@@ -3284,6 +3271,8 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
 	}
 	spin_unlock_irqrestore(&nvme_fc_lock, flags);
 
+	pr_warn("%s: %s - %s combination not found\n",
+		__func__, opts->traddr, opts->host_traddr);
 	return ERR_PTR(-ENOENT);
 }
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 17d2f7c..de24fe7 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -22,6 +22,7 @@
 #include <linux/lightnvm.h>
 #include <linux/sed-opal.h>
 #include <linux/fault-inject.h>
+#include <linux/rcupdate.h>
 
 extern unsigned int nvme_io_timeout;
 #define NVME_IO_TIMEOUT	(nvme_io_timeout * HZ)
@@ -180,6 +181,7 @@ struct nvme_ctrl {
 	u16 kas;
 	u8 npss;
 	u8 apsta;
+	u32 oaes;
 	u32 aen_result;
 	unsigned int shutdown_timeout;
 	unsigned int kato;
@@ -192,6 +194,8 @@ struct nvme_ctrl {
 	struct delayed_work ka_work;
 	struct nvme_command ka_cmd;
 	struct work_struct fw_act_work;
+#define EVENT_NS_CHANGED		(1 << 0)
+	unsigned long events;
 
 	/* Power saving configuration */
 	u64 ps_max_latency_us;
@@ -398,14 +402,13 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl);
 void nvme_put_ctrl(struct nvme_ctrl *ctrl);
 int nvme_init_identify(struct nvme_ctrl *ctrl);
 
-void nvme_queue_scan(struct nvme_ctrl *ctrl);
 void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
 
 int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 		bool send);
 
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
-		union nvme_result *res);
+		volatile union nvme_result *res);
 
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
@@ -454,7 +457,7 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
 {
 	struct nvme_ns_head *head = ns->head;
 
-	if (head && ns == srcu_dereference(head->current_path, &head->srcu))
+	if (head && ns == rcu_access_pointer(head->current_path))
 		rcu_assign_pointer(head->current_path, NULL);
 }
 struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 17a0190..e526437 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/aer.h>
+#include <linux/async.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
 #include <linux/blk-mq-pci.h>
@@ -68,7 +69,6 @@ MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
 struct nvme_dev;
 struct nvme_queue;
 
-static void nvme_process_cq(struct nvme_queue *nvmeq);
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
 
 /*
@@ -147,9 +147,10 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
 struct nvme_queue {
 	struct device *q_dmadev;
 	struct nvme_dev *dev;
-	spinlock_t q_lock;
+	spinlock_t sq_lock;
 	struct nvme_command *sq_cmds;
 	struct nvme_command __iomem *sq_cmds_io;
+	spinlock_t cq_lock ____cacheline_aligned_in_smp;
 	volatile struct nvme_completion *cqes;
 	struct blk_mq_tags **tags;
 	dma_addr_t sq_dma_addr;
@@ -159,9 +160,9 @@ struct nvme_queue {
 	s16 cq_vector;
 	u16 sq_tail;
 	u16 cq_head;
+	u16 last_cq_head;
 	u16 qid;
 	u8 cq_phase;
-	u8 cqe_seen;
 	u32 *dbbuf_sq_db;
 	u32 *dbbuf_cq_db;
 	u32 *dbbuf_sq_ei;
@@ -420,28 +421,25 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 }
 
 /**
- * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
+ * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
  * @nvmeq: The queue to use
  * @cmd: The command to send
- *
- * Safe to use from interrupt context
  */
-static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
-						struct nvme_command *cmd)
+static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 {
-	u16 tail = nvmeq->sq_tail;
-
+	spin_lock(&nvmeq->sq_lock);
 	if (nvmeq->sq_cmds_io)
-		memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
+		memcpy_toio(&nvmeq->sq_cmds_io[nvmeq->sq_tail], cmd,
+				sizeof(*cmd));
 	else
-		memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
+		memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
 
-	if (++tail == nvmeq->q_depth)
-		tail = 0;
-	if (nvme_dbbuf_update_and_check_event(tail, nvmeq->dbbuf_sq_db,
-					      nvmeq->dbbuf_sq_ei))
-		writel(tail, nvmeq->q_db);
-	nvmeq->sq_tail = tail;
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+	if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
+			nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
+		writel(nvmeq->sq_tail, nvmeq->q_db);
+	spin_unlock(&nvmeq->sq_lock);
 }
 
 static void **nvme_pci_iod_list(struct request *req)
@@ -872,6 +870,13 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct nvme_command cmnd;
 	blk_status_t ret;
 
+	/*
+	 * We should not need to do this, but we're still using this to
+	 * ensure we can drain requests on a dying queue.
+	 */
+	if (unlikely(nvmeq->cq_vector < 0))
+		return BLK_STS_IOERR;
+
 	ret = nvme_setup_cmd(ns, req, &cmnd);
 	if (ret)
 		return ret;
@@ -887,16 +892,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	}
 
 	blk_mq_start_request(req);
-
-	spin_lock_irq(&nvmeq->q_lock);
-	if (unlikely(nvmeq->cq_vector < 0)) {
-		ret = BLK_STS_IOERR;
-		spin_unlock_irq(&nvmeq->q_lock);
-		goto out_cleanup_iod;
-	}
-	__nvme_submit_cmd(nvmeq, &cmnd);
-	nvme_process_cq(nvmeq);
-	spin_unlock_irq(&nvmeq->q_lock);
+	nvme_submit_cmd(nvmeq, &cmnd);
 	return BLK_STS_OK;
 out_cleanup_iod:
 	nvme_free_iod(dev, req);
@@ -914,10 +910,10 @@ static void nvme_pci_complete_rq(struct request *req)
 }
 
 /* We read the CQE phase first to check if the rest of the entry is valid */
-static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head,
-		u16 phase)
+static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
 {
-	return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase;
+	return (le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) ==
+			nvmeq->cq_phase;
 }
 
 static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
@@ -931,9 +927,9 @@ static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
 	}
 }
 
-static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
-		struct nvme_completion *cqe)
+static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
 {
+	volatile struct nvme_completion *cqe = &nvmeq->cqes[idx];
 	struct request *req;
 
 	if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
@@ -956,83 +952,87 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 		return;
 	}
 
-	nvmeq->cqe_seen = 1;
 	req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
 	nvme_end_request(req, cqe->status, cqe->result);
 }
 
-static inline bool nvme_read_cqe(struct nvme_queue *nvmeq,
-		struct nvme_completion *cqe)
+static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end)
 {
-	if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
-		*cqe = nvmeq->cqes[nvmeq->cq_head];
+	while (start != end) {
+		nvme_handle_cqe(nvmeq, start);
+		if (++start == nvmeq->q_depth)
+			start = 0;
+	}
+}
 
-		if (++nvmeq->cq_head == nvmeq->q_depth) {
-			nvmeq->cq_head = 0;
-			nvmeq->cq_phase = !nvmeq->cq_phase;
-		}
-		return true;
+static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
+{
+	if (++nvmeq->cq_head == nvmeq->q_depth) {
+		nvmeq->cq_head = 0;
+		nvmeq->cq_phase = !nvmeq->cq_phase;
 	}
-	return false;
 }
 
-static void nvme_process_cq(struct nvme_queue *nvmeq)
+static inline bool nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
+		u16 *end, int tag)
 {
-	struct nvme_completion cqe;
-	int consumed = 0;
+	bool found = false;
 
-	while (nvme_read_cqe(nvmeq, &cqe)) {
-		nvme_handle_cqe(nvmeq, &cqe);
-		consumed++;
+	*start = nvmeq->cq_head;
+	while (!found && nvme_cqe_pending(nvmeq)) {
+		if (nvmeq->cqes[nvmeq->cq_head].command_id == tag)
+			found = true;
+		nvme_update_cq_head(nvmeq);
 	}
+	*end = nvmeq->cq_head;
 
-	if (consumed)
+	if (*start != *end)
 		nvme_ring_cq_doorbell(nvmeq);
+	return found;
 }
 
 static irqreturn_t nvme_irq(int irq, void *data)
 {
-	irqreturn_t result;
 	struct nvme_queue *nvmeq = data;
-	spin_lock(&nvmeq->q_lock);
-	nvme_process_cq(nvmeq);
-	result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
-	nvmeq->cqe_seen = 0;
-	spin_unlock(&nvmeq->q_lock);
-	return result;
+	irqreturn_t ret = IRQ_NONE;
+	u16 start, end;
+
+	spin_lock(&nvmeq->cq_lock);
+	if (nvmeq->cq_head != nvmeq->last_cq_head)
+		ret = IRQ_HANDLED;
+	nvme_process_cq(nvmeq, &start, &end, -1);
+	nvmeq->last_cq_head = nvmeq->cq_head;
+	spin_unlock(&nvmeq->cq_lock);
+
+	if (start != end) {
+		nvme_complete_cqes(nvmeq, start, end);
+		return IRQ_HANDLED;
+	}
+
+	return ret;
 }
 
 static irqreturn_t nvme_irq_check(int irq, void *data)
 {
 	struct nvme_queue *nvmeq = data;
-	if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase))
+	if (nvme_cqe_pending(nvmeq))
 		return IRQ_WAKE_THREAD;
 	return IRQ_NONE;
 }
 
 static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
 {
-	struct nvme_completion cqe;
-	int found = 0, consumed = 0;
+	u16 start, end;
+	bool found;
 
-	if (!nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase))
+	if (!nvme_cqe_pending(nvmeq))
 		return 0;
 
-	spin_lock_irq(&nvmeq->q_lock);
-	while (nvme_read_cqe(nvmeq, &cqe)) {
-		nvme_handle_cqe(nvmeq, &cqe);
-		consumed++;
-
-		if (tag == cqe.command_id) {
-			found = 1;
-			break;
-		}
-       }
-
-	if (consumed)
-		nvme_ring_cq_doorbell(nvmeq);
-	spin_unlock_irq(&nvmeq->q_lock);
+	spin_lock_irq(&nvmeq->cq_lock);
+	found = nvme_process_cq(nvmeq, &start, &end, tag);
+	spin_unlock_irq(&nvmeq->cq_lock);
 
+	nvme_complete_cqes(nvmeq, start, end);
 	return found;
 }
 
@@ -1052,10 +1052,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
 	memset(&c, 0, sizeof(c));
 	c.common.opcode = nvme_admin_async_event;
 	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
-
-	spin_lock_irq(&nvmeq->q_lock);
-	__nvme_submit_cmd(nvmeq, &c);
-	spin_unlock_irq(&nvmeq->q_lock);
+	nvme_submit_cmd(nvmeq, &c);
 }
 
 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
@@ -1070,7 +1067,7 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
 }
 
 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
-						struct nvme_queue *nvmeq)
+		struct nvme_queue *nvmeq, s16 vector)
 {
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
@@ -1085,7 +1082,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 	c.create_cq.cqid = cpu_to_le16(qid);
 	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
 	c.create_cq.cq_flags = cpu_to_le16(flags);
-	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
+	c.create_cq.irq_vector = cpu_to_le16(vector);
 
 	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
 }
@@ -1208,7 +1205,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		nvme_warn_reset(dev, csts);
 		nvme_dev_disable(dev, false);
 		nvme_reset_ctrl(&dev->ctrl);
-		return BLK_EH_HANDLED;
+		return BLK_EH_DONE;
 	}
 
 	/*
@@ -1218,24 +1215,24 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, completion polled\n",
 			 req->tag, nvmeq->qid);
-		return BLK_EH_HANDLED;
+		return BLK_EH_DONE;
 	}
 
 	/*
 	 * Shutdown immediately if controller times out while starting. The
 	 * reset work will see the pci device disabled when it gets the forced
 	 * cancellation error. All outstanding requests are completed on
-	 * shutdown, so we return BLK_EH_HANDLED.
+	 * shutdown, so we return BLK_EH_DONE.
 	 */
 	switch (dev->ctrl.state) {
 	case NVME_CTRL_CONNECTING:
 	case NVME_CTRL_RESETTING:
-		dev_warn(dev->ctrl.device,
+		dev_warn_ratelimited(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
 			 req->tag, nvmeq->qid);
 		nvme_dev_disable(dev, false);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-		return BLK_EH_HANDLED;
+		return BLK_EH_DONE;
 	default:
 		break;
 	}
@@ -1252,12 +1249,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		nvme_dev_disable(dev, false);
 		nvme_reset_ctrl(&dev->ctrl);
 
-		/*
-		 * Mark the request as handled, since the inline shutdown
-		 * forces all outstanding requests to complete.
-		 */
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-		return BLK_EH_HANDLED;
+		return BLK_EH_DONE;
 	}
 
 	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
@@ -1321,15 +1314,21 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 {
 	int vector;
 
-	spin_lock_irq(&nvmeq->q_lock);
+	spin_lock_irq(&nvmeq->cq_lock);
 	if (nvmeq->cq_vector == -1) {
-		spin_unlock_irq(&nvmeq->q_lock);
+		spin_unlock_irq(&nvmeq->cq_lock);
 		return 1;
 	}
 	vector = nvmeq->cq_vector;
 	nvmeq->dev->online_queues--;
 	nvmeq->cq_vector = -1;
-	spin_unlock_irq(&nvmeq->q_lock);
+	spin_unlock_irq(&nvmeq->cq_lock);
+
+	/*
+	 * Ensure that nvme_queue_rq() sees it ->cq_vector == -1 without
+	 * having to grab the lock.
+	 */
+	mb();
 
 	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
 		blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
@@ -1342,15 +1341,18 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
 {
 	struct nvme_queue *nvmeq = &dev->queues[0];
+	u16 start, end;
 
 	if (shutdown)
 		nvme_shutdown_ctrl(&dev->ctrl);
 	else
 		nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
 
-	spin_lock_irq(&nvmeq->q_lock);
-	nvme_process_cq(nvmeq);
-	spin_unlock_irq(&nvmeq->q_lock);
+	spin_lock_irq(&nvmeq->cq_lock);
+	nvme_process_cq(nvmeq, &start, &end, -1);
+	spin_unlock_irq(&nvmeq->cq_lock);
+
+	nvme_complete_cqes(nvmeq, start, end);
 }
 
 static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
@@ -1408,7 +1410,8 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
 
 	nvmeq->q_dmadev = dev->dev;
 	nvmeq->dev = dev;
-	spin_lock_init(&nvmeq->q_lock);
+	spin_lock_init(&nvmeq->sq_lock);
+	spin_lock_init(&nvmeq->cq_lock);
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
@@ -1444,7 +1447,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 {
 	struct nvme_dev *dev = nvmeq->dev;
 
-	spin_lock_irq(&nvmeq->q_lock);
+	spin_lock_irq(&nvmeq->cq_lock);
 	nvmeq->sq_tail = 0;
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
@@ -1452,13 +1455,14 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
 	nvme_dbbuf_init(dev, nvmeq, qid);
 	dev->online_queues++;
-	spin_unlock_irq(&nvmeq->q_lock);
+	spin_unlock_irq(&nvmeq->cq_lock);
 }
 
 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 {
 	struct nvme_dev *dev = nvmeq->dev;
 	int result;
+	s16 vector;
 
 	if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
 		unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth),
@@ -1471,15 +1475,21 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 	 * A queue's vector matches the queue identifier unless the controller
 	 * has only one vector available.
 	 */
-	nvmeq->cq_vector = dev->num_vecs == 1 ? 0 : qid;
-	result = adapter_alloc_cq(dev, qid, nvmeq);
+	vector = dev->num_vecs == 1 ? 0 : qid;
+	result = adapter_alloc_cq(dev, qid, nvmeq, vector);
 	if (result < 0)
-		goto release_vector;
+		goto out;
 
 	result = adapter_alloc_sq(dev, qid, nvmeq);
 	if (result < 0)
 		goto release_cq;
 
+	/*
+	 * Set cq_vector after alloc cq/sq, otherwise nvme_suspend_queue will
+	 * invoke free_irq for it and cause a 'Trying to free already-free IRQ
+	 * xxx' warning if the create CQ/SQ command times out.
+	 */
+	nvmeq->cq_vector = vector;
 	nvme_init_queue(nvmeq, qid);
 	result = queue_request_irq(nvmeq);
 	if (result < 0)
@@ -1487,13 +1497,13 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 
 	return result;
 
- release_sq:
+release_sq:
+	nvmeq->cq_vector = -1;
 	dev->online_queues--;
 	adapter_delete_sq(dev, qid);
- release_cq:
+release_cq:
 	adapter_delete_cq(dev, qid);
- release_vector:
-	nvmeq->cq_vector = -1;
+out:
 	return result;
 }
 
@@ -1997,19 +2007,22 @@ static void nvme_del_queue_end(struct request *req, blk_status_t error)
 static void nvme_del_cq_end(struct request *req, blk_status_t error)
 {
 	struct nvme_queue *nvmeq = req->end_io_data;
+	u16 start, end;
 
 	if (!error) {
 		unsigned long flags;
 
 		/*
-		 * We might be called with the AQ q_lock held
-		 * and the I/O queue q_lock should always
+		 * We might be called with the AQ cq_lock held
+		 * and the I/O queue cq_lock should always
 		 * nest inside the AQ one.
 		 */
-		spin_lock_irqsave_nested(&nvmeq->q_lock, flags,
+		spin_lock_irqsave_nested(&nvmeq->cq_lock, flags,
 					SINGLE_DEPTH_NESTING);
-		nvme_process_cq(nvmeq);
-		spin_unlock_irqrestore(&nvmeq->q_lock, flags);
+		nvme_process_cq(nvmeq, &start, &end, -1);
+		spin_unlock_irqrestore(&nvmeq->cq_lock, flags);
+
+		nvme_complete_cqes(nvmeq, start, end);
 	}
 
 	nvme_del_queue_end(req, error);
@@ -2497,6 +2510,15 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
 	return 0;
 }
 
+static void nvme_async_probe(void *data, async_cookie_t cookie)
+{
+	struct nvme_dev *dev = data;
+
+	nvme_reset_ctrl_sync(&dev->ctrl);
+	flush_work(&dev->ctrl.scan_work);
+	nvme_put_ctrl(&dev->ctrl);
+}
+
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	int node, result = -ENOMEM;
@@ -2541,7 +2563,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
-	nvme_reset_ctrl(&dev->ctrl);
+	nvme_get_ctrl(&dev->ctrl);
+	async_schedule(nvme_async_probe, dev);
 
 	return 0;
 
@@ -2685,6 +2708,9 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
 
 static void nvme_error_resume(struct pci_dev *pdev)
 {
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+	flush_work(&dev->ctrl.reset_work);
 	pci_cleanup_aer_uncorrect_error_status(pdev);
 }
 
@@ -2714,6 +2740,8 @@ static const struct pci_device_id nvme_id_table[] = {
 				NVME_QUIRK_MEDIUM_PRIO_SQ },
 	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
 		.driver_data = NVME_QUIRK_IDENTIFY_CNS, },
+	{ PCI_DEVICE(0x1bb1, 0x0100),   /* Seagate Nytro Flash Storage */
+		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
 	{ PCI_DEVICE(0x1c58, 0x0003),	/* HGST adapter */
 		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
 	{ PCI_DEVICE(0x1c58, 0x0023),	/* WDC SN200 adapter */
@@ -2728,6 +2756,8 @@ static const struct pci_device_id nvme_id_table[] = {
 		.driver_data = NVME_QUIRK_LIGHTNVM, },
 	{ PCI_DEVICE(0x1d1d, 0x2807),	/* CNEX WL */
 		.driver_data = NVME_QUIRK_LIGHTNVM, },
+	{ PCI_DEVICE(0x1d1d, 0x2601),	/* CNEX Granby */
+		.driver_data = NVME_QUIRK_LIGHTNVM, },
 	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 1eb4438..7b3f084 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -778,7 +778,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 	if (error) {
 		dev_err(ctrl->ctrl.device,
 			"prop_get NVME_REG_CAP failed\n");
-		goto out_cleanup_queue;
+		goto out_stop_queue;
 	}
 
 	ctrl->ctrl.sqsize =
@@ -786,23 +786,25 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 
 	error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
 	if (error)
-		goto out_cleanup_queue;
+		goto out_stop_queue;
 
 	ctrl->ctrl.max_hw_sectors =
 		(ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
 
 	error = nvme_init_identify(&ctrl->ctrl);
 	if (error)
-		goto out_cleanup_queue;
+		goto out_stop_queue;
 
 	error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev,
 			&ctrl->async_event_sqe, sizeof(struct nvme_command),
 			DMA_TO_DEVICE);
 	if (error)
-		goto out_cleanup_queue;
+		goto out_stop_queue;
 
 	return 0;
 
+out_stop_queue:
+	nvme_rdma_stop_queue(&ctrl->queues[0]);
 out_cleanup_queue:
 	if (new)
 		blk_cleanup_queue(ctrl->ctrl.admin_q);
@@ -1598,7 +1600,7 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
 	/* fail with DNR on cmd timeout */
 	nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
 
-	return BLK_EH_HANDLED;
+	return BLK_EH_DONE;
 }
 
 static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
index ea91fcc..01390f0 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -148,8 +148,8 @@ TRACE_EVENT(nvme_complete_rq,
 		    __entry->flags = nvme_req(req)->flags;
 		    __entry->status = nvme_req(req)->status;
 	    ),
-	    TP_printk("cmdid=%u, qid=%d, res=%llu, retries=%u, flags=0x%x, status=%u",
-		      __entry->cid, __entry->qid, __entry->result,
+	    TP_printk("qid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u",
+		      __entry->qid, __entry->cid, __entry->result,
 		      __entry->retries, __entry->flags, __entry->status)
 
 );
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index 4882501..8118c93 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -6,8 +6,8 @@ obj-$(CONFIG_NVME_TARGET_RDMA)		+= nvmet-rdma.o
 obj-$(CONFIG_NVME_TARGET_FC)		+= nvmet-fc.o
 obj-$(CONFIG_NVME_TARGET_FCLOOP)	+= nvme-fcloop.o
 
-nvmet-y		+= core.o configfs.o admin-cmd.o io-cmd.o fabrics-cmd.o \
-			discovery.o
+nvmet-y		+= core.o configfs.o admin-cmd.o fabrics-cmd.o \
+			discovery.o io-cmd-file.o io-cmd-bdev.o
 nvme-loop-y	+= loop.o
 nvmet-rdma-y	+= rdma.o
 nvmet-fc-y	+= fc.o
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 5e0e9fc..ead8fbe 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -32,6 +32,11 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd)
 	return len;
 }
 
+static void nvmet_execute_get_log_page_noop(struct nvmet_req *req)
+{
+	nvmet_req_complete(req, nvmet_zero_sgl(req, 0, req->data_len));
+}
+
 static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
 		struct nvme_smart_log *slog)
 {
@@ -45,6 +50,10 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
 		return NVME_SC_INVALID_NS;
 	}
 
+	/* we don't have the right data for file backed ns */
+	if (!ns->bdev)
+		goto out;
+
 	host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]);
 	data_units_read = part_stat_read(ns->bdev->bd_part, sectors[READ]);
 	host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]);
@@ -54,6 +63,7 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
 	put_unaligned_le64(data_units_read, &slog->data_units_read[0]);
 	put_unaligned_le64(host_writes, &slog->host_writes[0]);
 	put_unaligned_le64(data_units_written, &slog->data_units_written[0]);
+out:
 	nvmet_put_namespace(ns);
 
 	return NVME_SC_SUCCESS;
@@ -71,6 +81,9 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
+		/* we don't have the right data for file backed ns */
+		if (!ns->bdev)
+			continue;
 		host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]);
 		data_units_read +=
 			part_stat_read(ns->bdev->bd_part, sectors[READ]);
@@ -89,74 +102,50 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
 	return NVME_SC_SUCCESS;
 }
 
-static u16 nvmet_get_smart_log(struct nvmet_req *req,
-		struct nvme_smart_log *slog)
+static void nvmet_execute_get_log_page_smart(struct nvmet_req *req)
 {
-	u16 status;
+	struct nvme_smart_log *log;
+	u16 status = NVME_SC_INTERNAL;
+
+	if (req->data_len != sizeof(*log))
+		goto out;
+
+	log = kzalloc(sizeof(*log), GFP_KERNEL);
+	if (!log)
+		goto out;
 
-	WARN_ON(req == NULL || slog == NULL);
 	if (req->cmd->get_log_page.nsid == cpu_to_le32(NVME_NSID_ALL))
-		status = nvmet_get_smart_log_all(req, slog);
+		status = nvmet_get_smart_log_all(req, log);
 	else
-		status = nvmet_get_smart_log_nsid(req, slog);
-	return status;
+		status = nvmet_get_smart_log_nsid(req, log);
+	if (status)
+		goto out;
+
+	status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log));
+out:
+	nvmet_req_complete(req, status);
 }
 
-static void nvmet_execute_get_log_page(struct nvmet_req *req)
+static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req)
 {
-	struct nvme_smart_log *smart_log;
-	size_t data_len = nvmet_get_log_page_len(req->cmd);
-	void *buf;
-	u16 status = 0;
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	u16 status = NVME_SC_INTERNAL;
+	size_t len;
 
-	buf = kzalloc(data_len, GFP_KERNEL);
-	if (!buf) {
-		status = NVME_SC_INTERNAL;
+	if (req->data_len != NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32))
 		goto out;
-	}
 
-	switch (req->cmd->get_log_page.lid) {
-	case NVME_LOG_ERROR:
-		/*
-		 * We currently never set the More bit in the status field,
-		 * so all error log entries are invalid and can be zeroed out.
-		 * This is called a minum viable implementation (TM) of this
-		 * mandatory log page.
-		 */
-		break;
-	case NVME_LOG_SMART:
-		/*
-		 * XXX: fill out actual smart log
-		 *
-		 * We might have a hard time coming up with useful values for
-		 * many of the fields, and even when we have useful data
-		 * available (e.g. units or commands read/written) those aren't
-		 * persistent over power loss.
-		 */
-		if (data_len != sizeof(*smart_log)) {
-			status = NVME_SC_INTERNAL;
-			goto err;
-		}
-		smart_log = buf;
-		status = nvmet_get_smart_log(req, smart_log);
-		if (status)
-			goto err;
-		break;
-	case NVME_LOG_FW_SLOT:
-		/*
-		 * We only support a single firmware slot which always is
-		 * active, so we can zero out the whole firmware slot log and
-		 * still claim to fully implement this mandatory log page.
-		 */
-		break;
-	default:
-		BUG();
-	}
-
-	status = nvmet_copy_to_sgl(req, 0, buf, data_len);
-
-err:
-	kfree(buf);
+	mutex_lock(&ctrl->lock);
+	if (ctrl->nr_changed_ns == U32_MAX)
+		len = sizeof(__le32);
+	else
+		len = ctrl->nr_changed_ns * sizeof(__le32);
+	status = nvmet_copy_to_sgl(req, 0, ctrl->changed_ns_list, len);
+	if (!status)
+		status = nvmet_zero_sgl(req, len, req->data_len - len);
+	ctrl->nr_changed_ns = 0;
+	clear_bit(NVME_AEN_CFG_NS_ATTR, &ctrl->aen_masked);
+	mutex_unlock(&ctrl->lock);
 out:
 	nvmet_req_complete(req, status);
 }
@@ -201,7 +190,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 	id->ver = cpu_to_le32(ctrl->subsys->ver);
 
 	/* XXX: figure out what to do about RTD3R/RTD3 */
-	id->oaes = cpu_to_le32(1 << 8);
+	id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL);
 	id->ctratt = cpu_to_le32(1 << 0);
 
 	id->oacs = 0;
@@ -447,6 +436,16 @@ static void nvmet_execute_set_features(struct nvmet_req *req)
 		req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000);
 		nvmet_set_result(req, req->sq->ctrl->kato);
 		break;
+	case NVME_FEAT_ASYNC_EVENT:
+		val32 = le32_to_cpu(req->cmd->common.cdw10[1]);
+		if (val32 & ~NVMET_AEN_CFG_ALL) {
+			status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+			break;
+		}
+
+		WRITE_ONCE(req->sq->ctrl->aen_enabled, val32);
+		nvmet_set_result(req, val32);
+		break;
 	case NVME_FEAT_HOST_ID:
 		status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
 		break;
@@ -485,9 +484,10 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
 		break;
 	case NVME_FEAT_WRITE_ATOMIC:
 		break;
+#endif
 	case NVME_FEAT_ASYNC_EVENT:
+		nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled));
 		break;
-#endif
 	case NVME_FEAT_VOLATILE_WC:
 		nvmet_set_result(req, 1);
 		break;
@@ -548,8 +548,6 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
 	struct nvme_command *cmd = req->cmd;
 	u16 ret;
 
-	req->ns = NULL;
-
 	ret = nvmet_check_ctrl_status(req, cmd);
 	if (unlikely(ret))
 		return ret;
@@ -560,9 +558,28 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
 
 		switch (cmd->get_log_page.lid) {
 		case NVME_LOG_ERROR:
+			/*
+			 * We currently never set the More bit in the status
+			 * field, so all error log entries are invalid and can
+			 * be zeroed out.  This is called a minum viable
+			 * implementation (TM) of this mandatory log page.
+			 */
+			req->execute = nvmet_execute_get_log_page_noop;
+			return 0;
 		case NVME_LOG_SMART:
+			req->execute = nvmet_execute_get_log_page_smart;
+			return 0;
 		case NVME_LOG_FW_SLOT:
-			req->execute = nvmet_execute_get_log_page;
+			/*
+			 * We only support a single firmware slot which always
+			 * is active, so we can zero out the whole firmware slot
+			 * log and still claim to fully implement this mandatory
+			 * log page.
+			 */
+			req->execute = nvmet_execute_get_log_page_noop;
+			return 0;
+		case NVME_LOG_CHANGED_NS:
+			req->execute = nvmet_execute_get_log_changed_ns;
 			return 0;
 		}
 		break;
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index e95424f..a03da76 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -57,6 +57,13 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
 	return 0;
 }
 
+u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len)
+{
+	if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len)
+		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
+	return 0;
+}
+
 static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys)
 {
 	struct nvmet_ns *ns;
@@ -137,6 +144,51 @@ static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
 	schedule_work(&ctrl->async_event_work);
 }
 
+static bool nvmet_aen_disabled(struct nvmet_ctrl *ctrl, u32 aen)
+{
+	if (!(READ_ONCE(ctrl->aen_enabled) & aen))
+		return true;
+	return test_and_set_bit(aen, &ctrl->aen_masked);
+}
+
+static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
+{
+	u32 i;
+
+	mutex_lock(&ctrl->lock);
+	if (ctrl->nr_changed_ns > NVME_MAX_CHANGED_NAMESPACES)
+		goto out_unlock;
+
+	for (i = 0; i < ctrl->nr_changed_ns; i++) {
+		if (ctrl->changed_ns_list[i] == nsid)
+			goto out_unlock;
+	}
+
+	if (ctrl->nr_changed_ns == NVME_MAX_CHANGED_NAMESPACES) {
+		ctrl->changed_ns_list[0] = cpu_to_le32(0xffffffff);
+		ctrl->nr_changed_ns = U32_MAX;
+		goto out_unlock;
+	}
+
+	ctrl->changed_ns_list[ctrl->nr_changed_ns++] = nsid;
+out_unlock:
+	mutex_unlock(&ctrl->lock);
+}
+
+static void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
+{
+	struct nvmet_ctrl *ctrl;
+
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+		nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid));
+		if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_NS_ATTR))
+			continue;
+		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
+				NVME_AER_NOTICE_NS_CHANGED,
+				NVME_LOG_CHANGED_NS);
+	}
+}
+
 int nvmet_register_transport(const struct nvmet_fabrics_ops *ops)
 {
 	int ret = 0;
@@ -271,33 +323,31 @@ void nvmet_put_namespace(struct nvmet_ns *ns)
 	percpu_ref_put(&ns->ref);
 }
 
+static void nvmet_ns_dev_disable(struct nvmet_ns *ns)
+{
+	nvmet_bdev_ns_disable(ns);
+	nvmet_file_ns_disable(ns);
+}
+
 int nvmet_ns_enable(struct nvmet_ns *ns)
 {
 	struct nvmet_subsys *subsys = ns->subsys;
-	struct nvmet_ctrl *ctrl;
 	int ret = 0;
 
 	mutex_lock(&subsys->lock);
 	if (ns->enabled)
 		goto out_unlock;
 
-	ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE,
-			NULL);
-	if (IS_ERR(ns->bdev)) {
-		pr_err("failed to open block device %s: (%ld)\n",
-		       ns->device_path, PTR_ERR(ns->bdev));
-		ret = PTR_ERR(ns->bdev);
-		ns->bdev = NULL;
+	ret = nvmet_bdev_ns_enable(ns);
+	if (ret)
+		ret = nvmet_file_ns_enable(ns);
+	if (ret)
 		goto out_unlock;
-	}
-
-	ns->size = i_size_read(ns->bdev->bd_inode);
-	ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev));
 
 	ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace,
 				0, GFP_KERNEL);
 	if (ret)
-		goto out_blkdev_put;
+		goto out_dev_put;
 
 	if (ns->nsid > subsys->max_nsid)
 		subsys->max_nsid = ns->nsid;
@@ -320,24 +370,20 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
 		list_add_tail_rcu(&ns->dev_link, &old->dev_link);
 	}
 
-	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
-		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0);
-
+	nvmet_ns_changed(subsys, ns->nsid);
 	ns->enabled = true;
 	ret = 0;
 out_unlock:
 	mutex_unlock(&subsys->lock);
 	return ret;
-out_blkdev_put:
-	blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ);
-	ns->bdev = NULL;
+out_dev_put:
+	nvmet_ns_dev_disable(ns);
 	goto out_unlock;
 }
 
 void nvmet_ns_disable(struct nvmet_ns *ns)
 {
 	struct nvmet_subsys *subsys = ns->subsys;
-	struct nvmet_ctrl *ctrl;
 
 	mutex_lock(&subsys->lock);
 	if (!ns->enabled)
@@ -363,11 +409,8 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
 	percpu_ref_exit(&ns->ref);
 
 	mutex_lock(&subsys->lock);
-	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
-		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0);
-
-	if (ns->bdev)
-		blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ);
+	nvmet_ns_changed(subsys, ns->nsid);
+	nvmet_ns_dev_disable(ns);
 out_unlock:
 	mutex_unlock(&subsys->lock);
 }
@@ -499,6 +542,25 @@ int nvmet_sq_init(struct nvmet_sq *sq)
 }
 EXPORT_SYMBOL_GPL(nvmet_sq_init);
 
+static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
+{
+	struct nvme_command *cmd = req->cmd;
+	u16 ret;
+
+	ret = nvmet_check_ctrl_status(req, cmd);
+	if (unlikely(ret))
+		return ret;
+
+	req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
+	if (unlikely(!req->ns))
+		return NVME_SC_INVALID_NS | NVME_SC_DNR;
+
+	if (req->ns->file)
+		return nvmet_file_parse_io_cmd(req);
+	else
+		return nvmet_bdev_parse_io_cmd(req);
+}
+
 bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 		struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops)
 {
@@ -710,15 +772,14 @@ out:
 u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd)
 {
 	if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
-		pr_err("got io cmd %d while CC.EN == 0 on qid = %d\n",
+		pr_err("got cmd %d while CC.EN == 0 on qid = %d\n",
 		       cmd->common.opcode, req->sq->qid);
 		return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
 	}
 
 	if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
-		pr_err("got io cmd %d while CSTS.RDY == 0 on qid = %d\n",
+		pr_err("got cmd %d while CSTS.RDY == 0 on qid = %d\n",
 		       cmd->common.opcode, req->sq->qid);
-		req->ns = NULL;
 		return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
 	}
 	return 0;
@@ -809,12 +870,18 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 
 	kref_init(&ctrl->ref);
 	ctrl->subsys = subsys;
+	WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL);
+
+	ctrl->changed_ns_list = kmalloc_array(NVME_MAX_CHANGED_NAMESPACES,
+			sizeof(__le32), GFP_KERNEL);
+	if (!ctrl->changed_ns_list)
+		goto out_free_ctrl;
 
 	ctrl->cqs = kcalloc(subsys->max_qid + 1,
 			sizeof(struct nvmet_cq *),
 			GFP_KERNEL);
 	if (!ctrl->cqs)
-		goto out_free_ctrl;
+		goto out_free_changed_ns_list;
 
 	ctrl->sqs = kcalloc(subsys->max_qid + 1,
 			sizeof(struct nvmet_sq *),
@@ -872,6 +939,8 @@ out_free_sqs:
 	kfree(ctrl->sqs);
 out_free_cqs:
 	kfree(ctrl->cqs);
+out_free_changed_ns_list:
+	kfree(ctrl->changed_ns_list);
 out_free_ctrl:
 	kfree(ctrl);
 out_put_subsystem:
@@ -898,6 +967,7 @@ static void nvmet_ctrl_free(struct kref *ref)
 
 	kfree(ctrl->sqs);
 	kfree(ctrl->cqs);
+	kfree(ctrl->changed_ns_list);
 	kfree(ctrl);
 
 	nvmet_subsys_put(subsys);
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 231e04e..08656b8 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -187,8 +187,6 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
 {
 	struct nvme_command *cmd = req->cmd;
 
-	req->ns = NULL;
-
 	if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
 		pr_err("got cmd %d while not ready\n",
 		       cmd->common.opcode);
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index 19e9e42..d84ae004 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -77,8 +77,6 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req)
 {
 	struct nvme_command *cmd = req->cmd;
 
-	req->ns = NULL;
-
 	switch (cmd->fabrics.fctype) {
 	case nvme_fabrics_type_property_set:
 		req->data_len = 0;
@@ -242,8 +240,6 @@ u16 nvmet_parse_connect_cmd(struct nvmet_req *req)
 {
 	struct nvme_command *cmd = req->cmd;
 
-	req->ns = NULL;
-
 	if (cmd->common.opcode != nvme_fabrics_command) {
 		pr_err("invalid command 0x%x on unconnected queue.\n",
 			cmd->fabrics.opcode);
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 33ee8d3..408279c 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -31,7 +31,7 @@
 /* *************************** Data Structures/Defines ****************** */
 
 
-#define NVMET_LS_CTX_COUNT		4
+#define NVMET_LS_CTX_COUNT		256
 
 /* for this implementation, assume small single frame rqst/rsp */
 #define NVME_FC_MAX_LS_BUFFER_SIZE		2048
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd-bdev.c
index cd23441..e0b0f7d 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -16,6 +16,34 @@
 #include <linux/module.h>
 #include "nvmet.h"
 
+int nvmet_bdev_ns_enable(struct nvmet_ns *ns)
+{
+	int ret;
+
+	ns->bdev = blkdev_get_by_path(ns->device_path,
+			FMODE_READ | FMODE_WRITE, NULL);
+	if (IS_ERR(ns->bdev)) {
+		ret = PTR_ERR(ns->bdev);
+		if (ret != -ENOTBLK) {
+			pr_err("failed to open block device %s: (%ld)\n",
+					ns->device_path, PTR_ERR(ns->bdev));
+		}
+		ns->bdev = NULL;
+		return ret;
+	}
+	ns->size = i_size_read(ns->bdev->bd_inode);
+	ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev));
+	return 0;
+}
+
+void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
+{
+	if (ns->bdev) {
+		blkdev_put(ns->bdev, FMODE_WRITE | FMODE_READ);
+		ns->bdev = NULL;
+	}
+}
+
 static void nvmet_bio_done(struct bio *bio)
 {
 	struct nvmet_req *req = bio->bi_private;
@@ -23,20 +51,14 @@ static void nvmet_bio_done(struct bio *bio)
 	nvmet_req_complete(req,
 		bio->bi_status ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
 
-	if (bio != &req->inline_bio)
+	if (bio != &req->b.inline_bio)
 		bio_put(bio);
 }
 
-static inline u32 nvmet_rw_len(struct nvmet_req *req)
-{
-	return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) <<
-			req->ns->blksize_shift;
-}
-
-static void nvmet_execute_rw(struct nvmet_req *req)
+static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 {
 	int sg_cnt = req->sg_cnt;
-	struct bio *bio = &req->inline_bio;
+	struct bio *bio = &req->b.inline_bio;
 	struct scatterlist *sg;
 	sector_t sector;
 	blk_qc_t cookie;
@@ -89,9 +111,9 @@ static void nvmet_execute_rw(struct nvmet_req *req)
 	blk_poll(bdev_get_queue(req->ns->bdev), cookie);
 }
 
-static void nvmet_execute_flush(struct nvmet_req *req)
+static void nvmet_bdev_execute_flush(struct nvmet_req *req)
 {
-	struct bio *bio = &req->inline_bio;
+	struct bio *bio = &req->b.inline_bio;
 
 	bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
 	bio_set_dev(bio, req->ns->bdev);
@@ -102,7 +124,7 @@ static void nvmet_execute_flush(struct nvmet_req *req)
 	submit_bio(bio);
 }
 
-static u16 nvmet_discard_range(struct nvmet_ns *ns,
+static u16 nvmet_bdev_discard_range(struct nvmet_ns *ns,
 		struct nvme_dsm_range *range, struct bio **bio)
 {
 	int ret;
@@ -116,7 +138,7 @@ static u16 nvmet_discard_range(struct nvmet_ns *ns,
 	return 0;
 }
 
-static void nvmet_execute_discard(struct nvmet_req *req)
+static void nvmet_bdev_execute_discard(struct nvmet_req *req)
 {
 	struct nvme_dsm_range range;
 	struct bio *bio = NULL;
@@ -129,7 +151,7 @@ static void nvmet_execute_discard(struct nvmet_req *req)
 		if (status)
 			break;
 
-		status = nvmet_discard_range(req->ns, &range, &bio);
+		status = nvmet_bdev_discard_range(req->ns, &range, &bio);
 		if (status)
 			break;
 	}
@@ -148,11 +170,11 @@ static void nvmet_execute_discard(struct nvmet_req *req)
 	}
 }
 
-static void nvmet_execute_dsm(struct nvmet_req *req)
+static void nvmet_bdev_execute_dsm(struct nvmet_req *req)
 {
 	switch (le32_to_cpu(req->cmd->dsm.attributes)) {
 	case NVME_DSMGMT_AD:
-		nvmet_execute_discard(req);
+		nvmet_bdev_execute_discard(req);
 		return;
 	case NVME_DSMGMT_IDR:
 	case NVME_DSMGMT_IDW:
@@ -163,7 +185,7 @@ static void nvmet_execute_dsm(struct nvmet_req *req)
 	}
 }
 
-static void nvmet_execute_write_zeroes(struct nvmet_req *req)
+static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req)
 {
 	struct nvme_write_zeroes_cmd *write_zeroes = &req->cmd->write_zeroes;
 	struct bio *bio = NULL;
@@ -189,38 +211,27 @@ static void nvmet_execute_write_zeroes(struct nvmet_req *req)
 	}
 }
 
-u16 nvmet_parse_io_cmd(struct nvmet_req *req)
+u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req)
 {
 	struct nvme_command *cmd = req->cmd;
-	u16 ret;
-
-	ret = nvmet_check_ctrl_status(req, cmd);
-	if (unlikely(ret)) {
-		req->ns = NULL;
-		return ret;
-	}
-
-	req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
-	if (unlikely(!req->ns))
-		return NVME_SC_INVALID_NS | NVME_SC_DNR;
 
 	switch (cmd->common.opcode) {
 	case nvme_cmd_read:
 	case nvme_cmd_write:
-		req->execute = nvmet_execute_rw;
+		req->execute = nvmet_bdev_execute_rw;
 		req->data_len = nvmet_rw_len(req);
 		return 0;
 	case nvme_cmd_flush:
-		req->execute = nvmet_execute_flush;
+		req->execute = nvmet_bdev_execute_flush;
 		req->data_len = 0;
 		return 0;
 	case nvme_cmd_dsm:
-		req->execute = nvmet_execute_dsm;
+		req->execute = nvmet_bdev_execute_dsm;
 		req->data_len = (le32_to_cpu(cmd->dsm.nr) + 1) *
 			sizeof(struct nvme_dsm_range);
 		return 0;
 	case nvme_cmd_write_zeroes:
-		req->execute = nvmet_execute_write_zeroes;
+		req->execute = nvmet_bdev_execute_write_zeroes;
 		return 0;
 	default:
 		pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
new file mode 100644
index 0000000..8c42b3a
--- /dev/null
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe Over Fabrics Target File I/O commands implementation.
+ * Copyright (c) 2017-2018 Western Digital Corporation or its
+ * affiliates.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/uio.h>
+#include <linux/falloc.h>
+#include <linux/file.h>
+#include "nvmet.h"
+
+#define NVMET_MAX_MPOOL_BVEC		16
+#define NVMET_MIN_MPOOL_OBJ		16
+
+void nvmet_file_ns_disable(struct nvmet_ns *ns)
+{
+	if (ns->file) {
+		mempool_destroy(ns->bvec_pool);
+		ns->bvec_pool = NULL;
+		kmem_cache_destroy(ns->bvec_cache);
+		ns->bvec_cache = NULL;
+		fput(ns->file);
+		ns->file = NULL;
+	}
+}
+
+int nvmet_file_ns_enable(struct nvmet_ns *ns)
+{
+	int ret;
+	struct kstat stat;
+
+	ns->file = filp_open(ns->device_path,
+			O_RDWR | O_LARGEFILE | O_DIRECT, 0);
+	if (IS_ERR(ns->file)) {
+		pr_err("failed to open file %s: (%ld)\n",
+				ns->device_path, PTR_ERR(ns->file));
+		return PTR_ERR(ns->file);
+	}
+
+	ret = vfs_getattr(&ns->file->f_path,
+			&stat, STATX_SIZE, AT_STATX_FORCE_SYNC);
+	if (ret)
+		goto err;
+
+	ns->size = stat.size;
+	ns->blksize_shift = file_inode(ns->file)->i_blkbits;
+
+	ns->bvec_cache = kmem_cache_create("nvmet-bvec",
+			NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec),
+			0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!ns->bvec_cache) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	ns->bvec_pool = mempool_create(NVMET_MIN_MPOOL_OBJ, mempool_alloc_slab,
+			mempool_free_slab, ns->bvec_cache);
+
+	if (!ns->bvec_pool) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	return ret;
+err:
+	ns->size = 0;
+	ns->blksize_shift = 0;
+	nvmet_file_ns_disable(ns);
+	return ret;
+}
+
+static void nvmet_file_init_bvec(struct bio_vec *bv, struct sg_page_iter *iter)
+{
+	bv->bv_page = sg_page_iter_page(iter);
+	bv->bv_offset = iter->sg->offset;
+	bv->bv_len = PAGE_SIZE - iter->sg->offset;
+}
+
+static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
+		unsigned long nr_segs, size_t count)
+{
+	struct kiocb *iocb = &req->f.iocb;
+	ssize_t (*call_iter)(struct kiocb *iocb, struct iov_iter *iter);
+	struct iov_iter iter;
+	int ki_flags = 0, rw;
+	ssize_t ret;
+
+	if (req->cmd->rw.opcode == nvme_cmd_write) {
+		if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
+			ki_flags = IOCB_DSYNC;
+		call_iter = req->ns->file->f_op->write_iter;
+		rw = WRITE;
+	} else {
+		call_iter = req->ns->file->f_op->read_iter;
+		rw = READ;
+	}
+
+	iov_iter_bvec(&iter, ITER_BVEC | rw, req->f.bvec, nr_segs, count);
+
+	iocb->ki_pos = pos;
+	iocb->ki_filp = req->ns->file;
+	iocb->ki_flags = IOCB_DIRECT | ki_flags;
+
+	ret = call_iter(iocb, &iter);
+
+	if (ret != -EIOCBQUEUED && iocb->ki_complete)
+		iocb->ki_complete(iocb, ret, 0);
+
+	return ret;
+}
+
+static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2)
+{
+	struct nvmet_req *req = container_of(iocb, struct nvmet_req, f.iocb);
+
+	if (req->f.bvec != req->inline_bvec) {
+		if (likely(req->f.mpool_alloc == false))
+			kfree(req->f.bvec);
+		else
+			mempool_free(req->f.bvec, req->ns->bvec_pool);
+	}
+
+	nvmet_req_complete(req, ret != req->data_len ?
+			NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+}
+
+static void nvmet_file_execute_rw(struct nvmet_req *req)
+{
+	ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE);
+	struct sg_page_iter sg_pg_iter;
+	unsigned long bv_cnt = 0;
+	bool is_sync = false;
+	size_t len = 0, total_len = 0;
+	ssize_t ret = 0;
+	loff_t pos;
+
+	if (!req->sg_cnt || !nr_bvec) {
+		nvmet_req_complete(req, 0);
+		return;
+	}
+
+	if (nr_bvec > NVMET_MAX_INLINE_BIOVEC)
+		req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
+				GFP_KERNEL);
+	else
+		req->f.bvec = req->inline_bvec;
+
+	req->f.mpool_alloc = false;
+	if (unlikely(!req->f.bvec)) {
+		/* fallback under memory pressure */
+		req->f.bvec = mempool_alloc(req->ns->bvec_pool, GFP_KERNEL);
+		req->f.mpool_alloc = true;
+		if (nr_bvec > NVMET_MAX_MPOOL_BVEC)
+			is_sync = true;
+	}
+
+	pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift;
+
+	memset(&req->f.iocb, 0, sizeof(struct kiocb));
+	for_each_sg_page(req->sg, &sg_pg_iter, req->sg_cnt, 0) {
+		nvmet_file_init_bvec(&req->f.bvec[bv_cnt], &sg_pg_iter);
+		len += req->f.bvec[bv_cnt].bv_len;
+		total_len += req->f.bvec[bv_cnt].bv_len;
+		bv_cnt++;
+
+		WARN_ON_ONCE((nr_bvec - 1) < 0);
+
+		if (unlikely(is_sync) &&
+		    (nr_bvec - 1 == 0 || bv_cnt == NVMET_MAX_MPOOL_BVEC)) {
+			ret = nvmet_file_submit_bvec(req, pos, bv_cnt, len);
+			if (ret < 0)
+				goto out;
+			pos += len;
+			bv_cnt = 0;
+			len = 0;
+		}
+		nr_bvec--;
+	}
+
+	if (WARN_ON_ONCE(total_len != req->data_len))
+		ret = -EIO;
+out:
+	if (unlikely(is_sync || ret)) {
+		nvmet_file_io_done(&req->f.iocb, ret < 0 ? ret : total_len, 0);
+		return;
+	}
+	req->f.iocb.ki_complete = nvmet_file_io_done;
+	nvmet_file_submit_bvec(req, pos, bv_cnt, total_len);
+}
+
+static void nvmet_file_flush_work(struct work_struct *w)
+{
+	struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
+	int ret;
+
+	ret = vfs_fsync(req->ns->file, 1);
+
+	nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+}
+
+static void nvmet_file_execute_flush(struct nvmet_req *req)
+{
+	INIT_WORK(&req->f.work, nvmet_file_flush_work);
+	schedule_work(&req->f.work);
+}
+
+static void nvmet_file_execute_discard(struct nvmet_req *req)
+{
+	int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+	struct nvme_dsm_range range;
+	loff_t offset;
+	loff_t len;
+	int i, ret;
+
+	for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) {
+		if (nvmet_copy_from_sgl(req, i * sizeof(range), &range,
+					sizeof(range)))
+			break;
+		offset = le64_to_cpu(range.slba) << req->ns->blksize_shift;
+		len = le32_to_cpu(range.nlb) << req->ns->blksize_shift;
+		ret = vfs_fallocate(req->ns->file, mode, offset, len);
+		if (ret)
+			break;
+	}
+
+	nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+}
+
+static void nvmet_file_dsm_work(struct work_struct *w)
+{
+	struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
+
+	switch (le32_to_cpu(req->cmd->dsm.attributes)) {
+	case NVME_DSMGMT_AD:
+		nvmet_file_execute_discard(req);
+		return;
+	case NVME_DSMGMT_IDR:
+	case NVME_DSMGMT_IDW:
+	default:
+		/* Not supported yet */
+		nvmet_req_complete(req, 0);
+		return;
+	}
+}
+
+static void nvmet_file_execute_dsm(struct nvmet_req *req)
+{
+	INIT_WORK(&req->f.work, nvmet_file_dsm_work);
+	schedule_work(&req->f.work);
+}
+
+static void nvmet_file_write_zeroes_work(struct work_struct *w)
+{
+	struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
+	struct nvme_write_zeroes_cmd *write_zeroes = &req->cmd->write_zeroes;
+	int mode = FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE;
+	loff_t offset;
+	loff_t len;
+	int ret;
+
+	offset = le64_to_cpu(write_zeroes->slba) << req->ns->blksize_shift;
+	len = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) <<
+			req->ns->blksize_shift);
+
+	ret = vfs_fallocate(req->ns->file, mode, offset, len);
+	nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+}
+
+static void nvmet_file_execute_write_zeroes(struct nvmet_req *req)
+{
+	INIT_WORK(&req->f.work, nvmet_file_write_zeroes_work);
+	schedule_work(&req->f.work);
+}
+
+u16 nvmet_file_parse_io_cmd(struct nvmet_req *req)
+{
+	struct nvme_command *cmd = req->cmd;
+
+	switch (cmd->common.opcode) {
+	case nvme_cmd_read:
+	case nvme_cmd_write:
+		req->execute = nvmet_file_execute_rw;
+		req->data_len = nvmet_rw_len(req);
+		return 0;
+	case nvme_cmd_flush:
+		req->execute = nvmet_file_execute_flush;
+		req->data_len = 0;
+		return 0;
+	case nvme_cmd_dsm:
+		req->execute = nvmet_file_execute_dsm;
+		req->data_len = (le32_to_cpu(cmd->dsm.nr) + 1) *
+			sizeof(struct nvme_dsm_range);
+		return 0;
+	case nvme_cmd_write_zeroes:
+		req->execute = nvmet_file_execute_write_zeroes;
+		req->data_len = 0;
+		return 0;
+	default:
+		pr_err("unhandled cmd for file ns %d on qid %d\n",
+				cmd->common.opcode, req->sq->qid);
+		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+	}
+}
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 27a8561..1304ec3 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -45,6 +45,7 @@ struct nvme_loop_ctrl {
 	struct nvme_ctrl	ctrl;
 
 	struct nvmet_ctrl	*target_ctrl;
+	struct nvmet_port	*port;
 };
 
 static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl)
@@ -63,7 +64,8 @@ struct nvme_loop_queue {
 	unsigned long		flags;
 };
 
-static struct nvmet_port *nvmet_loop_port;
+static LIST_HEAD(nvme_loop_ports);
+static DEFINE_MUTEX(nvme_loop_ports_mutex);
 
 static LIST_HEAD(nvme_loop_ctrl_list);
 static DEFINE_MUTEX(nvme_loop_ctrl_mutex);
@@ -146,7 +148,7 @@ nvme_loop_timeout(struct request *rq, bool reserved)
 	/* fail with DNR on admin cmd timeout */
 	nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
 
-	return BLK_EH_HANDLED;
+	return BLK_EH_DONE;
 }
 
 static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -169,12 +171,12 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	blk_mq_start_request(req);
 	iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
-	iod->req.port = nvmet_loop_port;
+	iod->req.port = queue->ctrl->port;
 	if (!nvmet_req_init(&iod->req, &queue->nvme_cq,
 			&queue->nvme_sq, &nvme_loop_ops))
 		return BLK_STS_OK;
 
-	if (blk_rq_payload_bytes(req)) {
+	if (blk_rq_nr_phys_segments(req)) {
 		iod->sg_table.sgl = iod->first_sgl;
 		if (sg_alloc_table_chained(&iod->sg_table,
 				blk_rq_nr_phys_segments(req),
@@ -517,6 +519,7 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = {
 	.free_ctrl		= nvme_loop_free_ctrl,
 	.submit_async_event	= nvme_loop_submit_async_event,
 	.delete_ctrl		= nvme_loop_delete_ctrl_host,
+	.get_address		= nvmf_get_address,
 };
 
 static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
@@ -565,6 +568,23 @@ out_destroy_queues:
 	return ret;
 }
 
+static struct nvmet_port *nvme_loop_find_port(struct nvme_ctrl *ctrl)
+{
+	struct nvmet_port *p, *found = NULL;
+
+	mutex_lock(&nvme_loop_ports_mutex);
+	list_for_each_entry(p, &nvme_loop_ports, entry) {
+		/* if no transport address is specified use the first port */
+		if ((ctrl->opts->mask & NVMF_OPT_TRADDR) &&
+		    strcmp(ctrl->opts->traddr, p->disc_addr.traddr))
+			continue;
+		found = p;
+		break;
+	}
+	mutex_unlock(&nvme_loop_ports_mutex);
+	return found;
+}
+
 static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
 		struct nvmf_ctrl_options *opts)
 {
@@ -589,6 +609,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
 
 	ctrl->ctrl.sqsize = opts->queue_size - 1;
 	ctrl->ctrl.kato = opts->kato;
+	ctrl->port = nvme_loop_find_port(&ctrl->ctrl);
 
 	ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues),
 			GFP_KERNEL);
@@ -646,27 +667,17 @@ out_put_ctrl:
 
 static int nvme_loop_add_port(struct nvmet_port *port)
 {
-	/*
-	 * XXX: disalow adding more than one port so
-	 * there is no connection rejections when a
-	 * a subsystem is assigned to a port for which
-	 * loop doesn't have a pointer.
-	 * This scenario would be possible if we allowed
-	 * more than one port to be added and a subsystem
-	 * was assigned to a port other than nvmet_loop_port.
-	 */
-
-	if (nvmet_loop_port)
-		return -EPERM;
-
-	nvmet_loop_port = port;
+	mutex_lock(&nvme_loop_ports_mutex);
+	list_add_tail(&port->entry, &nvme_loop_ports);
+	mutex_unlock(&nvme_loop_ports_mutex);
 	return 0;
 }
 
 static void nvme_loop_remove_port(struct nvmet_port *port)
 {
-	if (port == nvmet_loop_port)
-		nvmet_loop_port = NULL;
+	mutex_lock(&nvme_loop_ports_mutex);
+	list_del_init(&port->entry);
+	mutex_unlock(&nvme_loop_ports_mutex);
 }
 
 static const struct nvmet_fabrics_ops nvme_loop_ops = {
@@ -682,6 +693,7 @@ static struct nvmf_transport_ops nvme_loop_transport = {
 	.name		= "loop",
 	.module		= THIS_MODULE,
 	.create_ctrl	= nvme_loop_create_ctrl,
+	.allowed_opts	= NVMF_OPT_TRADDR,
 };
 
 static int __init nvme_loop_init_module(void)
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 15fd84a..480dfe1 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -30,6 +30,21 @@
 #define NVMET_ASYNC_EVENTS		4
 #define NVMET_ERROR_LOG_SLOTS		128
 
+
+/*
+ * Supported optional AENs:
+ */
+#define NVMET_AEN_CFG_OPTIONAL \
+	NVME_AEN_CFG_NS_ATTR
+
+/*
+ * Plus mandatory SMART AENs (we'll never send them, but allow enabling them):
+ */
+#define NVMET_AEN_CFG_ALL \
+	(NVME_SMART_CRIT_SPARE | NVME_SMART_CRIT_TEMPERATURE | \
+	 NVME_SMART_CRIT_RELIABILITY | NVME_SMART_CRIT_MEDIA | \
+	 NVME_SMART_CRIT_VOLATILE_MEMORY | NVMET_AEN_CFG_OPTIONAL)
+
 /* Helper Macros when NVMe error is NVME_SC_CONNECT_INVALID_PARAM
  * The 16 bit shift is to set IATTR bit to 1, which means offending
  * offset starts in the data section of connect()
@@ -43,6 +58,7 @@ struct nvmet_ns {
 	struct list_head	dev_link;
 	struct percpu_ref	ref;
 	struct block_device	*bdev;
+	struct file		*file;
 	u32			nsid;
 	u32			blksize_shift;
 	loff_t			size;
@@ -57,6 +73,8 @@ struct nvmet_ns {
 	struct config_group	group;
 
 	struct completion	disable_done;
+	mempool_t		*bvec_pool;
+	struct kmem_cache	*bvec_cache;
 };
 
 static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
@@ -82,7 +100,7 @@ struct nvmet_sq {
 /**
  * struct nvmet_port -	Common structure to keep port
  *				information for the target.
- * @entry:		List head for holding a list of these elements.
+ * @entry:		Entry into referrals or transport list.
  * @disc_addr:		Address information is stored in a format defined
  *				for a discovery log page entry.
  * @group:		ConfigFS group for this element's folder.
@@ -120,6 +138,8 @@ struct nvmet_ctrl {
 	u16			cntlid;
 	u32			kato;
 
+	u32			aen_enabled;
+	unsigned long		aen_masked;
 	struct nvmet_req	*async_event_cmds[NVMET_ASYNC_EVENTS];
 	unsigned int		nr_async_event_cmds;
 	struct list_head	async_events;
@@ -132,6 +152,9 @@ struct nvmet_ctrl {
 
 	const struct nvmet_fabrics_ops *ops;
 
+	__le32			*changed_ns_list;
+	u32			nr_changed_ns;
+
 	char			subsysnqn[NVMF_NQN_FIELD_LEN];
 	char			hostnqn[NVMF_NQN_FIELD_LEN];
 };
@@ -222,8 +245,18 @@ struct nvmet_req {
 	struct nvmet_cq		*cq;
 	struct nvmet_ns		*ns;
 	struct scatterlist	*sg;
-	struct bio		inline_bio;
 	struct bio_vec		inline_bvec[NVMET_MAX_INLINE_BIOVEC];
+	union {
+		struct {
+			struct bio      inline_bio;
+		} b;
+		struct {
+			bool			mpool_alloc;
+			struct kiocb            iocb;
+			struct bio_vec          *bvec;
+			struct work_struct      work;
+		} f;
+	};
 	int			sg_cnt;
 	/* data length as parsed from the command: */
 	size_t			data_len;
@@ -263,7 +296,8 @@ struct nvmet_async_event {
 };
 
 u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
-u16 nvmet_parse_io_cmd(struct nvmet_req *req);
+u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req);
+u16 nvmet_file_parse_io_cmd(struct nvmet_req *req);
 u16 nvmet_parse_admin_cmd(struct nvmet_req *req);
 u16 nvmet_parse_discovery_cmd(struct nvmet_req *req);
 u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req);
@@ -316,6 +350,7 @@ u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
 		size_t len);
 u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf,
 		size_t len);
+u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len);
 
 u32 nvmet_get_log_page_len(struct nvme_command *cmd);
 
@@ -338,4 +373,14 @@ extern struct rw_semaphore nvmet_config_sem;
 bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
 		const char *hostnqn);
 
+int nvmet_bdev_ns_enable(struct nvmet_ns *ns);
+int nvmet_file_ns_enable(struct nvmet_ns *ns);
+void nvmet_bdev_ns_disable(struct nvmet_ns *ns);
+void nvmet_file_ns_disable(struct nvmet_ns *ns);
+
+static inline u32 nvmet_rw_len(struct nvmet_req *req)
+{
+	return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) <<
+			req->ns->blksize_shift;
+}
 #endif /* _NVMET_H */
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 064c818..33d8551 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -76,6 +76,8 @@ int of_device_add(struct platform_device *ofdev)
  * of_dma_configure - Setup DMA configuration
  * @dev:	Device to apply DMA configuration
  * @np:		Pointer to OF node having DMA configuration
+ * @force_dma:  Whether device is to be set up by of_dma_configure() even if
+ *		DMA capability is not explicitly described by firmware.
  *
  * Try to get devices's DMA configuration from DT and update it
  * accordingly.
@@ -84,7 +86,7 @@ int of_device_add(struct platform_device *ofdev)
  * can use a platform bus notifier and handle BUS_NOTIFY_ADD_DEVICE events
  * to fix up DMA configuration.
  */
-int of_dma_configure(struct device *dev, struct device_node *np)
+int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma)
 {
 	u64 dma_addr, paddr, size = 0;
 	int ret;
@@ -100,7 +102,7 @@ int of_dma_configure(struct device *dev, struct device_node *np)
 		 * DMA configuration regardless of whether "dma-ranges" is
 		 * correctly specified or not.
 		 */
-		if (!dev->bus->force_dma)
+		if (!force_dma)
 			return ret == -ENODEV ? 0 : ret;
 
 		dma_addr = offset = 0;
diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c
index 9a4f4246..895c83e 100644
--- a/drivers/of/of_reserved_mem.c
+++ b/drivers/of/of_reserved_mem.c
@@ -353,7 +353,7 @@ int of_reserved_mem_device_init_by_idx(struct device *dev,
 		/* ensure that dma_ops is set for virtual devices
 		 * using reserved memory
 		 */
-		of_dma_configure(dev, np);
+		of_dma_configure(dev, np, true);
 
 		dev_info(dev, "assigned reserved memory node %s\n", rmem->name);
 	} else {
diff --git a/drivers/parisc/Kconfig b/drivers/parisc/Kconfig
index 3a102a8..5a48b56 100644
--- a/drivers/parisc/Kconfig
+++ b/drivers/parisc/Kconfig
@@ -103,11 +103,6 @@ config IOMMU_SBA
 	depends on PCI_LBA
 	default PCI_LBA
 
-config IOMMU_HELPER
-	bool
-	depends on IOMMU_SBA || IOMMU_CCIO
-	default y
-
 source "drivers/pcmcia/Kconfig"
 
 endmenu
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index 297599f..6148236 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1108,19 +1108,6 @@ static int ccio_proc_info(struct seq_file *m, void *p)
 	return 0;
 }
 
-static int ccio_proc_info_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, &ccio_proc_info, NULL);
-}
-
-static const struct file_operations ccio_proc_info_fops = {
-	.owner = THIS_MODULE,
-	.open = ccio_proc_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 static int ccio_proc_bitmap_info(struct seq_file *m, void *p)
 {
 	struct ioc *ioc = ioc_list;
@@ -1135,19 +1122,6 @@ static int ccio_proc_bitmap_info(struct seq_file *m, void *p)
 
 	return 0;
 }
-
-static int ccio_proc_bitmap_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, &ccio_proc_bitmap_info, NULL);
-}
-
-static const struct file_operations ccio_proc_bitmap_fops = {
-	.owner = THIS_MODULE,
-	.open = ccio_proc_bitmap_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
 #endif /* CONFIG_PROC_FS */
 
 /**
@@ -1589,15 +1563,13 @@ static int __init ccio_probe(struct parisc_device *dev)
 
 #ifdef CONFIG_PROC_FS
 	if (ioc_count == 0) {
-		proc_create(MODULE_NAME, 0, proc_runway_root,
-			    &ccio_proc_info_fops);
-		proc_create(MODULE_NAME"-bitmap", 0, proc_runway_root,
-			    &ccio_proc_bitmap_fops);
+		proc_create_single(MODULE_NAME, 0, proc_runway_root,
+				ccio_proc_info);
+		proc_create_single(MODULE_NAME"-bitmap", 0, proc_runway_root,
+				ccio_proc_bitmap_info);
 	}
 #endif
 	ioc_count++;
-
-	parisc_has_iommu();
 	return 0;
 }
 
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index 0a9c762..11de0ec 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -1864,20 +1864,6 @@ static int sba_proc_info(struct seq_file *m, void *p)
 }
 
 static int
-sba_proc_open(struct inode *i, struct file *f)
-{
-	return single_open(f, &sba_proc_info, NULL);
-}
-
-static const struct file_operations sba_proc_fops = {
-	.owner = THIS_MODULE,
-	.open = sba_proc_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static int
 sba_proc_bitmap_info(struct seq_file *m, void *p)
 {
 	struct sba_device *sba_dev = sba_list;
@@ -1889,20 +1875,6 @@ sba_proc_bitmap_info(struct seq_file *m, void *p)
 
 	return 0;
 }
-
-static int
-sba_proc_bitmap_open(struct inode *i, struct file *f)
-{
-	return single_open(f, &sba_proc_bitmap_info, NULL);
-}
-
-static const struct file_operations sba_proc_bitmap_fops = {
-	.owner = THIS_MODULE,
-	.open = sba_proc_bitmap_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
 #endif /* CONFIG_PROC_FS */
 
 static const struct parisc_device_id sba_tbl[] __initconst = {
@@ -2014,11 +1986,9 @@ static int __init sba_driver_callback(struct parisc_device *dev)
 		break;
 	}
 
-	proc_create("sba_iommu", 0, root, &sba_proc_fops);
-	proc_create("sba_iommu-bitmap", 0, root, &sba_proc_bitmap_fops);
+	proc_create_single("sba_iommu", 0, root, sba_proc_info);
+	proc_create_single("sba_iommu-bitmap", 0, root, sba_proc_bitmap_info);
 #endif
-
-	parisc_has_iommu();
 	return 0;
 }
 
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 34b56a8..29a487f 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -5,10 +5,6 @@
 
 source "drivers/pci/pcie/Kconfig"
 
-config PCI_BUS_ADDR_T_64BIT
-	def_bool y if (ARCH_DMA_ADDR_T_64BIT || 64BIT)
-	depends on PCI
-
 config PCI_MSI
 	bool "Message Signaled Interrupts (MSI and MSI-X)"
 	depends on PCI
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index bc2ded4..35b7fc8 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -120,7 +120,7 @@ int devm_request_pci_bus_resources(struct device *dev,
 EXPORT_SYMBOL_GPL(devm_request_pci_bus_resources);
 
 static struct pci_bus_region pci_32_bit = {0, 0xffffffffULL};
-#ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
 static struct pci_bus_region pci_64_bit = {0,
 				(pci_bus_addr_t) 0xffffffffffffffffULL};
 static struct pci_bus_region pci_high = {(pci_bus_addr_t) 0x100000000ULL,
@@ -230,7 +230,7 @@ int pci_bus_alloc_resource(struct pci_bus *bus, struct resource *res,
 					  resource_size_t),
 		void *alignf_data)
 {
-#ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
 	int rc;
 
 	if (res->flags & IORESOURCE_MEM_64) {
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index b9a1311..f8269a7 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -16,6 +16,8 @@
 #include <linux/pm_runtime.h>
 #include <linux/suspend.h>
 #include <linux/kexec.h>
+#include <linux/of_device.h>
+#include <linux/acpi.h>
 #include "pci.h"
 #include "pcie/portdrv.h"
 
@@ -1577,6 +1579,35 @@ static int pci_bus_num_vf(struct device *dev)
 	return pci_num_vf(to_pci_dev(dev));
 }
 
+/**
+ * pci_dma_configure - Setup DMA configuration
+ * @dev: ptr to dev structure
+ *
+ * Function to update PCI devices's DMA configuration using the same
+ * info from the OF node or ACPI node of host bridge's parent (if any).
+ */
+static int pci_dma_configure(struct device *dev)
+{
+	struct device *bridge;
+	int ret = 0;
+
+	bridge = pci_get_host_bridge_device(to_pci_dev(dev));
+
+	if (IS_ENABLED(CONFIG_OF) && bridge->parent &&
+	    bridge->parent->of_node) {
+		ret = of_dma_configure(dev, bridge->parent->of_node, true);
+	} else if (has_acpi_companion(bridge)) {
+		struct acpi_device *adev = to_acpi_device_node(bridge->fwnode);
+		enum dev_dma_attr attr = acpi_get_dma_attr(adev);
+
+		if (attr != DEV_DMA_NOT_SUPPORTED)
+			ret = acpi_dma_configure(dev, attr);
+	}
+
+	pci_put_host_bridge_device(bridge);
+	return ret;
+}
+
 struct bus_type pci_bus_type = {
 	.name		= "pci",
 	.match		= pci_bus_match,
@@ -1589,7 +1620,7 @@ struct bus_type pci_bus_type = {
 	.drv_groups	= pci_drv_groups,
 	.pm		= PCI_PM_OPS_PTR,
 	.num_vf		= pci_bus_num_vf,
-	.force_dma	= true,
+	.dma_configure	= pci_dma_configure,
 };
 EXPORT_SYMBOL(pci_bus_type);
 
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 1ee8927..7ac035af 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -435,25 +435,12 @@ int pci_proc_detach_bus(struct pci_bus *bus)
 	return 0;
 }
 
-static int proc_bus_pci_dev_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &proc_bus_pci_devices_op);
-}
-
-static const struct file_operations proc_bus_pci_dev_operations = {
-	.owner		= THIS_MODULE,
-	.open		= proc_bus_pci_dev_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int __init pci_proc_init(void)
 {
 	struct pci_dev *dev = NULL;
 	proc_bus_pci_dir = proc_mkdir("bus/pci", NULL);
-	proc_create("devices", 0, proc_bus_pci_dir,
-		    &proc_bus_pci_dev_operations);
+	proc_create_seq("devices", 0, proc_bus_pci_dir,
+		    &proc_bus_pci_devices_op);
 	proc_initialized = 1;
 	for_each_pci_dev(dev)
 		pci_proc_attach_device(dev);
diff --git a/drivers/platform/chrome/Kconfig b/drivers/platform/chrome/Kconfig
index e728a96..cb0df9e 100644
--- a/drivers/platform/chrome/Kconfig
+++ b/drivers/platform/chrome/Kconfig
@@ -38,6 +38,17 @@ config CHROMEOS_PSTORE
 	  If you have a supported Chromebook, choose Y or M here.
 	  The module will be called chromeos_pstore.
 
+config CHROMEOS_TBMC
+	tristate "ChromeOS Tablet Switch Controller"
+	depends on ACPI
+	depends on INPUT
+	help
+	  This option adds a driver for the tablet switch on
+	  select Chrome OS systems.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called chromeos_tbmc.
+
 config CROS_EC_CTL
         tristate
 
diff --git a/drivers/platform/chrome/Makefile b/drivers/platform/chrome/Makefile
index ff3b369..e44c37a 100644
--- a/drivers/platform/chrome/Makefile
+++ b/drivers/platform/chrome/Makefile
@@ -2,6 +2,7 @@
 
 obj-$(CONFIG_CHROMEOS_LAPTOP)		+= chromeos_laptop.o
 obj-$(CONFIG_CHROMEOS_PSTORE)		+= chromeos_pstore.o
+obj-$(CONFIG_CHROMEOS_TBMC)		+= chromeos_tbmc.o
 cros_ec_ctl-objs			:= cros_ec_sysfs.o cros_ec_lightbar.o \
 					   cros_ec_vbc.o cros_ec_debugfs.o
 obj-$(CONFIG_CROS_EC_CTL)		+= cros_ec_ctl.o
diff --git a/drivers/platform/chrome/chromeos_laptop.c b/drivers/platform/chrome/chromeos_laptop.c
index 5c47f45..24326ee 100644
--- a/drivers/platform/chrome/chromeos_laptop.c
+++ b/drivers/platform/chrome/chromeos_laptop.c
@@ -6,6 +6,7 @@
 
 #define pr_fmt(fmt)		KBUILD_MODNAME ": " fmt
 
+#include <linux/acpi.h>
 #include <linux/dmi.h>
 #include <linux/i2c.h>
 #include <linux/input.h>
@@ -54,6 +55,11 @@ struct i2c_peripheral {
 	struct i2c_client *client;
 };
 
+struct acpi_peripheral {
+	char hid[ACPI_ID_LEN];
+	const struct property_entry *properties;
+};
+
 struct chromeos_laptop {
 	/*
 	 * Note that we can't mark this pointer as const because
@@ -61,6 +67,9 @@ struct chromeos_laptop {
 	 */
 	struct i2c_peripheral *i2c_peripherals;
 	unsigned int num_i2c_peripherals;
+
+	const struct acpi_peripheral *acpi_peripherals;
+	unsigned int num_acpi_peripherals;
 };
 
 static const struct chromeos_laptop *cros_laptop;
@@ -148,6 +157,38 @@ static void chromeos_laptop_check_adapter(struct i2c_adapter *adapter)
 	}
 }
 
+static bool chromeos_laptop_adjust_client(struct i2c_client *client)
+{
+	const struct acpi_peripheral *acpi_dev;
+	struct acpi_device_id acpi_ids[2] = { };
+	int i;
+	int error;
+
+	if (!has_acpi_companion(&client->dev))
+		return false;
+
+	for (i = 0; i < cros_laptop->num_acpi_peripherals; i++) {
+		acpi_dev = &cros_laptop->acpi_peripherals[i];
+
+		memcpy(acpi_ids[0].id, acpi_dev->hid, ACPI_ID_LEN);
+
+		if (acpi_match_device(acpi_ids, &client->dev)) {
+			error = device_add_properties(&client->dev,
+						      acpi_dev->properties);
+			if (error) {
+				dev_err(&client->dev,
+					"failed to add properties: %d\n",
+					error);
+				break;
+			}
+
+			return true;
+		}
+	}
+
+	return false;
+}
+
 static void chromeos_laptop_detach_i2c_client(struct i2c_client *client)
 {
 	struct i2c_peripheral *i2c_dev;
@@ -170,6 +211,8 @@ static int chromeos_laptop_i2c_notifier_call(struct notifier_block *nb,
 	case BUS_NOTIFY_ADD_DEVICE:
 		if (dev->type == &i2c_adapter_type)
 			chromeos_laptop_check_adapter(to_i2c_adapter(dev));
+		else if (dev->type == &i2c_client_type)
+			chromeos_laptop_adjust_client(to_i2c_client(dev));
 		break;
 
 	case BUS_NOTIFY_REMOVED_DEVICE:
@@ -191,6 +234,12 @@ static const struct chromeos_laptop _name __initconst = {		\
 	.num_i2c_peripherals	= ARRAY_SIZE(_name##_peripherals),	\
 }
 
+#define DECLARE_ACPI_CROS_LAPTOP(_name)					\
+static const struct chromeos_laptop _name __initconst = {		\
+	.acpi_peripherals	= _name##_peripherals,			\
+	.num_acpi_peripherals	= ARRAY_SIZE(_name##_peripherals),	\
+}
+
 static struct i2c_peripheral samsung_series_5_550_peripherals[] __initdata = {
 	/* Touchpad. */
 	{
@@ -234,16 +283,25 @@ static const int chromebook_pixel_tp_keys[] __initconst = {
 
 static const struct property_entry
 chromebook_pixel_trackpad_props[] __initconst = {
+	PROPERTY_ENTRY_STRING("compatible", "atmel,maxtouch"),
 	PROPERTY_ENTRY_U32_ARRAY("linux,gpio-keymap", chromebook_pixel_tp_keys),
 	{ }
 };
 
+static const struct property_entry
+chromebook_atmel_touchscreen_props[] __initconst = {
+	PROPERTY_ENTRY_STRING("compatible", "atmel,maxtouch"),
+	{ }
+};
+
 static struct i2c_peripheral chromebook_pixel_peripherals[] __initdata = {
 	/* Touch Screen. */
 	{
 		.board_info	= {
 			I2C_BOARD_INFO("atmel_mxt_ts",
 					ATMEL_TS_I2C_ADDR),
+			.properties	=
+				chromebook_atmel_touchscreen_props,
 			.flags		= I2C_CLIENT_WAKE,
 		},
 		.dmi_name	= "touchscreen",
@@ -354,6 +412,8 @@ static struct i2c_peripheral acer_c720_peripherals[] __initdata = {
 		.board_info	= {
 			I2C_BOARD_INFO("atmel_mxt_ts",
 					ATMEL_TS_I2C_ADDR),
+			.properties	=
+				chromebook_atmel_touchscreen_props,
 			.flags		= I2C_CLIENT_WAKE,
 		},
 		.dmi_name	= "touchscreen",
@@ -419,6 +479,47 @@ static struct i2c_peripheral cr48_peripherals[] __initdata = {
 };
 DECLARE_CROS_LAPTOP(cr48);
 
+static const u32 samus_touchpad_buttons[] __initconst = {
+	KEY_RESERVED,
+	KEY_RESERVED,
+	KEY_RESERVED,
+	BTN_LEFT
+};
+
+static const struct property_entry samus_trackpad_props[] __initconst = {
+	PROPERTY_ENTRY_STRING("compatible", "atmel,maxtouch"),
+	PROPERTY_ENTRY_U32_ARRAY("linux,gpio-keymap", samus_touchpad_buttons),
+	{ }
+};
+
+static struct acpi_peripheral samus_peripherals[] __initdata = {
+	/* Touchpad */
+	{
+		.hid		= "ATML0000",
+		.properties	= samus_trackpad_props,
+	},
+	/* Touchsceen */
+	{
+		.hid		= "ATML0001",
+		.properties	= chromebook_atmel_touchscreen_props,
+	},
+};
+DECLARE_ACPI_CROS_LAPTOP(samus);
+
+static struct acpi_peripheral generic_atmel_peripherals[] __initdata = {
+	/* Touchpad */
+	{
+		.hid		= "ATML0000",
+		.properties	= chromebook_pixel_trackpad_props,
+	},
+	/* Touchsceen */
+	{
+		.hid		= "ATML0001",
+		.properties	= chromebook_atmel_touchscreen_props,
+	},
+};
+DECLARE_ACPI_CROS_LAPTOP(generic_atmel);
+
 static const struct dmi_system_id chromeos_laptop_dmi_table[] __initconst = {
 	{
 		.ident = "Samsung Series 5 550",
@@ -502,17 +603,72 @@ static const struct dmi_system_id chromeos_laptop_dmi_table[] __initconst = {
 		},
 		.driver_data = (void *)&cr48,
 	},
+	/* Devices with peripherals incompletely described in ACPI */
+	{
+		.ident = "Chromebook Pro",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Google"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Caroline"),
+		},
+		.driver_data = (void *)&samus,
+	},
+	{
+		.ident = "Google Pixel 2 (2015)",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "GOOGLE"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Samus"),
+		},
+		.driver_data = (void *)&samus,
+	},
+	{
+		.ident = "Samsung Chromebook 3",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "GOOGLE"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Celes"),
+		},
+		.driver_data = (void *)&samus,
+	},
+	{
+		/*
+		 * Other Chromebooks with Atmel touch controllers:
+		 * - Winky (touchpad)
+		 * - Clapper, Expresso, Rambi, Glimmer (touchscreen)
+		 */
+		.ident = "Other Chromebook",
+		.matches = {
+			/*
+			 * This will match all Google devices, not only devices
+			 * with Atmel, but we will validate that the device
+			 * actually has matching peripherals.
+			 */
+			DMI_MATCH(DMI_SYS_VENDOR, "GOOGLE"),
+		},
+		.driver_data = (void *)&generic_atmel,
+	},
 	{ }
 };
 MODULE_DEVICE_TABLE(dmi, chromeos_laptop_dmi_table);
 
-static int __init chromeos_laptop_scan_adapter(struct device *dev, void *data)
+static int __init chromeos_laptop_scan_peripherals(struct device *dev, void *data)
 {
-	struct i2c_adapter *adapter;
+	int error;
 
-	adapter = i2c_verify_adapter(dev);
-	if (adapter)
-		chromeos_laptop_check_adapter(adapter);
+	if (dev->type == &i2c_adapter_type) {
+		chromeos_laptop_check_adapter(to_i2c_adapter(dev));
+	} else if (dev->type == &i2c_client_type) {
+		if (chromeos_laptop_adjust_client(to_i2c_client(dev))) {
+			/*
+			 * Now that we have needed properties re-trigger
+			 * driver probe in case driver was initialized
+			 * earlier and probe failed.
+			 */
+			error = device_attach(dev);
+			if (error < 0)
+				dev_warn(dev,
+					 "%s: device_attach() failed: %d\n",
+					 __func__, error);
+		}
+	}
 
 	return 0;
 }
@@ -556,27 +712,24 @@ static int __init chromeos_laptop_setup_irq(struct i2c_peripheral *i2c_dev)
 	return 0;
 }
 
-static struct chromeos_laptop * __init
-chromeos_laptop_prepare(const struct chromeos_laptop *src)
+static int __init
+chromeos_laptop_prepare_i2c_peripherals(struct chromeos_laptop *cros_laptop,
+					const struct chromeos_laptop *src)
 {
-	struct chromeos_laptop *cros_laptop;
 	struct i2c_peripheral *i2c_dev;
 	struct i2c_board_info *info;
-	int error;
 	int i;
+	int error;
 
-	cros_laptop = kzalloc(sizeof(*cros_laptop), GFP_KERNEL);
-	if (!cros_laptop)
-		return ERR_PTR(-ENOMEM);
+	if (!src->num_i2c_peripherals)
+		return 0;
 
 	cros_laptop->i2c_peripherals = kmemdup(src->i2c_peripherals,
 					       src->num_i2c_peripherals *
 						sizeof(*src->i2c_peripherals),
 					       GFP_KERNEL);
-	if (!cros_laptop->i2c_peripherals) {
-		error = -ENOMEM;
-		goto err_free_cros_laptop;
-	}
+	if (!cros_laptop->i2c_peripherals)
+		return -ENOMEM;
 
 	cros_laptop->num_i2c_peripherals = src->num_i2c_peripherals;
 
@@ -586,7 +739,7 @@ chromeos_laptop_prepare(const struct chromeos_laptop *src)
 
 		error = chromeos_laptop_setup_irq(i2c_dev);
 		if (error)
-			goto err_destroy_cros_peripherals;
+			goto err_out;
 
 		/* We need to deep-copy properties */
 		if (info->properties) {
@@ -594,14 +747,14 @@ chromeos_laptop_prepare(const struct chromeos_laptop *src)
 				property_entries_dup(info->properties);
 			if (IS_ERR(info->properties)) {
 				error = PTR_ERR(info->properties);
-				goto err_destroy_cros_peripherals;
+				goto err_out;
 			}
 		}
 	}
 
-	return cros_laptop;
+	return 0;
 
-err_destroy_cros_peripherals:
+err_out:
 	while (--i >= 0) {
 		i2c_dev = &cros_laptop->i2c_peripherals[i];
 		info = &i2c_dev->board_info;
@@ -609,13 +762,74 @@ err_destroy_cros_peripherals:
 			property_entries_free(info->properties);
 	}
 	kfree(cros_laptop->i2c_peripherals);
-err_free_cros_laptop:
-	kfree(cros_laptop);
-	return ERR_PTR(error);
+	return error;
+}
+
+static int __init
+chromeos_laptop_prepare_acpi_peripherals(struct chromeos_laptop *cros_laptop,
+					const struct chromeos_laptop *src)
+{
+	struct acpi_peripheral *acpi_peripherals;
+	struct acpi_peripheral *acpi_dev;
+	const struct acpi_peripheral *src_dev;
+	int n_peripherals = 0;
+	int i;
+	int error;
+
+	for (i = 0; i < src->num_acpi_peripherals; i++) {
+		if (acpi_dev_present(src->acpi_peripherals[i].hid, NULL, -1))
+			n_peripherals++;
+	}
+
+	if (!n_peripherals)
+		return 0;
+
+	acpi_peripherals = kcalloc(n_peripherals,
+				   sizeof(*src->acpi_peripherals),
+				   GFP_KERNEL);
+	if (!acpi_peripherals)
+		return -ENOMEM;
+
+	acpi_dev = acpi_peripherals;
+	for (i = 0; i < src->num_acpi_peripherals; i++) {
+		src_dev = &src->acpi_peripherals[i];
+		if (!acpi_dev_present(src_dev->hid, NULL, -1))
+			continue;
+
+		*acpi_dev = *src_dev;
+
+		/* We need to deep-copy properties */
+		if (src_dev->properties) {
+			acpi_dev->properties =
+				property_entries_dup(src_dev->properties);
+			if (IS_ERR(acpi_dev->properties)) {
+				error = PTR_ERR(acpi_dev->properties);
+				goto err_out;
+			}
+		}
+
+		acpi_dev++;
+	}
+
+	cros_laptop->acpi_peripherals = acpi_peripherals;
+	cros_laptop->num_acpi_peripherals = n_peripherals;
+
+	return 0;
+
+err_out:
+	while (--i >= 0) {
+		acpi_dev = &acpi_peripherals[i];
+		if (acpi_dev->properties)
+			property_entries_free(acpi_dev->properties);
+	}
+
+	kfree(acpi_peripherals);
+	return error;
 }
 
 static void chromeos_laptop_destroy(const struct chromeos_laptop *cros_laptop)
 {
+	const struct acpi_peripheral *acpi_dev;
 	struct i2c_peripheral *i2c_dev;
 	struct i2c_board_info *info;
 	int i;
@@ -631,10 +845,41 @@ static void chromeos_laptop_destroy(const struct chromeos_laptop *cros_laptop)
 			property_entries_free(info->properties);
 	}
 
+	for (i = 0; i < cros_laptop->num_acpi_peripherals; i++) {
+		acpi_dev = &cros_laptop->acpi_peripherals[i];
+
+		if (acpi_dev->properties)
+			property_entries_free(acpi_dev->properties);
+	}
+
 	kfree(cros_laptop->i2c_peripherals);
+	kfree(cros_laptop->acpi_peripherals);
 	kfree(cros_laptop);
 }
 
+static struct chromeos_laptop * __init
+chromeos_laptop_prepare(const struct chromeos_laptop *src)
+{
+	struct chromeos_laptop *cros_laptop;
+	int error;
+
+	cros_laptop = kzalloc(sizeof(*cros_laptop), GFP_KERNEL);
+	if (!cros_laptop)
+		return ERR_PTR(-ENOMEM);
+
+	error = chromeos_laptop_prepare_i2c_peripherals(cros_laptop, src);
+	if (!error)
+		error = chromeos_laptop_prepare_acpi_peripherals(cros_laptop,
+								 src);
+
+	if (error) {
+		chromeos_laptop_destroy(cros_laptop);
+		return ERR_PTR(error);
+	}
+
+	return cros_laptop;
+}
+
 static int __init chromeos_laptop_init(void)
 {
 	const struct dmi_system_id *dmi_id;
@@ -652,21 +897,33 @@ static int __init chromeos_laptop_init(void)
 	if (IS_ERR(cros_laptop))
 		return PTR_ERR(cros_laptop);
 
+	if (!cros_laptop->num_i2c_peripherals &&
+	    !cros_laptop->num_acpi_peripherals) {
+		pr_debug("no relevant devices detected\n");
+		error = -ENODEV;
+		goto err_destroy_cros_laptop;
+	}
+
 	error = bus_register_notifier(&i2c_bus_type,
 				      &chromeos_laptop_i2c_notifier);
 	if (error) {
-		pr_err("failed to register i2c bus notifier: %d\n", error);
-		chromeos_laptop_destroy(cros_laptop);
-		return error;
+		pr_err("failed to register i2c bus notifier: %d\n",
+		       error);
+		goto err_destroy_cros_laptop;
 	}
 
 	/*
-	 * Scan adapters that have been registered before we installed
-	 * the notifier to make sure we do not miss any devices.
+	 * Scan adapters that have been registered and clients that have
+	 * been created before we installed the notifier to make sure
+	 * we do not miss any devices.
 	 */
-	i2c_for_each_dev(NULL, chromeos_laptop_scan_adapter);
+	i2c_for_each_dev(NULL, chromeos_laptop_scan_peripherals);
 
 	return 0;
+
+err_destroy_cros_laptop:
+	chromeos_laptop_destroy(cros_laptop);
+	return error;
 }
 
 static void __exit chromeos_laptop_exit(void)
diff --git a/drivers/platform/chrome/chromeos_tbmc.c b/drivers/platform/chrome/chromeos_tbmc.c
new file mode 100644
index 0000000..b935df6
--- /dev/null
+++ b/drivers/platform/chrome/chromeos_tbmc.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Driver to detect Tablet Mode for ChromeOS convertible.
+//
+// Copyright (C) 2017 Google, Inc.
+// Author: Gwendal Grignou <gwendal@chromium.org>
+
+#include <linux/acpi.h>
+#include <linux/input.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+
+#define DRV_NAME "chromeos_tbmc"
+#define ACPI_DRV_NAME "GOOG0006"
+
+static int chromeos_tbmc_query_switch(struct acpi_device *adev,
+				     struct input_dev *idev)
+{
+	unsigned long long state;
+	acpi_status status;
+
+	status = acpi_evaluate_integer(adev->handle, "TBMC", NULL, &state);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
+
+	/* input layer checks if event is redundant */
+	input_report_switch(idev, SW_TABLET_MODE, state);
+	input_sync(idev);
+
+	return 0;
+}
+
+static __maybe_unused int chromeos_tbmc_resume(struct device *dev)
+{
+	struct acpi_device *adev = to_acpi_device(dev);
+
+	return chromeos_tbmc_query_switch(adev, adev->driver_data);
+}
+
+static void chromeos_tbmc_notify(struct acpi_device *adev, u32 event)
+{
+	switch (event) {
+	case 0x80:
+		chromeos_tbmc_query_switch(adev, adev->driver_data);
+		break;
+	default:
+		dev_err(&adev->dev, "Unexpected event: 0x%08X\n", event);
+	}
+}
+
+static int chromeos_tbmc_open(struct input_dev *idev)
+{
+	struct acpi_device *adev = input_get_drvdata(idev);
+
+	return chromeos_tbmc_query_switch(adev, idev);
+}
+
+static int chromeos_tbmc_add(struct acpi_device *adev)
+{
+	struct input_dev *idev;
+	struct device *dev = &adev->dev;
+	int ret;
+
+	idev = devm_input_allocate_device(dev);
+	if (!idev)
+		return -ENOMEM;
+
+	idev->name = "Tablet Mode Switch";
+	idev->phys = acpi_device_hid(adev);
+
+	idev->id.bustype = BUS_HOST;
+	idev->id.version = 1;
+	idev->id.product = 0;
+	idev->open = chromeos_tbmc_open;
+
+	input_set_drvdata(idev, adev);
+	adev->driver_data = idev;
+
+	input_set_capability(idev, EV_SW, SW_TABLET_MODE);
+	ret = input_register_device(idev);
+	if (ret) {
+		dev_err(dev, "cannot register input device\n");
+		return ret;
+	}
+	return 0;
+}
+
+static const struct acpi_device_id chromeos_tbmc_acpi_device_ids[] = {
+	{ ACPI_DRV_NAME, 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(acpi, chromeos_tbmc_acpi_device_ids);
+
+static const SIMPLE_DEV_PM_OPS(chromeos_tbmc_pm_ops, NULL,
+		chromeos_tbmc_resume);
+
+static struct acpi_driver chromeos_tbmc_driver = {
+	.name = DRV_NAME,
+	.class = DRV_NAME,
+	.ids = chromeos_tbmc_acpi_device_ids,
+	.ops = {
+		.add = chromeos_tbmc_add,
+		.notify = chromeos_tbmc_notify,
+	},
+	.drv.pm = &chromeos_tbmc_pm_ops,
+};
+
+module_acpi_driver(chromeos_tbmc_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("ChromeOS ACPI tablet switch driver");
diff --git a/drivers/platform/chrome/cros_ec_lightbar.c b/drivers/platform/chrome/cros_ec_lightbar.c
index 6ea79d4..68193bb 100644
--- a/drivers/platform/chrome/cros_ec_lightbar.c
+++ b/drivers/platform/chrome/cros_ec_lightbar.c
@@ -170,8 +170,7 @@ static ssize_t version_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
 	uint32_t version = 0, flags = 0;
-	struct cros_ec_dev *ec = container_of(dev,
-					      struct cros_ec_dev, class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 	int ret;
 
 	ret = lb_throttle();
@@ -193,8 +192,7 @@ static ssize_t brightness_store(struct device *dev,
 	struct cros_ec_command *msg;
 	int ret;
 	unsigned int val;
-	struct cros_ec_dev *ec = container_of(dev,
-					      struct cros_ec_dev, class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 
 	if (kstrtouint(buf, 0, &val))
 		return -EINVAL;
@@ -238,8 +236,7 @@ static ssize_t led_rgb_store(struct device *dev, struct device_attribute *attr,
 {
 	struct ec_params_lightbar *param;
 	struct cros_ec_command *msg;
-	struct cros_ec_dev *ec = container_of(dev,
-					      struct cros_ec_dev, class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 	unsigned int val[4];
 	int ret, i = 0, j = 0, ok = 0;
 
@@ -311,8 +308,7 @@ static ssize_t sequence_show(struct device *dev,
 	struct ec_response_lightbar *resp;
 	struct cros_ec_command *msg;
 	int ret;
-	struct cros_ec_dev *ec = container_of(dev,
-					      struct cros_ec_dev, class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 
 	msg = alloc_lightbar_cmd_msg(ec);
 	if (!msg)
@@ -439,8 +435,7 @@ static ssize_t sequence_store(struct device *dev, struct device_attribute *attr,
 	struct cros_ec_command *msg;
 	unsigned int num;
 	int ret, len;
-	struct cros_ec_dev *ec = container_of(dev,
-					      struct cros_ec_dev, class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 
 	for (len = 0; len < count; len++)
 		if (!isalnum(buf[len]))
@@ -488,8 +483,7 @@ static ssize_t program_store(struct device *dev, struct device_attribute *attr,
 	int extra_bytes, max_size, ret;
 	struct ec_params_lightbar *param;
 	struct cros_ec_command *msg;
-	struct cros_ec_dev *ec = container_of(dev, struct cros_ec_dev,
-					      class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 
 	/*
 	 * We might need to reject the program for size reasons. The EC
@@ -599,8 +593,7 @@ static umode_t cros_ec_lightbar_attrs_are_visible(struct kobject *kobj,
 						  struct attribute *a, int n)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
-	struct cros_ec_dev *ec = container_of(dev,
-					      struct cros_ec_dev, class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 	struct platform_device *pdev = to_platform_device(ec->dev);
 	struct cros_ec_platform *pdata = pdev->dev.platform_data;
 	int is_cros_ec;
diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
index 3682e15..31c8b8c 100644
--- a/drivers/platform/chrome/cros_ec_lpc.c
+++ b/drivers/platform/chrome/cros_ec_lpc.c
@@ -435,7 +435,13 @@ static int __init cros_ec_lpc_init(void)
 	int ret;
 	acpi_status status;
 
-	if (!dmi_check_system(cros_ec_lpc_dmi_table)) {
+	status = acpi_get_devices(ACPI_DRV_NAME, cros_ec_lpc_parse_device,
+				  &cros_ec_lpc_acpi_device_found, NULL);
+	if (ACPI_FAILURE(status))
+		pr_warn(DRV_NAME ": Looking for %s failed\n", ACPI_DRV_NAME);
+
+	if (!cros_ec_lpc_acpi_device_found &&
+	    !dmi_check_system(cros_ec_lpc_dmi_table)) {
 		pr_err(DRV_NAME ": unsupported system.\n");
 		return -ENODEV;
 	}
@@ -450,11 +456,6 @@ static int __init cros_ec_lpc_init(void)
 		return ret;
 	}
 
-	status = acpi_get_devices(ACPI_DRV_NAME, cros_ec_lpc_parse_device,
-				  &cros_ec_lpc_acpi_device_found, NULL);
-	if (ACPI_FAILURE(status))
-		pr_warn(DRV_NAME ": Looking for %s failed\n", ACPI_DRV_NAME);
-
 	if (!cros_ec_lpc_acpi_device_found) {
 		/* Register the device, and it'll get hooked up automatically */
 		ret = platform_device_register(&cros_ec_lpc_device);
diff --git a/drivers/platform/chrome/cros_ec_sysfs.c b/drivers/platform/chrome/cros_ec_sysfs.c
index 5a6db3f..f34a501 100644
--- a/drivers/platform/chrome/cros_ec_sysfs.c
+++ b/drivers/platform/chrome/cros_ec_sysfs.c
@@ -34,8 +34,6 @@
 #include <linux/types.h>
 #include <linux/uaccess.h>
 
-#define to_cros_ec_dev(dev)  container_of(dev, struct cros_ec_dev, class_dev)
-
 /* Accessor functions */
 
 static ssize_t reboot_show(struct device *dev,
diff --git a/drivers/platform/chrome/cros_ec_vbc.c b/drivers/platform/chrome/cros_ec_vbc.c
index 6d38e6b..5356f26b 100644
--- a/drivers/platform/chrome/cros_ec_vbc.c
+++ b/drivers/platform/chrome/cros_ec_vbc.c
@@ -29,8 +29,7 @@ static ssize_t vboot_context_read(struct file *filp, struct kobject *kobj,
 				  loff_t pos, size_t count)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
-	struct cros_ec_dev *ec = container_of(dev, struct cros_ec_dev,
-					      class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 	struct cros_ec_device *ecdev = ec->ec_dev;
 	struct ec_params_vbnvcontext *params;
 	struct cros_ec_command *msg;
@@ -70,8 +69,7 @@ static ssize_t vboot_context_write(struct file *filp, struct kobject *kobj,
 				   loff_t pos, size_t count)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
-	struct cros_ec_dev *ec = container_of(dev, struct cros_ec_dev,
-					      class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 	struct cros_ec_device *ecdev = ec->ec_dev;
 	struct ec_params_vbnvcontext *params;
 	struct cros_ec_command *msg;
@@ -111,8 +109,7 @@ static umode_t cros_ec_vbc_is_visible(struct kobject *kobj,
 				      struct bin_attribute *a, int n)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
-	struct cros_ec_dev *ec = container_of(dev, struct cros_ec_dev,
-					      class_dev);
+	struct cros_ec_dev *ec = to_cros_ec_dev(dev);
 	struct device_node *np = ec->ec_dev->dev->of_node;
 
 	if (IS_ENABLED(CONFIG_OF) && np) {
diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c
index e8d058c..eef76bf 100644
--- a/drivers/platform/x86/toshiba_acpi.c
+++ b/drivers/platform/x86/toshiba_acpi.c
@@ -1689,19 +1689,6 @@ static int version_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int version_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, version_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations version_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= version_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /*
  * Proc and module init
  */
@@ -1722,8 +1709,8 @@ static void create_toshiba_proc_entries(struct toshiba_acpi_dev *dev)
 	if (dev->hotkey_dev)
 		proc_create_data("keys", S_IRUGO | S_IWUSR, toshiba_proc_dir,
 				 &keys_proc_fops, dev);
-	proc_create_data("version", S_IRUGO, toshiba_proc_dir,
-			 &version_proc_fops, dev);
+	proc_create_single_data("version", S_IRUGO, toshiba_proc_dir,
+			version_proc_show, dev);
 }
 
 static void remove_toshiba_proc_entries(struct toshiba_acpi_dev *dev)
diff --git a/drivers/pnp/pnpbios/proc.c b/drivers/pnp/pnpbios/proc.c
index 7d4aca7..fe1c8f5 100644
--- a/drivers/pnp/pnpbios/proc.c
+++ b/drivers/pnp/pnpbios/proc.c
@@ -47,19 +47,6 @@ static int pnpconfig_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int pnpconfig_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pnpconfig_proc_show, NULL);
-}
-
-static const struct file_operations pnpconfig_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= pnpconfig_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int escd_info_proc_show(struct seq_file *m, void *v)
 {
 	struct escd_info_struc escd;
@@ -74,19 +61,6 @@ static int escd_info_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int escd_info_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, escd_info_proc_show, NULL);
-}
-
-static const struct file_operations escd_info_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= escd_info_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 #define MAX_SANE_ESCD_SIZE (32*1024)
 static int escd_proc_show(struct seq_file *m, void *v)
 {
@@ -129,19 +103,6 @@ static int escd_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int escd_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, escd_proc_show, NULL);
-}
-
-static const struct file_operations escd_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= escd_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int pnp_legacyres_proc_show(struct seq_file *m, void *v)
 {
 	void *buf;
@@ -159,19 +120,6 @@ static int pnp_legacyres_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int pnp_legacyres_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pnp_legacyres_proc_show, NULL);
-}
-
-static const struct file_operations pnp_legacyres_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= pnp_legacyres_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int pnp_devices_proc_show(struct seq_file *m, void *v)
 {
 	struct pnp_bios_node *node;
@@ -202,19 +150,6 @@ static int pnp_devices_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int pnp_devices_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pnp_devices_proc_show, NULL);
-}
-
-static const struct file_operations pnp_devices_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= pnp_devices_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int pnpbios_proc_show(struct seq_file *m, void *v)
 {
 	void *data = m->private;
@@ -318,12 +253,13 @@ int __init pnpbios_proc_init(void)
 	proc_pnp_boot = proc_mkdir("boot", proc_pnp);
 	if (!proc_pnp_boot)
 		return -EIO;
-	proc_create("devices", 0, proc_pnp, &pnp_devices_proc_fops);
-	proc_create("configuration_info", 0, proc_pnp, &pnpconfig_proc_fops);
-	proc_create("escd_info", 0, proc_pnp, &escd_info_proc_fops);
-	proc_create("escd", S_IRUSR, proc_pnp, &escd_proc_fops);
-	proc_create("legacy_device_resources", 0, proc_pnp, &pnp_legacyres_proc_fops);
-
+	proc_create_single("devices", 0, proc_pnp, pnp_devices_proc_show);
+	proc_create_single("configuration_info", 0, proc_pnp,
+			pnpconfig_proc_show);
+	proc_create_single("escd_info", 0, proc_pnp, escd_info_proc_show);
+	proc_create_single("escd", S_IRUSR, proc_pnp, escd_proc_show);
+	proc_create_single("legacy_device_resources", 0, proc_pnp,
+			pnp_legacyres_proc_show);
 	return 0;
 }
 
diff --git a/drivers/rtc/rtc-proc.c b/drivers/rtc/rtc-proc.c
index 31e7e23..a9dd921 100644
--- a/drivers/rtc/rtc-proc.c
+++ b/drivers/rtc/rtc-proc.c
@@ -107,40 +107,11 @@ static int rtc_proc_show(struct seq_file *seq, void *offset)
 	return 0;
 }
 
-static int rtc_proc_open(struct inode *inode, struct file *file)
-{
-	int ret;
-	struct rtc_device *rtc = PDE_DATA(inode);
-
-	if (!try_module_get(rtc->owner))
-		return -ENODEV;
-
-	ret = single_open(file, rtc_proc_show, rtc);
-	if (ret)
-		module_put(rtc->owner);
-	return ret;
-}
-
-static int rtc_proc_release(struct inode *inode, struct file *file)
-{
-	int res = single_release(inode, file);
-	struct rtc_device *rtc = PDE_DATA(inode);
-
-	module_put(rtc->owner);
-	return res;
-}
-
-static const struct file_operations rtc_proc_fops = {
-	.open		= rtc_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= rtc_proc_release,
-};
-
 void rtc_proc_add_device(struct rtc_device *rtc)
 {
 	if (is_rtc_hctosys(rtc))
-		proc_create_data("driver/rtc", 0, NULL, &rtc_proc_fops, rtc);
+		proc_create_single_data("driver/rtc", 0, NULL, rtc_proc_show,
+				rtc);
 }
 
 void rtc_proc_del_device(struct rtc_device *rtc)
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 02c03e4..cb9b685 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -3054,7 +3054,7 @@ out:
  *
  * Return values:
  * BLK_EH_RESET_TIMER if the request should be left running
- * BLK_EH_NOT_HANDLED if the request is handled or terminated
+ * BLK_EH_DONE if the request is handled or terminated
  *		      by the driver.
  */
 enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved)
@@ -3067,7 +3067,7 @@ enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved)
 
 	cqr = *((struct dasd_ccw_req **) blk_mq_rq_to_pdu(req));
 	if (!cqr)
-		return BLK_EH_NOT_HANDLED;
+		return BLK_EH_DONE;
 
 	spin_lock_irqsave(&cqr->dq->lock, flags);
 	device = cqr->startdev ? cqr->startdev : block->base;
@@ -3126,7 +3126,7 @@ enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved)
 	spin_unlock(&block->queue_lock);
 	spin_unlock_irqrestore(&cqr->dq->lock, flags);
 
-	return rc ? BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED;
+	return rc ? BLK_EH_RESET_TIMER : BLK_EH_DONE;
 }
 
 static int dasd_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
diff --git a/drivers/s390/block/dasd_proc.c b/drivers/s390/block/dasd_proc.c
index c33788a..5cb80c6 100644
--- a/drivers/s390/block/dasd_proc.c
+++ b/drivers/s390/block/dasd_proc.c
@@ -131,19 +131,6 @@ static const struct seq_operations dasd_devices_seq_ops = {
 	.show		= dasd_devices_show,
 };
 
-static int dasd_devices_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &dasd_devices_seq_ops);
-}
-
-static const struct file_operations dasd_devices_file_ops = {
-	.owner		= THIS_MODULE,
-	.open		= dasd_devices_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 #ifdef CONFIG_DASD_PROFILE
 static int dasd_stats_all_block_on(void)
 {
@@ -352,10 +339,10 @@ dasd_proc_init(void)
 	dasd_proc_root_entry = proc_mkdir("dasd", NULL);
 	if (!dasd_proc_root_entry)
 		goto out_nodasd;
-	dasd_devices_entry = proc_create("devices",
+	dasd_devices_entry = proc_create_seq("devices",
 					 S_IFREG | S_IRUGO | S_IWUSR,
 					 dasd_proc_root_entry,
-					 &dasd_devices_file_ops);
+					 &dasd_devices_seq_ops);
 	if (!dasd_devices_entry)
 		goto out_nodevices;
 	dasd_statistics_entry = proc_create("statistics",
diff --git a/drivers/s390/char/tape_proc.c b/drivers/s390/char/tape_proc.c
index faae304..32a14ee 100644
--- a/drivers/s390/char/tape_proc.c
+++ b/drivers/s390/char/tape_proc.c
@@ -105,29 +105,14 @@ static const struct seq_operations tape_proc_seq = {
 	.show		= tape_proc_show,
 };
 
-static int tape_proc_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &tape_proc_seq);
-}
-
-static const struct file_operations tape_proc_ops =
-{
-	.owner		= THIS_MODULE,
-	.open		= tape_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 /*
  * Initialize procfs stuff on startup
  */
 void
 tape_proc_init(void)
 {
-	tape_proc_devices =
-		proc_create("tapedevices", S_IFREG | S_IRUGO | S_IWUSR, NULL,
-			    &tape_proc_ops);
+	tape_proc_devices = proc_create_seq("tapedevices",
+			S_IFREG | S_IRUGO | S_IWUSR, NULL,  &tape_proc_seq);
 	if (tape_proc_devices == NULL) {
 		return;
 	}
diff --git a/drivers/sbus/char/Kconfig b/drivers/sbus/char/Kconfig
index bf3c5f7..89edd13 100644
--- a/drivers/sbus/char/Kconfig
+++ b/drivers/sbus/char/Kconfig
@@ -28,13 +28,6 @@ config TADPOLE_TS102_UCTRL
 	  events, and can also notice the attachment/detachment of external
 	  monitors and mice.
 
-config SUN_JSFLASH
-	tristate "JavaStation OS Flash SIMM"
-	depends on SPARC32
-	help
-	  If you say Y here, you will be able to boot from your JavaStation's
-	  Flash memory.
-
 config BBC_I2C
 	tristate "UltraSPARC-III bootbus i2c controller driver"
 	depends on PCI && SPARC64
diff --git a/drivers/sbus/char/Makefile b/drivers/sbus/char/Makefile
index 8c48ed9..44347c9 100644
--- a/drivers/sbus/char/Makefile
+++ b/drivers/sbus/char/Makefile
@@ -15,6 +15,5 @@ obj-$(CONFIG_DISPLAY7SEG)		+= display7seg.o
 obj-$(CONFIG_OBP_FLASH)			+= flash.o
 obj-$(CONFIG_SUN_OPENPROMIO)		+= openprom.o
 obj-$(CONFIG_TADPOLE_TS102_UCTRL)	+= uctrl.o
-obj-$(CONFIG_SUN_JSFLASH)		+= jsflash.o
 obj-$(CONFIG_BBC_I2C)			+= bbc.o
 obj-$(CONFIG_ORACLE_DAX) 		+= oradax.o
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
deleted file mode 100644
index 14f377a..0000000
--- a/drivers/sbus/char/jsflash.c
+++ /dev/null
@@ -1,658 +0,0 @@
-/*
- * drivers/sbus/char/jsflash.c
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds	(drivers/char/mem.c)
- *  Copyright (C) 1997  Eddie C. Dost		(drivers/sbus/char/flash.c)
- *  Copyright (C) 1997-2000 Pavel Machek <pavel@ucw.cz>   (drivers/block/nbd.c)
- *  Copyright (C) 1999-2000 Pete Zaitcev
- *
- * This driver is used to program OS into a Flash SIMM on
- * Krups and Espresso platforms.
- *
- * TODO: do not allow erase/programming if file systems are mounted.
- * TODO: Erase/program both banks of a 8MB SIMM.
- *
- * It is anticipated that programming an OS Flash will be a routine
- * procedure. In the same time it is exceedingly dangerous because
- * a user can program its OBP flash with OS image and effectively
- * kill the machine.
- *
- * This driver uses an interface different from Eddie's flash.c
- * as a silly safeguard.
- *
- * XXX The flash.c manipulates page caching characteristics in a certain
- * dubious way; also it assumes that remap_pfn_range() can remap
- * PCI bus locations, which may be false. ioremap() must be used
- * instead. We should discuss this.
- */
-
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/miscdevice.h>
-#include <linux/fcntl.h>
-#include <linux/poll.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/genhd.h>
-#include <linux/blkdev.h>
-#include <linux/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/io.h>
-#include <asm/pcic.h>
-#include <asm/oplib.h>
-
-#include <asm/jsflash.h>		/* ioctl arguments. <linux/> ?? */
-#define JSFIDSZ		(sizeof(struct jsflash_ident_arg))
-#define JSFPRGSZ	(sizeof(struct jsflash_program_arg))
-
-/*
- * Our device numbers have no business in system headers.
- * The only thing a user knows is the device name /dev/jsflash.
- *
- * Block devices are laid out like this:
- *   minor+0	- Bootstrap, for 8MB SIMM 0x20400000[0x800000]
- *   minor+1	- Filesystem to mount, normally 0x20400400[0x7ffc00]
- *   minor+2	- Whole flash area for any case... 0x20000000[0x01000000]
- * Total 3 minors per flash device.
- *
- * It is easier to have static size vectors, so we define
- * a total minor range JSF_MAX, which must cover all minors.
- */
-/* character device */
-#define JSF_MINOR	178	/* 178 is registered with hpa */
-/* block device */
-#define JSF_MAX		 3	/* 3 minors wasted total so far. */
-#define JSF_NPART	 3	/* 3 minors per flash device */
-#define JSF_PART_BITS	 2	/* 2 bits of minors to cover JSF_NPART */
-#define JSF_PART_MASK	 0x3	/* 2 bits mask */
-
-static DEFINE_MUTEX(jsf_mutex);
-
-/*
- * Access functions.
- * We could ioremap(), but it's easier this way.
- */
-static unsigned int jsf_inl(unsigned long addr)
-{
-	unsigned long retval;
-
-	__asm__ __volatile__("lda [%1] %2, %0\n\t" :
-				"=r" (retval) :
-				"r" (addr), "i" (ASI_M_BYPASS));
-        return retval;
-}
-
-static void jsf_outl(unsigned long addr, __u32 data)
-{
-
-	__asm__ __volatile__("sta %0, [%1] %2\n\t" : :
-				"r" (data), "r" (addr), "i" (ASI_M_BYPASS) :
-				"memory");
-}
-
-/*
- * soft carrier
- */
-
-struct jsfd_part {
-	unsigned long dbase;
-	unsigned long dsize;
-};
-
-struct jsflash {
-	unsigned long base;
-	unsigned long size;
-	unsigned long busy;		/* In use? */
-	struct jsflash_ident_arg id;
-	/* int mbase; */		/* Minor base, typically zero */
-	struct jsfd_part dv[JSF_NPART];
-};
-
-/*
- * We do not map normal memory or obio as a safety precaution.
- * But offsets are real, for ease of userland programming.
- */
-#define JSF_BASE_TOP	0x30000000
-#define JSF_BASE_ALL	0x20000000
-
-#define JSF_BASE_JK	0x20400000
-
-/*
- */
-static struct gendisk *jsfd_disk[JSF_MAX];
-
-/*
- * Let's pretend we may have several of these...
- */
-static struct jsflash jsf0;
-
-/*
- * Wait for AMD to finish its embedded algorithm.
- * We use the Toggle bit DQ6 (0x40) because it does not
- * depend on the data value as /DATA bit DQ7 does.
- *
- * XXX Do we need any timeout here? So far it never hanged, beware broken hw.
- */
-static void jsf_wait(unsigned long p) {
-	unsigned int x1, x2;
-
-	for (;;) {
-		x1 = jsf_inl(p);
-		x2 = jsf_inl(p);
-		if ((x1 & 0x40404040) == (x2 & 0x40404040)) return;
-	}
-}
-
-/*
- * Programming will only work if Flash is clean,
- * we leave it to the programmer application.
- *
- * AMD must be programmed one byte at a time;
- * thus, Simple Tech SIMM must be written 4 bytes at a time.
- *
- * Write waits for the chip to become ready after the write
- * was finished. This is done so that application would read
- * consistent data after the write is done.
- */
-static void jsf_write4(unsigned long fa, u32 data) {
-
-	jsf_outl(fa, 0xAAAAAAAA);		/* Unlock 1 Write 1 */
-	jsf_outl(fa, 0x55555555);		/* Unlock 1 Write 2 */
-	jsf_outl(fa, 0xA0A0A0A0);		/* Byte Program */
-	jsf_outl(fa, data);
-
-	jsf_wait(fa);
-}
-
-/*
- */
-static void jsfd_read(char *buf, unsigned long p, size_t togo) {
-	union byte4 {
-		char s[4];
-		unsigned int n;
-	} b;
-
-	while (togo >= 4) {
-		togo -= 4;
-		b.n = jsf_inl(p);
-		memcpy(buf, b.s, 4);
-		p += 4;
-		buf += 4;
-	}
-}
-
-static int jsfd_queue;
-
-static struct request *jsfd_next_request(void)
-{
-	struct request_queue *q;
-	struct request *rq;
-	int old_pos = jsfd_queue;
-
-	do {
-		q = jsfd_disk[jsfd_queue]->queue;
-		if (++jsfd_queue == JSF_MAX)
-			jsfd_queue = 0;
-		if (q) {
-			rq = blk_fetch_request(q);
-			if (rq)
-				return rq;
-		}
-	} while (jsfd_queue != old_pos);
-
-	return NULL;
-}
-
-static void jsfd_request(void)
-{
-	struct request *req;
-
-	req = jsfd_next_request();
-	while (req) {
-		struct jsfd_part *jdp = req->rq_disk->private_data;
-		unsigned long offset = blk_rq_pos(req) << 9;
-		size_t len = blk_rq_cur_bytes(req);
-		blk_status_t err = BLK_STS_IOERR;
-
-		if ((offset + len) > jdp->dsize)
-			goto end;
-
-		if (rq_data_dir(req) != READ) {
-			printk(KERN_ERR "jsfd: write\n");
-			goto end;
-		}
-
-		if ((jdp->dbase & 0xff000000) != 0x20000000) {
-			printk(KERN_ERR "jsfd: bad base %x\n", (int)jdp->dbase);
-			goto end;
-		}
-
-		jsfd_read(bio_data(req->bio), jdp->dbase + offset, len);
-		err = BLK_STS_OK;
-	end:
-		if (!__blk_end_request_cur(req, err))
-			req = jsfd_next_request();
-	}
-}
-
-static void jsfd_do_request(struct request_queue *q)
-{
-	jsfd_request();
-}
-
-/*
- * The memory devices use the full 32/64 bits of the offset, and so we cannot
- * check against negative addresses: they are ok. The return value is weird,
- * though, in that case (0).
- *
- * also note that seeking relative to the "end of file" isn't supported:
- * it has no meaning, so it returns -EINVAL.
- */
-static loff_t jsf_lseek(struct file * file, loff_t offset, int orig)
-{
-	loff_t ret;
-
-	mutex_lock(&jsf_mutex);
-	switch (orig) {
-		case 0:
-			file->f_pos = offset;
-			ret = file->f_pos;
-			break;
-		case 1:
-			file->f_pos += offset;
-			ret = file->f_pos;
-			break;
-		default:
-			ret = -EINVAL;
-	}
-	mutex_unlock(&jsf_mutex);
-	return ret;
-}
-
-/*
- * OS SIMM Cannot be read in other size but a 32bits word.
- */
-static ssize_t jsf_read(struct file * file, char __user * buf, 
-    size_t togo, loff_t *ppos)
-{
-	unsigned long p = *ppos;
-	char __user *tmp = buf;
-
-	union byte4 {
-		char s[4];
-		unsigned int n;
-	} b;
-
-	if (p < JSF_BASE_ALL || p >= JSF_BASE_TOP) {
-		return 0;
-	}
-
-	if ((p + togo) < p	/* wrap */
-	   || (p + togo) >= JSF_BASE_TOP) {
-		togo = JSF_BASE_TOP - p;
-	}
-
-	if (p < JSF_BASE_ALL && togo != 0) {
-#if 0 /* __bzero XXX */
-		size_t x = JSF_BASE_ALL - p;
-		if (x > togo) x = togo;
-		clear_user(tmp, x);
-		tmp += x;
-		p += x;
-		togo -= x;
-#else
-		/*
-		 * Implementation of clear_user() calls __bzero
-		 * without regard to modversions,
-		 * so we cannot build a module.
-		 */
-		return 0;
-#endif
-	}
-
-	while (togo >= 4) {
-		togo -= 4;
-		b.n = jsf_inl(p);
-		if (copy_to_user(tmp, b.s, 4))
-			return -EFAULT;
-		tmp += 4;
-		p += 4;
-	}
-
-	/*
-	 * XXX Small togo may remain if 1 byte is ordered.
-	 * It would be nice if we did a word size read and unpacked it.
-	 */
-
-	*ppos = p;
-	return tmp-buf;
-}
-
-static ssize_t jsf_write(struct file * file, const char __user * buf,
-    size_t count, loff_t *ppos)
-{
-	return -ENOSPC;
-}
-
-/*
- */
-static int jsf_ioctl_erase(unsigned long arg)
-{
-	unsigned long p;
-
-	/* p = jsf0.base;	hits wrong bank */
-	p = 0x20400000;
-
-	jsf_outl(p, 0xAAAAAAAA);		/* Unlock 1 Write 1 */
-	jsf_outl(p, 0x55555555);		/* Unlock 1 Write 2 */
-	jsf_outl(p, 0x80808080);		/* Erase setup */
-	jsf_outl(p, 0xAAAAAAAA);		/* Unlock 2 Write 1 */
-	jsf_outl(p, 0x55555555);		/* Unlock 2 Write 2 */
-	jsf_outl(p, 0x10101010);		/* Chip erase */
-
-#if 0
-	/*
-	 * This code is ok, except that counter based timeout
-	 * has no place in this world. Let's just drop timeouts...
-	 */
-	{
-		int i;
-		__u32 x;
-		for (i = 0; i < 1000000; i++) {
-			x = jsf_inl(p);
-			if ((x & 0x80808080) == 0x80808080) break;
-		}
-		if ((x & 0x80808080) != 0x80808080) {
-			printk("jsf0: erase timeout with 0x%08x\n", x);
-		} else {
-			printk("jsf0: erase done with 0x%08x\n", x);
-		}
-	}
-#else
-	jsf_wait(p);
-#endif
-
-	return 0;
-}
-
-/*
- * Program a block of flash.
- * Very simple because we can do it byte by byte anyway.
- */
-static int jsf_ioctl_program(void __user *arg)
-{
-	struct jsflash_program_arg abuf;
-	char __user *uptr;
-	unsigned long p;
-	unsigned int togo;
-	union {
-		unsigned int n;
-		char s[4];
-	} b;
-
-	if (copy_from_user(&abuf, arg, JSFPRGSZ))
-		return -EFAULT; 
-	p = abuf.off;
-	togo = abuf.size;
-	if ((togo & 3) || (p & 3)) return -EINVAL;
-
-	uptr = (char __user *) (unsigned long) abuf.data;
-	while (togo != 0) {
-		togo -= 4;
-		if (copy_from_user(&b.s[0], uptr, 4))
-			return -EFAULT;
-		jsf_write4(p, b.n);
-		p += 4;
-		uptr += 4;
-	}
-
-	return 0;
-}
-
-static long jsf_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
-{
-	mutex_lock(&jsf_mutex);
-	int error = -ENOTTY;
-	void __user *argp = (void __user *)arg;
-
-	if (!capable(CAP_SYS_ADMIN)) {
-		mutex_unlock(&jsf_mutex);
-		return -EPERM;
-	}
-	switch (cmd) {
-	case JSFLASH_IDENT:
-		if (copy_to_user(argp, &jsf0.id, JSFIDSZ)) {
-			mutex_unlock(&jsf_mutex);
-			return -EFAULT;
-		}
-		break;
-	case JSFLASH_ERASE:
-		error = jsf_ioctl_erase(arg);
-		break;
-	case JSFLASH_PROGRAM:
-		error = jsf_ioctl_program(argp);
-		break;
-	}
-
-	mutex_unlock(&jsf_mutex);
-	return error;
-}
-
-static int jsf_mmap(struct file * file, struct vm_area_struct * vma)
-{
-	return -ENXIO;
-}
-
-static int jsf_open(struct inode * inode, struct file * filp)
-{
-	mutex_lock(&jsf_mutex);
-	if (jsf0.base == 0) {
-		mutex_unlock(&jsf_mutex);
-		return -ENXIO;
-	}
-	if (test_and_set_bit(0, (void *)&jsf0.busy) != 0) {
-		mutex_unlock(&jsf_mutex);
-		return -EBUSY;
-	}
-
-	mutex_unlock(&jsf_mutex);
-	return 0;	/* XXX What security? */
-}
-
-static int jsf_release(struct inode *inode, struct file *file)
-{
-	jsf0.busy = 0;
-	return 0;
-}
-
-static const struct file_operations jsf_fops = {
-	.owner =	THIS_MODULE,
-	.llseek =	jsf_lseek,
-	.read =		jsf_read,
-	.write =	jsf_write,
-	.unlocked_ioctl =	jsf_ioctl,
-	.mmap =		jsf_mmap,
-	.open =		jsf_open,
-	.release =	jsf_release,
-};
-
-static struct miscdevice jsf_dev = { JSF_MINOR, "jsflash", &jsf_fops };
-
-static const struct block_device_operations jsfd_fops = {
-	.owner =	THIS_MODULE,
-};
-
-static int jsflash_init(void)
-{
-	int rc;
-	struct jsflash *jsf;
-	phandle node;
-	char banner[128];
-	struct linux_prom_registers reg0;
-
-	node = prom_getchild(prom_root_node);
-	node = prom_searchsiblings(node, "flash-memory");
-	if (node != 0 && (s32)node != -1) {
-		if (prom_getproperty(node, "reg",
-		    (char *)&reg0, sizeof(reg0)) == -1) {
-			printk("jsflash: no \"reg\" property\n");
-			return -ENXIO;
-		}
-		if (reg0.which_io != 0) {
-			printk("jsflash: bus number nonzero: 0x%x:%x\n",
-			    reg0.which_io, reg0.phys_addr);
-			return -ENXIO;
-		}
-		/*
-		 * Flash may be somewhere else, for instance on Ebus.
-		 * So, don't do the following check for IIep flash space.
-		 */
-#if 0
-		if ((reg0.phys_addr >> 24) != 0x20) {
-			printk("jsflash: suspicious address: 0x%x:%x\n",
-			    reg0.which_io, reg0.phys_addr);
-			return -ENXIO;
-		}
-#endif
-		if ((int)reg0.reg_size <= 0) {
-			printk("jsflash: bad size 0x%x\n", (int)reg0.reg_size);
-			return -ENXIO;
-		}
-	} else {
-		/* XXX Remove this code once PROLL ID12 got widespread */
-		printk("jsflash: no /flash-memory node, use PROLL >= 12\n");
-		prom_getproperty(prom_root_node, "banner-name", banner, 128);
-		if (strcmp (banner, "JavaStation-NC") != 0 &&
-		    strcmp (banner, "JavaStation-E") != 0) {
-			return -ENXIO;
-		}
-		reg0.which_io = 0;
-		reg0.phys_addr = 0x20400000;
-		reg0.reg_size  = 0x00800000;
-	}
-
-	/* Let us be really paranoid for modifications to probing code. */
-	if (sparc_cpu_model != sun4m) {
-		/* We must be on sun4m because we use MMU Bypass ASI. */
-		return -ENXIO;
-	}
-
-	if (jsf0.base == 0) {
-		jsf = &jsf0;
-
-		jsf->base = reg0.phys_addr;
-		jsf->size = reg0.reg_size;
-
-		/* XXX Redo the userland interface. */
-		jsf->id.off = JSF_BASE_ALL;
-		jsf->id.size = 0x01000000;	/* 16M - all segments */
-		strcpy(jsf->id.name, "Krups_all");
-
-		jsf->dv[0].dbase = jsf->base;
-		jsf->dv[0].dsize = jsf->size;
-		jsf->dv[1].dbase = jsf->base + 1024;
-		jsf->dv[1].dsize = jsf->size - 1024;
-		jsf->dv[2].dbase = JSF_BASE_ALL;
-		jsf->dv[2].dsize = 0x01000000;
-
-		printk("Espresso Flash @0x%lx [%d MB]\n", jsf->base,
-		    (int) (jsf->size / (1024*1024)));
-	}
-
-	if ((rc = misc_register(&jsf_dev)) != 0) {
-		printk(KERN_ERR "jsf: unable to get misc minor %d\n",
-		    JSF_MINOR);
-		jsf0.base = 0;
-		return rc;
-	}
-
-	return 0;
-}
-
-static int jsfd_init(void)
-{
-	static DEFINE_SPINLOCK(lock);
-	struct jsflash *jsf;
-	struct jsfd_part *jdp;
-	int err;
-	int i;
-
-	if (jsf0.base == 0)
-		return -ENXIO;
-
-	err = -ENOMEM;
-	for (i = 0; i < JSF_MAX; i++) {
-		struct gendisk *disk = alloc_disk(1);
-		if (!disk)
-			goto out;
-		disk->queue = blk_init_queue(jsfd_do_request, &lock);
-		if (!disk->queue) {
-			put_disk(disk);
-			goto out;
-		}
-		blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH);
-		jsfd_disk[i] = disk;
-	}
-
-	if (register_blkdev(JSFD_MAJOR, "jsfd")) {
-		err = -EIO;
-		goto out;
-	}
-
-	for (i = 0; i < JSF_MAX; i++) {
-		struct gendisk *disk = jsfd_disk[i];
-		if ((i & JSF_PART_MASK) >= JSF_NPART) continue;
-		jsf = &jsf0;	/* actually, &jsfv[i >> JSF_PART_BITS] */
-		jdp = &jsf->dv[i&JSF_PART_MASK];
-
-		disk->major = JSFD_MAJOR;
-		disk->first_minor = i;
-		sprintf(disk->disk_name, "jsfd%d", i);
-		disk->fops = &jsfd_fops;
-		set_capacity(disk, jdp->dsize >> 9);
-		disk->private_data = jdp;
-		add_disk(disk);
-		set_disk_ro(disk, 1);
-	}
-	return 0;
-out:
-	while (i--)
-		put_disk(jsfd_disk[i]);
-	return err;
-}
-
-MODULE_LICENSE("GPL");
-
-static int __init jsflash_init_module(void) {
-	int rc;
-
-	if ((rc = jsflash_init()) == 0) {
-		jsfd_init();
-		return 0;
-	}
-	return rc;
-}
-
-static void __exit jsflash_cleanup_module(void)
-{
-	int i;
-
-	for (i = 0; i < JSF_MAX; i++) {
-		if ((i & JSF_PART_MASK) >= JSF_NPART) continue;
-		del_gendisk(jsfd_disk[i]);
-		blk_cleanup_queue(jsfd_disk[i]->queue);
-		put_disk(jsfd_disk[i]);
-	}
-	if (jsf0.busy)
-		printk("jsf0: cleaning busy unit\n");
-	jsf0.base = 0;
-	jsf0.busy = 0;
-
-	misc_deregister(&jsf_dev);
-	unregister_blkdev(JSFD_MAJOR, "jsfd");
-}
-
-module_init(jsflash_init_module);
-module_exit(jsflash_cleanup_module);
diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c
index c35f05c..8560479 100644
--- a/drivers/scsi/gdth.c
+++ b/drivers/scsi/gdth.c
@@ -3882,7 +3882,7 @@ static enum blk_eh_timer_return gdth_timed_out(struct scsi_cmnd *scp)
 	struct gdth_cmndinfo *cmndinfo = gdth_cmnd_priv(scp);
 	u8 b, t;
 	unsigned long flags;
-	enum blk_eh_timer_return retval = BLK_EH_NOT_HANDLED;
+	enum blk_eh_timer_return retval = BLK_EH_DONE;
 
 	TRACE(("%s() cmd 0x%x\n", scp->cmnd[0], __func__));
 	b = scp->device->channel;
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 15a2fef..71bdc0b 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -1963,7 +1963,7 @@ static int iscsi_has_ping_timed_out(struct iscsi_conn *conn)
 
 enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc)
 {
-	enum blk_eh_timer_return rc = BLK_EH_NOT_HANDLED;
+	enum blk_eh_timer_return rc = BLK_EH_DONE;
 	struct iscsi_task *task = NULL, *running_task;
 	struct iscsi_cls_session *cls_session;
 	struct iscsi_session *session;
@@ -1982,7 +1982,7 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc)
 		 * Raced with completion. Blk layer has taken ownership
 		 * so let timeout code complete it now.
 		 */
-		rc = BLK_EH_HANDLED;
+		rc = BLK_EH_DONE;
 		goto done;
 	}
 
@@ -1997,7 +1997,7 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc)
 		if (unlikely(system_state != SYSTEM_RUNNING)) {
 			sc->result = DID_NO_CONNECT << 16;
 			ISCSI_DBG_EH(session, "sc on shutdown, handled\n");
-			rc = BLK_EH_HANDLED;
+			rc = BLK_EH_DONE;
 			goto done;
 		}
 		/*
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index 7195cff..91f5e2c 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -2731,53 +2731,6 @@ proc_show_rdrv_40(struct seq_file *m, void *v)
 	return proc_show_rdrv(m, m->private, 30, 39);
 }
 
-
-/*
- * seq_file wrappers for procfile show routines.
- */
-static int mega_proc_open(struct inode *inode, struct file *file)
-{
-	adapter_t *adapter = proc_get_parent_data(inode);
-	int (*show)(struct seq_file *, void *) = PDE_DATA(inode);
-
-	return single_open(file, show, adapter);
-}
-
-static const struct file_operations mega_proc_fops = {
-	.open		= mega_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-/*
- * Table of proc files we need to create.
- */
-struct mega_proc_file {
-	const char *name;
-	unsigned short ptr_offset;
-	int (*show) (struct seq_file *m, void *v);
-};
-
-static const struct mega_proc_file mega_proc_files[] = {
-	{ "config",	      offsetof(adapter_t, proc_read), proc_show_config },
-	{ "stat",	      offsetof(adapter_t, proc_stat), proc_show_stat },
-	{ "mailbox",	      offsetof(adapter_t, proc_mbox), proc_show_mbox },
-#if MEGA_HAVE_ENH_PROC
-	{ "rebuild-rate",     offsetof(adapter_t, proc_rr), proc_show_rebuild_rate },
-	{ "battery-status",   offsetof(adapter_t, proc_battery), proc_show_battery },
-	{ "diskdrives-ch0",   offsetof(adapter_t, proc_pdrvstat[0]), proc_show_pdrv_ch0 },
-	{ "diskdrives-ch1",   offsetof(adapter_t, proc_pdrvstat[1]), proc_show_pdrv_ch1 },
-	{ "diskdrives-ch2",   offsetof(adapter_t, proc_pdrvstat[2]), proc_show_pdrv_ch2 },
-	{ "diskdrives-ch3",   offsetof(adapter_t, proc_pdrvstat[3]), proc_show_pdrv_ch3 },
-	{ "raiddrives-0-9",   offsetof(adapter_t, proc_rdrvstat[0]), proc_show_rdrv_10 },
-	{ "raiddrives-10-19", offsetof(adapter_t, proc_rdrvstat[1]), proc_show_rdrv_20 },
-	{ "raiddrives-20-29", offsetof(adapter_t, proc_rdrvstat[2]), proc_show_rdrv_30 },
-	{ "raiddrives-30-39", offsetof(adapter_t, proc_rdrvstat[3]), proc_show_rdrv_40 },
-#endif
-	{ NULL }
-};
-
 /**
  * mega_create_proc_entry()
  * @index - index in soft state array
@@ -2788,31 +2741,45 @@ static const struct mega_proc_file mega_proc_files[] = {
 static void
 mega_create_proc_entry(int index, struct proc_dir_entry *parent)
 {
-	const struct mega_proc_file *f;
-	adapter_t	*adapter = hba_soft_state[index];
-	struct proc_dir_entry	*dir, *de, **ppde;
-	u8		string[16];
+	adapter_t *adapter = hba_soft_state[index];
+	struct proc_dir_entry *dir;
+	u8 string[16];
 
 	sprintf(string, "hba%d", adapter->host->host_no);
-
-	dir = adapter->controller_proc_dir_entry =
-		proc_mkdir_data(string, 0, parent, adapter);
-	if(!dir) {
+	dir = proc_mkdir_data(string, 0, parent, adapter);
+	if (!dir) {
 		dev_warn(&adapter->dev->dev, "proc_mkdir failed\n");
 		return;
 	}
 
-	for (f = mega_proc_files; f->name; f++) {
-		de = proc_create_data(f->name, S_IRUSR, dir, &mega_proc_fops,
-				      f->show);
-		if (!de) {
-			dev_warn(&adapter->dev->dev, "proc_create failed\n");
-			return;
-		}
-
-		ppde = (void *)adapter + f->ptr_offset;
-		*ppde = de;
-	}
+	proc_create_single_data("config", S_IRUSR, dir,
+			proc_show_config, adapter);
+	proc_create_single_data("stat", S_IRUSR, dir,
+			proc_show_stat, adapter);
+	proc_create_single_data("mailbox", S_IRUSR, dir,
+			proc_show_mbox, adapter);
+#if MEGA_HAVE_ENH_PROC
+	proc_create_single_data("rebuild-rate", S_IRUSR, dir,
+			proc_show_rebuild_rate, adapter);
+	proc_create_single_data("battery-status", S_IRUSR, dir,
+			proc_show_battery, adapter);
+	proc_create_single_data("diskdrives-ch0", S_IRUSR, dir,
+			proc_show_pdrv_ch0, adapter);
+	proc_create_single_data("diskdrives-ch1", S_IRUSR, dir,
+			proc_show_pdrv_ch1, adapter);
+	proc_create_single_data("diskdrives-ch2", S_IRUSR, dir,
+			proc_show_pdrv_ch2, adapter);
+	proc_create_single_data("diskdrives-ch3", S_IRUSR, dir,
+			proc_show_pdrv_ch3, adapter);
+	proc_create_single_data("raiddrives-0-9", S_IRUSR, dir,
+			proc_show_rdrv_10, adapter);
+	proc_create_single_data("raiddrives-10-19", S_IRUSR, dir,
+			proc_show_rdrv_20, adapter);
+	proc_create_single_data("raiddrives-20-29", S_IRUSR, dir,
+			proc_show_rdrv_30, adapter);
+	proc_create_single_data("raiddrives-30-39", S_IRUSR, dir,
+			proc_show_rdrv_40, adapter);
+#endif
 }
 
 #else
@@ -4580,6 +4547,7 @@ megaraid_remove_one(struct pci_dev *pdev)
 {
 	struct Scsi_Host *host = pci_get_drvdata(pdev);
 	adapter_t *adapter = (adapter_t *)host->hostdata;
+	char buf[12] = { 0 };
 
 	scsi_remove_host(host);
 
@@ -4594,44 +4562,8 @@ megaraid_remove_one(struct pci_dev *pdev)
 
 	mega_free_sgl(adapter);
 
-#ifdef CONFIG_PROC_FS
-	if (adapter->controller_proc_dir_entry) {
-		remove_proc_entry("stat", adapter->controller_proc_dir_entry);
-		remove_proc_entry("config",
-				adapter->controller_proc_dir_entry);
-		remove_proc_entry("mailbox",
-				adapter->controller_proc_dir_entry);
-#if MEGA_HAVE_ENH_PROC
-		remove_proc_entry("rebuild-rate",
-				adapter->controller_proc_dir_entry);
-		remove_proc_entry("battery-status",
-				adapter->controller_proc_dir_entry);
-
-		remove_proc_entry("diskdrives-ch0",
-				adapter->controller_proc_dir_entry);
-		remove_proc_entry("diskdrives-ch1",
-				adapter->controller_proc_dir_entry);
-		remove_proc_entry("diskdrives-ch2",
-				adapter->controller_proc_dir_entry);
-		remove_proc_entry("diskdrives-ch3",
-				adapter->controller_proc_dir_entry);
-
-		remove_proc_entry("raiddrives-0-9",
-				adapter->controller_proc_dir_entry);
-		remove_proc_entry("raiddrives-10-19",
-				adapter->controller_proc_dir_entry);
-		remove_proc_entry("raiddrives-20-29",
-				adapter->controller_proc_dir_entry);
-		remove_proc_entry("raiddrives-30-39",
-				adapter->controller_proc_dir_entry);
-#endif
-		{
-			char	buf[12] = { 0 };
-			sprintf(buf, "hba%d", adapter->host->host_no);
-			remove_proc_entry(buf, mega_proc_dir_entry);
-		}
-	}
-#endif
+	sprintf(buf, "hba%d", adapter->host->host_no);
+	remove_proc_subtree(buf, mega_proc_dir_entry);
 
 	pci_free_consistent(adapter->dev, MEGA_BUFFER_SIZE,
 			adapter->mega_buffer, adapter->buf_dma_handle);
diff --git a/drivers/scsi/megaraid.h b/drivers/scsi/megaraid.h
index 21eba2f..18e85d9 100644
--- a/drivers/scsi/megaraid.h
+++ b/drivers/scsi/megaraid.h
@@ -814,18 +814,6 @@ typedef struct {
 
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry	*controller_proc_dir_entry;
-	struct proc_dir_entry	*proc_read;
-	struct proc_dir_entry	*proc_stat;
-	struct proc_dir_entry	*proc_mbox;
-
-#if MEGA_HAVE_ENH_PROC
-	struct proc_dir_entry	*proc_rr;
-	struct proc_dir_entry	*proc_battery;
-#define MAX_PROC_CHANNELS	4
-	struct proc_dir_entry	*proc_pdrvstat[MAX_PROC_CHANNELS];
-	struct proc_dir_entry	*proc_rdrvstat[MAX_PROC_CHANNELS];
-#endif
-
 #endif
 
 	int	has_64bit_addr;		/* are we using 64-bit addressing */
diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
index b89c6e6..ce656c4 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -2772,7 +2772,7 @@ blk_eh_timer_return megasas_reset_timer(struct scsi_cmnd *scmd)
 
 	if (time_after(jiffies, scmd->jiffies_at_alloc +
 				(scmd_timeout * 2) * HZ)) {
-		return BLK_EH_NOT_HANDLED;
+		return BLK_EH_DONE;
 	}
 
 	instance = (struct megasas_instance *)scmd->device->host->hostdata;
diff --git a/drivers/scsi/mvumi.c b/drivers/scsi/mvumi.c
index fe97401..afd2716 100644
--- a/drivers/scsi/mvumi.c
+++ b/drivers/scsi/mvumi.c
@@ -2155,7 +2155,7 @@ static enum blk_eh_timer_return mvumi_timed_out(struct scsi_cmnd *scmd)
 	mvumi_return_cmd(mhba, cmd);
 	spin_unlock_irqrestore(mhba->shost->host_lock, flags);
 
-	return BLK_EH_NOT_HANDLED;
+	return BLK_EH_DONE;
 }
 
 static int
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index e188771..5a33e1a 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -99,7 +99,7 @@ static int _osd_get_print_system_info(struct osd_dev *od,
 	int nelem = ARRAY_SIZE(get_attrs), a = 0;
 	int ret;
 
-	or = osd_start_request(od, GFP_KERNEL);
+	or = osd_start_request(od);
 	if (!or)
 		return -ENOMEM;
 
@@ -409,16 +409,15 @@ static void _osd_request_free(struct osd_request *or)
 	kfree(or);
 }
 
-struct osd_request *osd_start_request(struct osd_dev *dev, gfp_t gfp)
+struct osd_request *osd_start_request(struct osd_dev *dev)
 {
 	struct osd_request *or;
 
-	or = _osd_request_alloc(gfp);
+	or = _osd_request_alloc(GFP_KERNEL);
 	if (!or)
 		return NULL;
 
 	or->osd_dev = dev;
-	or->alloc_flags = gfp;
 	or->timeout = dev->def_timeout;
 	or->retries = OSD_REQ_RETRIES;
 
@@ -546,7 +545,7 @@ static int _osd_realloc_seg(struct osd_request *or,
 	if (seg->alloc_size >= max_bytes)
 		return 0;
 
-	buff = krealloc(seg->buff, max_bytes, or->alloc_flags);
+	buff = krealloc(seg->buff, max_bytes, GFP_KERNEL);
 	if (!buff) {
 		OSD_ERR("Failed to Realloc %d-bytes was-%d\n", max_bytes,
 			seg->alloc_size);
@@ -728,7 +727,7 @@ static int _osd_req_list_objects(struct osd_request *or,
 		_osd_req_encode_olist(or, list);
 
 	WARN_ON(or->in.bio);
-	bio = bio_map_kern(q, list, len, or->alloc_flags);
+	bio = bio_map_kern(q, list, len, GFP_KERNEL);
 	if (IS_ERR(bio)) {
 		OSD_ERR("!!! Failed to allocate list_objects BIO\n");
 		return PTR_ERR(bio);
@@ -1190,14 +1189,14 @@ static int _req_append_segment(struct osd_request *or,
 			pad_buff = io->pad_buff;
 
 		ret = blk_rq_map_kern(io->req->q, io->req, pad_buff, padding,
-				       or->alloc_flags);
+				       GFP_KERNEL);
 		if (ret)
 			return ret;
 		io->total_bytes += padding;
 	}
 
 	ret = blk_rq_map_kern(io->req->q, io->req, seg->buff, seg->total_bytes,
-			       or->alloc_flags);
+			       GFP_KERNEL);
 	if (ret)
 		return ret;
 
@@ -1564,14 +1563,14 @@ static int _osd_req_finalize_data_integrity(struct osd_request *or,
  * osd_finalize_request and helpers
  */
 static struct request *_make_request(struct request_queue *q, bool has_write,
-			      struct _osd_io_info *oii, gfp_t flags)
+			      struct _osd_io_info *oii)
 {
 	struct request *req;
 	struct bio *bio = oii->bio;
 	int ret;
 
 	req = blk_get_request(q, has_write ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
-			flags);
+			0);
 	if (IS_ERR(req))
 		return req;
 
@@ -1589,13 +1588,12 @@ static struct request *_make_request(struct request_queue *q, bool has_write,
 static int _init_blk_request(struct osd_request *or,
 	bool has_in, bool has_out)
 {
-	gfp_t flags = or->alloc_flags;
 	struct scsi_device *scsi_device = or->osd_dev->scsi_device;
 	struct request_queue *q = scsi_device->request_queue;
 	struct request *req;
 	int ret;
 
-	req = _make_request(q, has_out, has_out ? &or->out : &or->in, flags);
+	req = _make_request(q, has_out, has_out ? &or->out : &or->in);
 	if (IS_ERR(req)) {
 		ret = PTR_ERR(req);
 		goto out;
@@ -1611,7 +1609,7 @@ static int _init_blk_request(struct osd_request *or,
 		or->out.req = req;
 		if (has_in) {
 			/* allocate bidi request */
-			req = _make_request(q, false, &or->in, flags);
+			req = _make_request(q, false, &or->in);
 			if (IS_ERR(req)) {
 				OSD_DEBUG("blk_get_request for bidi failed\n");
 				ret = PTR_ERR(req);
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index 20ec1c0..2bbe797 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -368,7 +368,7 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd,
 	int write = (data_direction == DMA_TO_DEVICE);
 
 	req = blk_get_request(SRpnt->stp->device->request_queue,
-			write ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, GFP_KERNEL);
+			write ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(req))
 		return DRIVER_ERROR << 24;
 
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index 94c14ce..0e13349 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -1848,7 +1848,7 @@ static enum blk_eh_timer_return qla4xxx_eh_cmd_timed_out(struct scsi_cmnd *sc)
 	struct iscsi_cls_session *session;
 	struct iscsi_session *sess;
 	unsigned long flags;
-	enum blk_eh_timer_return ret = BLK_EH_NOT_HANDLED;
+	enum blk_eh_timer_return ret = BLK_EH_DONE;
 
 	session = starget_to_session(scsi_target(sc->device));
 	sess = session->dd_data;
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 9460391..9c02ba2 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -282,7 +282,7 @@ void scsi_eh_scmd_add(struct scsi_cmnd *scmd)
 enum blk_eh_timer_return scsi_times_out(struct request *req)
 {
 	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req);
-	enum blk_eh_timer_return rtn = BLK_EH_NOT_HANDLED;
+	enum blk_eh_timer_return rtn = BLK_EH_DONE;
 	struct Scsi_Host *host = scmd->device->host;
 
 	trace_scsi_dispatch_cmd_timeout(scmd);
@@ -294,7 +294,7 @@ enum blk_eh_timer_return scsi_times_out(struct request *req)
 	if (host->hostt->eh_timed_out)
 		rtn = host->hostt->eh_timed_out(scmd);
 
-	if (rtn == BLK_EH_NOT_HANDLED) {
+	if (rtn == BLK_EH_DONE) {
 		if (scsi_abort_command(scmd) != SUCCESS) {
 			set_host_byte(scmd, DID_TIME_OUT);
 			scsi_eh_scmd_add(scmd);
@@ -1933,11 +1933,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
 	struct request *req;
 	struct scsi_request *rq;
 
-	/*
-	 * blk_get_request with GFP_KERNEL (__GFP_RECLAIM) sleeps until a
-	 * request becomes available
-	 */
-	req = blk_get_request(sdev->request_queue, REQ_OP_SCSI_IN, GFP_KERNEL);
+	req = blk_get_request(sdev->request_queue, REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(req))
 		return;
 	rq = scsi_req(req);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index e9b4f27..fb38aef 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -265,7 +265,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 	struct scsi_request *rq;
 	int ret = DRIVER_ERROR << 24;
 
-	req = blk_get_request_flags(sdev->request_queue,
+	req = blk_get_request(sdev->request_queue,
 			data_direction == DMA_TO_DEVICE ?
 			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, BLK_MQ_REQ_PREEMPT);
 	if (IS_ERR(req))
@@ -273,7 +273,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 	rq = scsi_req(req);
 
 	if (bufflen &&	blk_rq_map_kern(sdev->request_queue, req,
-					buffer, bufflen, __GFP_RECLAIM))
+					buffer, bufflen, GFP_NOIO))
 		goto out;
 
 	rq->cmd_len = COMMAND_SIZE(cmd[0]);
@@ -2149,27 +2149,6 @@ static int scsi_map_queues(struct blk_mq_tag_set *set)
 	return blk_mq_map_queues(set);
 }
 
-static u64 scsi_calculate_bounce_limit(struct Scsi_Host *shost)
-{
-	struct device *host_dev;
-	u64 bounce_limit = 0xffffffff;
-
-	if (shost->unchecked_isa_dma)
-		return BLK_BOUNCE_ISA;
-	/*
-	 * Platforms with virtual-DMA translation
-	 * hardware have no practical limit.
-	 */
-	if (!PCI_DMA_BUS_IS_PHYS)
-		return BLK_BOUNCE_ANY;
-
-	host_dev = scsi_get_device(shost);
-	if (host_dev && host_dev->dma_mask)
-		bounce_limit = (u64)dma_max_pfn(host_dev) << PAGE_SHIFT;
-
-	return bounce_limit;
-}
-
 void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
 {
 	struct device *dev = shost->dma_dev;
@@ -2189,7 +2168,8 @@ void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
 	}
 
 	blk_queue_max_hw_sectors(q, shost->max_sectors);
-	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
+	if (shost->unchecked_isa_dma)
+		blk_queue_bounce_limit(q, BLK_BOUNCE_ISA);
 	blk_queue_segment_boundary(q, shost->dma_boundary);
 	dma_set_seg_boundary(dev, shost->dma_boundary);
 
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index be3be0f..1da3d71 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -2087,7 +2087,7 @@ fc_eh_timed_out(struct scsi_cmnd *scmd)
 	if (rport->port_state == FC_PORTSTATE_BLOCKED)
 		return BLK_EH_RESET_TIMER;
 
-	return BLK_EH_NOT_HANDLED;
+	return BLK_EH_DONE;
 }
 EXPORT_SYMBOL(fc_eh_timed_out);
 
@@ -3591,10 +3591,9 @@ fc_bsg_job_timeout(struct request *req)
 	}
 
 	/* the blk_end_sync_io() doesn't check the error */
-	if (!inflight)
-		return BLK_EH_NOT_HANDLED;
-	else
-		return BLK_EH_HANDLED;
+	if (inflight)
+		blk_mq_complete_request(req);
+	return BLK_EH_DONE;
 }
 
 /**
@@ -3781,8 +3780,7 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
 	snprintf(bsg_name, sizeof(bsg_name),
 		 "fc_host%d", shost->host_no);
 
-	q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, i->f->dd_bsg_size,
-			NULL);
+	q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, i->f->dd_bsg_size);
 	if (IS_ERR(q)) {
 		dev_err(dev,
 			"fc_host%d: bsg interface failed to initialize - setup queue\n",
@@ -3827,8 +3825,8 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
 	if (!i->f->bsg_request)
 		return -ENOTSUPP;
 
-	q = bsg_setup_queue(dev, NULL, fc_bsg_dispatch, i->f->dd_bsg_size,
-			NULL);
+	q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch,
+			i->f->dd_bsg_size);
 	if (IS_ERR(q)) {
 		dev_err(dev, "failed to setup bsg queue\n");
 		return PTR_ERR(q);
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 65f6c94..6fd2fe2 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1542,7 +1542,7 @@ iscsi_bsg_host_add(struct Scsi_Host *shost, struct iscsi_cls_host *ihost)
 		return -ENOTSUPP;
 
 	snprintf(bsg_name, sizeof(bsg_name), "iscsi_host%d", shost->host_no);
-	q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, 0, NULL);
+	q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, 0);
 	if (IS_ERR(q)) {
 		shost_printk(KERN_ERR, shost, "bsg interface failed to "
 			     "initialize - no request queue\n");
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 08acbab..e2953b4 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -187,16 +187,6 @@ static int sas_smp_dispatch(struct bsg_job *job)
 	return 0;
 }
 
-static void sas_host_release(struct device *dev)
-{
-	struct Scsi_Host *shost = dev_to_shost(dev);
-	struct sas_host_attrs *sas_host = to_sas_host_attrs(shost);
-	struct request_queue *q = sas_host->q;
-
-	if (q)
-		blk_cleanup_queue(q);
-}
-
 static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 {
 	struct request_queue *q;
@@ -208,7 +198,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 
 	if (rphy) {
 		q = bsg_setup_queue(&rphy->dev, dev_name(&rphy->dev),
-				sas_smp_dispatch, 0, NULL);
+				sas_smp_dispatch, 0);
 		if (IS_ERR(q))
 			return PTR_ERR(q);
 		rphy->q = q;
@@ -217,7 +207,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 
 		snprintf(name, sizeof(name), "sas_host%d", shost->host_no);
 		q = bsg_setup_queue(&shost->shost_gendev, name,
-				sas_smp_dispatch, 0, sas_host_release);
+				sas_smp_dispatch, 0);
 		if (IS_ERR(q))
 			return PTR_ERR(q);
 		to_sas_host_attrs(shost)->q = q;
@@ -260,8 +250,11 @@ static int sas_host_remove(struct transport_container *tc, struct device *dev,
 	struct Scsi_Host *shost = dev_to_shost(dev);
 	struct request_queue *q = to_sas_host_attrs(shost)->q;
 
-	if (q)
+	if (q) {
 		bsg_unregister_queue(q);
+		blk_cleanup_queue(q);
+	}
+
 	return 0;
 }
 
diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c
index 456ce9f..4e46fdb 100644
--- a/drivers/scsi/scsi_transport_srp.c
+++ b/drivers/scsi/scsi_transport_srp.c
@@ -604,7 +604,7 @@ EXPORT_SYMBOL(srp_reconnect_rport);
  *
  * If a timeout occurs while an rport is in the blocked state, ask the SCSI
  * EH to continue waiting (BLK_EH_RESET_TIMER). Otherwise let the SCSI core
- * handle the timeout (BLK_EH_NOT_HANDLED).
+ * handle the timeout (BLK_EH_DONE).
  *
  * Note: This function is called from soft-IRQ context and with the request
  * queue lock held.
@@ -620,7 +620,7 @@ enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd)
 	return rport && rport->fast_io_fail_tmo < 0 &&
 		rport->dev_loss_tmo < 0 &&
 		i->f->reset_timer_if_blocked && scsi_device_blocked(sdev) ?
-		BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED;
+		BLK_EH_RESET_TIMER : BLK_EH_DONE;
 }
 EXPORT_SYMBOL(srp_timed_out);
 
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 5c40d80..6fc58e2 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -66,7 +66,6 @@ static int sg_version_num = 30536;	/* 2 digits for each component */
 static char *sg_version_date = "20140603";
 
 static int sg_proc_init(void);
-static void sg_proc_cleanup(void);
 #endif
 
 #define SG_ALLOW_DIO_DEF 0
@@ -1661,7 +1660,7 @@ static void __exit
 exit_sg(void)
 {
 #ifdef CONFIG_SCSI_PROC_FS
-	sg_proc_cleanup();
+	remove_proc_subtree("scsi/sg", NULL);
 #endif				/* CONFIG_SCSI_PROC_FS */
 	scsi_unregister_interface(&sg_interface);
 	class_destroy(sg_sysfs_class);
@@ -1715,7 +1714,7 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
 	 * does not sleep except under memory pressure.
 	 */
 	rq = blk_get_request(q, hp->dxfer_direction == SG_DXFER_TO_DEV ?
-			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, GFP_KERNEL);
+			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(rq)) {
 		kfree(long_cmdp);
 		return PTR_ERR(rq);
@@ -2274,11 +2273,6 @@ sg_get_dev(int dev)
 }
 
 #ifdef CONFIG_SCSI_PROC_FS
-
-static struct proc_dir_entry *sg_proc_sgp = NULL;
-
-static char sg_proc_sg_dirname[] = "scsi/sg";
-
 static int sg_proc_seq_show_int(struct seq_file *s, void *v);
 
 static int sg_proc_single_open_adio(struct inode *inode, struct file *file);
@@ -2306,37 +2300,11 @@ static const struct file_operations dressz_fops = {
 };
 
 static int sg_proc_seq_show_version(struct seq_file *s, void *v);
-static int sg_proc_single_open_version(struct inode *inode, struct file *file);
-static const struct file_operations version_fops = {
-	.owner = THIS_MODULE,
-	.open = sg_proc_single_open_version,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 static int sg_proc_seq_show_devhdr(struct seq_file *s, void *v);
-static int sg_proc_single_open_devhdr(struct inode *inode, struct file *file);
-static const struct file_operations devhdr_fops = {
-	.owner = THIS_MODULE,
-	.open = sg_proc_single_open_devhdr,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 static int sg_proc_seq_show_dev(struct seq_file *s, void *v);
-static int sg_proc_open_dev(struct inode *inode, struct file *file);
 static void * dev_seq_start(struct seq_file *s, loff_t *pos);
 static void * dev_seq_next(struct seq_file *s, void *v, loff_t *pos);
 static void dev_seq_stop(struct seq_file *s, void *v);
-static const struct file_operations dev_fops = {
-	.owner = THIS_MODULE,
-	.open = sg_proc_open_dev,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
 static const struct seq_operations dev_seq_ops = {
 	.start = dev_seq_start,
 	.next  = dev_seq_next,
@@ -2345,14 +2313,6 @@ static const struct seq_operations dev_seq_ops = {
 };
 
 static int sg_proc_seq_show_devstrs(struct seq_file *s, void *v);
-static int sg_proc_open_devstrs(struct inode *inode, struct file *file);
-static const struct file_operations devstrs_fops = {
-	.owner = THIS_MODULE,
-	.open = sg_proc_open_devstrs,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
 static const struct seq_operations devstrs_seq_ops = {
 	.start = dev_seq_start,
 	.next  = dev_seq_next,
@@ -2361,14 +2321,6 @@ static const struct seq_operations devstrs_seq_ops = {
 };
 
 static int sg_proc_seq_show_debug(struct seq_file *s, void *v);
-static int sg_proc_open_debug(struct inode *inode, struct file *file);
-static const struct file_operations debug_fops = {
-	.owner = THIS_MODULE,
-	.open = sg_proc_open_debug,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
 static const struct seq_operations debug_seq_ops = {
 	.start = dev_seq_start,
 	.next  = dev_seq_next,
@@ -2376,50 +2328,23 @@ static const struct seq_operations debug_seq_ops = {
 	.show  = sg_proc_seq_show_debug,
 };
 
-
-struct sg_proc_leaf {
-	const char * name;
-	const struct file_operations * fops;
-};
-
-static const struct sg_proc_leaf sg_proc_leaf_arr[] = {
-	{"allow_dio", &adio_fops},
-	{"debug", &debug_fops},
-	{"def_reserved_size", &dressz_fops},
-	{"device_hdr", &devhdr_fops},
-	{"devices", &dev_fops},
-	{"device_strs", &devstrs_fops},
-	{"version", &version_fops}
-};
-
 static int
 sg_proc_init(void)
 {
-	int num_leaves = ARRAY_SIZE(sg_proc_leaf_arr);
-	int k;
+	struct proc_dir_entry *p;
 
-	sg_proc_sgp = proc_mkdir(sg_proc_sg_dirname, NULL);
-	if (!sg_proc_sgp)
+	p = proc_mkdir("scsi/sg", NULL);
+	if (!p)
 		return 1;
-	for (k = 0; k < num_leaves; ++k) {
-		const struct sg_proc_leaf *leaf = &sg_proc_leaf_arr[k];
-		umode_t mask = leaf->fops->write ? S_IRUGO | S_IWUSR : S_IRUGO;
-		proc_create(leaf->name, mask, sg_proc_sgp, leaf->fops);
-	}
-	return 0;
-}
 
-static void
-sg_proc_cleanup(void)
-{
-	int k;
-	int num_leaves = ARRAY_SIZE(sg_proc_leaf_arr);
-
-	if (!sg_proc_sgp)
-		return;
-	for (k = 0; k < num_leaves; ++k)
-		remove_proc_entry(sg_proc_leaf_arr[k].name, sg_proc_sgp);
-	remove_proc_entry(sg_proc_sg_dirname, NULL);
+	proc_create("allow_dio", S_IRUGO | S_IWUSR, p, &adio_fops);
+	proc_create_seq("debug", S_IRUGO, p, &debug_seq_ops);
+	proc_create("def_reserved_size", S_IRUGO | S_IWUSR, p, &dressz_fops);
+	proc_create_single("device_hdr", S_IRUGO, p, sg_proc_seq_show_devhdr);
+	proc_create_seq("devices", S_IRUGO, p, &dev_seq_ops);
+	proc_create_seq("device_strs", S_IRUGO, p, &devstrs_seq_ops);
+	proc_create_single("version", S_IRUGO, p, sg_proc_seq_show_version);
+	return 0;
 }
 
 
@@ -2482,22 +2407,12 @@ static int sg_proc_seq_show_version(struct seq_file *s, void *v)
 	return 0;
 }
 
-static int sg_proc_single_open_version(struct inode *inode, struct file *file)
-{
-	return single_open(file, sg_proc_seq_show_version, NULL);
-}
-
 static int sg_proc_seq_show_devhdr(struct seq_file *s, void *v)
 {
 	seq_puts(s, "host\tchan\tid\tlun\ttype\topens\tqdepth\tbusy\tonline\n");
 	return 0;
 }
 
-static int sg_proc_single_open_devhdr(struct inode *inode, struct file *file)
-{
-	return single_open(file, sg_proc_seq_show_devhdr, NULL);
-}
-
 struct sg_proc_deviter {
 	loff_t	index;
 	size_t	max;
@@ -2531,11 +2446,6 @@ static void dev_seq_stop(struct seq_file *s, void *v)
 	kfree(s->private);
 }
 
-static int sg_proc_open_dev(struct inode *inode, struct file *file)
-{
-        return seq_open(file, &dev_seq_ops);
-}
-
 static int sg_proc_seq_show_dev(struct seq_file *s, void *v)
 {
 	struct sg_proc_deviter * it = (struct sg_proc_deviter *) v;
@@ -2562,11 +2472,6 @@ static int sg_proc_seq_show_dev(struct seq_file *s, void *v)
 	return 0;
 }
 
-static int sg_proc_open_devstrs(struct inode *inode, struct file *file)
-{
-        return seq_open(file, &devstrs_seq_ops);
-}
-
 static int sg_proc_seq_show_devstrs(struct seq_file *s, void *v)
 {
 	struct sg_proc_deviter * it = (struct sg_proc_deviter *) v;
@@ -2650,11 +2555,6 @@ static void sg_proc_debug_helper(struct seq_file *s, Sg_device * sdp)
 	}
 }
 
-static int sg_proc_open_debug(struct inode *inode, struct file *file)
-{
-        return seq_open(file, &debug_seq_ops);
-}
-
 static int sg_proc_seq_show_debug(struct seq_file *s, void *v)
 {
 	struct sg_proc_deviter * it = (struct sg_proc_deviter *) v;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 6c39948..a427ce9 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -545,7 +545,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
 
 	req = blk_get_request(SRpnt->stp->device->request_queue,
 			data_direction == DMA_TO_DEVICE ?
-			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, GFP_KERNEL);
+			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(req))
 		return DRIVER_ERROR << 24;
 	rq = scsi_req(req);
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 00e7905..d0a1674 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -6497,12 +6497,12 @@ static enum blk_eh_timer_return ufshcd_eh_timed_out(struct scsi_cmnd *scmd)
 	bool found = false;
 
 	if (!scmd || !scmd->device || !scmd->device->host)
-		return BLK_EH_NOT_HANDLED;
+		return BLK_EH_DONE;
 
 	host = scmd->device->host;
 	hba = shost_priv(host);
 	if (!hba)
-		return BLK_EH_NOT_HANDLED;
+		return BLK_EH_DONE;
 
 	spin_lock_irqsave(host->host_lock, flags);
 
@@ -6520,7 +6520,7 @@ static enum blk_eh_timer_return ufshcd_eh_timed_out(struct scsi_cmnd *scmd)
 	 * SCSI command was not actually dispatched to UFS driver, otherwise
 	 * let SCSI layer handle the error as usual.
 	 */
-	return found ? BLK_EH_NOT_HANDLED : BLK_EH_RESET_TIMER;
+	return found ? BLK_EH_DONE : BLK_EH_RESET_TIMER;
 }
 
 static const struct attribute_group *ufshcd_driver_groups[] = {
diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig
index 2d4146c..ad5d68e 100644
--- a/drivers/spi/Kconfig
+++ b/drivers/spi/Kconfig
@@ -47,6 +47,13 @@ config SPI_MASTER
 
 if SPI_MASTER
 
+config SPI_MEM
+	bool "SPI memory extension"
+	help
+	  Enable this option if you want to enable the SPI memory extension.
+	  This extension is meant to simplify interaction with SPI memories
+	  by providing a high-level interface to send memory-like commands.
+
 comment "SPI Master Controller Drivers"
 
 config SPI_ALTERA
@@ -71,7 +78,6 @@ config SPI_ARMADA_3700
 
 config SPI_ATMEL
 	tristate "Atmel SPI Controller"
-	depends on HAS_DMA
 	depends on ARCH_AT91 || COMPILE_TEST
 	help
 	  This selects a driver for the Atmel SPI Controller, present on
@@ -115,14 +121,6 @@ config SPI_BCM2835AUX
 	  "universal SPI master", and the regular SPI controller.
 	  This driver is for the universal/auxiliary SPI controller.
 
-config SPI_BCM53XX
-	tristate "Broadcom BCM53xx SPI controller"
-	depends on ARCH_BCM_5301X
-	depends on BCMA_POSSIBLE
-	select BCMA
-	help
-          Enable support for the SPI controller on Broadcom BCM53xx ARM SoCs.
-
 config SPI_BCM63XX
 	tristate "Broadcom BCM63xx SPI controller"
 	depends on BCM63XX || COMPILE_TEST
@@ -233,7 +231,6 @@ config SPI_EFM32
 
 config SPI_EP93XX
 	tristate "Cirrus Logic EP93xx SPI controller"
-	depends on HAS_DMA
 	depends on ARCH_EP93XX || COMPILE_TEST
 	help
 	  This enables using the Cirrus EP93xx SPI controller in master
@@ -355,7 +352,6 @@ config SPI_FSL_SPI
 config SPI_FSL_DSPI
 	tristate "Freescale DSPI controller"
 	select REGMAP_MMIO
-	depends on HAS_DMA
 	depends on SOC_VF610 || SOC_LS1021A || ARCH_LAYERSCAPE || M5441x || COMPILE_TEST
 	help
 	  This enables support for the Freescale DSPI controller in master
@@ -431,7 +427,6 @@ config SPI_OMAP_UWIRE
 
 config SPI_OMAP24XX
 	tristate "McSPI driver for OMAP"
-	depends on HAS_DMA
 	depends on ARCH_OMAP2PLUS || COMPILE_TEST
 	select SG_SPLIT
 	help
@@ -440,7 +435,6 @@ config SPI_OMAP24XX
 
 config SPI_TI_QSPI
 	tristate "DRA7xxx QSPI controller support"
-	depends on HAS_DMA
 	depends on ARCH_OMAP2PLUS || COMPILE_TEST
 	help
 	  QSPI master controller for DRA7xxx used for flash devices.
@@ -469,7 +463,6 @@ config SPI_PIC32
 config SPI_PIC32_SQI
 	tristate "Microchip PIC32 Quad SPI driver"
 	depends on MACH_PIC32 || COMPILE_TEST
-	depends on HAS_DMA
 	help
 	  SPI driver for PIC32 Quad SPI controller.
 
@@ -572,7 +565,7 @@ config SPI_SC18IS602
 
 config SPI_SH_MSIOF
 	tristate "SuperH MSIOF SPI controller"
-	depends on HAVE_CLK && HAS_DMA
+	depends on HAVE_CLK
 	depends on ARCH_SHMOBILE || ARCH_RENESAS || COMPILE_TEST
 	help
 	  SPI driver for SuperH and SH Mobile MSIOF blocks.
@@ -650,7 +643,7 @@ config SPI_MXS
 config SPI_TEGRA114
 	tristate "NVIDIA Tegra114 SPI Controller"
 	depends on (ARCH_TEGRA && TEGRA20_APB_DMA) || COMPILE_TEST
-	depends on RESET_CONTROLLER && HAS_DMA
+	depends on RESET_CONTROLLER
 	help
 	  SPI driver for NVIDIA Tegra114 SPI Controller interface. This controller
 	  is different than the older SoCs SPI controller and also register interface
@@ -668,7 +661,7 @@ config SPI_TEGRA20_SFLASH
 config SPI_TEGRA20_SLINK
 	tristate "Nvidia Tegra20/Tegra30 SLINK Controller"
 	depends on (ARCH_TEGRA && TEGRA20_APB_DMA) || COMPILE_TEST
-	depends on RESET_CONTROLLER && HAS_DMA
+	depends on RESET_CONTROLLER
 	help
 	  SPI driver for Nvidia Tegra20/Tegra30 SLINK Controller interface.
 
diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile
index b935f10..cb1f437 100644
--- a/drivers/spi/Makefile
+++ b/drivers/spi/Makefile
@@ -8,6 +8,7 @@ ccflags-$(CONFIG_SPI_DEBUG) := -DDEBUG
 # small core, mostly translating board-specific
 # config declarations into driver model code
 obj-$(CONFIG_SPI_MASTER)		+= spi.o
+obj-$(CONFIG_SPI_MEM)			+= spi-mem.o
 obj-$(CONFIG_SPI_SPIDEV)		+= spidev.o
 obj-$(CONFIG_SPI_LOOPBACK_TEST)		+= spi-loopback-test.o
 
@@ -20,7 +21,6 @@ obj-$(CONFIG_SPI_AU1550)		+= spi-au1550.o
 obj-$(CONFIG_SPI_AXI_SPI_ENGINE)	+= spi-axi-spi-engine.o
 obj-$(CONFIG_SPI_BCM2835)		+= spi-bcm2835.o
 obj-$(CONFIG_SPI_BCM2835AUX)		+= spi-bcm2835aux.o
-obj-$(CONFIG_SPI_BCM53XX)		+= spi-bcm53xx.o
 obj-$(CONFIG_SPI_BCM63XX)		+= spi-bcm63xx.o
 obj-$(CONFIG_SPI_BCM63XX_HSSPI)		+= spi-bcm63xx-hsspi.o
 obj-$(CONFIG_SPI_BCM_QSPI)		+= spi-iproc-qspi.o spi-brcmstb-qspi.o spi-bcm-qspi.o
diff --git a/drivers/spi/internals.h b/drivers/spi/internals.h
new file mode 100644
index 0000000..4a28a83
--- /dev/null
+++ b/drivers/spi/internals.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (C) 2018 Exceet Electronics GmbH
+ * Copyright (C) 2018 Bootlin
+ *
+ * Author: Boris Brezillon <boris.brezillon@bootlin.com>
+ *
+ * Helpers needed by the spi or spi-mem logic. Should not be used outside of
+ * spi-mem.c and spi.c.
+ */
+
+#ifndef __LINUX_SPI_INTERNALS_H
+#define __LINUX_SPI_INTERNALS_H
+
+#include <linux/device.h>
+#include <linux/dma-direction.h>
+#include <linux/scatterlist.h>
+#include <linux/spi/spi.h>
+
+void spi_flush_queue(struct spi_controller *ctrl);
+
+#ifdef CONFIG_HAS_DMA
+int spi_map_buf(struct spi_controller *ctlr, struct device *dev,
+		struct sg_table *sgt, void *buf, size_t len,
+		enum dma_data_direction dir);
+void spi_unmap_buf(struct spi_controller *ctlr, struct device *dev,
+		   struct sg_table *sgt, enum dma_data_direction dir);
+#else /* !CONFIG_HAS_DMA */
+static inline int spi_map_buf(struct spi_controller *ctlr, struct device *dev,
+			      struct sg_table *sgt, void *buf, size_t len,
+			      enum dma_data_direction dir)
+{
+	return -EINVAL;
+}
+
+static inline void spi_unmap_buf(struct spi_controller *ctlr,
+				 struct device *dev, struct sg_table *sgt,
+				 enum dma_data_direction dir)
+{
+}
+#endif /* CONFIG_HAS_DMA */
+
+#endif /* __LINUX_SPI_INTERNALS_H */
diff --git a/drivers/spi/spi-bcm-qspi.c b/drivers/spi/spi-bcm-qspi.c
index 6573152..8612525 100644
--- a/drivers/spi/spi-bcm-qspi.c
+++ b/drivers/spi/spi-bcm-qspi.c
@@ -30,6 +30,7 @@
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/spi/spi.h>
+#include <linux/spi/spi-mem.h>
 #include <linux/sysfs.h>
 #include <linux/types.h>
 #include "spi-bcm-qspi.h"
@@ -215,10 +216,10 @@ struct bcm_qspi {
 	int bspi_maj_rev;
 	int bspi_min_rev;
 	int bspi_enabled;
-	struct spi_flash_read_message *bspi_rf_msg;
-	u32 bspi_rf_msg_idx;
-	u32 bspi_rf_msg_len;
-	u32 bspi_rf_msg_status;
+	const struct spi_mem_op *bspi_rf_op;
+	u32 bspi_rf_op_idx;
+	u32 bspi_rf_op_len;
+	u32 bspi_rf_op_status;
 	struct bcm_xfer_mode xfer_mode;
 	u32 s3_strap_override_ctrl;
 	bool bspi_mode;
@@ -313,26 +314,26 @@ static inline void bcm_qspi_bspi_lr_clear(struct bcm_qspi *qspi)
 
 static void bcm_qspi_bspi_lr_data_read(struct bcm_qspi *qspi)
 {
-	u32 *buf = (u32 *)qspi->bspi_rf_msg->buf;
+	u32 *buf = (u32 *)qspi->bspi_rf_op->data.buf.in;
 	u32 data = 0;
 
-	dev_dbg(&qspi->pdev->dev, "xfer %p rx %p rxlen %d\n", qspi->bspi_rf_msg,
-		qspi->bspi_rf_msg->buf, qspi->bspi_rf_msg_len);
+	dev_dbg(&qspi->pdev->dev, "xfer %p rx %p rxlen %d\n", qspi->bspi_rf_op,
+		qspi->bspi_rf_op->data.buf.in, qspi->bspi_rf_op_len);
 	while (!bcm_qspi_bspi_lr_is_fifo_empty(qspi)) {
 		data = bcm_qspi_bspi_lr_read_fifo(qspi);
-		if (likely(qspi->bspi_rf_msg_len >= 4) &&
+		if (likely(qspi->bspi_rf_op_len >= 4) &&
 		    IS_ALIGNED((uintptr_t)buf, 4)) {
-			buf[qspi->bspi_rf_msg_idx++] = data;
-			qspi->bspi_rf_msg_len -= 4;
+			buf[qspi->bspi_rf_op_idx++] = data;
+			qspi->bspi_rf_op_len -= 4;
 		} else {
 			/* Read out remaining bytes, make sure*/
-			u8 *cbuf = (u8 *)&buf[qspi->bspi_rf_msg_idx];
+			u8 *cbuf = (u8 *)&buf[qspi->bspi_rf_op_idx];
 
 			data = cpu_to_le32(data);
-			while (qspi->bspi_rf_msg_len) {
+			while (qspi->bspi_rf_op_len) {
 				*cbuf++ = (u8)data;
 				data >>= 8;
-				qspi->bspi_rf_msg_len--;
+				qspi->bspi_rf_op_len--;
 			}
 		}
 	}
@@ -349,14 +350,12 @@ static void bcm_qspi_bspi_set_xfer_params(struct bcm_qspi *qspi, u8 cmd_byte,
 }
 
 static int bcm_qspi_bspi_set_flex_mode(struct bcm_qspi *qspi,
-				       struct spi_flash_read_message *msg,
-				       int hp)
+				       const struct spi_mem_op *op, int hp)
 {
 	int bpc = 0, bpp = 0;
-	u8 command = msg->read_opcode;
-	int width  = msg->data_nbits ? msg->data_nbits : SPI_NBITS_SINGLE;
-	int addrlen = msg->addr_width;
-	int addr_nbits = msg->addr_nbits ? msg->addr_nbits : SPI_NBITS_SINGLE;
+	u8 command = op->cmd.opcode;
+	int width  = op->cmd.buswidth ? op->cmd.buswidth : SPI_NBITS_SINGLE;
+	int addrlen = op->addr.nbytes * 8;
 	int flex_mode = 1;
 
 	dev_dbg(&qspi->pdev->dev, "set flex mode w %x addrlen %x hp %d\n",
@@ -365,7 +364,7 @@ static int bcm_qspi_bspi_set_flex_mode(struct bcm_qspi *qspi,
 	if (addrlen == BSPI_ADDRLEN_4BYTES)
 		bpp = BSPI_BPP_ADDR_SELECT_MASK;
 
-	bpp |= msg->dummy_bytes * (8/addr_nbits);
+	bpp |= (op->dummy.nbytes * 8) / op->dummy.buswidth;
 
 	switch (width) {
 	case SPI_NBITS_SINGLE:
@@ -397,11 +396,10 @@ static int bcm_qspi_bspi_set_flex_mode(struct bcm_qspi *qspi,
 }
 
 static int bcm_qspi_bspi_set_override(struct bcm_qspi *qspi,
-				      struct spi_flash_read_message *msg,
-				      int hp)
+				      const struct spi_mem_op *op, int hp)
 {
-	int width = msg->data_nbits ? msg->data_nbits : SPI_NBITS_SINGLE;
-	int addrlen = msg->addr_width;
+	int width = op->data.buswidth ? op->data.buswidth : SPI_NBITS_SINGLE;
+	int addrlen = op->addr.nbytes;
 	u32 data = bcm_qspi_read(qspi, BSPI, BSPI_STRAP_OVERRIDE_CTRL);
 
 	dev_dbg(&qspi->pdev->dev, "set override mode w %x addrlen %x hp %d\n",
@@ -437,17 +435,17 @@ static int bcm_qspi_bspi_set_override(struct bcm_qspi *qspi,
 	/* set the override mode */
 	data |=	BSPI_STRAP_OVERRIDE_CTRL_OVERRIDE;
 	bcm_qspi_write(qspi, BSPI, BSPI_STRAP_OVERRIDE_CTRL, data);
-	bcm_qspi_bspi_set_xfer_params(qspi, msg->read_opcode, 0, 0, 0);
+	bcm_qspi_bspi_set_xfer_params(qspi, op->cmd.opcode, 0, 0, 0);
 
 	return 0;
 }
 
 static int bcm_qspi_bspi_set_mode(struct bcm_qspi *qspi,
-				  struct spi_flash_read_message *msg, int hp)
+				  const struct spi_mem_op *op, int hp)
 {
 	int error = 0;
-	int width = msg->data_nbits ? msg->data_nbits : SPI_NBITS_SINGLE;
-	int addrlen = msg->addr_width;
+	int width = op->data.buswidth ? op->data.buswidth : SPI_NBITS_SINGLE;
+	int addrlen = op->addr.nbytes;
 
 	/* default mode */
 	qspi->xfer_mode.flex_mode = true;
@@ -460,12 +458,12 @@ static int bcm_qspi_bspi_set_mode(struct bcm_qspi *qspi,
 		if (val & mask || qspi->s3_strap_override_ctrl & mask) {
 			qspi->xfer_mode.flex_mode = false;
 			bcm_qspi_write(qspi, BSPI, BSPI_FLEX_MODE_ENABLE, 0);
-			error = bcm_qspi_bspi_set_override(qspi, msg, hp);
+			error = bcm_qspi_bspi_set_override(qspi, op, hp);
 		}
 	}
 
 	if (qspi->xfer_mode.flex_mode)
-		error = bcm_qspi_bspi_set_flex_mode(qspi, msg, hp);
+		error = bcm_qspi_bspi_set_flex_mode(qspi, op, hp);
 
 	if (error) {
 		dev_warn(&qspi->pdev->dev,
@@ -802,19 +800,20 @@ done:
 	return slot;
 }
 
-static int bcm_qspi_bspi_flash_read(struct spi_device *spi,
-				    struct spi_flash_read_message *msg)
+static int bcm_qspi_bspi_exec_mem_op(struct spi_device *spi,
+				     const struct spi_mem_op *op)
 {
 	struct bcm_qspi *qspi = spi_master_get_devdata(spi->master);
-	u32 addr = 0, len, rdlen, len_words;
+	u32 addr = 0, len, rdlen, len_words, from = 0;
 	int ret = 0;
 	unsigned long timeo = msecs_to_jiffies(100);
 	struct bcm_qspi_soc_intc *soc_intc = qspi->soc_intc;
 
 	if (bcm_qspi_bspi_ver_three(qspi))
-		if (msg->addr_width == BSPI_ADDRLEN_4BYTES)
+		if (op->addr.nbytes == BSPI_ADDRLEN_4BYTES)
 			return -EIO;
 
+	from = op->addr.val;
 	bcm_qspi_chip_select(qspi, spi->chip_select);
 	bcm_qspi_write(qspi, MSPI, MSPI_WRITE_LOCK, 0);
 
@@ -823,15 +822,15 @@ static int bcm_qspi_bspi_flash_read(struct spi_device *spi,
 	 * the upper address byte to bspi
 	 */
 	if (bcm_qspi_bspi_ver_three(qspi) == false) {
-		addr = msg->from & 0xff000000;
+		addr = from & 0xff000000;
 		bcm_qspi_write(qspi, BSPI,
 			       BSPI_BSPI_FLASH_UPPER_ADDR_BYTE, addr);
 	}
 
 	if (!qspi->xfer_mode.flex_mode)
-		addr = msg->from;
+		addr = from;
 	else
-		addr = msg->from & 0x00ffffff;
+		addr = from & 0x00ffffff;
 
 	if (bcm_qspi_bspi_ver_three(qspi) == true)
 		addr = (addr + 0xc00000) & 0xffffff;
@@ -840,8 +839,8 @@ static int bcm_qspi_bspi_flash_read(struct spi_device *spi,
 	 * read into the entire buffer by breaking the reads
 	 * into RAF buffer read lengths
 	 */
-	len = msg->len;
-	qspi->bspi_rf_msg_idx = 0;
+	len = op->data.nbytes;
+	qspi->bspi_rf_op_idx = 0;
 
 	do {
 		if (len > BSPI_READ_LENGTH)
@@ -852,9 +851,9 @@ static int bcm_qspi_bspi_flash_read(struct spi_device *spi,
 		reinit_completion(&qspi->bspi_done);
 		bcm_qspi_enable_bspi(qspi);
 		len_words = (rdlen + 3) >> 2;
-		qspi->bspi_rf_msg = msg;
-		qspi->bspi_rf_msg_status = 0;
-		qspi->bspi_rf_msg_len = rdlen;
+		qspi->bspi_rf_op = op;
+		qspi->bspi_rf_op_status = 0;
+		qspi->bspi_rf_op_len = rdlen;
 		dev_dbg(&qspi->pdev->dev,
 			"bspi xfr addr 0x%x len 0x%x", addr, rdlen);
 		bcm_qspi_write(qspi, BSPI, BSPI_RAF_START_ADDR, addr);
@@ -879,7 +878,6 @@ static int bcm_qspi_bspi_flash_read(struct spi_device *spi,
 		}
 
 		/* set msg return length */
-		msg->retlen += rdlen;
 		addr += rdlen;
 		len -= rdlen;
 	} while (len);
@@ -914,61 +912,63 @@ static int bcm_qspi_transfer_one(struct spi_master *master,
 	return 0;
 }
 
-static int bcm_qspi_mspi_flash_read(struct spi_device *spi,
-				    struct spi_flash_read_message *msg)
+static int bcm_qspi_mspi_exec_mem_op(struct spi_device *spi,
+				     const struct spi_mem_op *op)
 {
-	struct bcm_qspi *qspi = spi_master_get_devdata(spi->master);
+	struct spi_master *master = spi->master;
+	struct bcm_qspi *qspi = spi_master_get_devdata(master);
 	struct spi_transfer t[2];
-	u8 cmd[6];
-	int ret;
+	u8 cmd[6] = { };
+	int ret, i;
 
 	memset(cmd, 0, sizeof(cmd));
 	memset(t, 0, sizeof(t));
 
 	/* tx */
 	/* opcode is in cmd[0] */
-	cmd[0] = msg->read_opcode;
-	cmd[1] = msg->from >> (msg->addr_width * 8 -  8);
-	cmd[2] = msg->from >> (msg->addr_width * 8 - 16);
-	cmd[3] = msg->from >> (msg->addr_width * 8 - 24);
-	cmd[4] = msg->from >> (msg->addr_width * 8 - 32);
+	cmd[0] = op->cmd.opcode;
+	for (i = 0; i < op->addr.nbytes; i++)
+		cmd[1 + i] = op->addr.val >> (8 * (op->addr.nbytes - i - 1));
+
 	t[0].tx_buf = cmd;
-	t[0].len = msg->addr_width + msg->dummy_bytes + 1;
+	t[0].len = op->addr.nbytes + op->dummy.nbytes + 1;
 	t[0].bits_per_word = spi->bits_per_word;
-	t[0].tx_nbits = msg->opcode_nbits;
+	t[0].tx_nbits = op->cmd.buswidth;
 	/* lets mspi know that this is not last transfer */
 	qspi->trans_pos.mspi_last_trans = false;
-	ret = bcm_qspi_transfer_one(spi->master, spi, &t[0]);
+	ret = bcm_qspi_transfer_one(master, spi, &t[0]);
 
 	/* rx */
 	qspi->trans_pos.mspi_last_trans = true;
 	if (!ret) {
 		/* rx */
-		t[1].rx_buf = msg->buf;
-		t[1].len = msg->len;
-		t[1].rx_nbits =  msg->data_nbits;
+		t[1].rx_buf = op->data.buf.in;
+		t[1].len = op->data.nbytes;
+		t[1].rx_nbits =  op->data.buswidth;
 		t[1].bits_per_word = spi->bits_per_word;
-		ret = bcm_qspi_transfer_one(spi->master, spi, &t[1]);
+		ret = bcm_qspi_transfer_one(master, spi, &t[1]);
 	}
 
-	if (!ret)
-		msg->retlen = msg->len;
-
 	return ret;
 }
 
-static int bcm_qspi_flash_read(struct spi_device *spi,
-			       struct spi_flash_read_message *msg)
+static int bcm_qspi_exec_mem_op(struct spi_mem *mem,
+				const struct spi_mem_op *op)
 {
+	struct spi_device *spi = mem->spi;
 	struct bcm_qspi *qspi = spi_master_get_devdata(spi->master);
 	int ret = 0;
 	bool mspi_read = false;
-	u32 addr, len;
+	u32 addr = 0, len;
 	u_char *buf;
 
-	buf = msg->buf;
-	addr = msg->from;
-	len = msg->len;
+	if (!op->data.nbytes || !op->addr.nbytes || op->addr.nbytes > 4 ||
+	    op->data.dir != SPI_MEM_DATA_IN)
+		return -ENOTSUPP;
+
+	buf = op->data.buf.in;
+	addr = op->addr.val;
+	len = op->data.nbytes;
 
 	if (bcm_qspi_bspi_ver_three(qspi) == true) {
 		/*
@@ -990,12 +990,12 @@ static int bcm_qspi_flash_read(struct spi_device *spi,
 		mspi_read = true;
 
 	if (mspi_read)
-		return bcm_qspi_mspi_flash_read(spi, msg);
+		return bcm_qspi_mspi_exec_mem_op(spi, op);
 
-	ret = bcm_qspi_bspi_set_mode(qspi, msg, -1);
+	ret = bcm_qspi_bspi_set_mode(qspi, op, -1);
 
 	if (!ret)
-		ret = bcm_qspi_bspi_flash_read(spi, msg);
+		ret = bcm_qspi_bspi_exec_mem_op(spi, op);
 
 	return ret;
 }
@@ -1034,10 +1034,10 @@ static irqreturn_t bcm_qspi_bspi_lr_l2_isr(int irq, void *dev_id)
 	struct bcm_qspi_soc_intc *soc_intc = qspi->soc_intc;
 	u32 status = qspi_dev_id->irqp->mask;
 
-	if (qspi->bspi_enabled && qspi->bspi_rf_msg) {
+	if (qspi->bspi_enabled && qspi->bspi_rf_op) {
 		bcm_qspi_bspi_lr_data_read(qspi);
-		if (qspi->bspi_rf_msg_len == 0) {
-			qspi->bspi_rf_msg = NULL;
+		if (qspi->bspi_rf_op_len == 0) {
+			qspi->bspi_rf_op = NULL;
 			if (qspi->soc_intc) {
 				/* disable soc BSPI interrupt */
 				soc_intc->bcm_qspi_int_set(soc_intc, BSPI_DONE,
@@ -1046,7 +1046,7 @@ static irqreturn_t bcm_qspi_bspi_lr_l2_isr(int irq, void *dev_id)
 				status = INTR_BSPI_LR_SESSION_DONE_MASK;
 			}
 
-			if (qspi->bspi_rf_msg_status)
+			if (qspi->bspi_rf_op_status)
 				bcm_qspi_bspi_lr_clear(qspi);
 			else
 				bcm_qspi_bspi_flush_prefetch_buffers(qspi);
@@ -1058,7 +1058,7 @@ static irqreturn_t bcm_qspi_bspi_lr_l2_isr(int irq, void *dev_id)
 	}
 
 	status &= INTR_BSPI_LR_SESSION_DONE_MASK;
-	if (qspi->bspi_enabled && status && qspi->bspi_rf_msg_len == 0)
+	if (qspi->bspi_enabled && status && qspi->bspi_rf_op_len == 0)
 		complete(&qspi->bspi_done);
 
 	return IRQ_HANDLED;
@@ -1071,7 +1071,7 @@ static irqreturn_t bcm_qspi_bspi_lr_err_l2_isr(int irq, void *dev_id)
 	struct bcm_qspi_soc_intc *soc_intc = qspi->soc_intc;
 
 	dev_err(&qspi->pdev->dev, "BSPI INT error\n");
-	qspi->bspi_rf_msg_status = -EIO;
+	qspi->bspi_rf_op_status = -EIO;
 	if (qspi->soc_intc)
 		/* clear soc interrupt */
 		soc_intc->bcm_qspi_int_ack(soc_intc, BSPI_ERR);
@@ -1194,6 +1194,10 @@ static void bcm_qspi_hw_uninit(struct bcm_qspi *qspi)
 
 }
 
+static const struct spi_controller_mem_ops bcm_qspi_mem_ops = {
+	.exec_op = bcm_qspi_exec_mem_op,
+};
+
 static const struct of_device_id bcm_qspi_of_match[] = {
 	{ .compatible = "brcm,spi-bcm-qspi" },
 	{},
@@ -1236,7 +1240,7 @@ int bcm_qspi_probe(struct platform_device *pdev,
 	master->mode_bits = SPI_CPHA | SPI_CPOL | SPI_RX_DUAL | SPI_RX_QUAD;
 	master->setup = bcm_qspi_setup;
 	master->transfer_one = bcm_qspi_transfer_one;
-	master->spi_flash_read = bcm_qspi_flash_read;
+	master->mem_ops = &bcm_qspi_mem_ops;
 	master->cleanup = bcm_qspi_cleanup;
 	master->dev.of_node = dev->of_node;
 	master->num_chipselect = NUM_CHIPSELECT;
diff --git a/drivers/spi/spi-bcm53xx.c b/drivers/spi/spi-bcm53xx.c
deleted file mode 100644
index d02ceb7..0000000
--- a/drivers/spi/spi-bcm53xx.c
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Copyright (C) 2014-2016 Rafał Miłecki <rafal@milecki.pl>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#define pr_fmt(fmt)		KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/delay.h>
-#include <linux/bcma/bcma.h>
-#include <linux/spi/spi.h>
-
-#include "spi-bcm53xx.h"
-
-#define BCM53XXSPI_MAX_SPI_BAUD	13500000	/* 216 MHz? */
-#define BCM53XXSPI_FLASH_WINDOW	SZ_32M
-
-/* The longest observed required wait was 19 ms */
-#define BCM53XXSPI_SPE_TIMEOUT_MS	80
-
-struct bcm53xxspi {
-	struct bcma_device *core;
-	struct spi_master *master;
-	void __iomem *mmio_base;
-	bool bspi;				/* Boot SPI mode with memory mapping */
-};
-
-static inline u32 bcm53xxspi_read(struct bcm53xxspi *b53spi, u16 offset)
-{
-	return bcma_read32(b53spi->core, offset);
-}
-
-static inline void bcm53xxspi_write(struct bcm53xxspi *b53spi, u16 offset,
-				    u32 value)
-{
-	bcma_write32(b53spi->core, offset, value);
-}
-
-static void bcm53xxspi_disable_bspi(struct bcm53xxspi *b53spi)
-{
-	struct device *dev = &b53spi->core->dev;
-	unsigned long deadline;
-	u32 tmp;
-
-	if (!b53spi->bspi)
-		return;
-
-	tmp = bcm53xxspi_read(b53spi, B53SPI_BSPI_MAST_N_BOOT_CTRL);
-	if (tmp & 0x1)
-		return;
-
-	deadline = jiffies + usecs_to_jiffies(200);
-	do {
-		tmp = bcm53xxspi_read(b53spi, B53SPI_BSPI_BUSY_STATUS);
-		if (!(tmp & 0x1)) {
-			bcm53xxspi_write(b53spi, B53SPI_BSPI_MAST_N_BOOT_CTRL,
-					 0x1);
-			ndelay(200);
-			b53spi->bspi = false;
-			return;
-		}
-		udelay(1);
-	} while (!time_after_eq(jiffies, deadline));
-
-	dev_warn(dev, "Timeout disabling BSPI\n");
-}
-
-static void bcm53xxspi_enable_bspi(struct bcm53xxspi *b53spi)
-{
-	u32 tmp;
-
-	if (b53spi->bspi)
-		return;
-
-	tmp = bcm53xxspi_read(b53spi, B53SPI_BSPI_MAST_N_BOOT_CTRL);
-	if (!(tmp & 0x1))
-		return;
-
-	bcm53xxspi_write(b53spi, B53SPI_BSPI_MAST_N_BOOT_CTRL, 0x0);
-	b53spi->bspi = true;
-}
-
-static inline unsigned int bcm53xxspi_calc_timeout(size_t len)
-{
-	/* Do some magic calculation based on length and buad. Add 10% and 1. */
-	return (len * 9000 / BCM53XXSPI_MAX_SPI_BAUD * 110 / 100) + 1;
-}
-
-static int bcm53xxspi_wait(struct bcm53xxspi *b53spi, unsigned int timeout_ms)
-{
-	unsigned long deadline;
-	u32 tmp;
-
-	/* SPE bit has to be 0 before we read MSPI STATUS */
-	deadline = jiffies + msecs_to_jiffies(BCM53XXSPI_SPE_TIMEOUT_MS);
-	do {
-		tmp = bcm53xxspi_read(b53spi, B53SPI_MSPI_SPCR2);
-		if (!(tmp & B53SPI_MSPI_SPCR2_SPE))
-			break;
-		udelay(5);
-	} while (!time_after_eq(jiffies, deadline));
-
-	if (tmp & B53SPI_MSPI_SPCR2_SPE)
-		goto spi_timeout;
-
-	/* Check status */
-	deadline = jiffies + msecs_to_jiffies(timeout_ms);
-	do {
-		tmp = bcm53xxspi_read(b53spi, B53SPI_MSPI_MSPI_STATUS);
-		if (tmp & B53SPI_MSPI_MSPI_STATUS_SPIF) {
-			bcm53xxspi_write(b53spi, B53SPI_MSPI_MSPI_STATUS, 0);
-			return 0;
-		}
-
-		cpu_relax();
-		udelay(100);
-	} while (!time_after_eq(jiffies, deadline));
-
-spi_timeout:
-	bcm53xxspi_write(b53spi, B53SPI_MSPI_MSPI_STATUS, 0);
-
-	pr_err("Timeout waiting for SPI to be ready!\n");
-
-	return -EBUSY;
-}
-
-static void bcm53xxspi_buf_write(struct bcm53xxspi *b53spi, u8 *w_buf,
-				 size_t len, bool cont)
-{
-	u32 tmp;
-	int i;
-
-	for (i = 0; i < len; i++) {
-		/* Transmit Register File MSB */
-		bcm53xxspi_write(b53spi, B53SPI_MSPI_TXRAM + 4 * (i * 2),
-				 (unsigned int)w_buf[i]);
-	}
-
-	for (i = 0; i < len; i++) {
-		tmp = B53SPI_CDRAM_CONT | B53SPI_CDRAM_PCS_DISABLE_ALL |
-		      B53SPI_CDRAM_PCS_DSCK;
-		if (!cont && i == len - 1)
-			tmp &= ~B53SPI_CDRAM_CONT;
-		tmp &= ~0x1;
-		/* Command Register File */
-		bcm53xxspi_write(b53spi, B53SPI_MSPI_CDRAM + 4 * i, tmp);
-	}
-
-	/* Set queue pointers */
-	bcm53xxspi_write(b53spi, B53SPI_MSPI_NEWQP, 0);
-	bcm53xxspi_write(b53spi, B53SPI_MSPI_ENDQP, len - 1);
-
-	if (cont)
-		bcm53xxspi_write(b53spi, B53SPI_MSPI_WRITE_LOCK, 1);
-
-	/* Start SPI transfer */
-	tmp = bcm53xxspi_read(b53spi, B53SPI_MSPI_SPCR2);
-	tmp |= B53SPI_MSPI_SPCR2_SPE;
-	if (cont)
-		tmp |= B53SPI_MSPI_SPCR2_CONT_AFTER_CMD;
-	bcm53xxspi_write(b53spi, B53SPI_MSPI_SPCR2, tmp);
-
-	/* Wait for SPI to finish */
-	bcm53xxspi_wait(b53spi, bcm53xxspi_calc_timeout(len));
-
-	if (!cont)
-		bcm53xxspi_write(b53spi, B53SPI_MSPI_WRITE_LOCK, 0);
-}
-
-static void bcm53xxspi_buf_read(struct bcm53xxspi *b53spi, u8 *r_buf,
-				size_t len, bool cont)
-{
-	u32 tmp;
-	int i;
-
-	for (i = 0; i < len; i++) {
-		tmp = B53SPI_CDRAM_CONT | B53SPI_CDRAM_PCS_DISABLE_ALL |
-		      B53SPI_CDRAM_PCS_DSCK;
-		if (!cont && i == len - 1)
-			tmp &= ~B53SPI_CDRAM_CONT;
-		tmp &= ~0x1;
-		/* Command Register File */
-		bcm53xxspi_write(b53spi, B53SPI_MSPI_CDRAM + 4 * i, tmp);
-	}
-
-	/* Set queue pointers */
-	bcm53xxspi_write(b53spi, B53SPI_MSPI_NEWQP, 0);
-	bcm53xxspi_write(b53spi, B53SPI_MSPI_ENDQP, len - 1);
-
-	if (cont)
-		bcm53xxspi_write(b53spi, B53SPI_MSPI_WRITE_LOCK, 1);
-
-	/* Start SPI transfer */
-	tmp = bcm53xxspi_read(b53spi, B53SPI_MSPI_SPCR2);
-	tmp |= B53SPI_MSPI_SPCR2_SPE;
-	if (cont)
-		tmp |= B53SPI_MSPI_SPCR2_CONT_AFTER_CMD;
-	bcm53xxspi_write(b53spi, B53SPI_MSPI_SPCR2, tmp);
-
-	/* Wait for SPI to finish */
-	bcm53xxspi_wait(b53spi, bcm53xxspi_calc_timeout(len));
-
-	if (!cont)
-		bcm53xxspi_write(b53spi, B53SPI_MSPI_WRITE_LOCK, 0);
-
-	for (i = 0; i < len; ++i) {
-		u16 reg = B53SPI_MSPI_RXRAM + 4 * (1 + i * 2);
-
-		/* Data stored in the transmit register file LSB */
-		r_buf[i] = (u8)bcm53xxspi_read(b53spi, reg);
-	}
-}
-
-static int bcm53xxspi_transfer_one(struct spi_master *master,
-				   struct spi_device *spi,
-				   struct spi_transfer *t)
-{
-	struct bcm53xxspi *b53spi = spi_master_get_devdata(master);
-	u8 *buf;
-	size_t left;
-
-	bcm53xxspi_disable_bspi(b53spi);
-
-	if (t->tx_buf) {
-		buf = (u8 *)t->tx_buf;
-		left = t->len;
-		while (left) {
-			size_t to_write = min_t(size_t, 16, left);
-			bool cont = !spi_transfer_is_last(master, t) ||
-				    left - to_write > 0;
-
-			bcm53xxspi_buf_write(b53spi, buf, to_write, cont);
-			left -= to_write;
-			buf += to_write;
-		}
-	}
-
-	if (t->rx_buf) {
-		buf = (u8 *)t->rx_buf;
-		left = t->len;
-		while (left) {
-			size_t to_read = min_t(size_t, 16, left);
-			bool cont = !spi_transfer_is_last(master, t) ||
-				    left - to_read > 0;
-
-			bcm53xxspi_buf_read(b53spi, buf, to_read, cont);
-			left -= to_read;
-			buf += to_read;
-		}
-	}
-
-	return 0;
-}
-
-static int bcm53xxspi_flash_read(struct spi_device *spi,
-				 struct spi_flash_read_message *msg)
-{
-	struct bcm53xxspi *b53spi = spi_master_get_devdata(spi->master);
-	int ret = 0;
-
-	if (msg->from + msg->len > BCM53XXSPI_FLASH_WINDOW)
-		return -EINVAL;
-
-	bcm53xxspi_enable_bspi(b53spi);
-	memcpy_fromio(msg->buf, b53spi->mmio_base + msg->from, msg->len);
-	msg->retlen = msg->len;
-
-	return ret;
-}
-
-/**************************************************
- * BCMA
- **************************************************/
-
-static const struct bcma_device_id bcm53xxspi_bcma_tbl[] = {
-	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_NS_QSPI, BCMA_ANY_REV, BCMA_ANY_CLASS),
-	{},
-};
-MODULE_DEVICE_TABLE(bcma, bcm53xxspi_bcma_tbl);
-
-static int bcm53xxspi_bcma_probe(struct bcma_device *core)
-{
-	struct device *dev = &core->dev;
-	struct bcm53xxspi *b53spi;
-	struct spi_master *master;
-	int err;
-
-	if (core->bus->drv_cc.core->id.rev != 42) {
-		pr_err("SPI on SoC with unsupported ChipCommon rev\n");
-		return -ENOTSUPP;
-	}
-
-	master = spi_alloc_master(dev, sizeof(*b53spi));
-	if (!master)
-		return -ENOMEM;
-
-	b53spi = spi_master_get_devdata(master);
-	b53spi->master = master;
-	b53spi->core = core;
-
-	if (core->addr_s[0])
-		b53spi->mmio_base = devm_ioremap(dev, core->addr_s[0],
-						 BCM53XXSPI_FLASH_WINDOW);
-	b53spi->bspi = true;
-	bcm53xxspi_disable_bspi(b53spi);
-
-	master->dev.of_node = dev->of_node;
-	master->transfer_one = bcm53xxspi_transfer_one;
-	if (b53spi->mmio_base)
-		master->spi_flash_read = bcm53xxspi_flash_read;
-
-	bcma_set_drvdata(core, b53spi);
-
-	err = devm_spi_register_master(dev, master);
-	if (err) {
-		spi_master_put(master);
-		bcma_set_drvdata(core, NULL);
-		return err;
-	}
-
-	return 0;
-}
-
-static struct bcma_driver bcm53xxspi_bcma_driver = {
-	.name		= KBUILD_MODNAME,
-	.id_table	= bcm53xxspi_bcma_tbl,
-	.probe		= bcm53xxspi_bcma_probe,
-};
-
-/**************************************************
- * Init & exit
- **************************************************/
-
-static int __init bcm53xxspi_module_init(void)
-{
-	int err = 0;
-
-	err = bcma_driver_register(&bcm53xxspi_bcma_driver);
-	if (err)
-		pr_err("Failed to register bcma driver: %d\n", err);
-
-	return err;
-}
-
-static void __exit bcm53xxspi_module_exit(void)
-{
-	bcma_driver_unregister(&bcm53xxspi_bcma_driver);
-}
-
-module_init(bcm53xxspi_module_init);
-module_exit(bcm53xxspi_module_exit);
-
-MODULE_DESCRIPTION("Broadcom BCM53xx SPI Controller driver");
-MODULE_AUTHOR("Rafał Miłecki <zajec5@gmail.com>");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/spi/spi-bcm53xx.h b/drivers/spi/spi-bcm53xx.h
deleted file mode 100644
index 03e3442..0000000
--- a/drivers/spi/spi-bcm53xx.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef SPI_BCM53XX_H
-#define SPI_BCM53XX_H
-
-#define B53SPI_BSPI_REVISION_ID			0x000
-#define B53SPI_BSPI_SCRATCH			0x004
-#define B53SPI_BSPI_MAST_N_BOOT_CTRL		0x008
-#define B53SPI_BSPI_BUSY_STATUS			0x00c
-#define B53SPI_BSPI_INTR_STATUS			0x010
-#define B53SPI_BSPI_B0_STATUS			0x014
-#define B53SPI_BSPI_B0_CTRL			0x018
-#define B53SPI_BSPI_B1_STATUS			0x01c
-#define B53SPI_BSPI_B1_CTRL			0x020
-#define B53SPI_BSPI_STRAP_OVERRIDE_CTRL		0x024
-#define B53SPI_BSPI_FLEX_MODE_ENABLE		0x028
-#define B53SPI_BSPI_BITS_PER_CYCLE		0x02c
-#define B53SPI_BSPI_BITS_PER_PHASE		0x030
-#define B53SPI_BSPI_CMD_AND_MODE_BYTE		0x034
-#define B53SPI_BSPI_BSPI_FLASH_UPPER_ADDR_BYTE	0x038
-#define B53SPI_BSPI_BSPI_XOR_VALUE		0x03c
-#define B53SPI_BSPI_BSPI_XOR_ENABLE		0x040
-#define B53SPI_BSPI_BSPI_PIO_MODE_ENABLE	0x044
-#define B53SPI_BSPI_BSPI_PIO_IODIR		0x048
-#define B53SPI_BSPI_BSPI_PIO_DATA		0x04c
-
-/* RAF */
-#define B53SPI_RAF_START_ADDR			0x100
-#define B53SPI_RAF_NUM_WORDS			0x104
-#define B53SPI_RAF_CTRL				0x108
-#define B53SPI_RAF_FULLNESS			0x10c
-#define B53SPI_RAF_WATERMARK			0x110
-#define B53SPI_RAF_STATUS			0x114
-#define B53SPI_RAF_READ_DATA			0x118
-#define B53SPI_RAF_WORD_CNT			0x11c
-#define B53SPI_RAF_CURR_ADDR			0x120
-
-/* MSPI */
-#define B53SPI_MSPI_SPCR0_LSB			0x200
-#define B53SPI_MSPI_SPCR0_MSB			0x204
-#define B53SPI_MSPI_SPCR1_LSB			0x208
-#define B53SPI_MSPI_SPCR1_MSB			0x20c
-#define B53SPI_MSPI_NEWQP			0x210
-#define B53SPI_MSPI_ENDQP			0x214
-#define B53SPI_MSPI_SPCR2			0x218
-#define  B53SPI_MSPI_SPCR2_SPE			0x00000040
-#define  B53SPI_MSPI_SPCR2_CONT_AFTER_CMD	0x00000080
-#define B53SPI_MSPI_MSPI_STATUS			0x220
-#define  B53SPI_MSPI_MSPI_STATUS_SPIF		0x00000001
-#define B53SPI_MSPI_CPTQP			0x224
-#define B53SPI_MSPI_TXRAM			0x240 /* 32 registers, up to 0x2b8 */
-#define B53SPI_MSPI_RXRAM			0x2c0 /* 32 registers, up to 0x33c */
-#define B53SPI_MSPI_CDRAM			0x340 /* 16 registers, up to 0x37c */
-#define  B53SPI_CDRAM_PCS_PCS0			0x00000001
-#define  B53SPI_CDRAM_PCS_PCS1			0x00000002
-#define  B53SPI_CDRAM_PCS_PCS2			0x00000004
-#define  B53SPI_CDRAM_PCS_PCS3			0x00000008
-#define  B53SPI_CDRAM_PCS_DISABLE_ALL		0x0000000f
-#define  B53SPI_CDRAM_PCS_DSCK			0x00000010
-#define  B53SPI_CDRAM_BITSE			0x00000040
-#define  B53SPI_CDRAM_CONT			0x00000080
-#define B53SPI_MSPI_WRITE_LOCK			0x380
-#define B53SPI_MSPI_DISABLE_FLUSH_GEN		0x384
-
-/* Interrupt */
-#define B53SPI_INTR_RAF_LR_FULLNESS_REACHED	0x3a0
-#define B53SPI_INTR_RAF_LR_TRUNCATED		0x3a4
-#define B53SPI_INTR_RAF_LR_IMPATIENT		0x3a8
-#define B53SPI_INTR_RAF_LR_SESSION_DONE		0x3ac
-#define B53SPI_INTR_RAF_LR_OVERREAD		0x3b0
-#define B53SPI_INTR_MSPI_DONE			0x3b4
-#define B53SPI_INTR_MSPI_HALT_SET_TRANSACTION_DONE	0x3b8
-
-#endif /* SPI_BCM53XX_H */
diff --git a/drivers/spi/spi-bcm63xx-hsspi.c b/drivers/spi/spi-bcm63xx-hsspi.c
index cbcba61..c23849f 100644
--- a/drivers/spi/spi-bcm63xx-hsspi.c
+++ b/drivers/spi/spi-bcm63xx-hsspi.c
@@ -352,22 +352,31 @@ static int bcm63xx_hsspi_probe(struct platform_device *pdev)
 	if (IS_ERR(clk))
 		return PTR_ERR(clk);
 
+	ret = clk_prepare_enable(clk);
+	if (ret)
+		return ret;
+
 	rate = clk_get_rate(clk);
 	if (!rate) {
 		struct clk *pll_clk = devm_clk_get(dev, "pll");
 
-		if (IS_ERR(pll_clk))
-			return PTR_ERR(pll_clk);
+		if (IS_ERR(pll_clk)) {
+			ret = PTR_ERR(pll_clk);
+			goto out_disable_clk;
+		}
+
+		ret = clk_prepare_enable(pll_clk);
+		if (ret)
+			goto out_disable_clk;
 
 		rate = clk_get_rate(pll_clk);
-		if (!rate)
-			return -EINVAL;
+		clk_disable_unprepare(pll_clk);
+		if (!rate) {
+			ret = -EINVAL;
+			goto out_disable_clk;
+		}
 	}
 
-	ret = clk_prepare_enable(clk);
-	if (ret)
-		return ret;
-
 	master = spi_alloc_master(&pdev->dev, sizeof(*bs));
 	if (!master) {
 		ret = -ENOMEM;
diff --git a/drivers/spi/spi-cadence.c b/drivers/spi/spi-cadence.c
index 4a00163..f3dad6f 100644
--- a/drivers/spi/spi-cadence.c
+++ b/drivers/spi/spi-cadence.c
@@ -694,8 +694,7 @@ static int cdns_spi_remove(struct platform_device *pdev)
  */
 static int __maybe_unused cdns_spi_suspend(struct device *dev)
 {
-	struct platform_device *pdev = to_platform_device(dev);
-	struct spi_master *master = platform_get_drvdata(pdev);
+	struct spi_master *master = dev_get_drvdata(dev);
 
 	return spi_master_suspend(master);
 }
@@ -710,8 +709,7 @@ static int __maybe_unused cdns_spi_suspend(struct device *dev)
  */
 static int __maybe_unused cdns_spi_resume(struct device *dev)
 {
-	struct platform_device *pdev = to_platform_device(dev);
-	struct spi_master *master = platform_get_drvdata(pdev);
+	struct spi_master *master = dev_get_drvdata(dev);
 	struct cdns_spi *xspi = spi_master_get_devdata(master);
 
 	cdns_spi_init_hw(xspi);
diff --git a/drivers/spi/spi-fsl-lpspi.c b/drivers/spi/spi-fsl-lpspi.c
index cb3c730..e6d5cc6 100644
--- a/drivers/spi/spi-fsl-lpspi.c
+++ b/drivers/spi/spi-fsl-lpspi.c
@@ -1,19 +1,8 @@
-/*
- * Freescale i.MX7ULP LPSPI driver
- *
- * Copyright 2016 Freescale Semiconductor, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Freescale i.MX7ULP LPSPI driver
+//
+// Copyright 2016 Freescale Semiconductor, Inc.
 
 #include <linux/clk.h>
 #include <linux/completion.h>
diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c
index a056ee8..866246f 100644
--- a/drivers/spi/spi-imx.c
+++ b/drivers/spi/spi-imx.c
@@ -1,22 +1,6 @@
-/*
- * Copyright 2004-2007 Freescale Semiconductor, Inc. All Rights Reserved.
- * Copyright (C) 2008 Juergen Beisert
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the
- * Free Software Foundation
- * 51 Franklin Street, Fifth Floor
- * Boston, MA  02110-1301, USA.
- */
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright 2004-2007 Freescale Semiconductor, Inc. All Rights Reserved.
+// Copyright (C) 2008 Juergen Beisert
 
 #include <linux/clk.h>
 #include <linux/completion.h>
diff --git a/drivers/spi/spi-mem.c b/drivers/spi/spi-mem.c
new file mode 100644
index 0000000..990770d
--- /dev/null
+++ b/drivers/spi/spi-mem.c
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2018 Exceet Electronics GmbH
+ * Copyright (C) 2018 Bootlin
+ *
+ * Author: Boris Brezillon <boris.brezillon@bootlin.com>
+ */
+#include <linux/dmaengine.h>
+#include <linux/pm_runtime.h>
+#include <linux/spi/spi.h>
+#include <linux/spi/spi-mem.h>
+
+#include "internals.h"
+
+/**
+ * spi_controller_dma_map_mem_op_data() - DMA-map the buffer attached to a
+ *					  memory operation
+ * @ctlr: the SPI controller requesting this dma_map()
+ * @op: the memory operation containing the buffer to map
+ * @sgt: a pointer to a non-initialized sg_table that will be filled by this
+ *	 function
+ *
+ * Some controllers might want to do DMA on the data buffer embedded in @op.
+ * This helper prepares everything for you and provides a ready-to-use
+ * sg_table. This function is not intended to be called from spi drivers.
+ * Only SPI controller drivers should use it.
+ * Note that the caller must ensure the memory region pointed by
+ * op->data.buf.{in,out} is DMA-able before calling this function.
+ *
+ * Return: 0 in case of success, a negative error code otherwise.
+ */
+int spi_controller_dma_map_mem_op_data(struct spi_controller *ctlr,
+				       const struct spi_mem_op *op,
+				       struct sg_table *sgt)
+{
+	struct device *dmadev;
+
+	if (!op->data.nbytes)
+		return -EINVAL;
+
+	if (op->data.dir == SPI_MEM_DATA_OUT && ctlr->dma_tx)
+		dmadev = ctlr->dma_tx->device->dev;
+	else if (op->data.dir == SPI_MEM_DATA_IN && ctlr->dma_rx)
+		dmadev = ctlr->dma_rx->device->dev;
+	else
+		dmadev = ctlr->dev.parent;
+
+	if (!dmadev)
+		return -EINVAL;
+
+	return spi_map_buf(ctlr, dmadev, sgt, op->data.buf.in, op->data.nbytes,
+			   op->data.dir == SPI_MEM_DATA_IN ?
+			   DMA_FROM_DEVICE : DMA_TO_DEVICE);
+}
+EXPORT_SYMBOL_GPL(spi_controller_dma_map_mem_op_data);
+
+/**
+ * spi_controller_dma_unmap_mem_op_data() - DMA-unmap the buffer attached to a
+ *					    memory operation
+ * @ctlr: the SPI controller requesting this dma_unmap()
+ * @op: the memory operation containing the buffer to unmap
+ * @sgt: a pointer to an sg_table previously initialized by
+ *	 spi_controller_dma_map_mem_op_data()
+ *
+ * Some controllers might want to do DMA on the data buffer embedded in @op.
+ * This helper prepares things so that the CPU can access the
+ * op->data.buf.{in,out} buffer again.
+ *
+ * This function is not intended to be called from SPI drivers. Only SPI
+ * controller drivers should use it.
+ *
+ * This function should be called after the DMA operation has finished and is
+ * only valid if the previous spi_controller_dma_map_mem_op_data() call
+ * returned 0.
+ *
+ * Return: 0 in case of success, a negative error code otherwise.
+ */
+void spi_controller_dma_unmap_mem_op_data(struct spi_controller *ctlr,
+					  const struct spi_mem_op *op,
+					  struct sg_table *sgt)
+{
+	struct device *dmadev;
+
+	if (!op->data.nbytes)
+		return;
+
+	if (op->data.dir == SPI_MEM_DATA_OUT && ctlr->dma_tx)
+		dmadev = ctlr->dma_tx->device->dev;
+	else if (op->data.dir == SPI_MEM_DATA_IN && ctlr->dma_rx)
+		dmadev = ctlr->dma_rx->device->dev;
+	else
+		dmadev = ctlr->dev.parent;
+
+	spi_unmap_buf(ctlr, dmadev, sgt,
+		      op->data.dir == SPI_MEM_DATA_IN ?
+		      DMA_FROM_DEVICE : DMA_TO_DEVICE);
+}
+EXPORT_SYMBOL_GPL(spi_controller_dma_unmap_mem_op_data);
+
+static int spi_check_buswidth_req(struct spi_mem *mem, u8 buswidth, bool tx)
+{
+	u32 mode = mem->spi->mode;
+
+	switch (buswidth) {
+	case 1:
+		return 0;
+
+	case 2:
+		if ((tx && (mode & (SPI_TX_DUAL | SPI_TX_QUAD))) ||
+		    (!tx && (mode & (SPI_RX_DUAL | SPI_RX_QUAD))))
+			return 0;
+
+		break;
+
+	case 4:
+		if ((tx && (mode & SPI_TX_QUAD)) ||
+		    (!tx && (mode & SPI_RX_QUAD)))
+			return 0;
+
+		break;
+
+	default:
+		break;
+	}
+
+	return -ENOTSUPP;
+}
+
+static bool spi_mem_default_supports_op(struct spi_mem *mem,
+					const struct spi_mem_op *op)
+{
+	if (spi_check_buswidth_req(mem, op->cmd.buswidth, true))
+		return false;
+
+	if (op->addr.nbytes &&
+	    spi_check_buswidth_req(mem, op->addr.buswidth, true))
+		return false;
+
+	if (op->dummy.nbytes &&
+	    spi_check_buswidth_req(mem, op->dummy.buswidth, true))
+		return false;
+
+	if (op->data.nbytes &&
+	    spi_check_buswidth_req(mem, op->data.buswidth,
+				   op->data.dir == SPI_MEM_DATA_OUT))
+		return false;
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(spi_mem_default_supports_op);
+
+/**
+ * spi_mem_supports_op() - Check if a memory device and the controller it is
+ *			   connected to support a specific memory operation
+ * @mem: the SPI memory
+ * @op: the memory operation to check
+ *
+ * Some controllers are only supporting Single or Dual IOs, others might only
+ * support specific opcodes, or it can even be that the controller and device
+ * both support Quad IOs but the hardware prevents you from using it because
+ * only 2 IO lines are connected.
+ *
+ * This function checks whether a specific operation is supported.
+ *
+ * Return: true if @op is supported, false otherwise.
+ */
+bool spi_mem_supports_op(struct spi_mem *mem, const struct spi_mem_op *op)
+{
+	struct spi_controller *ctlr = mem->spi->controller;
+
+	if (ctlr->mem_ops && ctlr->mem_ops->supports_op)
+		return ctlr->mem_ops->supports_op(mem, op);
+
+	return spi_mem_default_supports_op(mem, op);
+}
+EXPORT_SYMBOL_GPL(spi_mem_supports_op);
+
+/**
+ * spi_mem_exec_op() - Execute a memory operation
+ * @mem: the SPI memory
+ * @op: the memory operation to execute
+ *
+ * Executes a memory operation.
+ *
+ * This function first checks that @op is supported and then tries to execute
+ * it.
+ *
+ * Return: 0 in case of success, a negative error code otherwise.
+ */
+int spi_mem_exec_op(struct spi_mem *mem, const struct spi_mem_op *op)
+{
+	unsigned int tmpbufsize, xferpos = 0, totalxferlen = 0;
+	struct spi_controller *ctlr = mem->spi->controller;
+	struct spi_transfer xfers[4] = { };
+	struct spi_message msg;
+	u8 *tmpbuf;
+	int ret;
+
+	if (!spi_mem_supports_op(mem, op))
+		return -ENOTSUPP;
+
+	if (ctlr->mem_ops) {
+		/*
+		 * Flush the message queue before executing our SPI memory
+		 * operation to prevent preemption of regular SPI transfers.
+		 */
+		spi_flush_queue(ctlr);
+
+		if (ctlr->auto_runtime_pm) {
+			ret = pm_runtime_get_sync(ctlr->dev.parent);
+			if (ret < 0) {
+				dev_err(&ctlr->dev,
+					"Failed to power device: %d\n",
+					ret);
+				return ret;
+			}
+		}
+
+		mutex_lock(&ctlr->bus_lock_mutex);
+		mutex_lock(&ctlr->io_mutex);
+		ret = ctlr->mem_ops->exec_op(mem, op);
+		mutex_unlock(&ctlr->io_mutex);
+		mutex_unlock(&ctlr->bus_lock_mutex);
+
+		if (ctlr->auto_runtime_pm)
+			pm_runtime_put(ctlr->dev.parent);
+
+		/*
+		 * Some controllers only optimize specific paths (typically the
+		 * read path) and expect the core to use the regular SPI
+		 * interface in other cases.
+		 */
+		if (!ret || ret != -ENOTSUPP)
+			return ret;
+	}
+
+	tmpbufsize = sizeof(op->cmd.opcode) + op->addr.nbytes +
+		     op->dummy.nbytes;
+
+	/*
+	 * Allocate a buffer to transmit the CMD, ADDR cycles with kmalloc() so
+	 * we're guaranteed that this buffer is DMA-able, as required by the
+	 * SPI layer.
+	 */
+	tmpbuf = kzalloc(tmpbufsize, GFP_KERNEL | GFP_DMA);
+	if (!tmpbuf)
+		return -ENOMEM;
+
+	spi_message_init(&msg);
+
+	tmpbuf[0] = op->cmd.opcode;
+	xfers[xferpos].tx_buf = tmpbuf;
+	xfers[xferpos].len = sizeof(op->cmd.opcode);
+	xfers[xferpos].tx_nbits = op->cmd.buswidth;
+	spi_message_add_tail(&xfers[xferpos], &msg);
+	xferpos++;
+	totalxferlen++;
+
+	if (op->addr.nbytes) {
+		int i;
+
+		for (i = 0; i < op->addr.nbytes; i++)
+			tmpbuf[i + 1] = op->addr.val >>
+					(8 * (op->addr.nbytes - i - 1));
+
+		xfers[xferpos].tx_buf = tmpbuf + 1;
+		xfers[xferpos].len = op->addr.nbytes;
+		xfers[xferpos].tx_nbits = op->addr.buswidth;
+		spi_message_add_tail(&xfers[xferpos], &msg);
+		xferpos++;
+		totalxferlen += op->addr.nbytes;
+	}
+
+	if (op->dummy.nbytes) {
+		memset(tmpbuf + op->addr.nbytes + 1, 0xff, op->dummy.nbytes);
+		xfers[xferpos].tx_buf = tmpbuf + op->addr.nbytes + 1;
+		xfers[xferpos].len = op->dummy.nbytes;
+		xfers[xferpos].tx_nbits = op->dummy.buswidth;
+		spi_message_add_tail(&xfers[xferpos], &msg);
+		xferpos++;
+		totalxferlen += op->dummy.nbytes;
+	}
+
+	if (op->data.nbytes) {
+		if (op->data.dir == SPI_MEM_DATA_IN) {
+			xfers[xferpos].rx_buf = op->data.buf.in;
+			xfers[xferpos].rx_nbits = op->data.buswidth;
+		} else {
+			xfers[xferpos].tx_buf = op->data.buf.out;
+			xfers[xferpos].tx_nbits = op->data.buswidth;
+		}
+
+		xfers[xferpos].len = op->data.nbytes;
+		spi_message_add_tail(&xfers[xferpos], &msg);
+		xferpos++;
+		totalxferlen += op->data.nbytes;
+	}
+
+	ret = spi_sync(mem->spi, &msg);
+
+	kfree(tmpbuf);
+
+	if (ret)
+		return ret;
+
+	if (msg.actual_length != totalxferlen)
+		return -EIO;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(spi_mem_exec_op);
+
+/**
+ * spi_mem_adjust_op_size() - Adjust the data size of a SPI mem operation to
+ *			      match controller limitations
+ * @mem: the SPI memory
+ * @op: the operation to adjust
+ *
+ * Some controllers have FIFO limitations and must split a data transfer
+ * operation into multiple ones, others require a specific alignment for
+ * optimized accesses. This function allows SPI mem drivers to split a single
+ * operation into multiple sub-operations when required.
+ *
+ * Return: a negative error code if the controller can't properly adjust @op,
+ *	   0 otherwise. Note that @op->data.nbytes will be updated if @op
+ *	   can't be handled in a single step.
+ */
+int spi_mem_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op)
+{
+	struct spi_controller *ctlr = mem->spi->controller;
+
+	if (ctlr->mem_ops && ctlr->mem_ops->adjust_op_size)
+		return ctlr->mem_ops->adjust_op_size(mem, op);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(spi_mem_adjust_op_size);
+
+static inline struct spi_mem_driver *to_spi_mem_drv(struct device_driver *drv)
+{
+	return container_of(drv, struct spi_mem_driver, spidrv.driver);
+}
+
+static int spi_mem_probe(struct spi_device *spi)
+{
+	struct spi_mem_driver *memdrv = to_spi_mem_drv(spi->dev.driver);
+	struct spi_mem *mem;
+
+	mem = devm_kzalloc(&spi->dev, sizeof(*mem), GFP_KERNEL);
+	if (!mem)
+		return -ENOMEM;
+
+	mem->spi = spi;
+	spi_set_drvdata(spi, mem);
+
+	return memdrv->probe(mem);
+}
+
+static int spi_mem_remove(struct spi_device *spi)
+{
+	struct spi_mem_driver *memdrv = to_spi_mem_drv(spi->dev.driver);
+	struct spi_mem *mem = spi_get_drvdata(spi);
+
+	if (memdrv->remove)
+		return memdrv->remove(mem);
+
+	return 0;
+}
+
+static void spi_mem_shutdown(struct spi_device *spi)
+{
+	struct spi_mem_driver *memdrv = to_spi_mem_drv(spi->dev.driver);
+	struct spi_mem *mem = spi_get_drvdata(spi);
+
+	if (memdrv->shutdown)
+		memdrv->shutdown(mem);
+}
+
+/**
+ * spi_mem_driver_register_with_owner() - Register a SPI memory driver
+ * @memdrv: the SPI memory driver to register
+ * @owner: the owner of this driver
+ *
+ * Registers a SPI memory driver.
+ *
+ * Return: 0 in case of success, a negative error core otherwise.
+ */
+
+int spi_mem_driver_register_with_owner(struct spi_mem_driver *memdrv,
+				       struct module *owner)
+{
+	memdrv->spidrv.probe = spi_mem_probe;
+	memdrv->spidrv.remove = spi_mem_remove;
+	memdrv->spidrv.shutdown = spi_mem_shutdown;
+
+	return __spi_register_driver(owner, &memdrv->spidrv);
+}
+EXPORT_SYMBOL_GPL(spi_mem_driver_register_with_owner);
+
+/**
+ * spi_mem_driver_unregister_with_owner() - Unregister a SPI memory driver
+ * @memdrv: the SPI memory driver to unregister
+ *
+ * Unregisters a SPI memory driver.
+ */
+void spi_mem_driver_unregister(struct spi_mem_driver *memdrv)
+{
+	spi_unregister_driver(&memdrv->spidrv);
+}
+EXPORT_SYMBOL_GPL(spi_mem_driver_unregister);
diff --git a/drivers/spi/spi-meson-spicc.c b/drivers/spi/spi-meson-spicc.c
index 5c82910..7fe4488 100644
--- a/drivers/spi/spi-meson-spicc.c
+++ b/drivers/spi/spi-meson-spicc.c
@@ -574,10 +574,15 @@ static int meson_spicc_probe(struct platform_device *pdev)
 		master->max_speed_hz = rate >> 2;
 
 	ret = devm_spi_register_master(&pdev->dev, master);
-	if (!ret)
-		return 0;
+	if (ret) {
+		dev_err(&pdev->dev, "spi master registration failed\n");
+		goto out_clk;
+	}
 
-	dev_err(&pdev->dev, "spi master registration failed\n");
+	return 0;
+
+out_clk:
+	clk_disable_unprepare(spicc->core);
 
 out_master:
 	spi_master_put(master);
diff --git a/drivers/spi/spi-mpc52xx.c b/drivers/spi/spi-mpc52xx.c
index e8b59ce..0e55784 100644
--- a/drivers/spi/spi-mpc52xx.c
+++ b/drivers/spi/spi-mpc52xx.c
@@ -447,7 +447,7 @@ static int mpc52xx_spi_probe(struct platform_device *op)
 
 		for (i = 0; i < ms->gpio_cs_count; i++) {
 			gpio_cs = of_get_gpio(op->dev.of_node, i);
-			if (gpio_cs < 0) {
+			if (!gpio_is_valid(gpio_cs)) {
 				dev_err(&op->dev,
 					"could not parse the gpio field in oftree\n");
 				rc = -ENODEV;
diff --git a/drivers/spi/spi-mxs.c b/drivers/spi/spi-mxs.c
index 3d216b9..6ac95a2 100644
--- a/drivers/spi/spi-mxs.c
+++ b/drivers/spi/spi-mxs.c
@@ -1,32 +1,22 @@
-/*
- * Freescale MXS SPI master driver
- *
- * Copyright 2012 DENX Software Engineering, GmbH.
- * Copyright 2012 Freescale Semiconductor, Inc.
- * Copyright 2008 Embedded Alley Solutions, Inc All Rights Reserved.
- *
- * Rework and transition to new API by:
- * Marek Vasut <marex@denx.de>
- *
- * Based on previous attempt by:
- * Fabio Estevam <fabio.estevam@freescale.com>
- *
- * Based on code from U-Boot bootloader by:
- * Marek Vasut <marex@denx.de>
- *
- * Based on spi-stmp.c, which is:
- * Author: Dmitry Pervushin <dimka@embeddedalley.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Freescale MXS SPI master driver
+//
+// Copyright 2012 DENX Software Engineering, GmbH.
+// Copyright 2012 Freescale Semiconductor, Inc.
+// Copyright 2008 Embedded Alley Solutions, Inc All Rights Reserved.
+//
+// Rework and transition to new API by:
+// Marek Vasut <marex@denx.de>
+//
+// Based on previous attempt by:
+// Fabio Estevam <fabio.estevam@freescale.com>
+//
+// Based on code from U-Boot bootloader by:
+// Marek Vasut <marex@denx.de>
+//
+// Based on spi-stmp.c, which is:
+// Author: Dmitry Pervushin <dimka@embeddedalley.com>
 
 #include <linux/kernel.h>
 #include <linux/ioport.h>
diff --git a/drivers/spi/spi-omap2-mcspi.c b/drivers/spi/spi-omap2-mcspi.c
index 9bf64e6..6c628a5 100644
--- a/drivers/spi/spi-omap2-mcspi.c
+++ b/drivers/spi/spi-omap2-mcspi.c
@@ -255,6 +255,7 @@ static void omap2_mcspi_set_cs(struct spi_device *spi, bool enable)
 	if (spi->controller_state) {
 		int err = pm_runtime_get_sync(mcspi->dev);
 		if (err < 0) {
+			pm_runtime_put_noidle(mcspi->dev);
 			dev_err(mcspi->dev, "failed to get sync: %d\n", err);
 			return;
 		}
@@ -350,20 +351,6 @@ disable_fifo:
 	mcspi->fifo_depth = 0;
 }
 
-static void omap2_mcspi_restore_ctx(struct omap2_mcspi *mcspi)
-{
-	struct spi_master	*spi_cntrl = mcspi->master;
-	struct omap2_mcspi_regs	*ctx = &mcspi->ctx;
-	struct omap2_mcspi_cs	*cs;
-
-	/* McSPI: context restore */
-	mcspi_write_reg(spi_cntrl, OMAP2_MCSPI_MODULCTRL, ctx->modulctrl);
-	mcspi_write_reg(spi_cntrl, OMAP2_MCSPI_WAKEUPENABLE, ctx->wakeupenable);
-
-	list_for_each_entry(cs, &ctx->cs, node)
-		writel_relaxed(cs->chconf0, cs->base + OMAP2_MCSPI_CHCONF0);
-}
-
 static int mcspi_wait_for_reg_bit(void __iomem *reg, unsigned long bit)
 {
 	unsigned long timeout;
@@ -1065,8 +1052,11 @@ static int omap2_mcspi_setup(struct spi_device *spi)
 	}
 
 	ret = pm_runtime_get_sync(mcspi->dev);
-	if (ret < 0)
+	if (ret < 0) {
+		pm_runtime_put_noidle(mcspi->dev);
+
 		return ret;
+	}
 
 	ret = omap2_mcspi_setup_transfer(spi, NULL);
 	pm_runtime_mark_last_busy(mcspi->dev);
@@ -1284,8 +1274,11 @@ static int omap2_mcspi_master_setup(struct omap2_mcspi *mcspi)
 	int			ret = 0;
 
 	ret = pm_runtime_get_sync(mcspi->dev);
-	if (ret < 0)
+	if (ret < 0) {
+		pm_runtime_put_noidle(mcspi->dev);
+
 		return ret;
+	}
 
 	mcspi_write_reg(master, OMAP2_MCSPI_WAKEUPENABLE,
 			OMAP2_MCSPI_WAKEUPENABLE_WKEN);
@@ -1297,14 +1290,39 @@ static int omap2_mcspi_master_setup(struct omap2_mcspi *mcspi)
 	return 0;
 }
 
+/*
+ * When SPI wake up from off-mode, CS is in activate state. If it was in
+ * inactive state when driver was suspend, then force it to inactive state at
+ * wake up.
+ */
 static int omap_mcspi_runtime_resume(struct device *dev)
 {
-	struct omap2_mcspi	*mcspi;
-	struct spi_master	*master;
+	struct spi_master *master = dev_get_drvdata(dev);
+	struct omap2_mcspi *mcspi = spi_master_get_devdata(master);
+	struct omap2_mcspi_regs *ctx = &mcspi->ctx;
+	struct omap2_mcspi_cs *cs;
 
-	master = dev_get_drvdata(dev);
-	mcspi = spi_master_get_devdata(master);
-	omap2_mcspi_restore_ctx(mcspi);
+	/* McSPI: context restore */
+	mcspi_write_reg(master, OMAP2_MCSPI_MODULCTRL, ctx->modulctrl);
+	mcspi_write_reg(master, OMAP2_MCSPI_WAKEUPENABLE, ctx->wakeupenable);
+
+	list_for_each_entry(cs, &ctx->cs, node) {
+		/*
+		 * We need to toggle CS state for OMAP take this
+		 * change in account.
+		 */
+		if ((cs->chconf0 & OMAP2_MCSPI_CHCONF_FORCE) == 0) {
+			cs->chconf0 |= OMAP2_MCSPI_CHCONF_FORCE;
+			writel_relaxed(cs->chconf0,
+				       cs->base + OMAP2_MCSPI_CHCONF0);
+			cs->chconf0 &= ~OMAP2_MCSPI_CHCONF_FORCE;
+			writel_relaxed(cs->chconf0,
+				       cs->base + OMAP2_MCSPI_CHCONF0);
+		} else {
+			writel_relaxed(cs->chconf0,
+				       cs->base + OMAP2_MCSPI_CHCONF0);
+		}
+	}
 
 	return 0;
 }
@@ -1447,50 +1465,33 @@ static int omap2_mcspi_remove(struct platform_device *pdev)
 MODULE_ALIAS("platform:omap2_mcspi");
 
 #ifdef	CONFIG_SUSPEND
-/*
- * When SPI wake up from off-mode, CS is in activate state. If it was in
- * unactive state when driver was suspend, then force it to unactive state at
- * wake up.
- */
-static int omap2_mcspi_resume(struct device *dev)
+static int omap2_mcspi_suspend_noirq(struct device *dev)
 {
-	struct spi_master	*master = dev_get_drvdata(dev);
-	struct omap2_mcspi	*mcspi = spi_master_get_devdata(master);
-	struct omap2_mcspi_regs	*ctx = &mcspi->ctx;
-	struct omap2_mcspi_cs	*cs;
-
-	pm_runtime_get_sync(mcspi->dev);
-	list_for_each_entry(cs, &ctx->cs, node) {
-		if ((cs->chconf0 & OMAP2_MCSPI_CHCONF_FORCE) == 0) {
-			/*
-			 * We need to toggle CS state for OMAP take this
-			 * change in account.
-			 */
-			cs->chconf0 |= OMAP2_MCSPI_CHCONF_FORCE;
-			writel_relaxed(cs->chconf0, cs->base + OMAP2_MCSPI_CHCONF0);
-			cs->chconf0 &= ~OMAP2_MCSPI_CHCONF_FORCE;
-			writel_relaxed(cs->chconf0, cs->base + OMAP2_MCSPI_CHCONF0);
-		}
-	}
-	pm_runtime_mark_last_busy(mcspi->dev);
-	pm_runtime_put_autosuspend(mcspi->dev);
-
-	return pinctrl_pm_select_default_state(dev);
+	return pinctrl_pm_select_sleep_state(dev);
 }
 
-static int omap2_mcspi_suspend(struct device *dev)
+static int omap2_mcspi_resume_noirq(struct device *dev)
 {
-	return pinctrl_pm_select_sleep_state(dev);
+	struct spi_master *master = dev_get_drvdata(dev);
+	struct omap2_mcspi *mcspi = spi_master_get_devdata(master);
+	int error;
+
+	error = pinctrl_pm_select_default_state(dev);
+	if (error)
+		dev_warn(mcspi->dev, "%s: failed to set pins: %i\n",
+			 __func__, error);
+
+	return 0;
 }
 
 #else
-#define omap2_mcspi_suspend	NULL
-#define	omap2_mcspi_resume	NULL
+#define omap2_mcspi_suspend_noirq	NULL
+#define omap2_mcspi_resume_noirq	NULL
 #endif
 
 static const struct dev_pm_ops omap2_mcspi_pm_ops = {
-	.resume = omap2_mcspi_resume,
-	.suspend = omap2_mcspi_suspend,
+	.suspend_noirq = omap2_mcspi_suspend_noirq,
+	.resume_noirq = omap2_mcspi_resume_noirq,
 	.runtime_resume	= omap_mcspi_runtime_resume,
 };
 
diff --git a/drivers/spi/spi-pxa2xx-dma.c b/drivers/spi/spi-pxa2xx-dma.c
index 3d7f660..2fa7f4b 100644
--- a/drivers/spi/spi-pxa2xx-dma.c
+++ b/drivers/spi/spi-pxa2xx-dma.c
@@ -51,19 +51,15 @@ static void pxa2xx_spi_dma_transfer_complete(struct driver_data *drv_data,
 		if (!pxa25x_ssp_comp(drv_data))
 			pxa2xx_spi_write(drv_data, SSTO, 0);
 
-		if (!error) {
-			msg->actual_length += drv_data->len;
-			msg->state = pxa2xx_spi_next_transfer(drv_data);
-		} else {
+		if (error) {
 			/* In case we got an error we disable the SSP now */
 			pxa2xx_spi_write(drv_data, SSCR0,
 					 pxa2xx_spi_read(drv_data, SSCR0)
 					 & ~SSCR0_SSE);
-
-			msg->state = ERROR_STATE;
+			msg->status = -EIO;
 		}
 
-		tasklet_schedule(&drv_data->pump_transfers);
+		spi_finalize_current_transfer(drv_data->master);
 	}
 }
 
@@ -74,11 +70,11 @@ static void pxa2xx_spi_dma_callback(void *data)
 
 static struct dma_async_tx_descriptor *
 pxa2xx_spi_dma_prepare_one(struct driver_data *drv_data,
-			   enum dma_transfer_direction dir)
+			   enum dma_transfer_direction dir,
+			   struct spi_transfer *xfer)
 {
 	struct chip_data *chip =
 		spi_get_ctldata(drv_data->master->cur_msg->spi);
-	struct spi_transfer *xfer = drv_data->cur_transfer;
 	enum dma_slave_buswidth width;
 	struct dma_slave_config cfg;
 	struct dma_chan *chan;
@@ -144,12 +140,13 @@ irqreturn_t pxa2xx_spi_dma_transfer(struct driver_data *drv_data)
 	return IRQ_NONE;
 }
 
-int pxa2xx_spi_dma_prepare(struct driver_data *drv_data, u32 dma_burst)
+int pxa2xx_spi_dma_prepare(struct driver_data *drv_data,
+			   struct spi_transfer *xfer)
 {
 	struct dma_async_tx_descriptor *tx_desc, *rx_desc;
 	int err;
 
-	tx_desc = pxa2xx_spi_dma_prepare_one(drv_data, DMA_MEM_TO_DEV);
+	tx_desc = pxa2xx_spi_dma_prepare_one(drv_data, DMA_MEM_TO_DEV, xfer);
 	if (!tx_desc) {
 		dev_err(&drv_data->pdev->dev,
 			"failed to get DMA TX descriptor\n");
@@ -157,7 +154,7 @@ int pxa2xx_spi_dma_prepare(struct driver_data *drv_data, u32 dma_burst)
 		goto err_tx;
 	}
 
-	rx_desc = pxa2xx_spi_dma_prepare_one(drv_data, DMA_DEV_TO_MEM);
+	rx_desc = pxa2xx_spi_dma_prepare_one(drv_data, DMA_DEV_TO_MEM, xfer);
 	if (!rx_desc) {
 		dev_err(&drv_data->pdev->dev,
 			"failed to get DMA RX descriptor\n");
@@ -187,6 +184,13 @@ void pxa2xx_spi_dma_start(struct driver_data *drv_data)
 	atomic_set(&drv_data->dma_running, 1);
 }
 
+void pxa2xx_spi_dma_stop(struct driver_data *drv_data)
+{
+	atomic_set(&drv_data->dma_running, 0);
+	dmaengine_terminate_sync(drv_data->master->dma_rx);
+	dmaengine_terminate_sync(drv_data->master->dma_tx);
+}
+
 int pxa2xx_spi_dma_setup(struct driver_data *drv_data)
 {
 	struct pxa2xx_spi_master *pdata = drv_data->master_info;
diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index 82dcb88..0b2d60d 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -340,9 +340,11 @@ static void lpss_ssp_setup(struct driver_data *drv_data)
 	}
 }
 
-static void lpss_ssp_select_cs(struct driver_data *drv_data,
+static void lpss_ssp_select_cs(struct spi_device *spi,
 			       const struct lpss_config *config)
 {
+	struct driver_data *drv_data =
+		spi_controller_get_devdata(spi->controller);
 	u32 value, cs;
 
 	if (!config->cs_sel_mask)
@@ -350,7 +352,7 @@ static void lpss_ssp_select_cs(struct driver_data *drv_data,
 
 	value = __lpss_ssp_read_priv(drv_data, config->reg_cs_ctrl);
 
-	cs = drv_data->master->cur_msg->spi->chip_select;
+	cs = spi->chip_select;
 	cs <<= config->cs_sel_shift;
 	if (cs != (value & config->cs_sel_mask)) {
 		/*
@@ -369,15 +371,17 @@ static void lpss_ssp_select_cs(struct driver_data *drv_data,
 	}
 }
 
-static void lpss_ssp_cs_control(struct driver_data *drv_data, bool enable)
+static void lpss_ssp_cs_control(struct spi_device *spi, bool enable)
 {
+	struct driver_data *drv_data =
+		spi_controller_get_devdata(spi->controller);
 	const struct lpss_config *config;
 	u32 value;
 
 	config = lpss_get_config(drv_data);
 
 	if (enable)
-		lpss_ssp_select_cs(drv_data, config);
+		lpss_ssp_select_cs(spi, config);
 
 	value = __lpss_ssp_read_priv(drv_data, config->reg_cs_ctrl);
 	if (enable)
@@ -387,10 +391,11 @@ static void lpss_ssp_cs_control(struct driver_data *drv_data, bool enable)
 	__lpss_ssp_write_priv(drv_data, config->reg_cs_ctrl, value);
 }
 
-static void cs_assert(struct driver_data *drv_data)
+static void cs_assert(struct spi_device *spi)
 {
-	struct chip_data *chip =
-		spi_get_ctldata(drv_data->master->cur_msg->spi);
+	struct chip_data *chip = spi_get_ctldata(spi);
+	struct driver_data *drv_data =
+		spi_controller_get_devdata(spi->controller);
 
 	if (drv_data->ssp_type == CE4100_SSP) {
 		pxa2xx_spi_write(drv_data, SSSR, chip->frm);
@@ -408,13 +413,14 @@ static void cs_assert(struct driver_data *drv_data)
 	}
 
 	if (is_lpss_ssp(drv_data))
-		lpss_ssp_cs_control(drv_data, true);
+		lpss_ssp_cs_control(spi, true);
 }
 
-static void cs_deassert(struct driver_data *drv_data)
+static void cs_deassert(struct spi_device *spi)
 {
-	struct chip_data *chip =
-		spi_get_ctldata(drv_data->master->cur_msg->spi);
+	struct chip_data *chip = spi_get_ctldata(spi);
+	struct driver_data *drv_data =
+		spi_controller_get_devdata(spi->controller);
 	unsigned long timeout;
 
 	if (drv_data->ssp_type == CE4100_SSP)
@@ -437,7 +443,15 @@ static void cs_deassert(struct driver_data *drv_data)
 	}
 
 	if (is_lpss_ssp(drv_data))
-		lpss_ssp_cs_control(drv_data, false);
+		lpss_ssp_cs_control(spi, false);
+}
+
+static void pxa2xx_spi_set_cs(struct spi_device *spi, bool level)
+{
+	if (level)
+		cs_deassert(spi);
+	else
+		cs_assert(spi);
 }
 
 int pxa2xx_spi_flush(struct driver_data *drv_data)
@@ -549,70 +563,6 @@ static int u32_reader(struct driver_data *drv_data)
 	return drv_data->rx == drv_data->rx_end;
 }
 
-void *pxa2xx_spi_next_transfer(struct driver_data *drv_data)
-{
-	struct spi_message *msg = drv_data->master->cur_msg;
-	struct spi_transfer *trans = drv_data->cur_transfer;
-
-	/* Move to next transfer */
-	if (trans->transfer_list.next != &msg->transfers) {
-		drv_data->cur_transfer =
-			list_entry(trans->transfer_list.next,
-					struct spi_transfer,
-					transfer_list);
-		return RUNNING_STATE;
-	} else
-		return DONE_STATE;
-}
-
-/* caller already set message->status; dma and pio irqs are blocked */
-static void giveback(struct driver_data *drv_data)
-{
-	struct spi_transfer* last_transfer;
-	struct spi_message *msg;
-
-	msg = drv_data->master->cur_msg;
-	drv_data->cur_transfer = NULL;
-
-	last_transfer = list_last_entry(&msg->transfers, struct spi_transfer,
-					transfer_list);
-
-	/* Delay if requested before any change in chip select */
-	if (last_transfer->delay_usecs)
-		udelay(last_transfer->delay_usecs);
-
-	/* Drop chip select UNLESS cs_change is true or we are returning
-	 * a message with an error, or next message is for another chip
-	 */
-	if (!last_transfer->cs_change)
-		cs_deassert(drv_data);
-	else {
-		struct spi_message *next_msg;
-
-		/* Holding of cs was hinted, but we need to make sure
-		 * the next message is for the same chip.  Don't waste
-		 * time with the following tests unless this was hinted.
-		 *
-		 * We cannot postpone this until pump_messages, because
-		 * after calling msg->complete (below) the driver that
-		 * sent the current message could be unloaded, which
-		 * could invalidate the cs_control() callback...
-		 */
-
-		/* get a pointer to the next message, if any */
-		next_msg = spi_get_next_queued_message(drv_data->master);
-
-		/* see if the next and current messages point
-		 * to the same chip
-		 */
-		if ((next_msg && next_msg->spi != msg->spi) ||
-		    msg->state == ERROR_STATE)
-			cs_deassert(drv_data);
-	}
-
-	spi_finalize_current_message(drv_data->master);
-}
-
 static void reset_sccr1(struct driver_data *drv_data)
 {
 	struct chip_data *chip =
@@ -648,8 +598,8 @@ static void int_error_stop(struct driver_data *drv_data, const char* msg)
 
 	dev_err(&drv_data->pdev->dev, "%s\n", msg);
 
-	drv_data->master->cur_msg->state = ERROR_STATE;
-	tasklet_schedule(&drv_data->pump_transfers);
+	drv_data->master->cur_msg->status = -EIO;
+	spi_finalize_current_transfer(drv_data->master);
 }
 
 static void int_transfer_complete(struct driver_data *drv_data)
@@ -660,19 +610,7 @@ static void int_transfer_complete(struct driver_data *drv_data)
 	if (!pxa25x_ssp_comp(drv_data))
 		pxa2xx_spi_write(drv_data, SSTO, 0);
 
-	/* Update total byte transferred return count actual bytes read */
-	drv_data->master->cur_msg->actual_length += drv_data->len -
-				(drv_data->rx_end - drv_data->rx);
-
-	/* Transfer delays and chip select release are
-	 * handled in pump_transfers or giveback
-	 */
-
-	/* Move to next transfer */
-	drv_data->master->cur_msg->state = pxa2xx_spi_next_transfer(drv_data);
-
-	/* Schedule transfer tasklet */
-	tasklet_schedule(&drv_data->pump_transfers);
+	spi_finalize_current_transfer(drv_data->master);
 }
 
 static irqreturn_t interrupt_transfer(struct driver_data *drv_data)
@@ -973,17 +911,16 @@ static bool pxa2xx_spi_can_dma(struct spi_controller *master,
 	       xfer->len >= chip->dma_burst_size;
 }
 
-static void pump_transfers(unsigned long data)
+static int pxa2xx_spi_transfer_one(struct spi_controller *master,
+				   struct spi_device *spi,
+				   struct spi_transfer *transfer)
 {
-	struct driver_data *drv_data = (struct driver_data *)data;
-	struct spi_controller *master = drv_data->master;
+	struct driver_data *drv_data = spi_controller_get_devdata(master);
 	struct spi_message *message = master->cur_msg;
 	struct chip_data *chip = spi_get_ctldata(message->spi);
 	u32 dma_thresh = chip->dma_threshold;
 	u32 dma_burst = chip->dma_burst_size;
 	u32 change_mask = pxa2xx_spi_get_ssrc1_change_mask(drv_data);
-	struct spi_transfer *transfer;
-	struct spi_transfer *previous;
 	u32 clk_div;
 	u8 bits;
 	u32 speed;
@@ -992,36 +929,6 @@ static void pump_transfers(unsigned long data)
 	int err;
 	int dma_mapped;
 
-	/* Get current state information */
-	transfer = drv_data->cur_transfer;
-
-	/* Handle for abort */
-	if (message->state == ERROR_STATE) {
-		message->status = -EIO;
-		giveback(drv_data);
-		return;
-	}
-
-	/* Handle end of message */
-	if (message->state == DONE_STATE) {
-		message->status = 0;
-		giveback(drv_data);
-		return;
-	}
-
-	/* Delay if requested at end of transfer before CS change */
-	if (message->state == RUNNING_STATE) {
-		previous = list_entry(transfer->transfer_list.prev,
-					struct spi_transfer,
-					transfer_list);
-		if (previous->delay_usecs)
-			udelay(previous->delay_usecs);
-
-		/* Drop chip select only if cs_change is requested */
-		if (previous->cs_change)
-			cs_deassert(drv_data);
-	}
-
 	/* Check if we can DMA this transfer */
 	if (transfer->len > MAX_DMA_LEN && chip->enable_dma) {
 
@@ -1029,34 +936,27 @@ static void pump_transfers(unsigned long data)
 		if (message->is_dma_mapped
 				|| transfer->rx_dma || transfer->tx_dma) {
 			dev_err(&drv_data->pdev->dev,
-				"pump_transfers: mapped transfer length of "
-				"%u is greater than %d\n",
+				"Mapped transfer length of %u is greater than %d\n",
 				transfer->len, MAX_DMA_LEN);
-			message->status = -EINVAL;
-			giveback(drv_data);
-			return;
+			return -EINVAL;
 		}
 
 		/* warn ... we force this to PIO mode */
 		dev_warn_ratelimited(&message->spi->dev,
-				     "pump_transfers: DMA disabled for transfer length %ld "
-				     "greater than %d\n",
-				     (long)drv_data->len, MAX_DMA_LEN);
+				     "DMA disabled for transfer length %ld greater than %d\n",
+				     (long)transfer->len, MAX_DMA_LEN);
 	}
 
 	/* Setup the transfer state based on the type of transfer */
 	if (pxa2xx_spi_flush(drv_data) == 0) {
-		dev_err(&drv_data->pdev->dev, "pump_transfers: flush failed\n");
-		message->status = -EIO;
-		giveback(drv_data);
-		return;
+		dev_err(&drv_data->pdev->dev, "Flush failed\n");
+		return -EIO;
 	}
 	drv_data->n_bytes = chip->n_bytes;
 	drv_data->tx = (void *)transfer->tx_buf;
 	drv_data->tx_end = drv_data->tx + transfer->len;
 	drv_data->rx = transfer->rx_buf;
 	drv_data->rx_end = drv_data->rx + transfer->len;
-	drv_data->len = transfer->len;
 	drv_data->write = drv_data->tx ? chip->write : null_writer;
 	drv_data->read = drv_data->rx ? chip->read : null_reader;
 
@@ -1095,11 +995,9 @@ static void pump_transfers(unsigned long data)
 						bits, &dma_burst,
 						&dma_thresh))
 			dev_warn_ratelimited(&message->spi->dev,
-					     "pump_transfers: DMA burst size reduced to match bits_per_word\n");
+					     "DMA burst size reduced to match bits_per_word\n");
 	}
 
-	message->state = RUNNING_STATE;
-
 	dma_mapped = master->can_dma &&
 		     master->can_dma(master, message->spi, transfer) &&
 		     master->cur_msg_mapped;
@@ -1108,12 +1006,9 @@ static void pump_transfers(unsigned long data)
 		/* Ensure we have the correct interrupt handler */
 		drv_data->transfer_handler = pxa2xx_spi_dma_transfer;
 
-		err = pxa2xx_spi_dma_prepare(drv_data, dma_burst);
-		if (err) {
-			message->status = err;
-			giveback(drv_data);
-			return;
-		}
+		err = pxa2xx_spi_dma_prepare(drv_data, transfer);
+		if (err)
+			return err;
 
 		/* Clear status and start DMA engine */
 		cr1 = chip->cr1 | dma_thresh | drv_data->dma_cr1;
@@ -1175,27 +1070,40 @@ static void pump_transfers(unsigned long data)
 			pxa2xx_spi_write(drv_data, SSTO, chip->timeout);
 	}
 
-	cs_assert(drv_data);
-
-	/* after chip select, release the data by enabling service
-	 * requests and interrupts, without changing any mode bits */
+	/*
+	 * Release the data by enabling service requests and interrupts,
+	 * without changing any mode bits
+	 */
 	pxa2xx_spi_write(drv_data, SSCR1, cr1);
+
+	return 1;
 }
 
-static int pxa2xx_spi_transfer_one_message(struct spi_controller *master,
-					   struct spi_message *msg)
+static void pxa2xx_spi_handle_err(struct spi_controller *master,
+				 struct spi_message *msg)
 {
 	struct driver_data *drv_data = spi_controller_get_devdata(master);
 
-	/* Initial message state*/
-	msg->state = START_STATE;
-	drv_data->cur_transfer = list_entry(msg->transfers.next,
-						struct spi_transfer,
-						transfer_list);
+	/* Disable the SSP */
+	pxa2xx_spi_write(drv_data, SSCR0,
+			 pxa2xx_spi_read(drv_data, SSCR0) & ~SSCR0_SSE);
+	/* Clear and disable interrupts and service requests */
+	write_SSSR_CS(drv_data, drv_data->clear_sr);
+	pxa2xx_spi_write(drv_data, SSCR1,
+			 pxa2xx_spi_read(drv_data, SSCR1)
+			 & ~(drv_data->int_cr1 | drv_data->dma_cr1));
+	if (!pxa25x_ssp_comp(drv_data))
+		pxa2xx_spi_write(drv_data, SSTO, 0);
 
-	/* Mark as busy and launch transfers */
-	tasklet_schedule(&drv_data->pump_transfers);
-	return 0;
+	/*
+	 * Stop the DMA if running. Note DMA callback handler may have unset
+	 * the dma_running already, which is fine as stopping is not needed
+	 * then but we shouldn't rely this flag for anything else than
+	 * stopping. For instance to differentiate between PIO and DMA
+	 * transfers.
+	 */
+	if (atomic_read(&drv_data->dma_running))
+		pxa2xx_spi_dma_stop(drv_data);
 }
 
 static int pxa2xx_spi_unprepare_transfer(struct spi_controller *master)
@@ -1651,7 +1559,9 @@ static int pxa2xx_spi_probe(struct platform_device *pdev)
 	master->dma_alignment = DMA_ALIGNMENT;
 	master->cleanup = cleanup;
 	master->setup = setup;
-	master->transfer_one_message = pxa2xx_spi_transfer_one_message;
+	master->set_cs = pxa2xx_spi_set_cs;
+	master->transfer_one = pxa2xx_spi_transfer_one;
+	master->handle_err = pxa2xx_spi_handle_err;
 	master->unprepare_transfer_hardware = pxa2xx_spi_unprepare_transfer;
 	master->fw_translate_cs = pxa2xx_spi_fw_translate_cs;
 	master->auto_runtime_pm = true;
@@ -1702,7 +1612,9 @@ static int pxa2xx_spi_probe(struct platform_device *pdev)
 	}
 
 	/* Enable SOC clock */
-	clk_prepare_enable(ssp->clk);
+	status = clk_prepare_enable(ssp->clk);
+	if (status)
+		goto out_error_dma_irq_alloc;
 
 	master->max_speed_hz = clk_get_rate(ssp->clk);
 
@@ -1787,9 +1699,6 @@ static int pxa2xx_spi_probe(struct platform_device *pdev)
 		}
 	}
 
-	tasklet_init(&drv_data->pump_transfers, pump_transfers,
-		     (unsigned long)drv_data);
-
 	pm_runtime_set_autosuspend_delay(&pdev->dev, 50);
 	pm_runtime_use_autosuspend(&pdev->dev);
 	pm_runtime_set_active(&pdev->dev);
@@ -1809,6 +1718,8 @@ out_error_clock_enabled:
 	pm_runtime_put_noidle(&pdev->dev);
 	pm_runtime_disable(&pdev->dev);
 	clk_disable_unprepare(ssp->clk);
+
+out_error_dma_irq_alloc:
 	pxa2xx_spi_dma_release(drv_data);
 	free_irq(ssp->irq, drv_data);
 
@@ -1882,8 +1793,11 @@ static int pxa2xx_spi_resume(struct device *dev)
 	int status;
 
 	/* Enable the SSP clock */
-	if (!pm_runtime_suspended(dev))
-		clk_prepare_enable(ssp->clk);
+	if (!pm_runtime_suspended(dev)) {
+		status = clk_prepare_enable(ssp->clk);
+		if (status)
+			return status;
+	}
 
 	/* Restore LPSS private register bits */
 	if (is_lpss_ssp(drv_data))
@@ -1912,9 +1826,10 @@ static int pxa2xx_spi_runtime_suspend(struct device *dev)
 static int pxa2xx_spi_runtime_resume(struct device *dev)
 {
 	struct driver_data *drv_data = dev_get_drvdata(dev);
+	int status;
 
-	clk_prepare_enable(drv_data->ssp->clk);
-	return 0;
+	status = clk_prepare_enable(drv_data->ssp->clk);
+	return status;
 }
 #endif
 
diff --git a/drivers/spi/spi-pxa2xx.h b/drivers/spi/spi-pxa2xx.h
index 0ae7def..513c53a 100644
--- a/drivers/spi/spi-pxa2xx.h
+++ b/drivers/spi/spi-pxa2xx.h
@@ -46,15 +46,10 @@ struct driver_data {
 	u32 clear_sr;
 	u32 mask_sr;
 
-	/* Message Transfer pump */
-	struct tasklet_struct pump_transfers;
-
 	/* DMA engine support */
 	atomic_t dma_running;
 
-	/* Current message transfer state info */
-	struct spi_transfer *cur_transfer;
-	size_t len;
+	/* Current transfer state info */
 	void *tx;
 	void *tx_end;
 	void *rx;
@@ -104,11 +99,6 @@ static  inline void pxa2xx_spi_write(const struct driver_data *drv_data,
 	__raw_writel(val, drv_data->ioaddr + reg);
 }
 
-#define START_STATE ((void *)0)
-#define RUNNING_STATE ((void *)1)
-#define DONE_STATE ((void *)2)
-#define ERROR_STATE ((void *)-1)
-
 #define DMA_ALIGNMENT		8
 
 static inline int pxa25x_ssp_comp(struct driver_data *drv_data)
@@ -133,14 +123,15 @@ static inline void write_SSSR_CS(struct driver_data *drv_data, u32 val)
 }
 
 extern int pxa2xx_spi_flush(struct driver_data *drv_data);
-extern void *pxa2xx_spi_next_transfer(struct driver_data *drv_data);
 
 #define MAX_DMA_LEN		SZ_64K
 #define DEFAULT_DMA_CR1		(SSCR1_TSRE | SSCR1_RSRE | SSCR1_TRAIL)
 
 extern irqreturn_t pxa2xx_spi_dma_transfer(struct driver_data *drv_data);
-extern int pxa2xx_spi_dma_prepare(struct driver_data *drv_data, u32 dma_burst);
+extern int pxa2xx_spi_dma_prepare(struct driver_data *drv_data,
+				  struct spi_transfer *xfer);
 extern void pxa2xx_spi_dma_start(struct driver_data *drv_data);
+extern void pxa2xx_spi_dma_stop(struct driver_data *drv_data);
 extern int pxa2xx_spi_dma_setup(struct driver_data *drv_data);
 extern void pxa2xx_spi_dma_release(struct driver_data *drv_data);
 extern int pxa2xx_spi_set_dma_burst_and_threshold(struct chip_data *chip,
diff --git a/drivers/spi/spi-s3c64xx.c b/drivers/spi/spi-s3c64xx.c
index baa3a9f..7b7151e 100644
--- a/drivers/spi/spi-s3c64xx.c
+++ b/drivers/spi/spi-s3c64xx.c
@@ -28,15 +28,15 @@
 
 #define S3C64XX_SPI_CH_CFG		0x00
 #define S3C64XX_SPI_CLK_CFG		0x04
-#define S3C64XX_SPI_MODE_CFG	0x08
-#define S3C64XX_SPI_SLAVE_SEL	0x0C
+#define S3C64XX_SPI_MODE_CFG		0x08
+#define S3C64XX_SPI_SLAVE_SEL		0x0C
 #define S3C64XX_SPI_INT_EN		0x10
 #define S3C64XX_SPI_STATUS		0x14
 #define S3C64XX_SPI_TX_DATA		0x18
 #define S3C64XX_SPI_RX_DATA		0x1C
-#define S3C64XX_SPI_PACKET_CNT	0x20
-#define S3C64XX_SPI_PENDING_CLR	0x24
-#define S3C64XX_SPI_SWAP_CFG	0x28
+#define S3C64XX_SPI_PACKET_CNT		0x20
+#define S3C64XX_SPI_PENDING_CLR		0x24
+#define S3C64XX_SPI_SWAP_CFG		0x28
 #define S3C64XX_SPI_FB_CLK		0x2C
 
 #define S3C64XX_SPI_CH_HS_EN		(1<<6)	/* High Speed Enable */
@@ -77,9 +77,9 @@
 #define S3C64XX_SPI_INT_TX_FIFORDY_EN		(1<<0)
 
 #define S3C64XX_SPI_ST_RX_OVERRUN_ERR		(1<<5)
-#define S3C64XX_SPI_ST_RX_UNDERRUN_ERR	(1<<4)
+#define S3C64XX_SPI_ST_RX_UNDERRUN_ERR		(1<<4)
 #define S3C64XX_SPI_ST_TX_OVERRUN_ERR		(1<<3)
-#define S3C64XX_SPI_ST_TX_UNDERRUN_ERR	(1<<2)
+#define S3C64XX_SPI_ST_TX_UNDERRUN_ERR		(1<<2)
 #define S3C64XX_SPI_ST_RX_FIFORDY		(1<<1)
 #define S3C64XX_SPI_ST_TX_FIFORDY		(1<<0)
 
@@ -100,7 +100,7 @@
 #define S3C64XX_SPI_SWAP_TX_BIT			(1<<1)
 #define S3C64XX_SPI_SWAP_TX_EN			(1<<0)
 
-#define S3C64XX_SPI_FBCLK_MSK		(3<<0)
+#define S3C64XX_SPI_FBCLK_MSK			(3<<0)
 
 #define FIFO_LVL_MASK(i) ((i)->port_conf->fifo_lvl_mask[i->port_id])
 #define S3C64XX_SPI_ST_TX_DONE(v, i) (((v) & \
@@ -156,7 +156,6 @@ struct s3c64xx_spi_port_config {
  * @ioclk: Pointer to the i/o clock between master and slave
  * @master: Pointer to the SPI Protocol master.
  * @cntrlr_info: Platform specific data for the controller this driver manages.
- * @tgl_spi: Pointer to the last CS left untoggled by the cs_change hint.
  * @lock: Controller specific lock.
  * @state: Set of FLAGS to indicate status.
  * @rx_dmach: Controller's DMA channel for Rx.
@@ -177,7 +176,6 @@ struct s3c64xx_spi_driver_data {
 	struct platform_device          *pdev;
 	struct spi_master               *master;
 	struct s3c64xx_spi_info  *cntrlr_info;
-	struct spi_device               *tgl_spi;
 	spinlock_t                      lock;
 	unsigned long                   sfr_start;
 	struct completion               xfer_completion;
@@ -190,7 +188,7 @@ struct s3c64xx_spi_driver_data {
 	unsigned int			port_id;
 };
 
-static void flush_fifo(struct s3c64xx_spi_driver_data *sdd)
+static void s3c64xx_flush_fifo(struct s3c64xx_spi_driver_data *sdd)
 {
 	void __iomem *regs = sdd->regs;
 	unsigned long loops;
@@ -350,9 +348,8 @@ static bool s3c64xx_spi_can_dma(struct spi_master *master,
 	return xfer->len > (FIFO_LVL_MASK(sdd) >> 1) + 1;
 }
 
-static void enable_datapath(struct s3c64xx_spi_driver_data *sdd,
-				struct spi_device *spi,
-				struct spi_transfer *xfer, int dma_mode)
+static void s3c64xx_enable_datapath(struct s3c64xx_spi_driver_data *sdd,
+				    struct spi_transfer *xfer, int dma_mode)
 {
 	void __iomem *regs = sdd->regs;
 	u32 modecfg, chcfg;
@@ -442,8 +439,8 @@ static u32 s3c64xx_spi_wait_for_timeout(struct s3c64xx_spi_driver_data *sdd,
 	return RX_FIFO_LVL(status, sdd);
 }
 
-static int wait_for_dma(struct s3c64xx_spi_driver_data *sdd,
-			struct spi_transfer *xfer)
+static int s3c64xx_wait_for_dma(struct s3c64xx_spi_driver_data *sdd,
+				struct spi_transfer *xfer)
 {
 	void __iomem *regs = sdd->regs;
 	unsigned long val;
@@ -485,8 +482,8 @@ static int wait_for_dma(struct s3c64xx_spi_driver_data *sdd,
 	return 0;
 }
 
-static int wait_for_pio(struct s3c64xx_spi_driver_data *sdd,
-			struct spi_transfer *xfer)
+static int s3c64xx_wait_for_pio(struct s3c64xx_spi_driver_data *sdd,
+				struct spi_transfer *xfer)
 {
 	void __iomem *regs = sdd->regs;
 	unsigned long val;
@@ -505,6 +502,8 @@ static int wait_for_pio(struct s3c64xx_spi_driver_data *sdd,
 		status = readl(regs + S3C64XX_SPI_STATUS);
 	} while (RX_FIFO_LVL(status, sdd) < xfer->len && --val);
 
+	if (!val)
+		return -EIO;
 
 	/* If it was only Tx */
 	if (!xfer->rx_buf) {
@@ -635,11 +634,15 @@ static int s3c64xx_spi_transfer_one(struct spi_master *master,
 				    struct spi_transfer *xfer)
 {
 	struct s3c64xx_spi_driver_data *sdd = spi_master_get_devdata(master);
+	const unsigned int fifo_len = (FIFO_LVL_MASK(sdd) >> 1) + 1;
+	const void *tx_buf = NULL;
+	void *rx_buf = NULL;
+	int target_len = 0, origin_len = 0;
+	int use_dma = 0;
 	int status;
 	u32 speed;
 	u8 bpw;
 	unsigned long flags;
-	int use_dma;
 
 	reinit_completion(&sdd->xfer_completion);
 
@@ -654,48 +657,77 @@ static int s3c64xx_spi_transfer_one(struct spi_master *master,
 		s3c64xx_spi_config(sdd);
 	}
 
-	/* Polling method for xfers not bigger than FIFO capacity */
-	use_dma = 0;
-	if (!is_polling(sdd) &&
-	    (sdd->rx_dma.ch && sdd->tx_dma.ch &&
-	     (xfer->len > ((FIFO_LVL_MASK(sdd) >> 1) + 1))))
+	if (!is_polling(sdd) && (xfer->len > fifo_len) &&
+	    sdd->rx_dma.ch && sdd->tx_dma.ch) {
 		use_dma = 1;
 
-	spin_lock_irqsave(&sdd->lock, flags);
+	} else if (is_polling(sdd) && xfer->len > fifo_len) {
+		tx_buf = xfer->tx_buf;
+		rx_buf = xfer->rx_buf;
+		origin_len = xfer->len;
 
-	/* Pending only which is to be done */
-	sdd->state &= ~RXBUSY;
-	sdd->state &= ~TXBUSY;
+		target_len = xfer->len;
+		if (xfer->len > fifo_len)
+			xfer->len = fifo_len;
+	}
 
-	enable_datapath(sdd, spi, xfer, use_dma);
+	do {
+		spin_lock_irqsave(&sdd->lock, flags);
 
-	/* Start the signals */
-	s3c64xx_spi_set_cs(spi, true);
+		/* Pending only which is to be done */
+		sdd->state &= ~RXBUSY;
+		sdd->state &= ~TXBUSY;
 
-	spin_unlock_irqrestore(&sdd->lock, flags);
+		s3c64xx_enable_datapath(sdd, xfer, use_dma);
 
-	if (use_dma)
-		status = wait_for_dma(sdd, xfer);
-	else
-		status = wait_for_pio(sdd, xfer);
-
-	if (status) {
-		dev_err(&spi->dev, "I/O Error: rx-%d tx-%d res:rx-%c tx-%c len-%d\n",
-			xfer->rx_buf ? 1 : 0, xfer->tx_buf ? 1 : 0,
-			(sdd->state & RXBUSY) ? 'f' : 'p',
-			(sdd->state & TXBUSY) ? 'f' : 'p',
-			xfer->len);
-
-		if (use_dma) {
-			if (xfer->tx_buf != NULL
-			    && (sdd->state & TXBUSY))
-				dmaengine_terminate_all(sdd->tx_dma.ch);
-			if (xfer->rx_buf != NULL
-			    && (sdd->state & RXBUSY))
-				dmaengine_terminate_all(sdd->rx_dma.ch);
+		/* Start the signals */
+		s3c64xx_spi_set_cs(spi, true);
+
+		spin_unlock_irqrestore(&sdd->lock, flags);
+
+		if (use_dma)
+			status = s3c64xx_wait_for_dma(sdd, xfer);
+		else
+			status = s3c64xx_wait_for_pio(sdd, xfer);
+
+		if (status) {
+			dev_err(&spi->dev,
+				"I/O Error: rx-%d tx-%d res:rx-%c tx-%c len-%d\n",
+				xfer->rx_buf ? 1 : 0, xfer->tx_buf ? 1 : 0,
+				(sdd->state & RXBUSY) ? 'f' : 'p',
+				(sdd->state & TXBUSY) ? 'f' : 'p',
+				xfer->len);
+
+			if (use_dma) {
+				if (xfer->tx_buf && (sdd->state & TXBUSY))
+					dmaengine_terminate_all(sdd->tx_dma.ch);
+				if (xfer->rx_buf && (sdd->state & RXBUSY))
+					dmaengine_terminate_all(sdd->rx_dma.ch);
+			}
+		} else {
+			s3c64xx_flush_fifo(sdd);
 		}
-	} else {
-		flush_fifo(sdd);
+		if (target_len > 0) {
+			target_len -= xfer->len;
+
+			if (xfer->tx_buf)
+				xfer->tx_buf += xfer->len;
+
+			if (xfer->rx_buf)
+				xfer->rx_buf += xfer->len;
+
+			if (target_len > fifo_len)
+				xfer->len = fifo_len;
+			else
+				xfer->len = target_len;
+		}
+	} while (target_len > 0);
+
+	if (origin_len) {
+		/* Restore original xfer buffers and length */
+		xfer->tx_buf = tx_buf;
+		xfer->rx_buf = rx_buf;
+		xfer->len = origin_len;
 	}
 
 	return status;
@@ -891,7 +923,7 @@ static irqreturn_t s3c64xx_spi_irq(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static void s3c64xx_spi_hwinit(struct s3c64xx_spi_driver_data *sdd, int channel)
+static void s3c64xx_spi_hwinit(struct s3c64xx_spi_driver_data *sdd)
 {
 	struct s3c64xx_spi_info *sci = sdd->cntrlr_info;
 	void __iomem *regs = sdd->regs;
@@ -929,7 +961,7 @@ static void s3c64xx_spi_hwinit(struct s3c64xx_spi_driver_data *sdd, int channel)
 	val |= (S3C64XX_SPI_TRAILCNT << S3C64XX_SPI_TRAILCNT_OFF);
 	writel(val, regs + S3C64XX_SPI_MODE_CFG);
 
-	flush_fifo(sdd);
+	s3c64xx_flush_fifo(sdd);
 }
 
 #ifdef CONFIG_OF
@@ -1145,7 +1177,7 @@ static int s3c64xx_spi_probe(struct platform_device *pdev)
 	pm_runtime_get_sync(&pdev->dev);
 
 	/* Setup Deufult Mode */
-	s3c64xx_spi_hwinit(sdd, sdd->port_id);
+	s3c64xx_spi_hwinit(sdd);
 
 	spin_lock_init(&sdd->lock);
 	init_completion(&sdd->xfer_completion);
@@ -1260,8 +1292,6 @@ static int s3c64xx_spi_resume(struct device *dev)
 	if (ret < 0)
 		return ret;
 
-	s3c64xx_spi_hwinit(sdd, sdd->port_id);
-
 	return spi_master_resume(master);
 }
 #endif /* CONFIG_PM_SLEEP */
@@ -1299,6 +1329,8 @@ static int s3c64xx_spi_runtime_resume(struct device *dev)
 	if (ret != 0)
 		goto err_disable_src_clk;
 
+	s3c64xx_spi_hwinit(sdd);
+
 	return 0;
 
 err_disable_src_clk:
@@ -1344,15 +1376,6 @@ static struct s3c64xx_spi_port_config exynos4_spi_port_config = {
 	.clk_from_cmu	= true,
 };
 
-static struct s3c64xx_spi_port_config exynos5440_spi_port_config = {
-	.fifo_lvl_mask	= { 0x1ff },
-	.rx_lvl_offset	= 15,
-	.tx_st_done	= 25,
-	.high_speed	= true,
-	.clk_from_cmu	= true,
-	.quirks		= S3C64XX_SPI_QUIRK_POLL,
-};
-
 static struct s3c64xx_spi_port_config exynos7_spi_port_config = {
 	.fifo_lvl_mask	= { 0x1ff, 0x7F, 0x7F, 0x7F, 0x7F, 0x1ff},
 	.rx_lvl_offset	= 15,
@@ -1396,9 +1419,6 @@ static const struct of_device_id s3c64xx_spi_dt_match[] = {
 	{ .compatible = "samsung,exynos4210-spi",
 			.data = (void *)&exynos4_spi_port_config,
 	},
-	{ .compatible = "samsung,exynos5440-spi",
-			.data = (void *)&exynos5440_spi_port_config,
-	},
 	{ .compatible = "samsung,exynos7-spi",
 			.data = (void *)&exynos7_spi_port_config,
 	},
diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c
index 8171eed..0e74cbf 100644
--- a/drivers/spi/spi-sh-msiof.c
+++ b/drivers/spi/spi-sh-msiof.c
@@ -39,7 +39,7 @@ struct sh_msiof_chipdata {
 	u16 tx_fifo_size;
 	u16 rx_fifo_size;
 	u16 master_flags;
-	u16 min_div;
+	u16 min_div_pow;
 };
 
 struct sh_msiof_spi_priv {
@@ -51,7 +51,7 @@ struct sh_msiof_spi_priv {
 	struct completion done;
 	unsigned int tx_fifo_size;
 	unsigned int rx_fifo_size;
-	unsigned int min_div;
+	unsigned int min_div_pow;
 	void *tx_dma_page;
 	void *rx_dma_page;
 	dma_addr_t tx_dma_addr;
@@ -249,43 +249,46 @@ static irqreturn_t sh_msiof_spi_irq(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static struct {
-	unsigned short div;
-	unsigned short brdv;
-} const sh_msiof_spi_div_table[] = {
-	{ 1,	SCR_BRDV_DIV_1 },
-	{ 2,	SCR_BRDV_DIV_2 },
-	{ 4,	SCR_BRDV_DIV_4 },
-	{ 8,	SCR_BRDV_DIV_8 },
-	{ 16,	SCR_BRDV_DIV_16 },
-	{ 32,	SCR_BRDV_DIV_32 },
+static const u32 sh_msiof_spi_div_array[] = {
+	SCR_BRDV_DIV_1, SCR_BRDV_DIV_2,	 SCR_BRDV_DIV_4,
+	SCR_BRDV_DIV_8,	SCR_BRDV_DIV_16, SCR_BRDV_DIV_32,
 };
 
 static void sh_msiof_spi_set_clk_regs(struct sh_msiof_spi_priv *p,
 				      unsigned long parent_rate, u32 spi_hz)
 {
-	unsigned long div = 1024;
+	unsigned long div;
 	u32 brps, scr;
-	size_t k;
+	unsigned int div_pow = p->min_div_pow;
 
-	if (!WARN_ON(!spi_hz || !parent_rate))
-		div = DIV_ROUND_UP(parent_rate, spi_hz);
-
-	div = max_t(unsigned long, div, p->min_div);
+	if (!spi_hz || !parent_rate) {
+		WARN(1, "Invalid clock rate parameters %lu and %u\n",
+		     parent_rate, spi_hz);
+		return;
+	}
 
-	for (k = 0; k < ARRAY_SIZE(sh_msiof_spi_div_table); k++) {
-		brps = DIV_ROUND_UP(div, sh_msiof_spi_div_table[k].div);
+	div = DIV_ROUND_UP(parent_rate, spi_hz);
+	if (div <= 1024) {
 		/* SCR_BRDV_DIV_1 is valid only if BRPS is x 1/1 or x 1/2 */
-		if (sh_msiof_spi_div_table[k].div == 1 && brps > 2)
-			continue;
-		if (brps <= 32) /* max of brdv is 32 */
-			break;
-	}
+		if (!div_pow && div <= 32 && div > 2)
+			div_pow = 1;
+
+		if (div_pow)
+			brps = (div + 1) >> div_pow;
+		else
+			brps = div;
 
-	k = min_t(int, k, ARRAY_SIZE(sh_msiof_spi_div_table) - 1);
-	brps = min_t(int, brps, 32);
+		for (; brps > 32; div_pow++)
+			brps = (brps + 1) >> 1;
+	} else {
+		/* Set transfer rate composite divisor to 2^5 * 32 = 1024 */
+		dev_err(&p->pdev->dev,
+			"Requested SPI transfer rate %d is too low\n", spi_hz);
+		div_pow = 5;
+		brps = 32;
+	}
 
-	scr = sh_msiof_spi_div_table[k].brdv | SCR_BRPS(brps);
+	scr = sh_msiof_spi_div_array[div_pow] | SCR_BRPS(brps);
 	sh_msiof_write(p, TSCR, scr);
 	if (!(p->master->flags & SPI_MASTER_MUST_TX))
 		sh_msiof_write(p, RSCR, scr);
@@ -564,14 +567,16 @@ static int sh_msiof_spi_setup(struct spi_device *spi)
 
 	/* Configure native chip select mode/polarity early */
 	clr = MDR1_SYNCMD_MASK;
-	set = MDR1_TRMD | TMDR1_PCON | MDR1_SYNCMD_SPI;
+	set = MDR1_SYNCMD_SPI;
 	if (spi->mode & SPI_CS_HIGH)
 		clr |= BIT(MDR1_SYNCAC_SHIFT);
 	else
 		set |= BIT(MDR1_SYNCAC_SHIFT);
 	pm_runtime_get_sync(&p->pdev->dev);
 	tmp = sh_msiof_read(p, TMDR1) & ~clr;
-	sh_msiof_write(p, TMDR1, tmp | set);
+	sh_msiof_write(p, TMDR1, tmp | set | MDR1_TRMD | TMDR1_PCON);
+	tmp = sh_msiof_read(p, RMDR1) & ~clr;
+	sh_msiof_write(p, RMDR1, tmp | set);
 	pm_runtime_put(&p->pdev->dev);
 	p->native_cs_high = spi->mode & SPI_CS_HIGH;
 	p->native_cs_inited = true;
@@ -1041,21 +1046,21 @@ static const struct sh_msiof_chipdata sh_data = {
 	.tx_fifo_size = 64,
 	.rx_fifo_size = 64,
 	.master_flags = 0,
-	.min_div = 1,
+	.min_div_pow = 0,
 };
 
 static const struct sh_msiof_chipdata rcar_gen2_data = {
 	.tx_fifo_size = 64,
 	.rx_fifo_size = 64,
 	.master_flags = SPI_MASTER_MUST_TX,
-	.min_div = 1,
+	.min_div_pow = 0,
 };
 
 static const struct sh_msiof_chipdata rcar_gen3_data = {
 	.tx_fifo_size = 64,
 	.rx_fifo_size = 64,
 	.master_flags = SPI_MASTER_MUST_TX,
-	.min_div = 2,
+	.min_div_pow = 1,
 };
 
 static const struct of_device_id sh_msiof_match[] = {
@@ -1319,7 +1324,7 @@ static int sh_msiof_spi_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, p);
 	p->master = master;
 	p->info = info;
-	p->min_div = chipdata->min_div;
+	p->min_div_pow = chipdata->min_div_pow;
 
 	init_completion(&p->done);
 
diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c
index ba9743f..ad1e55d 100644
--- a/drivers/spi/spi-stm32.c
+++ b/drivers/spi/spi-stm32.c
@@ -1129,7 +1129,7 @@ static int stm32_spi_probe(struct platform_device *pdev)
 	if (!spi->clk_rate) {
 		dev_err(&pdev->dev, "clk rate = 0\n");
 		ret = -EINVAL;
-		goto err_master_put;
+		goto err_clk_disable;
 	}
 
 	spi->rst = devm_reset_control_get_exclusive(&pdev->dev, NULL);
diff --git a/drivers/spi/spi-ti-qspi.c b/drivers/spi/spi-ti-qspi.c
index c24d9b4..5f19016 100644
--- a/drivers/spi/spi-ti-qspi.c
+++ b/drivers/spi/spi-ti-qspi.c
@@ -36,6 +36,7 @@
 #include <linux/sizes.h>
 
 #include <linux/spi/spi.h>
+#include <linux/spi/spi-mem.h>
 
 struct ti_qspi_regs {
 	u32 clkctrl;
@@ -50,6 +51,7 @@ struct ti_qspi {
 	struct spi_master	*master;
 	void __iomem            *base;
 	void __iomem            *mmap_base;
+	size_t			mmap_size;
 	struct regmap		*ctrl_base;
 	unsigned int		ctrl_reg;
 	struct clk		*fclk;
@@ -434,12 +436,10 @@ static int ti_qspi_dma_xfer(struct ti_qspi *qspi, dma_addr_t dma_dst,
 	return 0;
 }
 
-static int ti_qspi_dma_bounce_buffer(struct ti_qspi *qspi,
-				     struct spi_flash_read_message *msg)
+static int ti_qspi_dma_bounce_buffer(struct ti_qspi *qspi, loff_t offs,
+				     void *to, size_t readsize)
 {
-	size_t readsize = msg->len;
-	void *to = msg->buf;
-	dma_addr_t dma_src = qspi->mmap_phys_base + msg->from;
+	dma_addr_t dma_src = qspi->mmap_phys_base + offs;
 	int ret = 0;
 
 	/*
@@ -507,13 +507,14 @@ static void ti_qspi_disable_memory_map(struct spi_device *spi)
 	qspi->mmap_enabled = false;
 }
 
-static void ti_qspi_setup_mmap_read(struct spi_device *spi,
-				    struct spi_flash_read_message *msg)
+static void ti_qspi_setup_mmap_read(struct spi_device *spi, u8 opcode,
+				    u8 data_nbits, u8 addr_width,
+				    u8 dummy_bytes)
 {
 	struct ti_qspi  *qspi = spi_master_get_devdata(spi->master);
-	u32 memval = msg->read_opcode;
+	u32 memval = opcode;
 
-	switch (msg->data_nbits) {
+	switch (data_nbits) {
 	case SPI_NBITS_QUAD:
 		memval |= QSPI_SETUP_RD_QUAD;
 		break;
@@ -524,48 +525,64 @@ static void ti_qspi_setup_mmap_read(struct spi_device *spi,
 		memval |= QSPI_SETUP_RD_NORMAL;
 		break;
 	}
-	memval |= ((msg->addr_width - 1) << QSPI_SETUP_ADDR_SHIFT |
-		   msg->dummy_bytes << QSPI_SETUP_DUMMY_SHIFT);
+	memval |= ((addr_width - 1) << QSPI_SETUP_ADDR_SHIFT |
+		   dummy_bytes << QSPI_SETUP_DUMMY_SHIFT);
 	ti_qspi_write(qspi, memval,
 		      QSPI_SPI_SETUP_REG(spi->chip_select));
 }
 
-static bool ti_qspi_spi_flash_can_dma(struct spi_device *spi,
-				      struct spi_flash_read_message *msg)
+static int ti_qspi_exec_mem_op(struct spi_mem *mem,
+			       const struct spi_mem_op *op)
 {
-	return virt_addr_valid(msg->buf);
-}
-
-static int ti_qspi_spi_flash_read(struct spi_device *spi,
-				  struct spi_flash_read_message *msg)
-{
-	struct ti_qspi *qspi = spi_master_get_devdata(spi->master);
+	struct ti_qspi *qspi = spi_master_get_devdata(mem->spi->master);
+	u32 from = 0;
 	int ret = 0;
 
+	/* Only optimize read path. */
+	if (!op->data.nbytes || op->data.dir != SPI_MEM_DATA_IN ||
+	    !op->addr.nbytes || op->addr.nbytes > 4)
+		return -ENOTSUPP;
+
+	/* Address exceeds MMIO window size, fall back to regular mode. */
+	from = op->addr.val;
+	if (from + op->data.nbytes > qspi->mmap_size)
+		return -ENOTSUPP;
+
 	mutex_lock(&qspi->list_lock);
 
 	if (!qspi->mmap_enabled)
-		ti_qspi_enable_memory_map(spi);
-	ti_qspi_setup_mmap_read(spi, msg);
+		ti_qspi_enable_memory_map(mem->spi);
+	ti_qspi_setup_mmap_read(mem->spi, op->cmd.opcode, op->data.buswidth,
+				op->addr.nbytes, op->dummy.nbytes);
 
 	if (qspi->rx_chan) {
-		if (msg->cur_msg_mapped)
-			ret = ti_qspi_dma_xfer_sg(qspi, msg->rx_sg, msg->from);
-		else
-			ret = ti_qspi_dma_bounce_buffer(qspi, msg);
-		if (ret)
-			goto err_unlock;
+		struct sg_table sgt;
+
+		if (virt_addr_valid(op->data.buf.in) &&
+		    !spi_controller_dma_map_mem_op_data(mem->spi->master, op,
+							&sgt)) {
+			ret = ti_qspi_dma_xfer_sg(qspi, sgt, from);
+			spi_controller_dma_unmap_mem_op_data(mem->spi->master,
+							     op, &sgt);
+		} else {
+			ret = ti_qspi_dma_bounce_buffer(qspi, from,
+							op->data.buf.in,
+							op->data.nbytes);
+		}
 	} else {
-		memcpy_fromio(msg->buf, qspi->mmap_base + msg->from, msg->len);
+		memcpy_fromio(op->data.buf.in, qspi->mmap_base + from,
+			      op->data.nbytes);
 	}
-	msg->retlen = msg->len;
 
-err_unlock:
 	mutex_unlock(&qspi->list_lock);
 
 	return ret;
 }
 
+static const struct spi_controller_mem_ops ti_qspi_mem_ops = {
+	.exec_op = ti_qspi_exec_mem_op,
+};
+
 static int ti_qspi_start_transfer_one(struct spi_master *master,
 		struct spi_message *m)
 {
@@ -672,7 +689,7 @@ static int ti_qspi_probe(struct platform_device *pdev)
 	master->dev.of_node = pdev->dev.of_node;
 	master->bits_per_word_mask = SPI_BPW_MASK(32) | SPI_BPW_MASK(16) |
 				     SPI_BPW_MASK(8);
-	master->spi_flash_read = ti_qspi_spi_flash_read;
+	master->mem_ops = &ti_qspi_mem_ops;
 
 	if (!of_property_read_u32(np, "num-cs", &num_cs))
 		master->num_chipselect = num_cs;
@@ -702,6 +719,9 @@ static int ti_qspi_probe(struct platform_device *pdev)
 		}
 	}
 
+	if (res_mmap)
+		qspi->mmap_size = resource_size(res_mmap);
+
 	irq = platform_get_irq(pdev, 0);
 	if (irq < 0) {
 		dev_err(&pdev->dev, "no irq resource?\n");
@@ -770,7 +790,6 @@ static int ti_qspi_probe(struct platform_device *pdev)
 		dma_release_channel(qspi->rx_chan);
 		goto no_dma;
 	}
-	master->spi_flash_can_dma = ti_qspi_spi_flash_can_dma;
 	master->dma_rx = qspi->rx_chan;
 	init_completion(&qspi->transfer_complete);
 	if (res_mmap)
@@ -784,7 +803,7 @@ no_dma:
 				 "mmap failed with error %ld using PIO mode\n",
 				 PTR_ERR(qspi->mmap_base));
 			qspi->mmap_base = NULL;
-			master->spi_flash_read = NULL;
+			master->mem_ops = NULL;
 		}
 	}
 	qspi->mmap_enabled = false;
diff --git a/drivers/spi/spi-zynqmp-gqspi.c b/drivers/spi/spi-zynqmp-gqspi.c
index 18aeace..cc4d310 100644
--- a/drivers/spi/spi-zynqmp-gqspi.c
+++ b/drivers/spi/spi-zynqmp-gqspi.c
@@ -20,6 +20,7 @@
 #include <linux/of_irq.h>
 #include <linux/of_address.h>
 #include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
 #include <linux/spi/spi.h>
 #include <linux/spinlock.h>
 #include <linux/workqueue.h>
@@ -135,6 +136,7 @@
 #define GQSPI_DMA_UNALIGN		0x3
 #define GQSPI_DEFAULT_NUM_CS	1	/* Default number of chip selects */
 
+#define SPI_AUTOSUSPEND_TIMEOUT		3000
 enum mode_type {GQSPI_MODE_IO, GQSPI_MODE_DMA};
 
 /**
@@ -356,21 +358,9 @@ static void zynqmp_qspi_copy_read_data(struct zynqmp_qspi *xqspi,
 static int zynqmp_prepare_transfer_hardware(struct spi_master *master)
 {
 	struct zynqmp_qspi *xqspi = spi_master_get_devdata(master);
-	int ret;
-
-	ret = clk_enable(xqspi->refclk);
-	if (ret)
-		return ret;
-
-	ret = clk_enable(xqspi->pclk);
-	if (ret)
-		goto clk_err;
 
 	zynqmp_gqspi_write(xqspi, GQSPI_EN_OFST, GQSPI_EN_MASK);
 	return 0;
-clk_err:
-	clk_disable(xqspi->refclk);
-	return ret;
 }
 
 /**
@@ -387,8 +377,6 @@ static int zynqmp_unprepare_transfer_hardware(struct spi_master *master)
 	struct zynqmp_qspi *xqspi = spi_master_get_devdata(master);
 
 	zynqmp_gqspi_write(xqspi, GQSPI_EN_OFST, 0x0);
-	clk_disable(xqspi->refclk);
-	clk_disable(xqspi->pclk);
 	return 0;
 }
 
@@ -918,8 +906,7 @@ static int zynqmp_qspi_start_transfer(struct spi_master *master,
  */
 static int __maybe_unused zynqmp_qspi_suspend(struct device *dev)
 {
-	struct platform_device *pdev = to_platform_device(dev);
-	struct spi_master *master = platform_get_drvdata(pdev);
+	struct spi_master *master = dev_get_drvdata(dev);
 
 	spi_master_suspend(master);
 
@@ -939,8 +926,7 @@ static int __maybe_unused zynqmp_qspi_suspend(struct device *dev)
  */
 static int __maybe_unused zynqmp_qspi_resume(struct device *dev)
 {
-	struct platform_device *pdev = to_platform_device(dev);
-	struct spi_master *master = platform_get_drvdata(pdev);
+	struct spi_master *master = dev_get_drvdata(dev);
 	struct zynqmp_qspi *xqspi = spi_master_get_devdata(master);
 	int ret = 0;
 
@@ -959,11 +945,67 @@ static int __maybe_unused zynqmp_qspi_resume(struct device *dev)
 
 	spi_master_resume(master);
 
+	clk_disable(xqspi->refclk);
+	clk_disable(xqspi->pclk);
 	return 0;
 }
 
-static SIMPLE_DEV_PM_OPS(zynqmp_qspi_dev_pm_ops, zynqmp_qspi_suspend,
-			 zynqmp_qspi_resume);
+/**
+ * zynqmp_runtime_suspend - Runtime suspend method for the SPI driver
+ * @dev:	Address of the platform_device structure
+ *
+ * This function disables the clocks
+ *
+ * Return:	Always 0
+ */
+static int __maybe_unused zynqmp_runtime_suspend(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct spi_master *master = platform_get_drvdata(pdev);
+	struct zynqmp_qspi *xqspi = spi_master_get_devdata(master);
+
+	clk_disable(xqspi->refclk);
+	clk_disable(xqspi->pclk);
+
+	return 0;
+}
+
+/**
+ * zynqmp_runtime_resume - Runtime resume method for the SPI driver
+ * @dev:	Address of the platform_device structure
+ *
+ * This function enables the clocks
+ *
+ * Return:	0 on success and error value on error
+ */
+static int __maybe_unused zynqmp_runtime_resume(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct spi_master *master = platform_get_drvdata(pdev);
+	struct zynqmp_qspi *xqspi = spi_master_get_devdata(master);
+	int ret;
+
+	ret = clk_enable(xqspi->pclk);
+	if (ret) {
+		dev_err(dev, "Cannot enable APB clock.\n");
+		return ret;
+	}
+
+	ret = clk_enable(xqspi->refclk);
+	if (ret) {
+		dev_err(dev, "Cannot enable device clock.\n");
+		clk_disable(xqspi->pclk);
+		return ret;
+	}
+
+	return 0;
+}
+
+static const struct dev_pm_ops zynqmp_qspi_dev_pm_ops = {
+	SET_RUNTIME_PM_OPS(zynqmp_runtime_suspend,
+			   zynqmp_runtime_resume, NULL)
+	SET_SYSTEM_SLEEP_PM_OPS(zynqmp_qspi_suspend, zynqmp_qspi_resume)
+};
 
 /**
  * zynqmp_qspi_probe:	Probe method for the QSPI driver
@@ -1023,9 +1065,15 @@ static int zynqmp_qspi_probe(struct platform_device *pdev)
 		goto clk_dis_pclk;
 	}
 
+	pm_runtime_use_autosuspend(&pdev->dev);
+	pm_runtime_set_autosuspend_delay(&pdev->dev, SPI_AUTOSUSPEND_TIMEOUT);
+	pm_runtime_set_active(&pdev->dev);
+	pm_runtime_enable(&pdev->dev);
 	/* QSPI controller initializations */
 	zynqmp_qspi_init_hw(xqspi);
 
+	pm_runtime_mark_last_busy(&pdev->dev);
+	pm_runtime_put_autosuspend(&pdev->dev);
 	xqspi->irq = platform_get_irq(pdev, 0);
 	if (xqspi->irq <= 0) {
 		ret = -ENXIO;
@@ -1063,6 +1111,8 @@ static int zynqmp_qspi_probe(struct platform_device *pdev)
 	return 0;
 
 clk_dis_all:
+	pm_runtime_set_suspended(&pdev->dev);
+	pm_runtime_disable(&pdev->dev);
 	clk_disable_unprepare(xqspi->refclk);
 clk_dis_pclk:
 	clk_disable_unprepare(xqspi->pclk);
@@ -1090,6 +1140,8 @@ static int zynqmp_qspi_remove(struct platform_device *pdev)
 	zynqmp_gqspi_write(xqspi, GQSPI_EN_OFST, 0x0);
 	clk_disable_unprepare(xqspi->refclk);
 	clk_disable_unprepare(xqspi->pclk);
+	pm_runtime_set_suspended(&pdev->dev);
+	pm_runtime_disable(&pdev->dev);
 
 	spi_unregister_master(master);
 
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 7b213fa..20b5b27 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/mod_devicetable.h>
 #include <linux/spi/spi.h>
+#include <linux/spi/spi-mem.h>
 #include <linux/of_gpio.h>
 #include <linux/pm_runtime.h>
 #include <linux/pm_domain.h>
@@ -46,6 +47,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/spi.h>
 
+#include "internals.h"
+
 static DEFINE_IDR(spi_master_idr);
 
 static void spidev_release(struct device *dev)
@@ -740,9 +743,9 @@ static void spi_set_cs(struct spi_device *spi, bool enable)
 }
 
 #ifdef CONFIG_HAS_DMA
-static int spi_map_buf(struct spi_controller *ctlr, struct device *dev,
-		       struct sg_table *sgt, void *buf, size_t len,
-		       enum dma_data_direction dir)
+int spi_map_buf(struct spi_controller *ctlr, struct device *dev,
+		struct sg_table *sgt, void *buf, size_t len,
+		enum dma_data_direction dir)
 {
 	const bool vmalloced_buf = is_vmalloc_addr(buf);
 	unsigned int max_seg_size = dma_get_max_seg_size(dev);
@@ -821,8 +824,8 @@ static int spi_map_buf(struct spi_controller *ctlr, struct device *dev,
 	return 0;
 }
 
-static void spi_unmap_buf(struct spi_controller *ctlr, struct device *dev,
-			  struct sg_table *sgt, enum dma_data_direction dir)
+void spi_unmap_buf(struct spi_controller *ctlr, struct device *dev,
+		   struct sg_table *sgt, enum dma_data_direction dir)
 {
 	if (sgt->orig_nents) {
 		dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir);
@@ -907,19 +910,6 @@ static int __spi_unmap_msg(struct spi_controller *ctlr, struct spi_message *msg)
 	return 0;
 }
 #else /* !CONFIG_HAS_DMA */
-static inline int spi_map_buf(struct spi_controller *ctlr, struct device *dev,
-			      struct sg_table *sgt, void *buf, size_t len,
-			      enum dma_data_direction dir)
-{
-	return -EINVAL;
-}
-
-static inline void spi_unmap_buf(struct spi_controller *ctlr,
-				 struct device *dev, struct sg_table *sgt,
-				 enum dma_data_direction dir)
-{
-}
-
 static inline int __spi_map_msg(struct spi_controller *ctlr,
 				struct spi_message *msg)
 {
@@ -1222,6 +1212,7 @@ static void __spi_pump_messages(struct spi_controller *ctlr, bool in_kthread)
 	if (!was_busy && ctlr->auto_runtime_pm) {
 		ret = pm_runtime_get_sync(ctlr->dev.parent);
 		if (ret < 0) {
+			pm_runtime_put_noidle(ctlr->dev.parent);
 			dev_err(&ctlr->dev, "Failed to power device: %d\n",
 				ret);
 			mutex_unlock(&ctlr->io_mutex);
@@ -1533,6 +1524,22 @@ err_init_queue:
 	return ret;
 }
 
+/**
+ * spi_flush_queue - Send all pending messages in the queue from the callers'
+ *		     context
+ * @ctlr: controller to process queue for
+ *
+ * This should be used when one wants to ensure all pending messages have been
+ * sent before doing something. Is used by the spi-mem code to make sure SPI
+ * memory operations do not preempt regular SPI transfers that have been queued
+ * before the spi-mem operation.
+ */
+void spi_flush_queue(struct spi_controller *ctlr)
+{
+	if (ctlr->transfer == spi_queued_transfer)
+		__spi_pump_messages(ctlr, false);
+}
+
 /*-------------------------------------------------------------------------*/
 
 #if defined(CONFIG_OF)
@@ -2063,6 +2070,26 @@ static int of_spi_register_master(struct spi_controller *ctlr)
 }
 #endif
 
+static int spi_controller_check_ops(struct spi_controller *ctlr)
+{
+	/*
+	 * The controller may implement only the high-level SPI-memory like
+	 * operations if it does not support regular SPI transfers, and this is
+	 * valid use case.
+	 * If ->mem_ops is NULL, we request that at least one of the
+	 * ->transfer_xxx() method be implemented.
+	 */
+	if (ctlr->mem_ops) {
+		if (!ctlr->mem_ops->exec_op)
+			return -EINVAL;
+	} else if (!ctlr->transfer && !ctlr->transfer_one &&
+		   !ctlr->transfer_one_message) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /**
  * spi_register_controller - register SPI master or slave controller
  * @ctlr: initialized master, originally from spi_alloc_master() or
@@ -2096,6 +2123,14 @@ int spi_register_controller(struct spi_controller *ctlr)
 	if (!dev)
 		return -ENODEV;
 
+	/*
+	 * Make sure all necessary hooks are implemented before registering
+	 * the SPI controller.
+	 */
+	status = spi_controller_check_ops(ctlr);
+	if (status)
+		return status;
+
 	if (!spi_controller_is_slave(ctlr)) {
 		status = of_spi_register_master(ctlr);
 		if (status)
@@ -2161,10 +2196,14 @@ int spi_register_controller(struct spi_controller *ctlr)
 			spi_controller_is_slave(ctlr) ? "slave" : "master",
 			dev_name(&ctlr->dev));
 
-	/* If we're using a queued driver, start the queue */
-	if (ctlr->transfer)
+	/*
+	 * If we're using a queued driver, start the queue. Note that we don't
+	 * need the queueing logic if the driver is only supporting high-level
+	 * memory operations.
+	 */
+	if (ctlr->transfer) {
 		dev_info(dev, "controller is unqueued, this is deprecated\n");
-	else {
+	} else if (ctlr->transfer_one || ctlr->transfer_one_message) {
 		status = spi_controller_initialize_queue(ctlr);
 		if (status) {
 			device_del(&ctlr->dev);
@@ -2894,6 +2933,13 @@ static int __spi_async(struct spi_device *spi, struct spi_message *message)
 {
 	struct spi_controller *ctlr = spi->controller;
 
+	/*
+	 * Some controllers do not support doing regular SPI transfers. Return
+	 * ENOTSUPP when this is the case.
+	 */
+	if (!ctlr->transfer)
+		return -ENOTSUPP;
+
 	message->spi = spi;
 
 	SPI_STATISTICS_INCREMENT_FIELD(&ctlr->statistics, spi_async);
@@ -3010,63 +3056,6 @@ int spi_async_locked(struct spi_device *spi, struct spi_message *message)
 }
 EXPORT_SYMBOL_GPL(spi_async_locked);
 
-
-int spi_flash_read(struct spi_device *spi,
-		   struct spi_flash_read_message *msg)
-
-{
-	struct spi_controller *master = spi->controller;
-	struct device *rx_dev = NULL;
-	int ret;
-
-	if ((msg->opcode_nbits == SPI_NBITS_DUAL ||
-	     msg->addr_nbits == SPI_NBITS_DUAL) &&
-	    !(spi->mode & (SPI_TX_DUAL | SPI_TX_QUAD)))
-		return -EINVAL;
-	if ((msg->opcode_nbits == SPI_NBITS_QUAD ||
-	     msg->addr_nbits == SPI_NBITS_QUAD) &&
-	    !(spi->mode & SPI_TX_QUAD))
-		return -EINVAL;
-	if (msg->data_nbits == SPI_NBITS_DUAL &&
-	    !(spi->mode & (SPI_RX_DUAL | SPI_RX_QUAD)))
-		return -EINVAL;
-	if (msg->data_nbits == SPI_NBITS_QUAD &&
-	    !(spi->mode &  SPI_RX_QUAD))
-		return -EINVAL;
-
-	if (master->auto_runtime_pm) {
-		ret = pm_runtime_get_sync(master->dev.parent);
-		if (ret < 0) {
-			dev_err(&master->dev, "Failed to power device: %d\n",
-				ret);
-			return ret;
-		}
-	}
-
-	mutex_lock(&master->bus_lock_mutex);
-	mutex_lock(&master->io_mutex);
-	if (master->dma_rx && master->spi_flash_can_dma(spi, msg)) {
-		rx_dev = master->dma_rx->device->dev;
-		ret = spi_map_buf(master, rx_dev, &msg->rx_sg,
-				  msg->buf, msg->len,
-				  DMA_FROM_DEVICE);
-		if (!ret)
-			msg->cur_msg_mapped = true;
-	}
-	ret = master->spi_flash_read(spi, msg);
-	if (msg->cur_msg_mapped)
-		spi_unmap_buf(master, rx_dev, &msg->rx_sg,
-			      DMA_FROM_DEVICE);
-	mutex_unlock(&master->io_mutex);
-	mutex_unlock(&master->bus_lock_mutex);
-
-	if (master->auto_runtime_pm)
-		pm_runtime_put(master->dev.parent);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(spi_flash_read);
-
 /*-------------------------------------------------------------------------*/
 
 /* Utility methods for SPI protocol drivers, layered on
diff --git a/drivers/staging/comedi/drivers/serial2002.c b/drivers/staging/comedi/drivers/serial2002.c
index b3f3b4a..5471b22 100644
--- a/drivers/staging/comedi/drivers/serial2002.c
+++ b/drivers/staging/comedi/drivers/serial2002.c
@@ -113,7 +113,7 @@ static void serial2002_tty_read_poll_wait(struct file *f, int timeout)
 		long elapsed;
 		__poll_t mask;
 
-		mask = f->f_op->poll(f, &table.pt);
+		mask = vfs_poll(f, &table.pt);
 		if (mask & (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN |
 			    EPOLLHUP | EPOLLERR)) {
 			break;
@@ -136,7 +136,7 @@ static int serial2002_tty_read(struct file *f, int timeout)
 
 	result = -1;
 	if (!IS_ERR(f)) {
-		if (f->f_op->poll) {
+		if (file_can_poll(f)) {
 			serial2002_tty_read_poll_wait(f, timeout);
 
 			if (kernel_read(f, &ch, 1, &pos) == 1)
diff --git a/drivers/staging/comedi/proc.c b/drivers/staging/comedi/proc.c
index 50d3893..8bc8e42 100644
--- a/drivers/staging/comedi/proc.c
+++ b/drivers/staging/comedi/proc.c
@@ -62,25 +62,9 @@ static int comedi_read(struct seq_file *m, void *v)
 	return 0;
 }
 
-/*
- * seq_file wrappers for procfile show routines.
- */
-static int comedi_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, comedi_read, NULL);
-}
-
-static const struct file_operations comedi_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= comedi_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 void __init comedi_proc_init(void)
 {
-	if (!proc_create("comedi", 0444, NULL, &comedi_proc_fops))
+	if (!proc_create_single("comedi", 0444, NULL, comedi_read))
 		pr_warn("comedi: unable to create proc entry\n");
 }
 
diff --git a/drivers/staging/fwserial/fwserial.c b/drivers/staging/fwserial/fwserial.c
index e8bfe55..fa0dd42 100644
--- a/drivers/staging/fwserial/fwserial.c
+++ b/drivers/staging/fwserial/fwserial.c
@@ -1506,11 +1506,6 @@ static int fwtty_debugfs_peers_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int fwtty_proc_open(struct inode *inode, struct file *fp)
-{
-	return single_open(fp, fwtty_proc_show, NULL);
-}
-
 static int fwtty_stats_open(struct inode *inode, struct file *fp)
 {
 	return single_open(fp, fwtty_debugfs_stats_show, inode->i_private);
@@ -1537,14 +1532,6 @@ static const struct file_operations fwtty_peers_fops = {
 	.release =	single_release,
 };
 
-static const struct file_operations fwtty_proc_fops = {
-	.owner =        THIS_MODULE,
-	.open =         fwtty_proc_open,
-	.read =         seq_read,
-	.llseek =       seq_lseek,
-	.release =      single_release,
-};
-
 static const struct tty_port_operations fwtty_port_ops = {
 	.dtr_rts =		fwtty_port_dtr_rts,
 	.carrier_raised =	fwtty_port_carrier_raised,
@@ -1570,7 +1557,7 @@ static const struct tty_operations fwtty_ops = {
 	.tiocmget =		fwtty_tiocmget,
 	.tiocmset =		fwtty_tiocmset,
 	.get_icount =		fwtty_get_icount,
-	.proc_fops =		&fwtty_proc_fops,
+	.proc_show =		fwtty_proc_show,
 };
 
 static const struct tty_operations fwloop_ops = {
diff --git a/drivers/staging/ipx/af_ipx.c b/drivers/staging/ipx/af_ipx.c
index 5703dd1..208b5c1 100644
--- a/drivers/staging/ipx/af_ipx.c
+++ b/drivers/staging/ipx/af_ipx.c
@@ -1965,7 +1965,7 @@ static const struct proto_ops ipx_dgram_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= ipx_getname,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.ioctl		= ipx_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ipx_compat_ioctl,
diff --git a/drivers/staging/ipx/ipx_proc.c b/drivers/staging/ipx/ipx_proc.c
index b9232e4..360f0ad 100644
--- a/drivers/staging/ipx/ipx_proc.c
+++ b/drivers/staging/ipx/ipx_proc.c
@@ -244,42 +244,6 @@ static const struct seq_operations ipx_seq_socket_ops = {
 	.show   = ipx_seq_socket_show,
 };
 
-static int ipx_seq_route_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &ipx_seq_route_ops);
-}
-
-static int ipx_seq_interface_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &ipx_seq_interface_ops);
-}
-
-static int ipx_seq_socket_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &ipx_seq_socket_ops);
-}
-
-static const struct file_operations ipx_seq_interface_fops = {
-	.open           = ipx_seq_interface_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release        = seq_release,
-};
-
-static const struct file_operations ipx_seq_route_fops = {
-	.open           = ipx_seq_route_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release        = seq_release,
-};
-
-static const struct file_operations ipx_seq_socket_fops = {
-	.open           = ipx_seq_socket_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release        = seq_release,
-};
-
 static struct proc_dir_entry *ipx_proc_dir;
 
 int __init ipx_proc_init(void)
@@ -291,16 +255,17 @@ int __init ipx_proc_init(void)
 
 	if (!ipx_proc_dir)
 		goto out;
-	p = proc_create("interface", S_IRUGO,
-			ipx_proc_dir, &ipx_seq_interface_fops);
+	p = proc_create_seq("interface", S_IRUGO, ipx_proc_dir,
+			&ipx_seq_interface_ops);
 	if (!p)
 		goto out_interface;
 
-	p = proc_create("route", S_IRUGO, ipx_proc_dir, &ipx_seq_route_fops);
+	p = proc_create_seq("route", S_IRUGO, ipx_proc_dir, &ipx_seq_route_ops);
 	if (!p)
 		goto out_route;
 
-	p = proc_create("socket", S_IRUGO, ipx_proc_dir, &ipx_seq_socket_fops);
+	p = proc_create_seq("socket", S_IRUGO, ipx_proc_dir,
+			&ipx_seq_socket_ops);
 	if (!p)
 		goto out_socket;
 
diff --git a/drivers/staging/ncpfs/dir.c b/drivers/staging/ncpfs/dir.c
index 0c57c5c..072bcb1 100644
--- a/drivers/staging/ncpfs/dir.c
+++ b/drivers/staging/ncpfs/dir.c
@@ -823,12 +823,11 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig
 	struct ncp_server *server = NCP_SERVER(dir);
 	struct inode *inode = NULL;
 	struct ncp_entry_info finfo;
-	int error, res, len;
+	int res, len;
 	__u8 __name[NCP_MAXPATHLEN + 1];
 
-	error = -EIO;
 	if (!ncp_conn_valid(server))
-		goto finished;
+		return ERR_PTR(-EIO);
 
 	ncp_vdbg("server lookup for %pd2\n", dentry);
 
@@ -847,31 +846,20 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig
 			res = ncp_obtain_info(server, dir, __name, &(finfo.i));
 	}
 	ncp_vdbg("looked for %pd2, res=%d\n", dentry, res);
-	/*
-	 * If we didn't find an entry, make a negative dentry.
-	 */
-	if (res)
-		goto add_entry;
-
-	/*
-	 * Create an inode for the entry.
-	 */
-	finfo.opened = 0;
-	finfo.ino = iunique(dir->i_sb, 2);
-	finfo.volume = finfo.i.volNumber;
-	error = -EACCES;
-	inode = ncp_iget(dir->i_sb, &finfo);
-
-	if (inode) {
-		ncp_new_dentry(dentry);
-add_entry:
-		d_add(dentry, inode);
-		error = 0;
+	if (!res) {
+		/*
+		 * Entry found; create an inode for it.
+		 */
+		finfo.opened = 0;
+		finfo.ino = iunique(dir->i_sb, 2);
+		finfo.volume = finfo.i.volNumber;
+		inode = ncp_iget(dir->i_sb, &finfo);
+		if (unlikely(!inode))
+			inode = ERR_PTR(-EACCES);
+		else
+			ncp_new_dentry(dentry);
 	}
-
-finished:
-	ncp_vdbg("result=%d\n", error);
-	return ERR_PTR(error);
+	return d_splice_alias(inode, dentry);
 }
 
 /*
diff --git a/drivers/staging/rtl8192u/r8192U_core.c b/drivers/staging/rtl8192u/r8192U_core.c
index d607c59..7a0dbc0 100644
--- a/drivers/staging/rtl8192u/r8192U_core.c
+++ b/drivers/staging/rtl8192u/r8192U_core.c
@@ -646,64 +646,25 @@ static void rtl8192_proc_module_init(void)
 	rtl8192_proc = proc_mkdir(RTL819xU_MODULE_NAME, init_net.proc_net);
 }
 
-/*
- * seq_file wrappers for procfile show routines.
- */
-static int rtl8192_proc_open(struct inode *inode, struct file *file)
-{
-	struct net_device *dev = proc_get_parent_data(inode);
-	int (*show)(struct seq_file *, void *) = PDE_DATA(inode);
-
-	return single_open(file, show, dev);
-}
-
-static const struct file_operations rtl8192_proc_fops = {
-	.open		= rtl8192_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-/*
- * Table of proc files we need to create.
- */
-struct rtl8192_proc_file {
-	char name[12];
-	int (*show)(struct seq_file *, void *);
-};
-
-static const struct rtl8192_proc_file rtl8192_proc_files[] = {
-	{ "stats-rx",	&proc_get_stats_rx },
-	{ "stats-tx",	&proc_get_stats_tx },
-	{ "stats-ap",	&proc_get_stats_ap },
-	{ "registers",	&proc_get_registers },
-	{ "" }
-};
-
 static void rtl8192_proc_init_one(struct net_device *dev)
 {
-	const struct rtl8192_proc_file *f;
 	struct proc_dir_entry *dir;
 
-	if (rtl8192_proc) {
-		dir = proc_mkdir_data(dev->name, 0, rtl8192_proc, dev);
-		if (!dir) {
-			RT_TRACE(COMP_ERR,
-				 "Unable to initialize /proc/net/rtl8192/%s\n",
-				 dev->name);
-			return;
-		}
+	if (!rtl8192_proc)
+		return;
 
-		for (f = rtl8192_proc_files; f->name[0]; f++) {
-			if (!proc_create_data(f->name, S_IFREG | S_IRUGO, dir,
-					      &rtl8192_proc_fops, f->show)) {
-				RT_TRACE(COMP_ERR,
-					 "Unable to initialize /proc/net/rtl8192/%s/%s\n",
-					 dev->name, f->name);
-				return;
-			}
-		}
-	}
+	dir = proc_mkdir_data(dev->name, 0, rtl8192_proc, dev);
+	if (!dir)
+		return;
+
+	proc_create_single("stats-rx", S_IFREG | S_IRUGO, dir,
+			proc_get_stats_rx);
+	proc_create_single("stats-tx", S_IFREG | S_IRUGO, dir,
+			proc_get_stats_tx);
+	proc_create_single("stats-ap", S_IFREG | S_IRUGO, dir,
+			proc_get_stats_ap);
+	proc_create_single("registers", S_IFREG | S_IRUGO, dir,
+			proc_get_registers);
 }
 
 static void rtl8192_proc_remove_one(struct net_device *dev)
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 6042901..ce1321a 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -94,8 +94,8 @@ static int iblock_configure_device(struct se_device *dev)
 		return -EINVAL;
 	}
 
-	ib_dev->ibd_bio_set = bioset_create(IBLOCK_BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-	if (!ib_dev->ibd_bio_set) {
+	ret = bioset_init(&ib_dev->ibd_bio_set, IBLOCK_BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+	if (ret) {
 		pr_err("IBLOCK: Unable to create bioset\n");
 		goto out;
 	}
@@ -141,7 +141,7 @@ static int iblock_configure_device(struct se_device *dev)
 
 	bi = bdev_get_integrity(bd);
 	if (bi) {
-		struct bio_set *bs = ib_dev->ibd_bio_set;
+		struct bio_set *bs = &ib_dev->ibd_bio_set;
 
 		if (!strcmp(bi->profile->name, "T10-DIF-TYPE3-IP") ||
 		    !strcmp(bi->profile->name, "T10-DIF-TYPE1-IP")) {
@@ -164,7 +164,7 @@ static int iblock_configure_device(struct se_device *dev)
 				goto out_blkdev_put;
 			}
 			pr_debug("IBLOCK setup BIP bs->bio_integrity_pool: %p\n",
-				 bs->bio_integrity_pool);
+				 &bs->bio_integrity_pool);
 		}
 		dev->dev_attrib.hw_pi_prot_type = dev->dev_attrib.pi_prot_type;
 	}
@@ -174,8 +174,7 @@ static int iblock_configure_device(struct se_device *dev)
 out_blkdev_put:
 	blkdev_put(ib_dev->ibd_bd, FMODE_WRITE|FMODE_READ|FMODE_EXCL);
 out_free_bioset:
-	bioset_free(ib_dev->ibd_bio_set);
-	ib_dev->ibd_bio_set = NULL;
+	bioset_exit(&ib_dev->ibd_bio_set);
 out:
 	return ret;
 }
@@ -199,8 +198,7 @@ static void iblock_destroy_device(struct se_device *dev)
 
 	if (ib_dev->ibd_bd != NULL)
 		blkdev_put(ib_dev->ibd_bd, FMODE_WRITE|FMODE_READ|FMODE_EXCL);
-	if (ib_dev->ibd_bio_set != NULL)
-		bioset_free(ib_dev->ibd_bio_set);
+	bioset_exit(&ib_dev->ibd_bio_set);
 }
 
 static unsigned long long iblock_emulate_read_cap_with_block_size(
@@ -332,7 +330,7 @@ iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num, int op,
 	if (sg_num > BIO_MAX_PAGES)
 		sg_num = BIO_MAX_PAGES;
 
-	bio = bio_alloc_bioset(GFP_NOIO, sg_num, ib_dev->ibd_bio_set);
+	bio = bio_alloc_bioset(GFP_NOIO, sg_num, &ib_dev->ibd_bio_set);
 	if (!bio) {
 		pr_err("Unable to allocate memory for bio\n");
 		return NULL;
diff --git a/drivers/target/target_core_iblock.h b/drivers/target/target_core_iblock.h
index b4aeb25..9cc3843 100644
--- a/drivers/target/target_core_iblock.h
+++ b/drivers/target/target_core_iblock.h
@@ -22,7 +22,7 @@ struct iblock_dev {
 	struct se_device dev;
 	unsigned char ibd_udev_path[SE_UDEV_PATH_LEN];
 	u32	ibd_flags;
-	struct bio_set	*ibd_bio_set;
+	struct bio_set	ibd_bio_set;
 	struct block_device *ibd_bd;
 	bool ibd_readonly;
 } ____cacheline_aligned;
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 6cb933e..668934e 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -986,8 +986,7 @@ pscsi_execute_cmd(struct se_cmd *cmd)
 
 	req = blk_get_request(pdv->pdv_sd->request_queue,
 			cmd->data_direction == DMA_TO_DEVICE ?
-			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
-			GFP_KERNEL);
+			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(req)) {
 		pr_err("PSCSI: blk_get_request() failed\n");
 		ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index 32d7ce4..34dead6 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -1566,19 +1566,6 @@ static int rs_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int rs_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, rs_proc_show, NULL);
-}
-
-static const struct file_operations rs_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= rs_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /*
  * ---------------------------------------------------------------------
  * rs_init() and friends
@@ -1620,7 +1607,7 @@ static const struct tty_operations serial_ops = {
 	.tiocmget = rs_tiocmget,
 	.tiocmset = rs_tiocmset,
 	.get_icount = rs_get_icount,
-	.proc_fops = &rs_proc_fops,
+	.proc_show = rs_proc_show,
 };
 
 static int amiga_carrier_raised(struct tty_port *port)
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
index cf0bde3..6d3c580 100644
--- a/drivers/tty/cyclades.c
+++ b/drivers/tty/cyclades.c
@@ -3972,19 +3972,6 @@ static int cyclades_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int cyclades_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, cyclades_proc_show, NULL);
-}
-
-static const struct file_operations cyclades_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= cyclades_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /* The serial driver boot-time initialization code!
     Hardware I/O ports are mapped to character special devices on a
     first found, first allocated manner.  That is, this code searches
@@ -4024,7 +4011,7 @@ static const struct tty_operations cy_ops = {
 	.tiocmget = cy_tiocmget,
 	.tiocmset = cy_tiocmset,
 	.get_icount = cy_get_icount,
-	.proc_fops = &cyclades_proc_fops,
+	.proc_show = cyclades_proc_show,
 };
 
 static int __init cy_init(void)
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 0466f9f..6ff9405 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1829,19 +1829,6 @@ static int uart_proc_show(struct seq_file *m, void *v)
 		uart_line_info(m, drv, i);
 	return 0;
 }
-
-static int uart_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, uart_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations uart_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= uart_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 #if defined(CONFIG_SERIAL_CORE_CONSOLE) || defined(CONFIG_CONSOLE_POLL)
@@ -2415,7 +2402,7 @@ static const struct tty_operations uart_ops = {
 	.break_ctl	= uart_break_ctl,
 	.wait_until_sent= uart_wait_until_sent,
 #ifdef CONFIG_PROC_FS
-	.proc_fops	= &uart_proc_fops,
+	.proc_show	= uart_proc_show,
 #endif
 	.tiocmget	= uart_tiocmget,
 	.tiocmset	= uart_tiocmset,
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index 3c4ad71..fbdf4d0 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -3534,19 +3534,6 @@ static int mgsl_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int mgsl_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, mgsl_proc_show, NULL);
-}
-
-static const struct file_operations mgsl_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= mgsl_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /* mgsl_allocate_dma_buffers()
  * 
  * 	Allocate and format DMA buffers (ISA adapter)
@@ -4298,7 +4285,7 @@ static const struct tty_operations mgsl_ops = {
 	.tiocmget = tiocmget,
 	.tiocmset = tiocmset,
 	.get_icount = msgl_get_icount,
-	.proc_fops = &mgsl_proc_fops,
+	.proc_show = mgsl_proc_show,
 };
 
 /*
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 255c496..a940865 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -1316,19 +1316,6 @@ static int synclink_gt_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int synclink_gt_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, synclink_gt_proc_show, NULL);
-}
-
-static const struct file_operations synclink_gt_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= synclink_gt_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /*
  * return count of bytes in transmit buffer
  */
@@ -3721,7 +3708,7 @@ static const struct tty_operations ops = {
 	.tiocmget = tiocmget,
 	.tiocmset = tiocmset,
 	.get_icount = get_icount,
-	.proc_fops = &synclink_gt_proc_fops,
+	.proc_show = synclink_gt_proc_show,
 };
 
 static void slgt_cleanup(void)
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index 75f11ce..1e4d5b9 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -1421,19 +1421,6 @@ static int synclinkmp_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int synclinkmp_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, synclinkmp_proc_show, NULL);
-}
-
-static const struct file_operations synclinkmp_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= synclinkmp_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /* Return the count of bytes in transmit buffer
  */
 static int chars_in_buffer(struct tty_struct *tty)
@@ -3899,7 +3886,7 @@ static const struct tty_operations ops = {
 	.tiocmget = tiocmget,
 	.tiocmset = tiocmset,
 	.get_icount = get_icount,
-	.proc_fops = &synclinkmp_proc_fops,
+	.proc_show = synclinkmp_proc_show,
 };
 
 
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index fb7329a..fc4c97c 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -229,26 +229,13 @@ static int tty_ldiscs_seq_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static const struct seq_operations tty_ldiscs_seq_ops = {
+const struct seq_operations tty_ldiscs_seq_ops = {
 	.start	= tty_ldiscs_seq_start,
 	.next	= tty_ldiscs_seq_next,
 	.stop	= tty_ldiscs_seq_stop,
 	.show	= tty_ldiscs_seq_show,
 };
 
-static int proc_tty_ldiscs_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &tty_ldiscs_seq_ops);
-}
-
-const struct file_operations tty_ldiscs_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_tty_ldiscs_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 /**
  *	tty_ldisc_ref_wait	-	wait for the tty ldisc
  *	@tty: tty device
diff --git a/drivers/usb/gadget/udc/at91_udc.c b/drivers/usb/gadget/udc/at91_udc.c
index ad743a8..03959dc 100644
--- a/drivers/usb/gadget/udc/at91_udc.c
+++ b/drivers/usb/gadget/udc/at91_udc.c
@@ -234,22 +234,10 @@ static int proc_udc_show(struct seq_file *s, void *unused)
 	return 0;
 }
 
-static int proc_udc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_udc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations proc_ops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_udc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static void create_debug_file(struct at91_udc *udc)
 {
-	udc->pde = proc_create_data(debug_filename, 0, NULL, &proc_ops, udc);
+	udc->pde = proc_create_single_data(debug_filename, 0, NULL,
+			proc_udc_show, udc);
 }
 
 static void remove_debug_file(struct at91_udc *udc)
diff --git a/drivers/usb/gadget/udc/fsl_udc_core.c b/drivers/usb/gadget/udc/fsl_udc_core.c
index 56b517a..7d8af29 100644
--- a/drivers/usb/gadget/udc/fsl_udc_core.c
+++ b/drivers/usb/gadget/udc/fsl_udc_core.c
@@ -2207,22 +2207,8 @@ static int fsl_proc_read(struct seq_file *m, void *v)
 	return 0;
 }
 
-/*
- * seq_file wrappers for procfile show routines.
- */
-static int fsl_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, fsl_proc_read, NULL);
-}
-
-static const struct file_operations fsl_proc_fops = {
-	.open		= fsl_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-#define create_proc_file()	proc_create(proc_filename, 0, NULL, &fsl_proc_fops)
+#define create_proc_file() \
+	proc_create_single(proc_filename, 0, NULL, fsl_proc_read)
 #define remove_proc_file()	remove_proc_entry(proc_filename, NULL)
 
 #else				/* !CONFIG_USB_GADGET_DEBUG_FILES */
diff --git a/drivers/usb/gadget/udc/goku_udc.c b/drivers/usb/gadget/udc/goku_udc.c
index 4504d0b..c372122 100644
--- a/drivers/usb/gadget/udc/goku_udc.c
+++ b/drivers/usb/gadget/udc/goku_udc.c
@@ -1241,22 +1241,6 @@ done:
 	local_irq_restore(flags);
 	return 0;
 }
-
-/*
- * seq_file wrappers for procfile show routines.
- */
-static int udc_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, udc_proc_read, PDE_DATA(file_inode(file)));
-}
-
-static const struct file_operations udc_proc_fops = {
-	.open		= udc_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 #endif	/* CONFIG_USB_GADGET_DEBUG_FILES */
 
 /*-------------------------------------------------------------------------*/
@@ -1826,7 +1810,7 @@ static int goku_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 
 #ifdef CONFIG_USB_GADGET_DEBUG_FILES
-	proc_create_data(proc_node_name, 0, NULL, &udc_proc_fops, dev);
+	proc_create_single_data(proc_node_name, 0, NULL, udc_proc_read, dev);
 #endif
 
 	retval = usb_add_gadget_udc_release(&pdev->dev, &dev->gadget,
diff --git a/drivers/usb/gadget/udc/omap_udc.c b/drivers/usb/gadget/udc/omap_udc.c
index dc35a54..3a16431 100644
--- a/drivers/usb/gadget/udc/omap_udc.c
+++ b/drivers/usb/gadget/udc/omap_udc.c
@@ -2432,22 +2432,9 @@ static int proc_udc_show(struct seq_file *s, void *_)
 	return 0;
 }
 
-static int proc_udc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_udc_show, NULL);
-}
-
-static const struct file_operations proc_ops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_udc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static void create_proc_file(void)
 {
-	proc_create(proc_filename, 0, NULL, &proc_ops);
+	proc_create_single(proc_filename, 0, NULL, proc_udc_show);
 }
 
 static void remove_proc_file(void)
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 790e0cb..268ffa6 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -476,19 +476,6 @@ static int serial_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int serial_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, serial_proc_show, NULL);
-}
-
-static const struct file_operations serial_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= serial_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int serial_tiocmget(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
@@ -1192,7 +1179,7 @@ static const struct tty_operations serial_ops = {
 	.get_icount =		serial_get_icount,
 	.cleanup =		serial_cleanup,
 	.install =		serial_install,
-	.proc_fops =		&serial_proc_fops,
+	.proc_show =		serial_proc_show,
 };
 
 
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index 085700f..2a1be85 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -166,7 +166,7 @@ int vfio_virqfd_enable(void *opaque,
 	init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
 	init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);
 
-	events = irqfd.file->f_op->poll(irqfd.file, &virqfd->pt);
+	events = vfs_poll(irqfd.file, &virqfd->pt);
 
 	/*
 	 * Check if there was an event already pending on the eventfd
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f0be5f3..895eaa2 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -208,7 +208,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file)
 	if (poll->wqh)
 		return 0;
 
-	mask = file->f_op->poll(file, &poll->table);
+	mask = vfs_poll(file, &poll->table);
 	if (mask)
 		vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
 	if (mask & EPOLLERR) {
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index f741ba8..924d073 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -713,19 +713,6 @@ static const struct seq_operations proc_fb_seq_ops = {
 	.show	= fb_seq_show,
 };
 
-static int proc_fb_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &proc_fb_seq_ops);
-}
-
-static const struct file_operations fb_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_fb_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 /*
  * We hold a reference to the fb_info in file->private_data,
  * but if the current registered fb has changed, we don't
@@ -1877,7 +1864,7 @@ fbmem_init(void)
 {
 	int ret;
 
-	if (!proc_create("fb", 0, NULL, &fb_proc_fops))
+	if (!proc_create_seq("fb", 0, NULL, &proc_fb_seq_ops))
 		return -ENOMEM;
 
 	ret = register_chrdev(FB_MAJOR, "fb", &fb_fops);
diff --git a/drivers/video/fbdev/via/viafbdev.c b/drivers/video/fbdev/via/viafbdev.c
index badee04..9b45125 100644
--- a/drivers/video/fbdev/via/viafbdev.c
+++ b/drivers/video/fbdev/via/viafbdev.c
@@ -1475,19 +1475,6 @@ static int viafb_sup_odev_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int viafb_sup_odev_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, viafb_sup_odev_proc_show, NULL);
-}
-
-static const struct file_operations viafb_sup_odev_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= viafb_sup_odev_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static ssize_t odev_update(const char __user *buffer, size_t count, u32 *odev)
 {
 	char buf[64], *ptr = buf;
@@ -1616,8 +1603,8 @@ static void viafb_init_proc(struct viafb_shared *shared)
 				&viafb_vt1636_proc_fops);
 #endif /* CONFIG_FB_VIA_DIRECT_PROCFS */
 
-		proc_create("supported_output_devices", 0, viafb_entry,
-			&viafb_sup_odev_proc_fops);
+		proc_create_single("supported_output_devices", 0, viafb_entry,
+			viafb_sup_odev_proc_show);
 		iga1_entry = proc_mkdir("iga1", viafb_entry);
 		shared->iga1_proc_entry = iga1_entry;
 		proc_create("output_devices", 0, iga1_entry,
diff --git a/drivers/w1/w1_io.c b/drivers/w1/w1_io.c
index 075d120..0364d33 100644
--- a/drivers/w1/w1_io.c
+++ b/drivers/w1/w1_io.c
@@ -194,6 +194,7 @@ static u8 w1_read_bit(struct w1_master *dev)
  *  bit 0 = id_bit
  *  bit 1 = comp_bit
  *  bit 2 = dir_taken
+ *
  * If both bits 0 & 1 are set, the search should be restarted.
  *
  * Return:        bit fields - see above
diff --git a/drivers/zorro/proc.c b/drivers/zorro/proc.c
index df05a26..2e4ca4d 100644
--- a/drivers/zorro/proc.c
+++ b/drivers/zorro/proc.c
@@ -96,19 +96,6 @@ static const struct seq_operations zorro_devices_seq_ops = {
 	.show  = zorro_seq_show,
 };
 
-static int zorro_devices_proc_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &zorro_devices_seq_ops);
-}
-
-static const struct file_operations zorro_devices_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= zorro_devices_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static struct proc_dir_entry *proc_bus_zorro_dir;
 
 static int __init zorro_proc_attach_device(unsigned int slot)
@@ -132,8 +119,8 @@ static int __init zorro_proc_init(void)
 
 	if (MACH_IS_AMIGA && AMIGAHW_PRESENT(ZORRO)) {
 		proc_bus_zorro_dir = proc_mkdir("bus/zorro", NULL);
-		proc_create("devices", 0, proc_bus_zorro_dir,
-			    &zorro_devices_proc_fops);
+		proc_create_seq("devices", 0, proc_bus_zorro_dir,
+			    &zorro_devices_seq_ops);
 		for (slot = 0; slot < zorro_num_autocon; slot++)
 			zorro_proc_attach_device(slot);
 	}
diff --git a/drivers/zorro/zorro.c b/drivers/zorro/zorro.c
index 4772847..67fa900 100644
--- a/drivers/zorro/zorro.c
+++ b/drivers/zorro/zorro.c
@@ -101,6 +101,7 @@ static void __init mark_region(unsigned long start, unsigned long end,
 	end = end > Z2RAM_END ? Z2RAM_SIZE : end-Z2RAM_START;
 	while (start < end) {
 		u32 chunk = start>>Z2RAM_CHUNKSHIFT;
+
 		if (flag)
 			set_bit(chunk, zorro_unused_z2ram);
 		else
@@ -117,6 +118,7 @@ static struct resource __init *zorro_find_parent_resource(
 
 	for (i = 0; i < bridge->num_resources; i++) {
 		struct resource *r = &bridge->resource[i];
+
 		if (zorro_resource_start(z) >= r->start &&
 		    zorro_resource_end(z) <= r->end)
 			return r;
@@ -168,6 +170,7 @@ static int __init amiga_zorro_probe(struct platform_device *pdev)
 		if (z->id == ZORRO_PROD_GVP_EPC_BASE) {
 			/* GVP quirk */
 			unsigned long magic = zi->boardaddr + 0x8000;
+
 			z->id |= *(u16 *)ZTWO_VADDR(magic) & GVP_PRODMASK;
 		}
 		z->slotaddr = zi->slotaddr;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 9ee5341..42e102e 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -823,28 +823,21 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(dfid))
 		return ERR_CAST(dfid);
 
-	name = dentry->d_name.name;
-	fid = p9_client_walk(dfid, 1, &name, 1);
-	if (IS_ERR(fid)) {
-		if (fid == ERR_PTR(-ENOENT)) {
-			d_add(dentry, NULL);
-			return NULL;
-		}
-		return ERR_CAST(fid);
-	}
 	/*
 	 * Make sure we don't use a wrong inode due to parallel
 	 * unlink. For cached mode create calls request for new
 	 * inode. But with cache disabled, lookup should do this.
 	 */
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+	name = dentry->d_name.name;
+	fid = p9_client_walk(dfid, 1, &name, 1);
+	if (fid == ERR_PTR(-ENOENT))
+		inode = NULL;
+	else if (IS_ERR(fid))
+		inode = ERR_CAST(fid);
+	else if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
 	else
 		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
-	if (IS_ERR(inode)) {
-		p9_client_clunk(fid);
-		return ERR_CAST(inode);
-	}
 	/*
 	 * If we had a rename on the server and a parallel lookup
 	 * for the new name, then make sure we instantiate with
@@ -853,12 +846,14 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	 * k/b.
 	 */
 	res = d_splice_alias(inode, dentry);
-	if (!res)
-		v9fs_fid_add(dentry, fid);
-	else if (!IS_ERR(res))
-		v9fs_fid_add(res, fid);
-	else
-		p9_client_clunk(fid);
+	if (!IS_ERR(fid)) {
+		if (!res)
+			v9fs_fid_add(dentry, fid);
+		else if (!IS_ERR(res))
+			v9fs_fid_add(res, fid);
+		else
+			p9_client_clunk(fid);
+	}
 	return res;
 }
 
diff --git a/fs/Kconfig b/fs/Kconfig
index bc821a8..ac4ac90 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -196,7 +196,7 @@ config HUGETLBFS
 	help
 	  hugetlbfs is a filesystem backing for HugeTLB pages, based on
 	  ramfs. For architectures that support it, say Y here and read
-	  <file:Documentation/vm/hugetlbpage.txt> for details.
+	  <file:Documentation/admin-guide/mm/hugetlbpage.rst> for details.
 
 	  If unsure, say N.
 
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 29444c8..e18eff8 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -146,20 +146,6 @@ adfs_dir_lookup_byname(struct inode *inode, const struct qstr *name, struct obje
 
 	obj->parent_id = inode->i_ino;
 
-	/*
-	 * '.' is handled by reserved_lookup() in fs/namei.c
-	 */
-	if (name->len == 2 && name->name[0] == '.' && name->name[1] == '.') {
-		/*
-		 * Currently unable to fill in the rest of 'obj',
-		 * but this is better than nothing.  We need to
-		 * ascend one level to find it's parent.
-		 */
-		obj->name_len = 0;
-		obj->file_id  = obj->parent_id;
-		goto free_out;
-	}
-
 	read_lock(&adfs_dir_lock);
 
 	ret = ops->setpos(&dir, 0);
@@ -266,17 +252,17 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 
 	error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
 	if (error == 0) {
-		error = -EACCES;
 		/*
 		 * This only returns NULL if get_empty_inode
 		 * fails.
 		 */
 		inode = adfs_iget(dir->i_sb, &obj);
-		if (inode)
-			error = 0;
+		if (!inode)
+			inode = ERR_PTR(-EACCES);
+	} else if (error != -ENOENT) {
+		inode = ERR_PTR(error);
 	}
-	d_add(dentry, inode);
-	return ERR_PTR(error);
+	return d_splice_alias(inode, dentry);
 }
 
 /*
diff --git a/fs/affs/super.c b/fs/affs/super.c
index e602619..d1ad11a 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -241,6 +241,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
 			affs_set_opt(*mount_opts, SF_NO_TRUNCATE);
 			break;
 		case Opt_prefix:
+			kfree(*prefix);
 			*prefix = match_strdup(&args[0]);
 			if (!*prefix)
 				return 0;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 839a222..3aad327 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -62,7 +62,6 @@ static const struct file_operations afs_proc_rootcell_fops = {
 	.llseek		= no_llseek,
 };
 
-static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file);
 static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos);
 static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
 					loff_t *pos);
@@ -76,15 +75,6 @@ static const struct seq_operations afs_proc_cell_volumes_ops = {
 	.show	= afs_proc_cell_volumes_show,
 };
 
-static const struct file_operations afs_proc_cell_volumes_fops = {
-	.open		= afs_proc_cell_volumes_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static int afs_proc_cell_vlservers_open(struct inode *inode,
-					struct file *file);
 static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos);
 static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
 					  loff_t *pos);
@@ -98,14 +88,6 @@ static const struct seq_operations afs_proc_cell_vlservers_ops = {
 	.show	= afs_proc_cell_vlservers_show,
 };
 
-static const struct file_operations afs_proc_cell_vlservers_fops = {
-	.open		= afs_proc_cell_vlservers_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static int afs_proc_servers_open(struct inode *inode, struct file *file);
 static void *afs_proc_servers_start(struct seq_file *p, loff_t *pos);
 static void *afs_proc_servers_next(struct seq_file *p, void *v,
 					loff_t *pos);
@@ -119,13 +101,6 @@ static const struct seq_operations afs_proc_servers_ops = {
 	.show	= afs_proc_servers_show,
 };
 
-static const struct file_operations afs_proc_servers_fops = {
-	.open		= afs_proc_servers_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int afs_proc_sysname_open(struct inode *inode, struct file *file);
 static int afs_proc_sysname_release(struct inode *inode, struct file *file);
 static void *afs_proc_sysname_start(struct seq_file *p, loff_t *pos);
@@ -152,7 +127,7 @@ static const struct file_operations afs_proc_sysname_fops = {
 	.write		= afs_proc_sysname_write,
 };
 
-static const struct file_operations afs_proc_stats_fops;
+static int afs_proc_stats_show(struct seq_file *m, void *v);
 
 /*
  * initialise the /proc/fs/afs/ directory
@@ -167,8 +142,8 @@ int afs_proc_init(struct afs_net *net)
 
 	if (!proc_create("cells", 0644, net->proc_afs, &afs_proc_cells_fops) ||
 	    !proc_create("rootcell", 0644, net->proc_afs, &afs_proc_rootcell_fops) ||
-	    !proc_create("servers", 0644, net->proc_afs, &afs_proc_servers_fops) ||
-	    !proc_create("stats", 0644, net->proc_afs, &afs_proc_stats_fops) ||
+	    !proc_create_seq("servers", 0644, net->proc_afs, &afs_proc_servers_ops) ||
+	    !proc_create_single("stats", 0644, net->proc_afs, afs_proc_stats_show) ||
 	    !proc_create("sysname", 0644, net->proc_afs, &afs_proc_sysname_fops))
 		goto error_tree;
 
@@ -196,16 +171,7 @@ void afs_proc_cleanup(struct afs_net *net)
  */
 static int afs_proc_cells_open(struct inode *inode, struct file *file)
 {
-	struct seq_file *m;
-	int ret;
-
-	ret = seq_open(file, &afs_proc_cells_ops);
-	if (ret < 0)
-		return ret;
-
-	m = file->private_data;
-	m->private = PDE_DATA(inode);
-	return 0;
+	return seq_open(file, &afs_proc_cells_ops);
 }
 
 /*
@@ -430,10 +396,11 @@ int afs_proc_cell_setup(struct afs_net *net, struct afs_cell *cell)
 	if (!dir)
 		goto error_dir;
 
-	if (!proc_create_data("vlservers", 0, dir,
-			      &afs_proc_cell_vlservers_fops, cell) ||
-	    !proc_create_data("volumes", 0, dir,
-			      &afs_proc_cell_volumes_fops, cell))
+	if (!proc_create_seq_data("vlservers", 0, dir,
+			&afs_proc_cell_vlservers_ops, cell))
+		goto error_tree;
+	if (!proc_create_seq_data("volumes", 0, dir, &afs_proc_cell_volumes_ops,
+			cell))
 		goto error_tree;
 
 	_leave(" = 0");
@@ -459,36 +426,13 @@ void afs_proc_cell_remove(struct afs_net *net, struct afs_cell *cell)
 }
 
 /*
- * open "/proc/fs/afs/<cell>/volumes" which provides a summary of extant cells
- */
-static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
-{
-	struct afs_cell *cell;
-	struct seq_file *m;
-	int ret;
-
-	cell = PDE_DATA(inode);
-	if (!cell)
-		return -ENOENT;
-
-	ret = seq_open(file, &afs_proc_cell_volumes_ops);
-	if (ret < 0)
-		return ret;
-
-	m = file->private_data;
-	m->private = cell;
-
-	return 0;
-}
-
-/*
  * set up the iterator to start reading from the cells list and return the
  * first item
  */
 static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
 	__acquires(cell->proc_lock)
 {
-	struct afs_cell *cell = m->private;
+	struct afs_cell *cell = PDE_DATA(file_inode(m->file));
 
 	_enter("cell=%p pos=%Ld", cell, *_pos);
 
@@ -502,7 +446,7 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
 static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
 					loff_t *_pos)
 {
-	struct afs_cell *cell = p->private;
+	struct afs_cell *cell = PDE_DATA(file_inode(p->file));
 
 	_enter("cell=%p pos=%Ld", cell, *_pos);
 	return seq_list_next(v, &cell->proc_volumes, _pos);
@@ -514,7 +458,7 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
 static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v)
 	__releases(cell->proc_lock)
 {
-	struct afs_cell *cell = p->private;
+	struct afs_cell *cell = PDE_DATA(file_inode(p->file));
 
 	read_unlock(&cell->proc_lock);
 }
@@ -530,7 +474,7 @@ static const char afs_vol_types[3][3] = {
  */
 static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
 {
-	struct afs_cell *cell = m->private;
+	struct afs_cell *cell = PDE_DATA(file_inode(m->file));
 	struct afs_volume *vol = list_entry(v, struct afs_volume, proc_link);
 
 	/* Display header on line 1 */
@@ -547,30 +491,6 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
 }
 
 /*
- * open "/proc/fs/afs/<cell>/vlservers" which provides a list of volume
- * location server
- */
-static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
-{
-	struct afs_cell *cell;
-	struct seq_file *m;
-	int ret;
-
-	cell = PDE_DATA(inode);
-	if (!cell)
-		return -ENOENT;
-
-	ret = seq_open(file, &afs_proc_cell_vlservers_ops);
-	if (ret<0)
-		return ret;
-
-	m = file->private_data;
-	m->private = cell;
-
-	return 0;
-}
-
-/*
  * set up the iterator to start reading from the cells list and return the
  * first item
  */
@@ -578,7 +498,7 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
 	__acquires(rcu)
 {
 	struct afs_addr_list *alist;
-	struct afs_cell *cell = m->private;
+	struct afs_cell *cell = PDE_DATA(file_inode(m->file));
 	loff_t pos = *_pos;
 
 	rcu_read_lock();
@@ -603,7 +523,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
 					  loff_t *_pos)
 {
 	struct afs_addr_list *alist;
-	struct afs_cell *cell = p->private;
+	struct afs_cell *cell = PDE_DATA(file_inode(p->file));
 	loff_t pos;
 
 	alist = rcu_dereference(cell->vl_addrs);
@@ -644,15 +564,6 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
 }
 
 /*
- * open "/proc/fs/afs/servers" which provides a summary of active
- * servers
- */
-static int afs_proc_servers_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &afs_proc_servers_ops);
-}
-
-/*
  * Set up the iterator to start reading from the server list and return the
  * first item.
  */
@@ -931,18 +842,3 @@ static int afs_proc_stats_show(struct seq_file *m, void *v)
 		   atomic_long_read(&net->n_store_bytes));
 	return 0;
 }
-
-/*
- * Open "/proc/fs/afs/stats" to allow reading of the stat counters.
- */
-static int afs_proc_stats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, afs_proc_stats_show, NULL);
-}
-
-static const struct file_operations afs_proc_stats_fops = {
-	.open		= afs_proc_stats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release        = single_release,
-};
diff --git a/fs/aio.c b/fs/aio.c
index 8061d97..b850e92 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -5,6 +5,7 @@
  *	Implements an efficient asynchronous io interface.
  *
  *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
+ *	Copyright 2018 Christoph Hellwig.
  *
  *	See ../COPYING for licensing terms.
  */
@@ -46,6 +47,8 @@
 
 #include "internal.h"
 
+#define KIOCB_KEY		0
+
 #define AIO_RING_MAGIC			0xa10a10a1
 #define AIO_RING_COMPAT_FEATURES	1
 #define AIO_RING_INCOMPAT_FEATURES	0
@@ -156,21 +159,29 @@ struct kioctx {
 	unsigned		id;
 };
 
-/*
- * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
- * cancelled or completed (this makes a certain amount of sense because
- * successful cancellation - io_cancel() - does deliver the completion to
- * userspace).
- *
- * And since most things don't implement kiocb cancellation and we'd really like
- * kiocb completion to be lockless when possible, we use ki_cancel to
- * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
- * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
- */
-#define KIOCB_CANCELLED		((void *) (~0ULL))
+struct fsync_iocb {
+	struct work_struct	work;
+	struct file		*file;
+	bool			datasync;
+};
+
+struct poll_iocb {
+	struct file		*file;
+	__poll_t		events;
+	struct wait_queue_head	*head;
+
+	union {
+		struct wait_queue_entry	wait;
+		struct work_struct	work;
+	};
+};
 
 struct aio_kiocb {
-	struct kiocb		common;
+	union {
+		struct kiocb		rw;
+		struct fsync_iocb	fsync;
+		struct poll_iocb	poll;
+	};
 
 	struct kioctx		*ki_ctx;
 	kiocb_cancel_fn		*ki_cancel;
@@ -264,9 +275,6 @@ static int __init aio_setup(void)
 
 	kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
-
-	pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
-
 	return 0;
 }
 __initcall(aio_setup);
@@ -552,42 +560,20 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
 
 void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
 {
-	struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common);
+	struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
 	struct kioctx *ctx = req->ki_ctx;
 	unsigned long flags;
 
-	spin_lock_irqsave(&ctx->ctx_lock, flags);
-
-	if (!req->ki_list.next)
-		list_add(&req->ki_list, &ctx->active_reqs);
+	if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
+		return;
 
+	spin_lock_irqsave(&ctx->ctx_lock, flags);
+	list_add_tail(&req->ki_list, &ctx->active_reqs);
 	req->ki_cancel = cancel;
-
 	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
 }
 EXPORT_SYMBOL(kiocb_set_cancel_fn);
 
-static int kiocb_cancel(struct aio_kiocb *kiocb)
-{
-	kiocb_cancel_fn *old, *cancel;
-
-	/*
-	 * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
-	 * actually has a cancel function, hence the cmpxchg()
-	 */
-
-	cancel = READ_ONCE(kiocb->ki_cancel);
-	do {
-		if (!cancel || cancel == KIOCB_CANCELLED)
-			return -EINVAL;
-
-		old = cancel;
-		cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
-	} while (cancel != old);
-
-	return cancel(&kiocb->common);
-}
-
 /*
  * free_ioctx() should be RCU delayed to synchronize against the RCU
  * protected lookup_ioctx() and also needs process context to call
@@ -634,9 +620,8 @@ static void free_ioctx_users(struct percpu_ref *ref)
 	while (!list_empty(&ctx->active_reqs)) {
 		req = list_first_entry(&ctx->active_reqs,
 				       struct aio_kiocb, ki_list);
-
+		req->ki_cancel(&req->rw);
 		list_del_init(&req->ki_list);
-		kiocb_cancel(req);
 	}
 
 	spin_unlock_irq(&ctx->ctx_lock);
@@ -1042,7 +1027,7 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
 		goto out_put;
 
 	percpu_ref_get(&ctx->reqs);
-
+	INIT_LIST_HEAD(&req->ki_list);
 	req->ki_ctx = ctx;
 	return req;
 out_put:
@@ -1050,15 +1035,6 @@ out_put:
 	return NULL;
 }
 
-static void kiocb_free(struct aio_kiocb *req)
-{
-	if (req->common.ki_filp)
-		fput(req->common.ki_filp);
-	if (req->ki_eventfd != NULL)
-		eventfd_ctx_put(req->ki_eventfd);
-	kmem_cache_free(kiocb_cachep, req);
-}
-
 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 {
 	struct aio_ring __user *ring  = (void __user *)ctx_id;
@@ -1089,44 +1065,14 @@ out:
 /* aio_complete
  *	Called when the io request on the given iocb is complete.
  */
-static void aio_complete(struct kiocb *kiocb, long res, long res2)
+static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
 {
-	struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common);
 	struct kioctx	*ctx = iocb->ki_ctx;
 	struct aio_ring	*ring;
 	struct io_event	*ev_page, *event;
 	unsigned tail, pos, head;
 	unsigned long	flags;
 
-	if (kiocb->ki_flags & IOCB_WRITE) {
-		struct file *file = kiocb->ki_filp;
-
-		/*
-		 * Tell lockdep we inherited freeze protection from submission
-		 * thread.
-		 */
-		if (S_ISREG(file_inode(file)->i_mode))
-			__sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE);
-		file_end_write(file);
-	}
-
-	/*
-	 * Special case handling for sync iocbs:
-	 *  - events go directly into the iocb for fast handling
-	 *  - the sync task with the iocb in its stack holds the single iocb
-	 *    ref, no other paths have a way to get another ref
-	 *  - the sync task helpfully left a reference to itself in the iocb
-	 */
-	BUG_ON(is_sync_kiocb(kiocb));
-
-	if (iocb->ki_list.next) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&ctx->ctx_lock, flags);
-		list_del(&iocb->ki_list);
-		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
-	}
-
 	/*
 	 * Add a completion event to the ring buffer. Must be done holding
 	 * ctx->completion_lock to prevent other code from messing with the tail
@@ -1180,11 +1126,12 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2)
 	 * eventfd. The eventfd_signal() function is safe to be called
 	 * from IRQ context.
 	 */
-	if (iocb->ki_eventfd != NULL)
+	if (iocb->ki_eventfd) {
 		eventfd_signal(iocb->ki_eventfd, 1);
+		eventfd_ctx_put(iocb->ki_eventfd);
+	}
 
-	/* everything turned out well, dispose of the aiocb. */
-	kiocb_free(iocb);
+	kmem_cache_free(kiocb_cachep, iocb);
 
 	/*
 	 * We have to order our ring_info tail store above and test
@@ -1250,14 +1197,13 @@ static long aio_read_events_ring(struct kioctx *ctx,
 		if (head == tail)
 			break;
 
-		avail = min(avail, nr - ret);
-		avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -
-			    ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));
-
 		pos = head + AIO_EVENTS_OFFSET;
 		page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
 		pos %= AIO_EVENTS_PER_PAGE;
 
+		avail = min(avail, nr - ret);
+		avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos);
+
 		ev = kmap(page);
 		copy_ret = copy_to_user(event + ret, ev + pos,
 					sizeof(*ev) * avail);
@@ -1328,10 +1274,6 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
 		wait_event_interruptible_hrtimeout(ctx->wait,
 				aio_read_events(ctx, min_nr, nr, event, &ret),
 				until);
-
-	if (!ret && signal_pending(current))
-		ret = -EINTR;
-
 	return ret;
 }
 
@@ -1447,6 +1389,58 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 	return -EINVAL;
 }
 
+static void aio_remove_iocb(struct aio_kiocb *iocb)
+{
+	struct kioctx *ctx = iocb->ki_ctx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->ctx_lock, flags);
+	list_del(&iocb->ki_list);
+	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+}
+
+static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
+{
+	struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
+
+	if (!list_empty_careful(&iocb->ki_list))
+		aio_remove_iocb(iocb);
+
+	if (kiocb->ki_flags & IOCB_WRITE) {
+		struct inode *inode = file_inode(kiocb->ki_filp);
+
+		/*
+		 * Tell lockdep we inherited freeze protection from submission
+		 * thread.
+		 */
+		if (S_ISREG(inode->i_mode))
+			__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+		file_end_write(kiocb->ki_filp);
+	}
+
+	fput(kiocb->ki_filp);
+	aio_complete(iocb, res, res2);
+}
+
+static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
+{
+	int ret;
+
+	req->ki_filp = fget(iocb->aio_fildes);
+	if (unlikely(!req->ki_filp))
+		return -EBADF;
+	req->ki_complete = aio_complete_rw;
+	req->ki_pos = iocb->aio_offset;
+	req->ki_flags = iocb_flags(req->ki_filp);
+	if (iocb->aio_flags & IOCB_FLAG_RESFD)
+		req->ki_flags |= IOCB_EVENTFD;
+	req->ki_hint = file_write_hint(req->ki_filp);
+	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
+	if (unlikely(ret))
+		fput(req->ki_filp);
+	return ret;
+}
+
 static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
 		bool vectored, bool compat, struct iov_iter *iter)
 {
@@ -1466,11 +1460,11 @@ static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
 	return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter);
 }
 
-static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret)
+static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
 {
 	switch (ret) {
 	case -EIOCBQUEUED:
-		return ret;
+		break;
 	case -ERESTARTSYS:
 	case -ERESTARTNOINTR:
 	case -ERESTARTNOHAND:
@@ -1482,85 +1476,270 @@ static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret)
 		ret = -EINTR;
 		/*FALLTHRU*/
 	default:
-		aio_complete(req, ret, 0);
-		return 0;
+		aio_complete_rw(req, ret, 0);
 	}
 }
 
 static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,
 		bool compat)
 {
-	struct file *file = req->ki_filp;
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct iov_iter iter;
+	struct file *file;
 	ssize_t ret;
 
+	ret = aio_prep_rw(req, iocb);
+	if (ret)
+		return ret;
+	file = req->ki_filp;
+
+	ret = -EBADF;
 	if (unlikely(!(file->f_mode & FMODE_READ)))
-		return -EBADF;
+		goto out_fput;
+	ret = -EINVAL;
 	if (unlikely(!file->f_op->read_iter))
-		return -EINVAL;
+		goto out_fput;
 
 	ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
 	if (ret)
-		return ret;
+		goto out_fput;
 	ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret)
-		ret = aio_ret(req, call_read_iter(file, req, &iter));
+		aio_rw_done(req, call_read_iter(file, req, &iter));
 	kfree(iovec);
+out_fput:
+	if (unlikely(ret))
+		fput(file);
 	return ret;
 }
 
 static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
 		bool compat)
 {
-	struct file *file = req->ki_filp;
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct iov_iter iter;
+	struct file *file;
 	ssize_t ret;
 
+	ret = aio_prep_rw(req, iocb);
+	if (ret)
+		return ret;
+	file = req->ki_filp;
+
+	ret = -EBADF;
 	if (unlikely(!(file->f_mode & FMODE_WRITE)))
-		return -EBADF;
+		goto out_fput;
+	ret = -EINVAL;
 	if (unlikely(!file->f_op->write_iter))
-		return -EINVAL;
+		goto out_fput;
 
 	ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
 	if (ret)
-		return ret;
+		goto out_fput;
 	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret) {
-		req->ki_flags |= IOCB_WRITE;
-		file_start_write(file);
-		ret = aio_ret(req, call_write_iter(file, req, &iter));
 		/*
-		 * We release freeze protection in aio_complete().  Fool lockdep
-		 * by telling it the lock got released so that it doesn't
-		 * complain about held lock when we return to userspace.
+		 * Open-code file_start_write here to grab freeze protection,
+		 * which will be released by another thread in
+		 * aio_complete_rw().  Fool lockdep by telling it the lock got
+		 * released so that it doesn't complain about the held lock when
+		 * we return to userspace.
 		 */
-		if (S_ISREG(file_inode(file)->i_mode))
+		if (S_ISREG(file_inode(file)->i_mode)) {
+			__sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
 			__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
+		}
+		req->ki_flags |= IOCB_WRITE;
+		aio_rw_done(req, call_write_iter(file, req, &iter));
 	}
 	kfree(iovec);
+out_fput:
+	if (unlikely(ret))
+		fput(file);
 	return ret;
 }
 
+static void aio_fsync_work(struct work_struct *work)
+{
+	struct fsync_iocb *req = container_of(work, struct fsync_iocb, work);
+	int ret;
+
+	ret = vfs_fsync(req->file, req->datasync);
+	fput(req->file);
+	aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
+}
+
+static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
+{
+	if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
+			iocb->aio_rw_flags))
+		return -EINVAL;
+	req->file = fget(iocb->aio_fildes);
+	if (unlikely(!req->file))
+		return -EBADF;
+	if (unlikely(!req->file->f_op->fsync)) {
+		fput(req->file);
+		return -EINVAL;
+	}
+
+	req->datasync = datasync;
+	INIT_WORK(&req->work, aio_fsync_work);
+	schedule_work(&req->work);
+	return 0;
+}
+
+/* need to use list_del_init so we can check if item was present */
+static inline bool __aio_poll_remove(struct poll_iocb *req)
+{
+	if (list_empty(&req->wait.entry))
+		return false;
+	list_del_init(&req->wait.entry);
+	return true;
+}
+
+static inline void __aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
+{
+	fput(iocb->poll.file);
+	aio_complete(iocb, mangle_poll(mask), 0);
+}
+
+static void aio_poll_work(struct work_struct *work)
+{
+	struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, poll.work);
+
+	if (!list_empty_careful(&iocb->ki_list))
+		aio_remove_iocb(iocb);
+	__aio_poll_complete(iocb, iocb->poll.events);
+}
+
+static int aio_poll_cancel(struct kiocb *iocb)
+{
+	struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
+	struct poll_iocb *req = &aiocb->poll;
+	struct wait_queue_head *head = req->head;
+	bool found = false;
+
+	spin_lock(&head->lock);
+	found = __aio_poll_remove(req);
+	spin_unlock(&head->lock);
+
+	if (found) {
+		req->events = 0;
+		INIT_WORK(&req->work, aio_poll_work);
+		schedule_work(&req->work);
+	}
+	return 0;
+}
+
+static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+		void *key)
+{
+	struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
+	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
+	struct file *file = req->file;
+	__poll_t mask = key_to_poll(key);
+
+	assert_spin_locked(&req->head->lock);
+
+	/* for instances that support it check for an event match first: */
+	if (mask && !(mask & req->events))
+		return 0;
+
+	mask = file->f_op->poll_mask(file, req->events);
+	if (!mask)
+		return 0;
+
+	__aio_poll_remove(req);
+
+	/*
+	 * Try completing without a context switch if we can acquire ctx_lock
+	 * without spinning.  Otherwise we need to defer to a workqueue to
+	 * avoid a deadlock due to the lock order.
+	 */
+	if (spin_trylock(&iocb->ki_ctx->ctx_lock)) {
+		list_del_init(&iocb->ki_list);
+		spin_unlock(&iocb->ki_ctx->ctx_lock);
+
+		__aio_poll_complete(iocb, mask);
+	} else {
+		req->events = mask;
+		INIT_WORK(&req->work, aio_poll_work);
+		schedule_work(&req->work);
+	}
+
+	return 1;
+}
+
+static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
+{
+	struct kioctx *ctx = aiocb->ki_ctx;
+	struct poll_iocb *req = &aiocb->poll;
+	__poll_t mask;
+
+	/* reject any unknown events outside the normal event mask. */
+	if ((u16)iocb->aio_buf != iocb->aio_buf)
+		return -EINVAL;
+	/* reject fields that are not defined for poll */
+	if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
+		return -EINVAL;
+
+	req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
+	req->file = fget(iocb->aio_fildes);
+	if (unlikely(!req->file))
+		return -EBADF;
+	if (!file_has_poll_mask(req->file))
+		goto out_fail;
+
+	req->head = req->file->f_op->get_poll_head(req->file, req->events);
+	if (!req->head)
+		goto out_fail;
+	if (IS_ERR(req->head)) {
+		mask = EPOLLERR;
+		goto done;
+	}
+
+	init_waitqueue_func_entry(&req->wait, aio_poll_wake);
+	aiocb->ki_cancel = aio_poll_cancel;
+
+	spin_lock_irq(&ctx->ctx_lock);
+	spin_lock(&req->head->lock);
+	mask = req->file->f_op->poll_mask(req->file, req->events);
+	if (!mask) {
+		__add_wait_queue(req->head, &req->wait);
+		list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
+	}
+	spin_unlock(&req->head->lock);
+	spin_unlock_irq(&ctx->ctx_lock);
+done:
+	if (mask)
+		__aio_poll_complete(aiocb, mask);
+	return 0;
+out_fail:
+	fput(req->file);
+	return -EINVAL; /* same as no support for IOCB_CMD_POLL */
+}
+
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-			 struct iocb *iocb, bool compat)
+			 bool compat)
 {
 	struct aio_kiocb *req;
-	struct file *file;
+	struct iocb iocb;
 	ssize_t ret;
 
+	if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
+		return -EFAULT;
+
 	/* enforce forwards compatibility on users */
-	if (unlikely(iocb->aio_reserved2)) {
+	if (unlikely(iocb.aio_reserved2)) {
 		pr_debug("EINVAL: reserve field set\n");
 		return -EINVAL;
 	}
 
 	/* prevent overflows */
 	if (unlikely(
-	    (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
-	    (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
-	    ((ssize_t)iocb->aio_nbytes < 0)
+	    (iocb.aio_buf != (unsigned long)iocb.aio_buf) ||
+	    (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) ||
+	    ((ssize_t)iocb.aio_nbytes < 0)
 	   )) {
 		pr_debug("EINVAL: overflow check\n");
 		return -EINVAL;
@@ -1570,37 +1749,19 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	if (unlikely(!req))
 		return -EAGAIN;
 
-	req->common.ki_filp = file = fget(iocb->aio_fildes);
-	if (unlikely(!req->common.ki_filp)) {
-		ret = -EBADF;
-		goto out_put_req;
-	}
-	req->common.ki_pos = iocb->aio_offset;
-	req->common.ki_complete = aio_complete;
-	req->common.ki_flags = iocb_flags(req->common.ki_filp);
-	req->common.ki_hint = file_write_hint(file);
-
-	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
+	if (iocb.aio_flags & IOCB_FLAG_RESFD) {
 		/*
 		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
 		 * instance of the file* now. The file descriptor must be
 		 * an eventfd() fd, and will be signaled for each completed
 		 * event using the eventfd_signal() function.
 		 */
-		req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
+		req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd);
 		if (IS_ERR(req->ki_eventfd)) {
 			ret = PTR_ERR(req->ki_eventfd);
 			req->ki_eventfd = NULL;
 			goto out_put_req;
 		}
-
-		req->common.ki_flags |= IOCB_EVENTFD;
-	}
-
-	ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags);
-	if (unlikely(ret)) {
-		pr_debug("EINVAL: aio_rw_flags\n");
-		goto out_put_req;
 	}
 
 	ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
@@ -1610,41 +1771,67 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	}
 
 	req->ki_user_iocb = user_iocb;
-	req->ki_user_data = iocb->aio_data;
+	req->ki_user_data = iocb.aio_data;
 
-	get_file(file);
-	switch (iocb->aio_lio_opcode) {
+	switch (iocb.aio_lio_opcode) {
 	case IOCB_CMD_PREAD:
-		ret = aio_read(&req->common, iocb, false, compat);
+		ret = aio_read(&req->rw, &iocb, false, compat);
 		break;
 	case IOCB_CMD_PWRITE:
-		ret = aio_write(&req->common, iocb, false, compat);
+		ret = aio_write(&req->rw, &iocb, false, compat);
 		break;
 	case IOCB_CMD_PREADV:
-		ret = aio_read(&req->common, iocb, true, compat);
+		ret = aio_read(&req->rw, &iocb, true, compat);
 		break;
 	case IOCB_CMD_PWRITEV:
-		ret = aio_write(&req->common, iocb, true, compat);
+		ret = aio_write(&req->rw, &iocb, true, compat);
+		break;
+	case IOCB_CMD_FSYNC:
+		ret = aio_fsync(&req->fsync, &iocb, false);
+		break;
+	case IOCB_CMD_FDSYNC:
+		ret = aio_fsync(&req->fsync, &iocb, true);
+		break;
+	case IOCB_CMD_POLL:
+		ret = aio_poll(req, &iocb);
 		break;
 	default:
-		pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
+		pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode);
 		ret = -EINVAL;
 		break;
 	}
-	fput(file);
 
-	if (ret && ret != -EIOCBQUEUED)
+	/*
+	 * If ret is 0, we'd either done aio_complete() ourselves or have
+	 * arranged for that to be done asynchronously.  Anything non-zero
+	 * means that we need to destroy req ourselves.
+	 */
+	if (ret)
 		goto out_put_req;
 	return 0;
 out_put_req:
 	put_reqs_available(ctx, 1);
 	percpu_ref_put(&ctx->reqs);
-	kiocb_free(req);
+	if (req->ki_eventfd)
+		eventfd_ctx_put(req->ki_eventfd);
+	kmem_cache_free(kiocb_cachep, req);
 	return ret;
 }
 
-static long do_io_submit(aio_context_t ctx_id, long nr,
-			  struct iocb __user *__user *iocbpp, bool compat)
+/* sys_io_submit:
+ *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
+ *	the number of iocbs queued.  May return -EINVAL if the aio_context
+ *	specified by ctx_id is invalid, if nr is < 0, if the iocb at
+ *	*iocbpp[0] is not properly initialized, if the operation specified
+ *	is invalid for the file descriptor in the iocb.  May fail with
+ *	-EFAULT if any of the data structures point to invalid data.  May
+ *	fail with -EBADF if the file descriptor specified in the first
+ *	iocb is invalid.  May fail with -EAGAIN if insufficient resources
+ *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
+ *	fail with -ENOSYS if not implemented.
+ */
+SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
+		struct iocb __user * __user *, iocbpp)
 {
 	struct kioctx *ctx;
 	long ret = 0;
@@ -1654,39 +1841,25 @@ static long do_io_submit(aio_context_t ctx_id, long nr,
 	if (unlikely(nr < 0))
 		return -EINVAL;
 
-	if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))
-		nr = LONG_MAX/sizeof(*iocbpp);
-
-	if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
-		return -EFAULT;
-
 	ctx = lookup_ioctx(ctx_id);
 	if (unlikely(!ctx)) {
 		pr_debug("EINVAL: invalid context id\n");
 		return -EINVAL;
 	}
 
-	blk_start_plug(&plug);
+	if (nr > ctx->nr_events)
+		nr = ctx->nr_events;
 
-	/*
-	 * AKPM: should this return a partial result if some of the IOs were
-	 * successfully submitted?
-	 */
-	for (i=0; i<nr; i++) {
+	blk_start_plug(&plug);
+	for (i = 0; i < nr; i++) {
 		struct iocb __user *user_iocb;
-		struct iocb tmp;
 
-		if (unlikely(__get_user(user_iocb, iocbpp + i))) {
+		if (unlikely(get_user(user_iocb, iocbpp + i))) {
 			ret = -EFAULT;
 			break;
 		}
 
-		if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) {
-			ret = -EFAULT;
-			break;
-		}
-
-		ret = io_submit_one(ctx, user_iocb, &tmp, compat);
+		ret = io_submit_one(ctx, user_iocb, false);
 		if (ret)
 			break;
 	}
@@ -1696,59 +1869,44 @@ static long do_io_submit(aio_context_t ctx_id, long nr,
 	return i ? i : ret;
 }
 
-/* sys_io_submit:
- *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
- *	the number of iocbs queued.  May return -EINVAL if the aio_context
- *	specified by ctx_id is invalid, if nr is < 0, if the iocb at
- *	*iocbpp[0] is not properly initialized, if the operation specified
- *	is invalid for the file descriptor in the iocb.  May fail with
- *	-EFAULT if any of the data structures point to invalid data.  May
- *	fail with -EBADF if the file descriptor specified in the first
- *	iocb is invalid.  May fail with -EAGAIN if insufficient resources
- *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
- *	fail with -ENOSYS if not implemented.
- */
-SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
-		struct iocb __user * __user *, iocbpp)
-{
-	return do_io_submit(ctx_id, nr, iocbpp, 0);
-}
-
 #ifdef CONFIG_COMPAT
-static inline long
-copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
+COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
+		       int, nr, compat_uptr_t __user *, iocbpp)
 {
-	compat_uptr_t uptr;
-	int i;
+	struct kioctx *ctx;
+	long ret = 0;
+	int i = 0;
+	struct blk_plug plug;
 
-	for (i = 0; i < nr; ++i) {
-		if (get_user(uptr, ptr32 + i))
-			return -EFAULT;
-		if (put_user(compat_ptr(uptr), ptr64 + i))
-			return -EFAULT;
+	if (unlikely(nr < 0))
+		return -EINVAL;
+
+	ctx = lookup_ioctx(ctx_id);
+	if (unlikely(!ctx)) {
+		pr_debug("EINVAL: invalid context id\n");
+		return -EINVAL;
 	}
-	return 0;
-}
 
-#define MAX_AIO_SUBMITS 	(PAGE_SIZE/sizeof(struct iocb *))
+	if (nr > ctx->nr_events)
+		nr = ctx->nr_events;
 
-COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
-		       int, nr, u32 __user *, iocb)
-{
-	struct iocb __user * __user *iocb64;
-	long ret;
+	blk_start_plug(&plug);
+	for (i = 0; i < nr; i++) {
+		compat_uptr_t user_iocb;
 
-	if (unlikely(nr < 0))
-		return -EINVAL;
+		if (unlikely(get_user(user_iocb, iocbpp + i))) {
+			ret = -EFAULT;
+			break;
+		}
 
-	if (nr > MAX_AIO_SUBMITS)
-		nr = MAX_AIO_SUBMITS;
+		ret = io_submit_one(ctx, compat_ptr(user_iocb), true);
+		if (ret)
+			break;
+	}
+	blk_finish_plug(&plug);
 
-	iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
-	ret = copy_iocb(nr, iocb, iocb64);
-	if (!ret)
-		ret = do_io_submit(ctx_id, nr, iocb64, 1);
-	return ret;
+	percpu_ref_put(&ctx->users);
+	return i ? i : ret;
 }
 #endif
 
@@ -1756,15 +1914,12 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
  *	Finds a given iocb for cancellation.
  */
 static struct aio_kiocb *
-lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
+lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb)
 {
 	struct aio_kiocb *kiocb;
 
 	assert_spin_locked(&ctx->ctx_lock);
 
-	if (key != KIOCB_KEY)
-		return NULL;
-
 	/* TODO: use a hash or array, this sucks. */
 	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
 		if (kiocb->ki_user_iocb == iocb)
@@ -1788,25 +1943,24 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 {
 	struct kioctx *ctx;
 	struct aio_kiocb *kiocb;
+	int ret = -EINVAL;
 	u32 key;
-	int ret;
 
-	ret = get_user(key, &iocb->aio_key);
-	if (unlikely(ret))
+	if (unlikely(get_user(key, &iocb->aio_key)))
 		return -EFAULT;
+	if (unlikely(key != KIOCB_KEY))
+		return -EINVAL;
 
 	ctx = lookup_ioctx(ctx_id);
 	if (unlikely(!ctx))
 		return -EINVAL;
 
 	spin_lock_irq(&ctx->ctx_lock);
-
-	kiocb = lookup_kiocb(ctx, iocb, key);
-	if (kiocb)
-		ret = kiocb_cancel(kiocb);
-	else
-		ret = -EINVAL;
-
+	kiocb = lookup_kiocb(ctx, iocb);
+	if (kiocb) {
+		ret = kiocb->ki_cancel(&kiocb->rw);
+		list_del_init(&kiocb->ki_list);
+	}
 	spin_unlock_irq(&ctx->ctx_lock);
 
 	if (!ret) {
@@ -1861,13 +2015,60 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 		struct timespec __user *, timeout)
 {
 	struct timespec64	ts;
+	int			ret;
+
+	if (timeout && unlikely(get_timespec64(&ts, timeout)))
+		return -EFAULT;
+
+	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+	if (!ret && signal_pending(current))
+		ret = -EINTR;
+	return ret;
+}
+
+SYSCALL_DEFINE6(io_pgetevents,
+		aio_context_t, ctx_id,
+		long, min_nr,
+		long, nr,
+		struct io_event __user *, events,
+		struct timespec __user *, timeout,
+		const struct __aio_sigset __user *, usig)
+{
+	struct __aio_sigset	ksig = { NULL, };
+	sigset_t		ksigmask, sigsaved;
+	struct timespec64	ts;
+	int ret;
+
+	if (timeout && unlikely(get_timespec64(&ts, timeout)))
+		return -EFAULT;
+
+	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
+		return -EFAULT;
 
-	if (timeout) {
-		if (unlikely(get_timespec64(&ts, timeout)))
+	if (ksig.sigmask) {
+		if (ksig.sigsetsize != sizeof(sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask)))
 			return -EFAULT;
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+	if (signal_pending(current)) {
+		if (ksig.sigmask) {
+			current->saved_sigmask = sigsaved;
+			set_restore_sigmask();
+		}
+
+		if (!ret)
+			ret = -ERESTARTNOHAND;
+	} else {
+		if (ksig.sigmask)
+			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 	}
 
-	return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+	return ret;
 }
 
 #ifdef CONFIG_COMPAT
@@ -1878,13 +2079,64 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
 		       struct compat_timespec __user *, timeout)
 {
 	struct timespec64 t;
+	int ret;
+
+	if (timeout && compat_get_timespec64(&t, timeout))
+		return -EFAULT;
+
+	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+	if (!ret && signal_pending(current))
+		ret = -EINTR;
+	return ret;
+}
+
+
+struct __compat_aio_sigset {
+	compat_sigset_t __user	*sigmask;
+	compat_size_t		sigsetsize;
+};
+
+COMPAT_SYSCALL_DEFINE6(io_pgetevents,
+		compat_aio_context_t, ctx_id,
+		compat_long_t, min_nr,
+		compat_long_t, nr,
+		struct io_event __user *, events,
+		struct compat_timespec __user *, timeout,
+		const struct __compat_aio_sigset __user *, usig)
+{
+	struct __compat_aio_sigset ksig = { NULL, };
+	sigset_t ksigmask, sigsaved;
+	struct timespec64 t;
+	int ret;
+
+	if (timeout && compat_get_timespec64(&t, timeout))
+		return -EFAULT;
+
+	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
+		return -EFAULT;
 
-	if (timeout) {
-		if (compat_get_timespec64(&t, timeout))
+	if (ksig.sigmask) {
+		if (ksig.sigsetsize != sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (get_compat_sigset(&ksigmask, ksig.sigmask))
 			return -EFAULT;
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
 
+	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+	if (signal_pending(current)) {
+		if (ksig.sigmask) {
+			current->saved_sigmask = sigsaved;
+			set_restore_sigmask();
+		}
+		if (!ret)
+			ret = -ERESTARTNOHAND;
+	} else {
+		if (ksig.sigmask)
+			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 	}
 
-	return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+	return ret;
 }
 #endif
diff --git a/fs/attr.c b/fs/attr.c
index 12ffdb6..d0b4d34 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,6 +18,32 @@
 #include <linux/evm.h>
 #include <linux/ima.h>
 
+static bool chown_ok(const struct inode *inode, kuid_t uid)
+{
+	if (uid_eq(current_fsuid(), inode->i_uid) &&
+	    uid_eq(uid, inode->i_uid))
+		return true;
+	if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+		return true;
+	if (uid_eq(inode->i_uid, INVALID_UID) &&
+	    ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
+		return true;
+	return false;
+}
+
+static bool chgrp_ok(const struct inode *inode, kgid_t gid)
+{
+	if (uid_eq(current_fsuid(), inode->i_uid) &&
+	    (in_group_p(gid) || gid_eq(gid, inode->i_gid)))
+		return true;
+	if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+		return true;
+	if (gid_eq(inode->i_gid, INVALID_GID) &&
+	    ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
+		return true;
+	return false;
+}
+
 /**
  * setattr_prepare - check if attribute changes to a dentry are allowed
  * @dentry:	dentry to check
@@ -52,17 +78,11 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr)
 		goto kill_priv;
 
 	/* Make sure a caller can chown. */
-	if ((ia_valid & ATTR_UID) &&
-	    (!uid_eq(current_fsuid(), inode->i_uid) ||
-	     !uid_eq(attr->ia_uid, inode->i_uid)) &&
-	    !capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+	if ((ia_valid & ATTR_UID) && !chown_ok(inode, attr->ia_uid))
 		return -EPERM;
 
 	/* Make sure caller can chgrp. */
-	if ((ia_valid & ATTR_GID) &&
-	    (!uid_eq(current_fsuid(), inode->i_uid) ||
-	    (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) &&
-	    !capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+	if ((ia_valid & ATTR_GID) && !chgrp_ok(inode, attr->ia_gid))
 		return -EPERM;
 
 	/* Make sure a caller can chmod. */
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index ee832ca..f32f21c 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -21,10 +21,9 @@
 #define dprintf(x...)
 #endif
 
-static int bfs_add_entry(struct inode *dir, const unsigned char *name,
-						int namelen, int ino);
+static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino);
 static struct buffer_head *bfs_find_entry(struct inode *dir,
-				const unsigned char *name, int namelen,
+				const struct qstr *child,
 				struct bfs_dirent **res_dir);
 
 static int bfs_readdir(struct file *f, struct dir_context *ctx)
@@ -111,8 +110,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
         mark_inode_dirty(inode);
 	bfs_dump_imap("create", s);
 
-	err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len,
-							inode->i_ino);
+	err = bfs_add_entry(dir, &dentry->d_name, inode->i_ino);
 	if (err) {
 		inode_dec_link_count(inode);
 		mutex_unlock(&info->bfs_lock);
@@ -136,19 +134,14 @@ static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry,
 		return ERR_PTR(-ENAMETOOLONG);
 
 	mutex_lock(&info->bfs_lock);
-	bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
+	bh = bfs_find_entry(dir, &dentry->d_name, &de);
 	if (bh) {
 		unsigned long ino = (unsigned long)le16_to_cpu(de->ino);
 		brelse(bh);
 		inode = bfs_iget(dir->i_sb, ino);
-		if (IS_ERR(inode)) {
-			mutex_unlock(&info->bfs_lock);
-			return ERR_CAST(inode);
-		}
 	}
 	mutex_unlock(&info->bfs_lock);
-	d_add(dentry, inode);
-	return NULL;
+	return d_splice_alias(inode, dentry);
 }
 
 static int bfs_link(struct dentry *old, struct inode *dir,
@@ -159,8 +152,7 @@ static int bfs_link(struct dentry *old, struct inode *dir,
 	int err;
 
 	mutex_lock(&info->bfs_lock);
-	err = bfs_add_entry(dir, new->d_name.name, new->d_name.len,
-							inode->i_ino);
+	err = bfs_add_entry(dir, &new->d_name, inode->i_ino);
 	if (err) {
 		mutex_unlock(&info->bfs_lock);
 		return err;
@@ -183,7 +175,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
 
 	mutex_lock(&info->bfs_lock);
-	bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
+	bh = bfs_find_entry(dir, &dentry->d_name, &de);
 	if (!bh || (le16_to_cpu(de->ino) != inode->i_ino))
 		goto out_brelse;
 
@@ -228,27 +220,21 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	info = BFS_SB(old_inode->i_sb);
 
 	mutex_lock(&info->bfs_lock);
-	old_bh = bfs_find_entry(old_dir, 
-				old_dentry->d_name.name, 
-				old_dentry->d_name.len, &old_de);
+	old_bh = bfs_find_entry(old_dir, &old_dentry->d_name, &old_de);
 
 	if (!old_bh || (le16_to_cpu(old_de->ino) != old_inode->i_ino))
 		goto end_rename;
 
 	error = -EPERM;
 	new_inode = d_inode(new_dentry);
-	new_bh = bfs_find_entry(new_dir, 
-				new_dentry->d_name.name, 
-				new_dentry->d_name.len, &new_de);
+	new_bh = bfs_find_entry(new_dir, &new_dentry->d_name, &new_de);
 
 	if (new_bh && !new_inode) {
 		brelse(new_bh);
 		new_bh = NULL;
 	}
 	if (!new_bh) {
-		error = bfs_add_entry(new_dir, 
-					new_dentry->d_name.name,
-					new_dentry->d_name.len,
+		error = bfs_add_entry(new_dir, &new_dentry->d_name,
 					old_inode->i_ino);
 		if (error)
 			goto end_rename;
@@ -278,9 +264,10 @@ const struct inode_operations bfs_dir_inops = {
 	.rename			= bfs_rename,
 };
 
-static int bfs_add_entry(struct inode *dir, const unsigned char *name,
-							int namelen, int ino)
+static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino)
 {
+	const unsigned char *name = child->name;
+	int namelen = child->len;
 	struct buffer_head *bh;
 	struct bfs_dirent *de;
 	int block, sblock, eblock, off, pos;
@@ -332,12 +319,14 @@ static inline int bfs_namecmp(int len, const unsigned char *name,
 }
 
 static struct buffer_head *bfs_find_entry(struct inode *dir,
-			const unsigned char *name, int namelen,
+			const struct qstr *child,
 			struct bfs_dirent **res_dir)
 {
 	unsigned long block = 0, offset = 0;
 	struct buffer_head *bh = NULL;
 	struct bfs_dirent *de;
+	const unsigned char *name = child->name;
+	int namelen = child->len;
 
 	*res_dir = NULL;
 	if (namelen > BFS_NAMELEN)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7ec920e..bef6934 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -272,7 +272,7 @@ struct blkdev_dio {
 	struct bio		bio;
 };
 
-static struct bio_set *blkdev_dio_pool __read_mostly;
+static struct bio_set blkdev_dio_pool;
 
 static void blkdev_bio_end_io(struct bio *bio)
 {
@@ -334,7 +334,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	    (bdev_logical_block_size(bdev) - 1))
 		return -EINVAL;
 
-	bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, blkdev_dio_pool);
+	bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
 	bio_get(bio); /* extra ref for the completion handler */
 
 	dio = container_of(bio, struct blkdev_dio, bio);
@@ -432,10 +432,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 
 static __init int blkdev_init(void)
 {
-	blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS);
-	if (!blkdev_dio_pool)
-		return -ENOMEM;
-	return 0;
+	return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS);
 }
 module_init(blkdev_init);
 
@@ -1322,27 +1319,30 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
  * check_disk_size_change - checks for disk size change and adjusts bdev size.
  * @disk: struct gendisk to check
  * @bdev: struct bdev to adjust.
+ * @verbose: if %true log a message about a size change if there is any
  *
  * This routine checks to see if the bdev size does not match the disk size
  * and adjusts it if it differs. When shrinking the bdev size, its all caches
  * are freed.
  */
-void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
+void check_disk_size_change(struct gendisk *disk, struct block_device *bdev,
+		bool verbose)
 {
 	loff_t disk_size, bdev_size;
 
 	disk_size = (loff_t)get_capacity(disk) << 9;
 	bdev_size = i_size_read(bdev->bd_inode);
 	if (disk_size != bdev_size) {
-		printk(KERN_INFO
-		       "%s: detected capacity change from %lld to %lld\n",
-		       disk->disk_name, bdev_size, disk_size);
+		if (verbose) {
+			printk(KERN_INFO
+			       "%s: detected capacity change from %lld to %lld\n",
+			       disk->disk_name, bdev_size, disk_size);
+		}
 		i_size_write(bdev->bd_inode, disk_size);
 		if (bdev_size > disk_size)
 			flush_disk(bdev, false);
 	}
 }
-EXPORT_SYMBOL(check_disk_size_change);
 
 /**
  * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
@@ -1364,7 +1364,7 @@ int revalidate_disk(struct gendisk *disk)
 		return ret;
 
 	mutex_lock(&bdev->bd_mutex);
-	check_disk_size_change(disk, bdev);
+	check_disk_size_change(disk, bdev, ret == 0);
 	bdev->bd_invalidated = 0;
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 234bae5..7e07534 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -19,17 +19,17 @@
  * ordered operations list so that we make sure to flush out any
  * new data the application may have written before commit.
  */
-#define BTRFS_INODE_ORDERED_DATA_CLOSE		0
-#define BTRFS_INODE_ORPHAN_META_RESERVED	1
-#define BTRFS_INODE_DUMMY			2
-#define BTRFS_INODE_IN_DEFRAG			3
-#define BTRFS_INODE_HAS_ORPHAN_ITEM		4
-#define BTRFS_INODE_HAS_ASYNC_EXTENT		5
-#define BTRFS_INODE_NEEDS_FULL_SYNC		6
-#define BTRFS_INODE_COPY_EVERYTHING		7
-#define BTRFS_INODE_IN_DELALLOC_LIST		8
-#define BTRFS_INODE_READDIO_NEED_LOCK		9
-#define BTRFS_INODE_HAS_PROPS		        10
+enum {
+	BTRFS_INODE_ORDERED_DATA_CLOSE = 0,
+	BTRFS_INODE_DUMMY,
+	BTRFS_INODE_IN_DEFRAG,
+	BTRFS_INODE_HAS_ASYNC_EXTENT,
+	BTRFS_INODE_NEEDS_FULL_SYNC,
+	BTRFS_INODE_COPY_EVERYTHING,
+	BTRFS_INODE_IN_DELALLOC_LIST,
+	BTRFS_INODE_READDIO_NEED_LOCK,
+	BTRFS_INODE_HAS_PROPS,
+};
 
 /* in memory btrfs inode */
 struct btrfs_inode {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1061575..d3e447b 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -990,12 +990,7 @@ static void __free_workspace(int type, struct list_head *workspace,
 		btrfs_compress_op[idx]->free_workspace(workspace);
 	atomic_dec(total_ws);
 wake:
-	/*
-	 * Make sure counter is updated before we wake up waiters.
-	 */
-	smp_mb();
-	if (waitqueue_active(ws_wait))
-		wake_up(ws_wait);
+	cond_wake_up(ws_wait);
 }
 
 static void free_workspace(int type, struct list_head *ws)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index cc605f7..ddda9b8 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -6,6 +6,8 @@
 #ifndef BTRFS_COMPRESSION_H
 #define BTRFS_COMPRESSION_H
 
+#include <linux/sizes.h>
+
 /*
  * We want to make sure that amount of RAM required to uncompress an extent is
  * reasonable, so we limit the total size in ram of a compressed extent to
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8c68961..4bc326d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2330,7 +2330,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
 			no_skips = 1;
 
 		t = path->nodes[i];
-		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
+		if (i >= lowest_unlock && i > skip_level) {
 			btrfs_tree_unlock_rw(t, path->locks[i]);
 			path->locks[i] = 0;
 			if (write_lock_level &&
@@ -2432,7 +2432,6 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	btrfs_unlock_up_safe(p, level + 1);
 	btrfs_set_path_blocking(p);
 
-	free_extent_buffer(tmp);
 	if (p->reada != READA_NONE)
 		reada_for_search(fs_info, p, level, slot, key->objectid);
 
@@ -2446,7 +2445,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 		 * and give up so that our caller doesn't loop forever
 		 * on our EAGAINs.
 		 */
-		if (!btrfs_buffer_uptodate(tmp, 0, 0))
+		if (!extent_buffer_uptodate(tmp))
 			ret = -EIO;
 		free_extent_buffer(tmp);
 	} else {
@@ -2599,6 +2598,78 @@ int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
 	return 0;
 }
 
+static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
+							struct btrfs_path *p,
+							int write_lock_level)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct extent_buffer *b;
+	int root_lock;
+	int level = 0;
+
+	/* We try very hard to do read locks on the root */
+	root_lock = BTRFS_READ_LOCK;
+
+	if (p->search_commit_root) {
+		/* The commit roots are read only so we always do read locks */
+		if (p->need_commit_sem)
+			down_read(&fs_info->commit_root_sem);
+		b = root->commit_root;
+		extent_buffer_get(b);
+		level = btrfs_header_level(b);
+		if (p->need_commit_sem)
+			up_read(&fs_info->commit_root_sem);
+		/*
+		 * Ensure that all callers have set skip_locking when
+		 * p->search_commit_root = 1.
+		 */
+		ASSERT(p->skip_locking == 1);
+
+		goto out;
+	}
+
+	if (p->skip_locking) {
+		b = btrfs_root_node(root);
+		level = btrfs_header_level(b);
+		goto out;
+	}
+
+	/*
+	 * If the level is set to maximum, we can skip trying to get the read
+	 * lock.
+	 */
+	if (write_lock_level < BTRFS_MAX_LEVEL) {
+		/*
+		 * We don't know the level of the root node until we actually
+		 * have it read locked
+		 */
+		b = btrfs_read_lock_root_node(root);
+		level = btrfs_header_level(b);
+		if (level > write_lock_level)
+			goto out;
+
+		/* Whoops, must trade for write lock */
+		btrfs_tree_read_unlock(b);
+		free_extent_buffer(b);
+	}
+
+	b = btrfs_lock_root_node(root);
+	root_lock = BTRFS_WRITE_LOCK;
+
+	/* The level might have changed, check again */
+	level = btrfs_header_level(b);
+
+out:
+	p->nodes[level] = b;
+	if (!p->skip_locking)
+		p->locks[level] = root_lock;
+	/*
+	 * Callers are responsible for dropping b's references.
+	 */
+	return b;
+}
+
+
 /*
  * btrfs_search_slot - look for a key in a tree and perform necessary
  * modifications to preserve tree invariants.
@@ -2635,7 +2706,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	int err;
 	int level;
 	int lowest_unlock = 1;
-	int root_lock;
 	/* everything at write_lock_level or lower must be write locked */
 	int write_lock_level = 0;
 	u8 lowest_level = 0;
@@ -2673,50 +2743,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 again:
 	prev_cmp = -1;
-	/*
-	 * we try very hard to do read locks on the root
-	 */
-	root_lock = BTRFS_READ_LOCK;
-	level = 0;
-	if (p->search_commit_root) {
-		/*
-		 * the commit roots are read only
-		 * so we always do read locks
-		 */
-		if (p->need_commit_sem)
-			down_read(&fs_info->commit_root_sem);
-		b = root->commit_root;
-		extent_buffer_get(b);
-		level = btrfs_header_level(b);
-		if (p->need_commit_sem)
-			up_read(&fs_info->commit_root_sem);
-		if (!p->skip_locking)
-			btrfs_tree_read_lock(b);
-	} else {
-		if (p->skip_locking) {
-			b = btrfs_root_node(root);
-			level = btrfs_header_level(b);
-		} else {
-			/* we don't know the level of the root node
-			 * until we actually have it read locked
-			 */
-			b = btrfs_read_lock_root_node(root);
-			level = btrfs_header_level(b);
-			if (level <= write_lock_level) {
-				/* whoops, must trade for write lock */
-				btrfs_tree_read_unlock(b);
-				free_extent_buffer(b);
-				b = btrfs_lock_root_node(root);
-				root_lock = BTRFS_WRITE_LOCK;
-
-				/* the level might have changed, check again */
-				level = btrfs_header_level(b);
-			}
-		}
-	}
-	p->nodes[level] = b;
-	if (!p->skip_locking)
-		p->locks[level] = root_lock;
+	b = btrfs_search_slot_get_root(root, p, write_lock_level);
 
 	while (b) {
 		level = btrfs_header_level(b);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0d422c9..f4bf787 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -739,6 +739,12 @@ struct btrfs_delayed_root;
  */
 #define BTRFS_FS_NEED_ASYNC_COMMIT		17
 
+/*
+ * Indicate that balance has been set up from the ioctl and is in the main
+ * phase. The fs_info::balance_ctl is initialized.
+ */
+#define BTRFS_FS_BALANCE_RUNNING		18
+
 struct btrfs_fs_info {
 	u8 fsid[BTRFS_FSID_SIZE];
 	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
@@ -838,7 +844,6 @@ struct btrfs_fs_info {
 	struct mutex transaction_kthread_mutex;
 	struct mutex cleaner_mutex;
 	struct mutex chunk_mutex;
-	struct mutex volume_mutex;
 
 	/*
 	 * this is taken to make sure we don't set block groups ro after
@@ -1004,7 +1009,6 @@ struct btrfs_fs_info {
 	/* restriper state */
 	spinlock_t balance_lock;
 	struct mutex balance_mutex;
-	atomic_t balance_running;
 	atomic_t balance_pause_req;
 	atomic_t balance_cancel_req;
 	struct btrfs_balance_control *balance_ctl;
@@ -1219,9 +1223,6 @@ struct btrfs_root {
 	spinlock_t log_extents_lock[2];
 	struct list_head logged_list[2];
 
-	spinlock_t orphan_lock;
-	atomic_t orphan_inodes;
-	struct btrfs_block_rsv *orphan_block_rsv;
 	int orphan_cleanup_state;
 
 	spinlock_t inode_lock;
@@ -2764,13 +2765,9 @@ void btrfs_delalloc_release_space(struct inode *inode,
 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
 					    u64 len);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
-int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
-				  struct btrfs_inode *inode);
-void btrfs_orphan_release_metadata(struct btrfs_inode *inode);
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 				     struct btrfs_block_rsv *rsv,
-				     int nitems,
-				     u64 *qgroup_reserved, bool use_global_rsv);
+				     int nitems, bool use_global_rsv);
 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
 				      struct btrfs_block_rsv *rsv);
 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
@@ -2828,7 +2825,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
 void check_system_chunk(struct btrfs_trans_handle *trans,
 			struct btrfs_fs_info *fs_info, const u64 type);
 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
-		       struct btrfs_fs_info *info, u64 start, u64 end);
+		       u64 start, u64 end);
 
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
@@ -3042,11 +3039,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
 
 /* uuid-tree.c */
-int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
-			struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
 			u64 subid);
-int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
-			struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
 			u64 subid);
 int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
 			    int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
@@ -3163,18 +3158,6 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
 				     struct extent_map *em);
 
 /* inode.c */
-struct btrfs_delalloc_work {
-	struct inode *inode;
-	int delay_iput;
-	struct completion completion;
-	struct list_head list;
-	struct btrfs_work work;
-};
-
-struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
-						    int delay_iput);
-void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
-
 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
 		struct page *page, size_t pg_offset, u64 start,
 		u64 len, int create);
@@ -3193,10 +3176,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 int btrfs_add_link(struct btrfs_trans_handle *trans,
 		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
 		   const char *name, int name_len, int add_backref, u64 index);
-int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			struct inode *dir, u64 objectid,
-			const char *name, int name_len);
+int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
 int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
 			int front);
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
@@ -3204,9 +3184,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct inode *inode, u64 new_size,
 			       u32 min_type);
 
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
-			       int nr);
+int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      unsigned int extra_bits,
 			      struct extent_state **cached_state, int dedupe);
@@ -3240,10 +3219,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
 		struct btrfs_inode *inode);
 int btrfs_orphan_cleanup(struct btrfs_root *root);
-void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
-void btrfs_invalidate_inodes(struct btrfs_root *root);
 void btrfs_add_delayed_iput(struct inode *inode);
 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
 int btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -3262,14 +3238,14 @@ void btrfs_test_inode_set_ops(struct inode *inode);
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 int btrfs_ioctl_get_supported_features(void __user *arg);
-void btrfs_update_iflags(struct inode *inode);
+void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
 int btrfs_is_empty_uuid(u8 *uuid);
 int btrfs_defrag_file(struct inode *inode, struct file *file,
 		      struct btrfs_ioctl_defrag_range_args *range,
 		      u64 newer_than, unsigned long max_pages);
 void btrfs_get_block_group_info(struct list_head *groups_list,
 				struct btrfs_ioctl_space_info *space);
-void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
 			       struct btrfs_ioctl_balance_args *bargs);
 ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
 			   struct file *dst_file, u64 dst_loff);
@@ -3767,4 +3743,26 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
 	return 0;
 }
 
+static inline void cond_wake_up(struct wait_queue_head *wq)
+{
+	/*
+	 * This implies a full smp_mb barrier, see comments for
+	 * waitqueue_active why.
+	 */
+	if (wq_has_sleeper(wq))
+		wake_up(wq);
+}
+
+static inline void cond_wake_up_nomb(struct wait_queue_head *wq)
+{
+	/*
+	 * Special case for conditional wakeup where the barrier required for
+	 * waitqueue_active is implied by some of the preceding code. Eg. one
+	 * of such atomic operations (atomic_dec_and_return, ...), or a
+	 * unlock/lock sequence, etc.
+	 */
+	if (waitqueue_active(wq))
+		wake_up(wq);
+}
+
 #endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index a8d492db..fe6caa7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -460,13 +460,10 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root)
 {
 	int seq = atomic_inc_return(&delayed_root->items_seq);
 
-	/*
-	 * atomic_dec_return implies a barrier for waitqueue_active
-	 */
+	/* atomic_dec_return implies a barrier */
 	if ((atomic_dec_return(&delayed_root->items) <
-	    BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
-	    waitqueue_active(&delayed_root->wait))
-		wake_up(&delayed_root->wait);
+	    BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0))
+		cond_wake_up_nomb(&delayed_root->wait);
 }
 
 static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e1b0651..03dec67 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -286,10 +286,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
 }
 
 void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
-			      struct btrfs_fs_info *fs_info,
 			      struct btrfs_delayed_ref_root *delayed_refs,
 			      struct btrfs_delayed_ref_head *head)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_ref_node *ref;
 	struct rb_node *node;
 	u64 seq = 0;
@@ -323,9 +323,7 @@ again:
 	}
 }
 
-int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
-			    struct btrfs_delayed_ref_root *delayed_refs,
-			    u64 seq)
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
 {
 	struct seq_list *elem;
 	int ret = 0;
@@ -336,10 +334,9 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
 					struct seq_list, list);
 		if (seq >= elem->seq) {
 			btrfs_debug(fs_info,
-				"holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)",
+				"holding back delayed_ref %#x.%x, lowest is %#x.%x",
 				(u32)(seq >> 32), (u32)seq,
-				(u32)(elem->seq >> 32), (u32)elem->seq,
-				delayed_refs);
+				(u32)(elem->seq >> 32), (u32)elem->seq);
 			ret = 1;
 		}
 	}
@@ -529,33 +526,20 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
 	spin_unlock(&existing->lock);
 }
 
-/*
- * helper function to actually insert a head node into the rbtree.
- * this does all the dirty work in terms of maintaining the correct
- * overall modification count.
- */
-static noinline struct btrfs_delayed_ref_head *
-add_delayed_ref_head(struct btrfs_fs_info *fs_info,
-		     struct btrfs_trans_handle *trans,
-		     struct btrfs_delayed_ref_head *head_ref,
-		     struct btrfs_qgroup_extent_record *qrecord,
-		     u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
-		     int action, int is_data, int is_system,
-		     int *qrecord_inserted_ret,
-		     int *old_ref_mod, int *new_ref_mod)
-
+static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
+				  struct btrfs_qgroup_extent_record *qrecord,
+				  u64 bytenr, u64 num_bytes, u64 ref_root,
+				  u64 reserved, int action, bool is_data,
+				  bool is_system)
 {
-	struct btrfs_delayed_ref_head *existing;
-	struct btrfs_delayed_ref_root *delayed_refs;
 	int count_mod = 1;
 	int must_insert_reserved = 0;
-	int qrecord_inserted = 0;
 
 	/* If reserved is provided, it must be a data extent. */
 	BUG_ON(!is_data && reserved);
 
 	/*
-	 * the head node stores the sum of all the mods, so dropping a ref
+	 * The head node stores the sum of all the mods, so dropping a ref
 	 * should drop the sum in the head node by one.
 	 */
 	if (action == BTRFS_UPDATE_DELAYED_HEAD)
@@ -564,12 +548,11 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 		count_mod = -1;
 
 	/*
-	 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
-	 * the reserved accounting when the extent is finally added, or
-	 * if a later modification deletes the delayed ref without ever
-	 * inserting the extent into the extent allocation tree.
-	 * ref->must_insert_reserved is the flag used to record
-	 * that accounting mods are required.
+	 * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved
+	 * accounting when the extent is finally added, or if a later
+	 * modification deletes the delayed ref without ever inserting the
+	 * extent into the extent allocation tree.  ref->must_insert_reserved
+	 * is the flag used to record that accounting mods are required.
 	 *
 	 * Once we record must_insert_reserved, switch the action to
 	 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
@@ -579,8 +562,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	else
 		must_insert_reserved = 0;
 
-	delayed_refs = &trans->transaction->delayed_refs;
-
 	refcount_set(&head_ref->refs, 1);
 	head_ref->bytenr = bytenr;
 	head_ref->num_bytes = num_bytes;
@@ -598,7 +579,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	spin_lock_init(&head_ref->lock);
 	mutex_init(&head_ref->mutex);
 
-	/* Record qgroup extent info if provided */
 	if (qrecord) {
 		if (ref_root && reserved) {
 			head_ref->qgroup_ref_root = ref_root;
@@ -608,20 +588,44 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 		qrecord->bytenr = bytenr;
 		qrecord->num_bytes = num_bytes;
 		qrecord->old_roots = NULL;
+	}
+}
+
+/*
+ * helper function to actually insert a head node into the rbtree.
+ * this does all the dirty work in terms of maintaining the correct
+ * overall modification count.
+ */
+static noinline struct btrfs_delayed_ref_head *
+add_delayed_ref_head(struct btrfs_trans_handle *trans,
+		     struct btrfs_delayed_ref_head *head_ref,
+		     struct btrfs_qgroup_extent_record *qrecord,
+		     int action, int *qrecord_inserted_ret,
+		     int *old_ref_mod, int *new_ref_mod)
+{
+	struct btrfs_delayed_ref_head *existing;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int qrecord_inserted = 0;
 
-		if(btrfs_qgroup_trace_extent_nolock(fs_info,
+	delayed_refs = &trans->transaction->delayed_refs;
+
+	/* Record qgroup extent info if provided */
+	if (qrecord) {
+		if (btrfs_qgroup_trace_extent_nolock(trans->fs_info,
 					delayed_refs, qrecord))
 			kfree(qrecord);
 		else
 			qrecord_inserted = 1;
 	}
 
-	trace_add_delayed_ref_head(fs_info, head_ref, action);
+	trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
 
 	existing = htree_insert(&delayed_refs->href_root,
 				&head_ref->href_node);
 	if (existing) {
-		WARN_ON(ref_root && reserved && existing->qgroup_ref_root
+		WARN_ON(qrecord && head_ref->qgroup_ref_root
+			&& head_ref->qgroup_reserved
+			&& existing->qgroup_ref_root
 			&& existing->qgroup_reserved);
 		update_existing_head_ref(delayed_refs, existing, head_ref,
 					 old_ref_mod);
@@ -634,8 +638,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	} else {
 		if (old_ref_mod)
 			*old_ref_mod = 0;
-		if (is_data && count_mod < 0)
-			delayed_refs->pending_csums += num_bytes;
+		if (head_ref->is_data && head_ref->ref_mod < 0)
+			delayed_refs->pending_csums += head_ref->num_bytes;
 		delayed_refs->num_heads++;
 		delayed_refs->num_heads_ready++;
 		atomic_inc(&delayed_refs->num_entries);
@@ -645,90 +649,48 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 		*qrecord_inserted_ret = qrecord_inserted;
 	if (new_ref_mod)
 		*new_ref_mod = head_ref->total_ref_mod;
-	return head_ref;
-}
-
-/*
- * helper to insert a delayed tree ref into the rbtree.
- */
-static noinline void
-add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
-		     struct btrfs_trans_handle *trans,
-		     struct btrfs_delayed_ref_head *head_ref,
-		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
-		     u64 num_bytes, u64 parent, u64 ref_root, int level,
-		     int action)
-{
-	struct btrfs_delayed_tree_ref *full_ref;
-	struct btrfs_delayed_ref_root *delayed_refs;
-	u64 seq = 0;
-	int ret;
-
-	if (action == BTRFS_ADD_DELAYED_EXTENT)
-		action = BTRFS_ADD_DELAYED_REF;
 
-	if (is_fstree(ref_root))
-		seq = atomic64_read(&fs_info->tree_mod_seq);
-	delayed_refs = &trans->transaction->delayed_refs;
-
-	/* first set the basic ref node struct up */
-	refcount_set(&ref->refs, 1);
-	ref->bytenr = bytenr;
-	ref->num_bytes = num_bytes;
-	ref->ref_mod = 1;
-	ref->action = action;
-	ref->is_head = 0;
-	ref->in_tree = 1;
-	ref->seq = seq;
-	RB_CLEAR_NODE(&ref->ref_node);
-	INIT_LIST_HEAD(&ref->add_list);
-
-	full_ref = btrfs_delayed_node_to_tree_ref(ref);
-	full_ref->parent = parent;
-	full_ref->root = ref_root;
-	if (parent)
-		ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
-	else
-		ref->type = BTRFS_TREE_BLOCK_REF_KEY;
-	full_ref->level = level;
-
-	trace_add_delayed_tree_ref(fs_info, ref, full_ref, action);
-
-	ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref);
-
-	/*
-	 * XXX: memory should be freed at the same level allocated.
-	 * But bad practice is anywhere... Follow it now. Need cleanup.
-	 */
-	if (ret > 0)
-		kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
+	return head_ref;
 }
 
 /*
- * helper to insert a delayed data ref into the rbtree.
+ * init_delayed_ref_common - Initialize the structure which represents a
+ *			     modification to a an extent.
+ *
+ * @fs_info:    Internal to the mounted filesystem mount structure.
+ *
+ * @ref:	The structure which is going to be initialized.
+ *
+ * @bytenr:	The logical address of the extent for which a modification is
+ *		going to be recorded.
+ *
+ * @num_bytes:  Size of the extent whose modification is being recorded.
+ *
+ * @ref_root:	The id of the root where this modification has originated, this
+ *		can be either one of the well-known metadata trees or the
+ *		subvolume id which references this extent.
+ *
+ * @action:	Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or
+ *		BTRFS_ADD_DELAYED_EXTENT
+ *
+ * @ref_type:	Holds the type of the extent which is being recorded, can be
+ *		one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY
+ *		when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/
+ *		BTRFS_EXTENT_DATA_REF_KEY when recording data extent
  */
-static noinline void
-add_delayed_data_ref(struct btrfs_fs_info *fs_info,
-		     struct btrfs_trans_handle *trans,
-		     struct btrfs_delayed_ref_head *head_ref,
-		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
-		     u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
-		     u64 offset, int action)
+static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
+				    struct btrfs_delayed_ref_node *ref,
+				    u64 bytenr, u64 num_bytes, u64 ref_root,
+				    int action, u8 ref_type)
 {
-	struct btrfs_delayed_data_ref *full_ref;
-	struct btrfs_delayed_ref_root *delayed_refs;
 	u64 seq = 0;
-	int ret;
 
 	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		action = BTRFS_ADD_DELAYED_REF;
 
-	delayed_refs = &trans->transaction->delayed_refs;
-
 	if (is_fstree(ref_root))
 		seq = atomic64_read(&fs_info->tree_mod_seq);
 
-	/* first set the basic ref node struct up */
 	refcount_set(&ref->refs, 1);
 	ref->bytenr = bytenr;
 	ref->num_bytes = num_bytes;
@@ -737,25 +699,9 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	ref->is_head = 0;
 	ref->in_tree = 1;
 	ref->seq = seq;
+	ref->type = ref_type;
 	RB_CLEAR_NODE(&ref->ref_node);
 	INIT_LIST_HEAD(&ref->add_list);
-
-	full_ref = btrfs_delayed_node_to_data_ref(ref);
-	full_ref->parent = parent;
-	full_ref->root = ref_root;
-	if (parent)
-		ref->type = BTRFS_SHARED_DATA_REF_KEY;
-	else
-		ref->type = BTRFS_EXTENT_DATA_REF_KEY;
-
-	full_ref->objectid = owner;
-	full_ref->offset = offset;
-
-	trace_add_delayed_data_ref(fs_info, ref, full_ref, action);
-
-	ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref);
-	if (ret > 0)
-		kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
 }
 
 /*
@@ -775,13 +721,25 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_qgroup_extent_record *record = NULL;
 	int qrecord_inserted;
-	int is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID);
+	bool is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID);
+	int ret;
+	u8 ref_type;
 
 	BUG_ON(extent_op && extent_op->is_data);
 	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
 	if (!ref)
 		return -ENOMEM;
 
+	if (parent)
+		ref_type = BTRFS_SHARED_BLOCK_REF_KEY;
+	else
+		ref_type = BTRFS_TREE_BLOCK_REF_KEY;
+	init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
+				ref_root, action, ref_type);
+	ref->root = ref_root;
+	ref->parent = parent;
+	ref->level = level;
+
 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
 	if (!head_ref)
 		goto free_ref;
@@ -793,6 +751,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 			goto free_head_ref;
 	}
 
+	init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
+			      ref_root, 0, action, false, is_system);
 	head_ref->extent_op = extent_op;
 
 	delayed_refs = &trans->transaction->delayed_refs;
@@ -802,15 +762,19 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
-					bytenr, num_bytes, 0, 0, action, 0,
-					is_system, &qrecord_inserted,
+	head_ref = add_delayed_ref_head(trans, head_ref, record,
+					action, &qrecord_inserted,
 					old_ref_mod, new_ref_mod);
 
-	add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
-			     num_bytes, parent, ref_root, level, action);
+	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
 	spin_unlock(&delayed_refs->lock);
 
+	trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
+				   action == BTRFS_ADD_DELAYED_EXTENT ?
+				   BTRFS_ADD_DELAYED_REF : action);
+	if (ret > 0)
+		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+
 	if (qrecord_inserted)
 		btrfs_qgroup_trace_extent_post(fs_info, record);
 
@@ -839,11 +803,25 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_qgroup_extent_record *record = NULL;
 	int qrecord_inserted;
+	int ret;
+	u8 ref_type;
 
 	ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
 	if (!ref)
 		return -ENOMEM;
 
+	if (parent)
+	        ref_type = BTRFS_SHARED_DATA_REF_KEY;
+	else
+	        ref_type = BTRFS_EXTENT_DATA_REF_KEY;
+	init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
+				ref_root, action, ref_type);
+	ref->root = ref_root;
+	ref->parent = parent;
+	ref->objectid = owner;
+	ref->offset = offset;
+
+
 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
 	if (!head_ref) {
 		kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
@@ -861,6 +839,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 		}
 	}
 
+	init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
+			      reserved, action, true, false);
 	head_ref->extent_op = NULL;
 
 	delayed_refs = &trans->transaction->delayed_refs;
@@ -870,16 +850,20 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
-					bytenr, num_bytes, ref_root, reserved,
-					action, 1, 0, &qrecord_inserted,
+	head_ref = add_delayed_ref_head(trans, head_ref, record,
+					action, &qrecord_inserted,
 					old_ref_mod, new_ref_mod);
 
-	add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
-				   num_bytes, parent, ref_root, owner, offset,
-				   action);
+	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
 	spin_unlock(&delayed_refs->lock);
 
+	trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
+				   action == BTRFS_ADD_DELAYED_EXTENT ?
+				   BTRFS_ADD_DELAYED_REF : action);
+	if (ret > 0)
+		kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+
+
 	if (qrecord_inserted)
 		return btrfs_qgroup_trace_extent_post(fs_info, record);
 	return 0;
@@ -897,19 +881,16 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 	if (!head_ref)
 		return -ENOMEM;
 
+	init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
+			      BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data,
+			      false);
 	head_ref->extent_op = extent_op;
 
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 
-	/*
-	 * extent_ops just modify the flags of an extent and they don't result
-	 * in ref count changes, hence it's safe to pass false/0 for is_system
-	 * argument
-	 */
-	add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr,
-			     num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
-			     extent_op->is_data, 0, NULL, NULL, NULL);
+	add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
+			     NULL, NULL, NULL);
 
 	spin_unlock(&delayed_refs->lock);
 	return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 7f00db5..ea1aecb 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -251,7 +251,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 				u64 bytenr, u64 num_bytes,
 				struct btrfs_delayed_extent_op *extent_op);
 void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
-			      struct btrfs_fs_info *fs_info,
 			      struct btrfs_delayed_ref_root *delayed_refs,
 			      struct btrfs_delayed_ref_head *head);
 
@@ -269,9 +268,7 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
 struct btrfs_delayed_ref_head *
 btrfs_select_ref_head(struct btrfs_trans_handle *trans);
 
-int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
-			    struct btrfs_delayed_ref_root *delayed_refs,
-			    u64 seq);
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
 
 /*
  * helper functions to cast a node into its container
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f82be26..e2ba041 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -33,8 +33,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
 						struct btrfs_device *srcdev,
 						struct btrfs_device *tgtdev);
 static int btrfs_dev_replace_kthread(void *data);
-static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
-
 
 int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
 {
@@ -179,6 +177,105 @@ out:
 }
 
 /*
+ * Initialize a new device for device replace target from a given source dev
+ * and path.
+ *
+ * Return 0 and new device in @device_out, otherwise return < 0
+ */
+static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+				  const char *device_path,
+				  struct btrfs_device *srcdev,
+				  struct btrfs_device **device_out)
+{
+	struct btrfs_device *device;
+	struct block_device *bdev;
+	struct list_head *devices;
+	struct rcu_string *name;
+	u64 devid = BTRFS_DEV_REPLACE_DEVID;
+	int ret = 0;
+
+	*device_out = NULL;
+	if (fs_info->fs_devices->seeding) {
+		btrfs_err(fs_info, "the filesystem is a seed filesystem!");
+		return -EINVAL;
+	}
+
+	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+				  fs_info->bdev_holder);
+	if (IS_ERR(bdev)) {
+		btrfs_err(fs_info, "target device %s is invalid!", device_path);
+		return PTR_ERR(bdev);
+	}
+
+	filemap_write_and_wait(bdev->bd_inode->i_mapping);
+
+	devices = &fs_info->fs_devices->devices;
+	list_for_each_entry(device, devices, dev_list) {
+		if (device->bdev == bdev) {
+			btrfs_err(fs_info,
+				  "target device is in the filesystem!");
+			ret = -EEXIST;
+			goto error;
+		}
+	}
+
+
+	if (i_size_read(bdev->bd_inode) <
+	    btrfs_device_get_total_bytes(srcdev)) {
+		btrfs_err(fs_info,
+			  "target device is smaller than source device!");
+		ret = -EINVAL;
+		goto error;
+	}
+
+
+	device = btrfs_alloc_device(NULL, &devid, NULL);
+	if (IS_ERR(device)) {
+		ret = PTR_ERR(device);
+		goto error;
+	}
+
+	name = rcu_string_strdup(device_path, GFP_KERNEL);
+	if (!name) {
+		btrfs_free_device(device);
+		ret = -ENOMEM;
+		goto error;
+	}
+	rcu_assign_pointer(device->name, name);
+
+	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+	device->generation = 0;
+	device->io_width = fs_info->sectorsize;
+	device->io_align = fs_info->sectorsize;
+	device->sector_size = fs_info->sectorsize;
+	device->total_bytes = btrfs_device_get_total_bytes(srcdev);
+	device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
+	device->bytes_used = btrfs_device_get_bytes_used(srcdev);
+	device->commit_total_bytes = srcdev->commit_total_bytes;
+	device->commit_bytes_used = device->bytes_used;
+	device->fs_info = fs_info;
+	device->bdev = bdev;
+	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+	set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+	device->mode = FMODE_EXCL;
+	device->dev_stats_valid = 1;
+	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+	device->fs_devices = fs_info->fs_devices;
+	list_add(&device->dev_list, &fs_info->fs_devices->devices);
+	fs_info->fs_devices->num_devices++;
+	fs_info->fs_devices->open_devices++;
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+	*device_out = device;
+	return 0;
+
+error:
+	blkdev_put(bdev, FMODE_EXCL);
+	return ret;
+}
+
+/*
  * called from commit_transaction. Writes changed device replace state to
  * disk.
  */
@@ -317,18 +414,13 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 	struct btrfs_device *tgt_device = NULL;
 	struct btrfs_device *src_device = NULL;
 
-	/* the disk copy procedure reuses the scrub code */
-	mutex_lock(&fs_info->volume_mutex);
 	ret = btrfs_find_device_by_devspec(fs_info, srcdevid,
 					    srcdev_name, &src_device);
-	if (ret) {
-		mutex_unlock(&fs_info->volume_mutex);
+	if (ret)
 		return ret;
-	}
 
 	ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
 					    src_device, &tgt_device);
-	mutex_unlock(&fs_info->volume_mutex);
 	if (ret)
 		return ret;
 
@@ -360,7 +452,6 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 	dev_replace->cont_reading_from_srcdev_mode = read_src;
 	WARN_ON(!src_device);
 	dev_replace->srcdev = src_device;
-	WARN_ON(!tgt_device);
 	dev_replace->tgtdev = tgt_device;
 
 	btrfs_info_in_rcu(fs_info,
@@ -503,7 +594,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	 * flush all outstanding I/O and inode extent mappings before the
 	 * copy operation is declared as being finished
 	 */
-	ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
+	ret = btrfs_start_delalloc_roots(fs_info, -1);
 	if (ret) {
 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 		return ret;
@@ -518,7 +609,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	ret = btrfs_commit_transaction(trans);
 	WARN_ON(ret);
 
-	mutex_lock(&uuid_mutex);
 	/* keep away write_all_supers() during the finishing procedure */
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 	mutex_lock(&fs_info->chunk_mutex);
@@ -545,7 +635,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 		btrfs_dev_replace_write_unlock(dev_replace);
 		mutex_unlock(&fs_info->chunk_mutex);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		mutex_unlock(&uuid_mutex);
 		btrfs_rm_dev_replace_blocked(fs_info);
 		if (tgt_device)
 			btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
@@ -596,7 +685,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	 */
 	mutex_unlock(&fs_info->chunk_mutex);
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-	mutex_unlock(&uuid_mutex);
 
 	/* replace the sysfs entry */
 	btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
@@ -800,7 +888,17 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
 	}
 	btrfs_dev_replace_write_unlock(dev_replace);
 
-	WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
+	/*
+	 * This could collide with a paused balance, but the exclusive op logic
+	 * should never allow both to start and pause. We don't want to allow
+	 * dev-replace to start anyway.
+	 */
+	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+		btrfs_info(fs_info,
+		"cannot resume dev-replace, other exclusive operation running");
+		return 0;
+	}
+
 	task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
 	return PTR_ERR_OR_ZERO(task);
 }
@@ -810,6 +908,7 @@ static int btrfs_dev_replace_kthread(void *data)
 	struct btrfs_fs_info *fs_info = data;
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 	u64 progress;
+	int ret;
 
 	progress = btrfs_dev_replace_progress(fs_info);
 	progress = div_u64(progress, 10);
@@ -820,23 +919,14 @@ static int btrfs_dev_replace_kthread(void *data)
 		btrfs_dev_name(dev_replace->tgtdev),
 		(unsigned int)progress);
 
-	btrfs_dev_replace_continue_on_mount(fs_info);
-	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
-
-	return 0;
-}
-
-static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
-	int ret;
-
 	ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
 			      dev_replace->committed_cursor_left,
 			      btrfs_device_get_total_bytes(dev_replace->srcdev),
 			      &dev_replace->scrub_progress, 0, 1);
 	ret = btrfs_dev_replace_finishing(fs_info, ret);
 	WARN_ON(ret);
+
+	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
 	return 0;
 }
 
@@ -916,9 +1006,9 @@ void btrfs_dev_replace_clear_lock_blocking(
 	ASSERT(atomic_read(&dev_replace->read_locks) > 0);
 	ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
 	read_lock(&dev_replace->lock);
-	if (atomic_dec_and_test(&dev_replace->blocking_readers) &&
-	    waitqueue_active(&dev_replace->read_lock_wq))
-		wake_up(&dev_replace->read_lock_wq);
+	/* Barrier implied by atomic_dec_and_test */
+	if (atomic_dec_and_test(&dev_replace->blocking_readers))
+		cond_wake_up_nomb(&dev_replace->read_lock_wq);
 }
 
 void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
@@ -929,9 +1019,7 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
 void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
 {
 	percpu_counter_sub(&fs_info->bio_counter, amount);
-
-	if (waitqueue_active(&fs_info->replace_wait))
-		wake_up(&fs_info->replace_wait);
+	cond_wake_up_nomb(&fs_info->replace_wait);
 }
 
 void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c3504b4..205092d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -55,7 +55,6 @@
 static const struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
-static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info);
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 				      struct btrfs_fs_info *fs_info);
@@ -416,7 +415,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
 
 static int verify_level_key(struct btrfs_fs_info *fs_info,
 			    struct extent_buffer *eb, int level,
-			    struct btrfs_key *first_key)
+			    struct btrfs_key *first_key, u64 parent_transid)
 {
 	int found_level;
 	struct btrfs_key found_key;
@@ -454,10 +453,11 @@ static int verify_level_key(struct btrfs_fs_info *fs_info,
 	if (ret) {
 		WARN_ON(1);
 		btrfs_err(fs_info,
-"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) has=(%llu, %u, %llu)",
-			  eb->start, first_key->objectid, first_key->type,
-			  first_key->offset, found_key.objectid,
-			  found_key.type, found_key.offset);
+"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
+			  eb->start, parent_transid, first_key->objectid,
+			  first_key->type, first_key->offset,
+			  found_key.objectid, found_key.type,
+			  found_key.offset);
 	}
 #endif
 	return ret;
@@ -493,7 +493,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
 						   parent_transid, 0))
 				ret = -EIO;
 			else if (verify_level_key(fs_info, eb, level,
-						  first_key))
+						  first_key, parent_transid))
 				ret = -EUCLEAN;
 			else
 				break;
@@ -1185,7 +1185,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	root->inode_tree = RB_ROOT;
 	INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
 	root->block_rsv = NULL;
-	root->orphan_block_rsv = NULL;
 
 	INIT_LIST_HEAD(&root->dirty_list);
 	INIT_LIST_HEAD(&root->root_list);
@@ -1195,7 +1194,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	INIT_LIST_HEAD(&root->ordered_root);
 	INIT_LIST_HEAD(&root->logged_list[0]);
 	INIT_LIST_HEAD(&root->logged_list[1]);
-	spin_lock_init(&root->orphan_lock);
 	spin_lock_init(&root->inode_lock);
 	spin_lock_init(&root->delalloc_lock);
 	spin_lock_init(&root->ordered_extent_lock);
@@ -1216,7 +1214,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	atomic_set(&root->log_commit[1], 0);
 	atomic_set(&root->log_writers, 0);
 	atomic_set(&root->log_batch, 0);
-	atomic_set(&root->orphan_inodes, 0);
 	refcount_set(&root->refs, 1);
 	atomic_set(&root->will_be_snapshotted, 0);
 	root->log_transid = 0;
@@ -2164,7 +2161,6 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
 {
 	spin_lock_init(&fs_info->balance_lock);
 	mutex_init(&fs_info->balance_mutex);
-	atomic_set(&fs_info->balance_running, 0);
 	atomic_set(&fs_info->balance_pause_req, 0);
 	atomic_set(&fs_info->balance_cancel_req, 0);
 	fs_info->balance_ctl = NULL;
@@ -2442,6 +2438,211 @@ out:
 	return ret;
 }
 
+/*
+ * Real super block validation
+ * NOTE: super csum type and incompat features will not be checked here.
+ *
+ * @sb:		super block to check
+ * @mirror_num:	the super block number to check its bytenr:
+ * 		0	the primary (1st) sb
+ * 		1, 2	2nd and 3rd backup copy
+ * 	       -1	skip bytenr check
+ */
+static int validate_super(struct btrfs_fs_info *fs_info,
+			    struct btrfs_super_block *sb, int mirror_num)
+{
+	u64 nodesize = btrfs_super_nodesize(sb);
+	u64 sectorsize = btrfs_super_sectorsize(sb);
+	int ret = 0;
+
+	if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
+		btrfs_err(fs_info, "no valid FS found");
+		ret = -EINVAL;
+	}
+	if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
+		btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
+				btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
+		ret = -EINVAL;
+	}
+	if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
+		btrfs_err(fs_info, "tree_root level too big: %d >= %d",
+				btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
+		ret = -EINVAL;
+	}
+	if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
+		btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
+				btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
+		ret = -EINVAL;
+	}
+	if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
+		btrfs_err(fs_info, "log_root level too big: %d >= %d",
+				btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
+		ret = -EINVAL;
+	}
+
+	/*
+	 * Check sectorsize and nodesize first, other check will need it.
+	 * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
+	 */
+	if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
+	    sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+		btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
+		ret = -EINVAL;
+	}
+	/* Only PAGE SIZE is supported yet */
+	if (sectorsize != PAGE_SIZE) {
+		btrfs_err(fs_info,
+			"sectorsize %llu not supported yet, only support %lu",
+			sectorsize, PAGE_SIZE);
+		ret = -EINVAL;
+	}
+	if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
+	    nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+		btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
+		ret = -EINVAL;
+	}
+	if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
+		btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
+			  le32_to_cpu(sb->__unused_leafsize), nodesize);
+		ret = -EINVAL;
+	}
+
+	/* Root alignment check */
+	if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
+		btrfs_warn(fs_info, "tree_root block unaligned: %llu",
+			   btrfs_super_root(sb));
+		ret = -EINVAL;
+	}
+	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
+		btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
+			   btrfs_super_chunk_root(sb));
+		ret = -EINVAL;
+	}
+	if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
+		btrfs_warn(fs_info, "log_root block unaligned: %llu",
+			   btrfs_super_log_root(sb));
+		ret = -EINVAL;
+	}
+
+	if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) {
+		btrfs_err(fs_info,
+			   "dev_item UUID does not match fsid: %pU != %pU",
+			   fs_info->fsid, sb->dev_item.fsid);
+		ret = -EINVAL;
+	}
+
+	/*
+	 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
+	 * done later
+	 */
+	if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
+		btrfs_err(fs_info, "bytes_used is too small %llu",
+			  btrfs_super_bytes_used(sb));
+		ret = -EINVAL;
+	}
+	if (!is_power_of_2(btrfs_super_stripesize(sb))) {
+		btrfs_err(fs_info, "invalid stripesize %u",
+			  btrfs_super_stripesize(sb));
+		ret = -EINVAL;
+	}
+	if (btrfs_super_num_devices(sb) > (1UL << 31))
+		btrfs_warn(fs_info, "suspicious number of devices: %llu",
+			   btrfs_super_num_devices(sb));
+	if (btrfs_super_num_devices(sb) == 0) {
+		btrfs_err(fs_info, "number of devices is 0");
+		ret = -EINVAL;
+	}
+
+	if (mirror_num >= 0 &&
+	    btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
+		btrfs_err(fs_info, "super offset mismatch %llu != %u",
+			  btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
+		ret = -EINVAL;
+	}
+
+	/*
+	 * Obvious sys_chunk_array corruptions, it must hold at least one key
+	 * and one chunk
+	 */
+	if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+		btrfs_err(fs_info, "system chunk array too big %u > %u",
+			  btrfs_super_sys_array_size(sb),
+			  BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
+		ret = -EINVAL;
+	}
+	if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
+			+ sizeof(struct btrfs_chunk)) {
+		btrfs_err(fs_info, "system chunk array too small %u < %zu",
+			  btrfs_super_sys_array_size(sb),
+			  sizeof(struct btrfs_disk_key)
+			  + sizeof(struct btrfs_chunk));
+		ret = -EINVAL;
+	}
+
+	/*
+	 * The generation is a global counter, we'll trust it more than the others
+	 * but it's still possible that it's the one that's wrong.
+	 */
+	if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
+		btrfs_warn(fs_info,
+			"suspicious: generation < chunk_root_generation: %llu < %llu",
+			btrfs_super_generation(sb),
+			btrfs_super_chunk_root_generation(sb));
+	if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
+	    && btrfs_super_cache_generation(sb) != (u64)-1)
+		btrfs_warn(fs_info,
+			"suspicious: generation < cache_generation: %llu < %llu",
+			btrfs_super_generation(sb),
+			btrfs_super_cache_generation(sb));
+
+	return ret;
+}
+
+/*
+ * Validation of super block at mount time.
+ * Some checks already done early at mount time, like csum type and incompat
+ * flags will be skipped.
+ */
+static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
+{
+	return validate_super(fs_info, fs_info->super_copy, 0);
+}
+
+/*
+ * Validation of super block at write time.
+ * Some checks like bytenr check will be skipped as their values will be
+ * overwritten soon.
+ * Extra checks like csum type and incompat flags will be done here.
+ */
+static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
+				      struct btrfs_super_block *sb)
+{
+	int ret;
+
+	ret = validate_super(fs_info, sb, -1);
+	if (ret < 0)
+		goto out;
+	if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) {
+		ret = -EUCLEAN;
+		btrfs_err(fs_info, "invalid csum type, has %u want %u",
+			  btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
+		goto out;
+	}
+	if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
+		ret = -EUCLEAN;
+		btrfs_err(fs_info,
+		"invalid incompat flags, has 0x%llx valid mask 0x%llx",
+			  btrfs_super_incompat_flags(sb),
+			  (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
+		goto out;
+	}
+out:
+	if (ret < 0)
+		btrfs_err(fs_info,
+		"super block corruption detected before writing it to disk");
+	return ret;
+}
+
 int open_ctree(struct super_block *sb,
 	       struct btrfs_fs_devices *fs_devices,
 	       char *options)
@@ -2601,7 +2802,6 @@ int open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->chunk_mutex);
 	mutex_init(&fs_info->transaction_kthread_mutex);
 	mutex_init(&fs_info->cleaner_mutex);
-	mutex_init(&fs_info->volume_mutex);
 	mutex_init(&fs_info->ro_block_group_mutex);
 	init_rwsem(&fs_info->commit_root_sem);
 	init_rwsem(&fs_info->cleanup_work_sem);
@@ -2668,7 +2868,7 @@ int open_ctree(struct super_block *sb,
 
 	memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
 
-	ret = btrfs_check_super_valid(fs_info);
+	ret = btrfs_validate_mount_super(fs_info);
 	if (ret) {
 		btrfs_err(fs_info, "superblock contains fatal errors");
 		err = -EINVAL;
@@ -3523,7 +3723,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
 	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
 		if (raid_type == BTRFS_RAID_SINGLE)
 			continue;
-		if (!(flags & btrfs_raid_group[raid_type]))
+		if (!(flags & btrfs_raid_array[raid_type].bg_flag))
 			continue;
 		min_tolerated = min(min_tolerated,
 				    btrfs_raid_array[raid_type].
@@ -3603,6 +3803,14 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 		flags = btrfs_super_flags(sb);
 		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
 
+		ret = btrfs_validate_write_super(fs_info, sb);
+		if (ret < 0) {
+			mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+			btrfs_handle_fs_error(fs_info, -EUCLEAN,
+				"unexpected superblock corruption detected");
+			return -EUCLEAN;
+		}
+
 		ret = write_dev_supers(dev, sb, max_mirrors);
 		if (ret)
 			total_errors++;
@@ -3674,8 +3882,6 @@ static void free_fs_root(struct btrfs_root *root)
 {
 	iput(root->ino_cache_inode);
 	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
-	btrfs_free_block_rsv(root->fs_info, root->orphan_block_rsv);
-	root->orphan_block_rsv = NULL;
 	if (root->anon_dev)
 		free_anon_bdev(root->anon_dev);
 	if (root->subv_writers)
@@ -3766,7 +3972,6 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info)
 
 void close_ctree(struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_root *root = fs_info->tree_root;
 	int ret;
 
 	set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
@@ -3862,9 +4067,6 @@ void close_ctree(struct btrfs_fs_info *fs_info)
 	btrfs_free_stripe_hash_table(fs_info);
 	btrfs_free_ref_cache(fs_info);
 
-	__btrfs_free_block_rsv(root->orphan_block_rsv);
-	root->orphan_block_rsv = NULL;
-
 	while (!list_empty(&fs_info->pinned_chunks)) {
 		struct extent_map *em;
 
@@ -3975,155 +4177,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
 					      level, first_key);
 }
 
-static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_super_block *sb = fs_info->super_copy;
-	u64 nodesize = btrfs_super_nodesize(sb);
-	u64 sectorsize = btrfs_super_sectorsize(sb);
-	int ret = 0;
-
-	if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
-		btrfs_err(fs_info, "no valid FS found");
-		ret = -EINVAL;
-	}
-	if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
-		btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
-				btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
-		ret = -EINVAL;
-	}
-	if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
-		btrfs_err(fs_info, "tree_root level too big: %d >= %d",
-				btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
-		ret = -EINVAL;
-	}
-	if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
-		btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
-				btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
-		ret = -EINVAL;
-	}
-	if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
-		btrfs_err(fs_info, "log_root level too big: %d >= %d",
-				btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
-		ret = -EINVAL;
-	}
-
-	/*
-	 * Check sectorsize and nodesize first, other check will need it.
-	 * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
-	 */
-	if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
-	    sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
-		btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
-		ret = -EINVAL;
-	}
-	/* Only PAGE SIZE is supported yet */
-	if (sectorsize != PAGE_SIZE) {
-		btrfs_err(fs_info,
-			"sectorsize %llu not supported yet, only support %lu",
-			sectorsize, PAGE_SIZE);
-		ret = -EINVAL;
-	}
-	if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
-	    nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
-		btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
-		ret = -EINVAL;
-	}
-	if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
-		btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
-			  le32_to_cpu(sb->__unused_leafsize), nodesize);
-		ret = -EINVAL;
-	}
-
-	/* Root alignment check */
-	if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
-		btrfs_warn(fs_info, "tree_root block unaligned: %llu",
-			   btrfs_super_root(sb));
-		ret = -EINVAL;
-	}
-	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
-		btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
-			   btrfs_super_chunk_root(sb));
-		ret = -EINVAL;
-	}
-	if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
-		btrfs_warn(fs_info, "log_root block unaligned: %llu",
-			   btrfs_super_log_root(sb));
-		ret = -EINVAL;
-	}
-
-	if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) {
-		btrfs_err(fs_info,
-			   "dev_item UUID does not match fsid: %pU != %pU",
-			   fs_info->fsid, sb->dev_item.fsid);
-		ret = -EINVAL;
-	}
-
-	/*
-	 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
-	 * done later
-	 */
-	if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
-		btrfs_err(fs_info, "bytes_used is too small %llu",
-			  btrfs_super_bytes_used(sb));
-		ret = -EINVAL;
-	}
-	if (!is_power_of_2(btrfs_super_stripesize(sb))) {
-		btrfs_err(fs_info, "invalid stripesize %u",
-			  btrfs_super_stripesize(sb));
-		ret = -EINVAL;
-	}
-	if (btrfs_super_num_devices(sb) > (1UL << 31))
-		btrfs_warn(fs_info, "suspicious number of devices: %llu",
-			   btrfs_super_num_devices(sb));
-	if (btrfs_super_num_devices(sb) == 0) {
-		btrfs_err(fs_info, "number of devices is 0");
-		ret = -EINVAL;
-	}
-
-	if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
-		btrfs_err(fs_info, "super offset mismatch %llu != %u",
-			  btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
-		ret = -EINVAL;
-	}
-
-	/*
-	 * Obvious sys_chunk_array corruptions, it must hold at least one key
-	 * and one chunk
-	 */
-	if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
-		btrfs_err(fs_info, "system chunk array too big %u > %u",
-			  btrfs_super_sys_array_size(sb),
-			  BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
-		ret = -EINVAL;
-	}
-	if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
-			+ sizeof(struct btrfs_chunk)) {
-		btrfs_err(fs_info, "system chunk array too small %u < %zu",
-			  btrfs_super_sys_array_size(sb),
-			  sizeof(struct btrfs_disk_key)
-			  + sizeof(struct btrfs_chunk));
-		ret = -EINVAL;
-	}
-
-	/*
-	 * The generation is a global counter, we'll trust it more than the others
-	 * but it's still possible that it's the one that's wrong.
-	 */
-	if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
-		btrfs_warn(fs_info,
-			"suspicious: generation < chunk_root_generation: %llu < %llu",
-			btrfs_super_generation(sb),
-			btrfs_super_chunk_root_generation(sb));
-	if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
-	    && btrfs_super_cache_generation(sb) != (u64)-1)
-		btrfs_warn(fs_info,
-			"suspicious: generation < cache_generation: %llu < %llu",
-			btrfs_super_generation(sb),
-			btrfs_super_cache_generation(sb));
-
-	return ret;
-}
-
 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
 {
 	/* cleanup FS via transaction */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 51b5e2da..3d9fe58 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -66,10 +66,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				      u64 flags, u64 owner, u64 offset,
 				      struct btrfs_key *ins, int ref_mod);
 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
-				     struct btrfs_fs_info *fs_info,
-				     u64 parent, u64 root_objectid,
-				     u64 flags, struct btrfs_disk_key *key,
-				     int level, struct btrfs_key *ins);
+				     struct btrfs_delayed_ref_node *node,
+				     struct btrfs_delayed_extent_op *extent_op);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_fs_info *fs_info, u64 flags,
 			  int force);
@@ -256,7 +254,7 @@ static int exclude_super_stripes(struct btrfs_fs_info *fs_info,
 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 		bytenr = btrfs_sb_offset(i);
 		ret = btrfs_rmap_block(fs_info, cache->key.objectid,
-				       bytenr, 0, &logical, &nr, &stripe_len);
+				       bytenr, &logical, &nr, &stripe_len);
 		if (ret)
 			return ret;
 
@@ -343,8 +341,9 @@ static void fragment_free_space(struct btrfs_block_group_cache *block_group)
  * since their free space will be released as soon as the transaction commits.
  */
 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
-		       struct btrfs_fs_info *info, u64 start, u64 end)
+		       u64 start, u64 end)
 {
+	struct btrfs_fs_info *info = block_group->fs_info;
 	u64 extent_start, extent_end, size, total_added = 0;
 	int ret;
 
@@ -489,8 +488,7 @@ next:
 
 		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 		    key.type == BTRFS_METADATA_ITEM_KEY) {
-			total_found += add_new_free_space(block_group,
-							  fs_info, last,
+			total_found += add_new_free_space(block_group, last,
 							  key.objectid);
 			if (key.type == BTRFS_METADATA_ITEM_KEY)
 				last = key.objectid +
@@ -508,7 +506,7 @@ next:
 	}
 	ret = 0;
 
-	total_found += add_new_free_space(block_group, fs_info, last,
+	total_found += add_new_free_space(block_group, last,
 					  block_group->key.objectid +
 					  block_group->key.offset);
 	caching_ctl->progress = (u64)-1;
@@ -744,12 +742,12 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 }
 
 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
-			     u64 owner, u64 root_objectid)
+			     bool metadata, u64 root_objectid)
 {
 	struct btrfs_space_info *space_info;
 	u64 flags;
 
-	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+	if (metadata) {
 		if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
 			flags = BTRFS_BLOCK_GROUP_SYSTEM;
 		else
@@ -2200,8 +2198,11 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 						 &old_ref_mod, &new_ref_mod);
 	}
 
-	if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
-		add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid);
+	if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
+		bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+
+		add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
+	}
 
 	return ret;
 }
@@ -2428,10 +2429,8 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 {
 	int ret = 0;
 	struct btrfs_delayed_tree_ref *ref;
-	struct btrfs_key ins;
 	u64 parent = 0;
 	u64 ref_root = 0;
-	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
 
 	ref = btrfs_delayed_node_to_tree_ref(node);
 	trace_run_delayed_tree_ref(fs_info, node, ref, node->action);
@@ -2440,15 +2439,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 		parent = ref->parent;
 	ref_root = ref->root;
 
-	ins.objectid = node->bytenr;
-	if (skinny_metadata) {
-		ins.offset = ref->level;
-		ins.type = BTRFS_METADATA_ITEM_KEY;
-	} else {
-		ins.offset = node->num_bytes;
-		ins.type = BTRFS_EXTENT_ITEM_KEY;
-	}
-
 	if (node->ref_mod != 1) {
 		btrfs_err(fs_info,
 	"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
@@ -2458,11 +2448,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	}
 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
 		BUG_ON(!extent_op || !extent_op->update_flags);
-		ret = alloc_reserved_tree_block(trans, fs_info,
-						parent, ref_root,
-						extent_op->flags_to_set,
-						&extent_op->key,
-						ref->level, &ins);
+		ret = alloc_reserved_tree_block(trans, node, extent_op);
 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
 		ret = __btrfs_inc_extent_ref(trans, fs_info, node,
 					     parent, ref_root,
@@ -2594,8 +2580,8 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 	delayed_refs->num_heads--;
 	rb_erase(&head->href_node, &delayed_refs->href_root);
 	RB_CLEAR_NODE(&head->href_node);
-	spin_unlock(&delayed_refs->lock);
 	spin_unlock(&head->lock);
+	spin_unlock(&delayed_refs->lock);
 	atomic_dec(&delayed_refs->num_entries);
 
 	trace_run_delayed_ref_head(fs_info, head, 0);
@@ -2700,17 +2686,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		 * insert_inline_extent_backref()).
 		 */
 		spin_lock(&locked_ref->lock);
-		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
-					 locked_ref);
+		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
 
-		/*
-		 * locked_ref is the head node, so we have to go one
-		 * node back for any delayed ref updates
-		 */
 		ref = select_delayed_ref(locked_ref);
 
 		if (ref && ref->seq &&
-		    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
+		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
 			spin_unlock(&locked_ref->lock);
 			unselect_delayed_ref_head(delayed_refs, locked_ref);
 			locked_ref = NULL;
@@ -3291,7 +3272,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
 
 	path = btrfs_alloc_path();
 	if (!path)
-		return -ENOENT;
+		return -ENOMEM;
 
 	do {
 		ret = check_committed_ref(root, path, objectid,
@@ -4026,8 +4007,7 @@ static const char *alloc_name(u64 flags)
 	};
 }
 
-static int create_space_info(struct btrfs_fs_info *info, u64 flags,
-			     struct btrfs_space_info **new)
+static int create_space_info(struct btrfs_fs_info *info, u64 flags)
 {
 
 	struct btrfs_space_info *space_info;
@@ -4065,7 +4045,6 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags,
 		return ret;
 	}
 
-	*new = space_info;
 	list_add_rcu(&space_info->list, &info->space_info);
 	if (flags & BTRFS_BLOCK_GROUP_DATA)
 		info->data_sinfo = space_info;
@@ -4122,7 +4101,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
  * returns target flags in extended format or 0 if restripe for this
  * chunk_type is not in progress
  *
- * should be called with either volume_mutex or balance_lock held
+ * should be called with balance_lock held
  */
 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
 {
@@ -4178,7 +4157,7 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
 	/* First, mask out the RAID levels which aren't possible */
 	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
 		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
-			allowed |= btrfs_raid_group[raid_type];
+			allowed |= btrfs_raid_array[raid_type].bg_flag;
 	}
 	allowed &= flags;
 
@@ -4341,7 +4320,7 @@ commit_trans:
 			need_commit--;
 
 			if (need_commit > 0) {
-				btrfs_start_delalloc_roots(fs_info, 0, -1);
+				btrfs_start_delalloc_roots(fs_info, -1);
 				btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
 							 (u64)-1);
 			}
@@ -4678,12 +4657,14 @@ again:
 	trans->allocating_chunk = false;
 
 	spin_lock(&space_info->lock);
-	if (ret < 0 && ret != -ENOSPC)
-		goto out;
-	if (ret)
-		space_info->full = 1;
-	else
+	if (ret < 0) {
+		if (ret == -ENOSPC)
+			space_info->full = 1;
+		else
+			goto out;
+	} else {
 		ret = 1;
+	}
 
 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
 out:
@@ -4792,7 +4773,7 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
 		 * the filesystem is readonly(all dirty pages are written to
 		 * the disk).
 		 */
-		btrfs_start_delalloc_roots(fs_info, 0, nr_items);
+		btrfs_start_delalloc_roots(fs_info, nr_items);
 		if (!current->journal_info)
 			btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
 	}
@@ -5949,44 +5930,6 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
 	trans->chunk_bytes_reserved = 0;
 }
 
-/* Can only return 0 or -ENOSPC */
-int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
-				  struct btrfs_inode *inode)
-{
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-	struct btrfs_root *root = inode->root;
-	/*
-	 * We always use trans->block_rsv here as we will have reserved space
-	 * for our orphan when starting the transaction, using get_block_rsv()
-	 * here will sometimes make us choose the wrong block rsv as we could be
-	 * doing a reloc inode for a non refcounted root.
-	 */
-	struct btrfs_block_rsv *src_rsv = trans->block_rsv;
-	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
-
-	/*
-	 * We need to hold space in order to delete our orphan item once we've
-	 * added it, so this takes the reservation so we can release it later
-	 * when we are truly done with the orphan item.
-	 */
-	u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
-
-	trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
-			num_bytes, 1);
-	return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
-}
-
-void btrfs_orphan_release_metadata(struct btrfs_inode *inode)
-{
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-	struct btrfs_root *root = inode->root;
-	u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
-
-	trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
-			num_bytes, 0);
-	btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes);
-}
-
 /*
  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
  * root: the root of the parent directory
@@ -6004,7 +5947,6 @@ void btrfs_orphan_release_metadata(struct btrfs_inode *inode)
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 				     struct btrfs_block_rsv *rsv,
 				     int items,
-				     u64 *qgroup_reserved,
 				     bool use_global_rsv)
 {
 	u64 num_bytes;
@@ -6022,8 +5964,6 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 		num_bytes = 0;
 	}
 
-	*qgroup_reserved = num_bytes;
-
 	num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
 	rsv->space_info = __find_space_info(fs_info,
 					    BTRFS_BLOCK_GROUP_METADATA);
@@ -6033,8 +5973,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 	if (ret == -ENOSPC && use_global_rsv)
 		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
 
-	if (ret && *qgroup_reserved)
-		btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved);
+	if (ret && num_bytes)
+		btrfs_qgroup_free_meta_prealloc(root, num_bytes);
 
 	return ret;
 }
@@ -6354,6 +6294,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			spin_lock(&info->unused_bgs_lock);
 			if (list_empty(&cache->bg_list)) {
 				btrfs_get_block_group(cache);
+				trace_btrfs_add_unused_block_group(cache);
 				list_add_tail(&cache->bg_list,
 					      &info->unused_bgs);
 			}
@@ -6511,6 +6452,7 @@ int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
 	struct btrfs_key key;
 	int found_type;
 	int i;
+	int ret = 0;
 
 	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
 		return 0;
@@ -6527,10 +6469,12 @@ int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
 			continue;
 		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
-		__exclude_logged_extent(fs_info, key.objectid, key.offset);
+		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
+		if (ret)
+			break;
 	}
 
-	return 0;
+	return ret;
 }
 
 static void
@@ -7122,7 +7066,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			}
 		}
 
-		ret = add_to_free_space_tree(trans, info, bytenr, num_bytes);
+		ret = add_to_free_space_tree(trans, bytenr, num_bytes);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
@@ -7266,7 +7210,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 	}
 out:
 	if (pin)
-		add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf),
+		add_pinned_bytes(fs_info, buf->len, true,
 				 root->root_key.objectid);
 
 	if (last_ref) {
@@ -7320,8 +7264,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 						 &old_ref_mod, &new_ref_mod);
 	}
 
-	if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
-		add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
+	if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
+		bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+
+		add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
+	}
 
 	return ret;
 }
@@ -7373,24 +7320,6 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 	return ret;
 }
 
-static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
-	[BTRFS_RAID_RAID10]	= "raid10",
-	[BTRFS_RAID_RAID1]	= "raid1",
-	[BTRFS_RAID_DUP]	= "dup",
-	[BTRFS_RAID_RAID0]	= "raid0",
-	[BTRFS_RAID_SINGLE]	= "single",
-	[BTRFS_RAID_RAID5]	= "raid5",
-	[BTRFS_RAID_RAID6]	= "raid6",
-};
-
-static const char *get_raid_name(enum btrfs_raid_types type)
-{
-	if (type >= BTRFS_NR_RAID_TYPES)
-		return NULL;
-
-	return btrfs_raid_type_names[type];
-}
-
 enum btrfs_loop_type {
 	LOOP_CACHING_NOWAIT = 0,
 	LOOP_CACHING_WAIT = 1,
@@ -7662,7 +7591,7 @@ have_block_group:
 			if (offset) {
 				/* we have a block, we're done */
 				spin_unlock(&last_ptr->refill_lock);
-				trace_btrfs_reserve_extent_cluster(fs_info,
+				trace_btrfs_reserve_extent_cluster(
 						used_block_group,
 						search_start, num_bytes);
 				if (used_block_group != block_group) {
@@ -7735,7 +7664,7 @@ refill_cluster:
 				if (offset) {
 					/* we found one, proceed */
 					spin_unlock(&last_ptr->refill_lock);
-					trace_btrfs_reserve_extent_cluster(fs_info,
+					trace_btrfs_reserve_extent_cluster(
 						block_group, search_start,
 						num_bytes);
 					goto checks;
@@ -7835,8 +7764,7 @@ checks:
 		ins->objectid = search_start;
 		ins->offset = num_bytes;
 
-		trace_btrfs_reserve_extent(fs_info, block_group,
-					   search_start, num_bytes);
+		trace_btrfs_reserve_extent(block_group, search_start, num_bytes);
 		btrfs_release_block_group(block_group, delalloc);
 		break;
 loop:
@@ -8184,8 +8112,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
 
-	ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
-					  ins->offset);
+	ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
 	if (ret)
 		return ret;
 
@@ -8200,37 +8127,52 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 }
 
 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
-				     struct btrfs_fs_info *fs_info,
-				     u64 parent, u64 root_objectid,
-				     u64 flags, struct btrfs_disk_key *key,
-				     int level, struct btrfs_key *ins)
+				     struct btrfs_delayed_ref_node *node,
+				     struct btrfs_delayed_extent_op *extent_op)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	int ret;
 	struct btrfs_extent_item *extent_item;
+	struct btrfs_key extent_key;
 	struct btrfs_tree_block_info *block_info;
 	struct btrfs_extent_inline_ref *iref;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
+	struct btrfs_delayed_tree_ref *ref;
 	u32 size = sizeof(*extent_item) + sizeof(*iref);
-	u64 num_bytes = ins->offset;
+	u64 num_bytes;
+	u64 flags = extent_op->flags_to_set;
 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
 
-	if (!skinny_metadata)
+	ref = btrfs_delayed_node_to_tree_ref(node);
+
+	extent_key.objectid = node->bytenr;
+	if (skinny_metadata) {
+		extent_key.offset = ref->level;
+		extent_key.type = BTRFS_METADATA_ITEM_KEY;
+		num_bytes = fs_info->nodesize;
+	} else {
+		extent_key.offset = node->num_bytes;
+		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
 		size += sizeof(*block_info);
+		num_bytes = node->num_bytes;
+	}
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
+		btrfs_free_and_pin_reserved_extent(fs_info,
+						   extent_key.objectid,
 						   fs_info->nodesize);
 		return -ENOMEM;
 	}
 
 	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
-				      ins, size);
+				      &extent_key, size);
 	if (ret) {
 		btrfs_free_path(path);
-		btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
+		btrfs_free_and_pin_reserved_extent(fs_info,
+						   extent_key.objectid,
 						   fs_info->nodesize);
 		return ret;
 	}
@@ -8245,42 +8187,41 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 
 	if (skinny_metadata) {
 		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
-		num_bytes = fs_info->nodesize;
 	} else {
 		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
-		btrfs_set_tree_block_key(leaf, block_info, key);
-		btrfs_set_tree_block_level(leaf, block_info, level);
+		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
+		btrfs_set_tree_block_level(leaf, block_info, ref->level);
 		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
 	}
 
-	if (parent > 0) {
+	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
 		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
 		btrfs_set_extent_inline_ref_type(leaf, iref,
 						 BTRFS_SHARED_BLOCK_REF_KEY);
-		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
 	} else {
 		btrfs_set_extent_inline_ref_type(leaf, iref,
 						 BTRFS_TREE_BLOCK_REF_KEY);
-		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
 	}
 
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_free_path(path);
 
-	ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+	ret = remove_from_free_space_tree(trans, extent_key.objectid,
 					  num_bytes);
 	if (ret)
 		return ret;
 
-	ret = update_block_group(trans, fs_info, ins->objectid,
+	ret = update_block_group(trans, fs_info, extent_key.objectid,
 				 fs_info->nodesize, 1);
 	if (ret) { /* -ENOENT, logic error */
 		btrfs_err(fs_info, "update block group failed for %llu %llu",
-			ins->objectid, ins->offset);
+			extent_key.objectid, extent_key.offset);
 		BUG();
 	}
 
-	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid,
+	trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
 					  fs_info->nodesize);
 	return ret;
 }
@@ -10173,8 +10114,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 		} else if (btrfs_block_group_used(&cache->item) == 0) {
 			cache->last_byte_to_unpin = (u64)-1;
 			cache->cached = BTRFS_CACHE_FINISHED;
-			add_new_free_space(cache, info,
-					   found_key.objectid,
+			add_new_free_space(cache, found_key.objectid,
 					   found_key.objectid +
 					   found_key.offset);
 			free_excluded_extents(info, cache);
@@ -10204,6 +10144,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 			/* Should always be true but just in case. */
 			if (list_empty(&cache->bg_list)) {
 				btrfs_get_block_group(cache);
+				trace_btrfs_add_unused_block_group(cache);
 				list_add_tail(&cache->bg_list,
 					      &info->unused_bgs);
 			}
@@ -10269,7 +10210,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
 					       key.offset);
 		if (ret)
 			btrfs_abort_transaction(trans, ret);
-		add_block_group_free_space(trans, fs_info, block_group);
+		add_block_group_free_space(trans, block_group);
 		/* already aborted the transaction if it failed. */
 next:
 		list_del_init(&block_group->bg_list);
@@ -10310,7 +10251,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 
-	add_new_free_space(cache, fs_info, chunk_offset, chunk_offset + size);
+	add_new_free_space(cache, chunk_offset, chunk_offset + size);
 
 	free_excluded_extents(fs_info, cache);
 
@@ -10391,6 +10332,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	BUG_ON(!block_group);
 	BUG_ON(!block_group->ro);
 
+	trace_btrfs_remove_block_group(block_group);
 	/*
 	 * Free the reserved super bytes from this block group before
 	 * remove it.
@@ -10648,7 +10590,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
 	mutex_unlock(&fs_info->chunk_mutex);
 
-	ret = remove_block_group_free_space(trans, fs_info, block_group);
+	ret = remove_block_group_free_space(trans, block_group);
 	if (ret)
 		goto out;
 
@@ -10755,6 +10697,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 			 * the ro check in case balance is currently acting on
 			 * this block group.
 			 */
+			trace_btrfs_skip_unused_block_group(block_group);
 			spin_unlock(&block_group->lock);
 			up_write(&space_info->groups_sem);
 			goto next;
@@ -10877,7 +10820,6 @@ next:
 
 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_space_info *space_info;
 	struct btrfs_super_block *disk_super;
 	u64 features;
 	u64 flags;
@@ -10893,21 +10835,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
 		mixed = 1;
 
 	flags = BTRFS_BLOCK_GROUP_SYSTEM;
-	ret = create_space_info(fs_info, flags, &space_info);
+	ret = create_space_info(fs_info, flags);
 	if (ret)
 		goto out;
 
 	if (mixed) {
 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
-		ret = create_space_info(fs_info, flags, &space_info);
+		ret = create_space_info(fs_info, flags);
 	} else {
 		flags = BTRFS_BLOCK_GROUP_METADATA;
-		ret = create_space_info(fs_info, flags, &space_info);
+		ret = create_space_info(fs_info, flags);
 		if (ret)
 			goto out;
 
 		flags = BTRFS_BLOCK_GROUP_DATA;
-		ret = create_space_info(fs_info, flags, &space_info);
+		ret = create_space_info(fs_info, flags);
 	}
 out:
 	return ret;
@@ -11092,12 +11034,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
 void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
 {
 	percpu_counter_dec(&root->subv_writers->counter);
-	/*
-	 * Make sure counter is updated before we wake up waiters.
-	 */
-	smp_mb();
-	if (waitqueue_active(&root->subv_writers->wait))
-		wake_up(&root->subv_writers->wait);
+	cond_wake_up(&root->subv_writers->wait);
 }
 
 int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e99b329..51fc015 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -26,7 +26,7 @@
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
-static struct bio_set *btrfs_bioset;
+static struct bio_set btrfs_bioset;
 
 static inline bool extent_state_in_tree(const struct extent_state *state)
 {
@@ -162,20 +162,18 @@ int __init extent_io_init(void)
 	if (!extent_buffer_cache)
 		goto free_state_cache;
 
-	btrfs_bioset = bioset_create(BIO_POOL_SIZE,
-				     offsetof(struct btrfs_io_bio, bio),
-				     BIOSET_NEED_BVECS);
-	if (!btrfs_bioset)
+	if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
+			offsetof(struct btrfs_io_bio, bio),
+			BIOSET_NEED_BVECS))
 		goto free_buffer_cache;
 
-	if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE))
+	if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
 		goto free_bioset;
 
 	return 0;
 
 free_bioset:
-	bioset_free(btrfs_bioset);
-	btrfs_bioset = NULL;
+	bioset_exit(&btrfs_bioset);
 
 free_buffer_cache:
 	kmem_cache_destroy(extent_buffer_cache);
@@ -198,8 +196,7 @@ void __cold extent_io_exit(void)
 	rcu_barrier();
 	kmem_cache_destroy(extent_state_cache);
 	kmem_cache_destroy(extent_buffer_cache);
-	if (btrfs_bioset)
-		bioset_free(btrfs_bioset);
+	bioset_exit(&btrfs_bioset);
 }
 
 void extent_io_tree_init(struct extent_io_tree *tree,
@@ -2679,7 +2676,7 @@ struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte)
 {
 	struct bio *bio;
 
-	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset);
+	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
 	bio_set_dev(bio, bdev);
 	bio->bi_iter.bi_sector = first_byte >> 9;
 	btrfs_io_bio_init(btrfs_io_bio(bio));
@@ -2692,7 +2689,7 @@ struct bio *btrfs_bio_clone(struct bio *bio)
 	struct bio *new;
 
 	/* Bio allocation backed by a bioset does not fail */
-	new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset);
+	new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
 	btrfs_bio = btrfs_io_bio(new);
 	btrfs_io_bio_init(btrfs_bio);
 	btrfs_bio->iter = bio->bi_iter;
@@ -2704,7 +2701,7 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
 	struct bio *bio;
 
 	/* Bio allocation backed by a bioset does not fail */
-	bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset);
+	bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
 	btrfs_io_bio_init(btrfs_io_bio(bio));
 	return bio;
 }
@@ -2715,7 +2712,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
 	struct btrfs_io_bio *btrfs_bio;
 
 	/* this will never fail when it's backed by a bioset */
-	bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset);
+	bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
 	ASSERT(bio);
 
 	btrfs_bio = btrfs_io_bio(bio);
@@ -4109,14 +4106,13 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
 	return ret;
 }
 
-int extent_writepages(struct extent_io_tree *tree,
-		      struct address_space *mapping,
+int extent_writepages(struct address_space *mapping,
 		      struct writeback_control *wbc)
 {
 	int ret = 0;
 	struct extent_page_data epd = {
 		.bio = NULL,
-		.tree = tree,
+		.tree = &BTRFS_I(mapping->host)->io_tree,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
@@ -4126,9 +4122,8 @@ int extent_writepages(struct extent_io_tree *tree,
 	return ret;
 }
 
-int extent_readpages(struct extent_io_tree *tree,
-		     struct address_space *mapping,
-		     struct list_head *pages, unsigned nr_pages)
+int extent_readpages(struct address_space *mapping, struct list_head *pages,
+		     unsigned nr_pages)
 {
 	struct bio *bio = NULL;
 	unsigned page_idx;
@@ -4136,6 +4131,7 @@ int extent_readpages(struct extent_io_tree *tree,
 	struct page *pagepool[16];
 	struct page *page;
 	struct extent_map *em_cached = NULL;
+	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
 	int nr = 0;
 	u64 prev_em_start = (u64)-1;
 
@@ -4202,8 +4198,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
  * are locked or under IO and drops the related state bits if it is safe
  * to drop the page.
  */
-static int try_release_extent_state(struct extent_map_tree *map,
-				    struct extent_io_tree *tree,
+static int try_release_extent_state(struct extent_io_tree *tree,
 				    struct page *page, gfp_t mask)
 {
 	u64 start = page_offset(page);
@@ -4238,13 +4233,13 @@ static int try_release_extent_state(struct extent_map_tree *map,
  * in the range corresponding to the page, both state records and extent
  * map records are removed
  */
-int try_release_extent_mapping(struct extent_map_tree *map,
-			       struct extent_io_tree *tree, struct page *page,
-			       gfp_t mask)
+int try_release_extent_mapping(struct page *page, gfp_t mask)
 {
 	struct extent_map *em;
 	u64 start = page_offset(page);
 	u64 end = start + PAGE_SIZE - 1;
+	struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
+	struct extent_map_tree *map = &BTRFS_I(page->mapping->host)->extent_tree;
 
 	if (gfpflags_allow_blocking(mask) &&
 	    page->mapping->host->i_size > SZ_16M) {
@@ -4278,7 +4273,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 			free_extent_map(em);
 		}
 	}
-	return try_release_extent_state(map, tree, page, mask);
+	return try_release_extent_state(tree, page, mask);
 }
 
 /*
@@ -5620,46 +5615,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 	}
 }
 
-void le_bitmap_set(u8 *map, unsigned int start, int len)
-{
-	u8 *p = map + BIT_BYTE(start);
-	const unsigned int size = start + len;
-	int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE);
-	u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start);
-
-	while (len - bits_to_set >= 0) {
-		*p |= mask_to_set;
-		len -= bits_to_set;
-		bits_to_set = BITS_PER_BYTE;
-		mask_to_set = ~0;
-		p++;
-	}
-	if (len) {
-		mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
-		*p |= mask_to_set;
-	}
-}
-
-void le_bitmap_clear(u8 *map, unsigned int start, int len)
-{
-	u8 *p = map + BIT_BYTE(start);
-	const unsigned int size = start + len;
-	int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE);
-	u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start);
-
-	while (len - bits_to_clear >= 0) {
-		*p &= ~mask_to_clear;
-		len -= bits_to_clear;
-		bits_to_clear = BITS_PER_BYTE;
-		mask_to_clear = ~0;
-		p++;
-	}
-	if (len) {
-		mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
-		*p &= ~mask_to_clear;
-	}
-}
-
 /*
  * eb_bitmap_offset() - calculate the page and offset of the byte containing the
  * given bit number
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a530096..0bfd4ae 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -79,14 +79,6 @@
 #define BITMAP_LAST_BYTE_MASK(nbits) \
 	(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
 
-static inline int le_test_bit(int nr, const u8 *addr)
-{
-	return 1U & (addr[BIT_BYTE(nr)] >> (nr & (BITS_PER_BYTE-1)));
-}
-
-void le_bitmap_set(u8 *map, unsigned int start, int len);
-void le_bitmap_clear(u8 *map, unsigned int start, int len);
-
 struct extent_state;
 struct btrfs_root;
 struct btrfs_inode;
@@ -278,9 +270,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
 					  int create);
 
 void extent_io_tree_init(struct extent_io_tree *tree, void *private_data);
-int try_release_extent_mapping(struct extent_map_tree *map,
-			       struct extent_io_tree *tree, struct page *page,
-			       gfp_t mask);
+int try_release_extent_mapping(struct page *page, gfp_t mask);
 int try_release_extent_buffer(struct page *page);
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		     struct extent_state **cached);
@@ -421,14 +411,12 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 int extent_write_full_page(struct page *page, struct writeback_control *wbc);
 int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
 			      int mode);
-int extent_writepages(struct extent_io_tree *tree,
-		      struct address_space *mapping,
+int extent_writepages(struct address_space *mapping,
 		      struct writeback_control *wbc);
 int btree_write_cache_pages(struct address_space *mapping,
 			    struct writeback_control *wbc);
-int extent_readpages(struct extent_io_tree *tree,
-		     struct address_space *mapping,
-		     struct list_head *pages, unsigned nr_pages);
+int extent_readpages(struct address_space *mapping, struct list_head *pages,
+		     unsigned nr_pages);
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len);
 void set_page_extent_mapped(struct page *page);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 1b8a078..6648d55 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -518,6 +518,7 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
 
 /**
  * btrfs_add_extent_mapping - add extent mapping into em_tree
+ * @fs_info - used for tracepoint
  * @em_tree - the extent tree into which we want to insert the extent mapping
  * @em_in   - extent we are inserting
  * @start   - start of the logical range btrfs_get_extent() is requesting
@@ -535,7 +536,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
  * Return 0 on success, otherwise -EEXIST.
  *
  */
-int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
+			     struct extent_map_tree *em_tree,
 			     struct extent_map **em_in, u64 start, u64 len)
 {
 	int ret;
@@ -553,7 +555,7 @@ int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
 
 		existing = search_extent_mapping(em_tree, start, len);
 
-		trace_btrfs_handle_em_exist(existing, em, start, len);
+		trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
 
 		/*
 		 * existing will always be non-NULL, since there must be
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 5fcb80a..25d985e 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -92,7 +92,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen
 void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 len);
-int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
+			     struct extent_map_tree *em_tree,
 			     struct extent_map **em_in, u64 start, u64 len);
 
 #endif
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index e5b569b..d5f80cb 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -253,10 +253,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 	truncate_pagecache(inode, 0);
 
 	/*
-	 * We don't need an orphan item because truncating the free space cache
-	 * will never be split across transactions.
-	 * We don't need to check for -EAGAIN because we're a free space
-	 * cache inode
+	 * We skip the throttling logic for free space cache inodes, so we don't
+	 * need to check for -EAGAIN.
 	 */
 	ret = btrfs_truncate_inode_items(trans, root, inode,
 					 0, BTRFS_EXTENT_DATA_KEY);
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 32a0f6c..b5950aa 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -12,7 +12,6 @@
 #include "transaction.h"
 
 static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
-					struct btrfs_fs_info *fs_info,
 					struct btrfs_block_group_cache *block_group,
 					struct btrfs_path *path);
 
@@ -45,11 +44,10 @@ void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache)
 }
 
 static int add_new_free_space_info(struct btrfs_trans_handle *trans,
-				   struct btrfs_fs_info *fs_info,
 				   struct btrfs_block_group_cache *block_group,
 				   struct btrfs_path *path)
 {
-	struct btrfs_root *root = fs_info->free_space_root;
+	struct btrfs_root *root = trans->fs_info->free_space_root;
 	struct btrfs_free_space_info *info;
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
@@ -138,10 +136,11 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
 	return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE);
 }
 
-static u8 *alloc_bitmap(u32 bitmap_size)
+static unsigned long *alloc_bitmap(u32 bitmap_size)
 {
-	u8 *ret;
+	unsigned long *ret;
 	unsigned int nofs_flag;
+	u32 bitmap_rounded_size = round_up(bitmap_size, sizeof(unsigned long));
 
 	/*
 	 * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse
@@ -152,21 +151,42 @@ static u8 *alloc_bitmap(u32 bitmap_size)
 	 * know that recursion is unsafe.
 	 */
 	nofs_flag = memalloc_nofs_save();
-	ret = kvzalloc(bitmap_size, GFP_KERNEL);
+	ret = kvzalloc(bitmap_rounded_size, GFP_KERNEL);
 	memalloc_nofs_restore(nofs_flag);
 	return ret;
 }
 
+static void le_bitmap_set(unsigned long *map, unsigned int start, int len)
+{
+	u8 *p = ((u8 *)map) + BIT_BYTE(start);
+	const unsigned int size = start + len;
+	int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE);
+	u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start);
+
+	while (len - bits_to_set >= 0) {
+		*p |= mask_to_set;
+		len -= bits_to_set;
+		bits_to_set = BITS_PER_BYTE;
+		mask_to_set = ~0;
+		p++;
+	}
+	if (len) {
+		mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
+		*p |= mask_to_set;
+	}
+}
+
 int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
-				  struct btrfs_fs_info *fs_info,
 				  struct btrfs_block_group_cache *block_group,
 				  struct btrfs_path *path)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_root *root = fs_info->free_space_root;
 	struct btrfs_free_space_info *info;
 	struct btrfs_key key, found_key;
 	struct extent_buffer *leaf;
-	u8 *bitmap, *bitmap_cursor;
+	unsigned long *bitmap;
+	char *bitmap_cursor;
 	u64 start, end;
 	u64 bitmap_range, i;
 	u32 bitmap_size, flags, expected_extent_count;
@@ -255,7 +275,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	bitmap_cursor = bitmap;
+	bitmap_cursor = (char *)bitmap;
 	bitmap_range = fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
 	i = start;
 	while (i < end) {
@@ -296,21 +316,18 @@ out:
 }
 
 int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
-				  struct btrfs_fs_info *fs_info,
 				  struct btrfs_block_group_cache *block_group,
 				  struct btrfs_path *path)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_root *root = fs_info->free_space_root;
 	struct btrfs_free_space_info *info;
 	struct btrfs_key key, found_key;
 	struct extent_buffer *leaf;
-	u8 *bitmap;
+	unsigned long *bitmap;
 	u64 start, end;
-	/* Initialize to silence GCC. */
-	u64 extent_start = 0;
-	u64 offset;
 	u32 bitmap_size, flags, expected_extent_count;
-	int prev_bit = 0, bit, bitnr;
+	unsigned long nrbits, start_bit, end_bit;
 	u32 extent_count = 0;
 	int done = 0, nr;
 	int ret;
@@ -348,7 +365,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 				break;
 			} else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
 				unsigned long ptr;
-				u8 *bitmap_cursor;
+				char *bitmap_cursor;
 				u32 bitmap_pos, data_size;
 
 				ASSERT(found_key.objectid >= start);
@@ -358,7 +375,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 				bitmap_pos = div_u64(found_key.objectid - start,
 						     fs_info->sectorsize *
 						     BITS_PER_BYTE);
-				bitmap_cursor = bitmap + bitmap_pos;
+				bitmap_cursor = ((char *)bitmap) + bitmap_pos;
 				data_size = free_space_bitmap_size(found_key.offset,
 								   fs_info->sectorsize);
 
@@ -392,32 +409,16 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(path);
 
-	offset = start;
-	bitnr = 0;
-	while (offset < end) {
-		bit = !!le_test_bit(bitnr, bitmap);
-		if (prev_bit == 0 && bit == 1) {
-			extent_start = offset;
-		} else if (prev_bit == 1 && bit == 0) {
-			key.objectid = extent_start;
-			key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
-			key.offset = offset - extent_start;
-
-			ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
-			if (ret)
-				goto out;
-			btrfs_release_path(path);
+	nrbits = div_u64(block_group->key.offset, block_group->fs_info->sectorsize);
+	start_bit = find_next_bit_le(bitmap, nrbits, 0);
 
-			extent_count++;
-		}
-		prev_bit = bit;
-		offset += fs_info->sectorsize;
-		bitnr++;
-	}
-	if (prev_bit == 1) {
-		key.objectid = extent_start;
+	while (start_bit < nrbits) {
+		end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit);
+		ASSERT(start_bit < end_bit);
+
+		key.objectid = start + start_bit * block_group->fs_info->sectorsize;
 		key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
-		key.offset = end - extent_start;
+		key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize;
 
 		ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
 		if (ret)
@@ -425,6 +426,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 		btrfs_release_path(path);
 
 		extent_count++;
+
+		start_bit = find_next_bit_le(bitmap, nrbits, end_bit);
 	}
 
 	if (extent_count != expected_extent_count) {
@@ -446,7 +449,6 @@ out:
 }
 
 static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
-					  struct btrfs_fs_info *fs_info,
 					  struct btrfs_block_group_cache *block_group,
 					  struct btrfs_path *path,
 					  int new_extents)
@@ -459,7 +461,8 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
 	if (new_extents == 0)
 		return 0;
 
-	info = search_free_space_info(trans, fs_info, block_group, path, 1);
+	info = search_free_space_info(trans, trans->fs_info, block_group, path,
+				      1);
 	if (IS_ERR(info)) {
 		ret = PTR_ERR(info);
 		goto out;
@@ -474,12 +477,10 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
 
 	if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
 	    extent_count > block_group->bitmap_high_thresh) {
-		ret = convert_free_space_to_bitmaps(trans, fs_info, block_group,
-						    path);
+		ret = convert_free_space_to_bitmaps(trans, block_group, path);
 	} else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
 		   extent_count < block_group->bitmap_low_thresh) {
-		ret = convert_free_space_to_extents(trans, fs_info, block_group,
-						    path);
+		ret = convert_free_space_to_extents(trans, block_group, path);
 	}
 
 out:
@@ -576,12 +577,11 @@ static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
  * the bitmap.
  */
 static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
-				    struct btrfs_fs_info *fs_info,
 				    struct btrfs_block_group_cache *block_group,
 				    struct btrfs_path *path,
 				    u64 start, u64 size, int remove)
 {
-	struct btrfs_root *root = fs_info->free_space_root;
+	struct btrfs_root *root = block_group->fs_info->free_space_root;
 	struct btrfs_key key;
 	u64 end = start + size;
 	u64 cur_start, cur_size;
@@ -682,7 +682,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
 	}
 
 	btrfs_release_path(path);
-	ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+	ret = update_free_space_extent_count(trans, block_group, path,
 					     new_extents);
 
 out:
@@ -690,12 +690,11 @@ out:
 }
 
 static int remove_free_space_extent(struct btrfs_trans_handle *trans,
-				    struct btrfs_fs_info *fs_info,
 				    struct btrfs_block_group_cache *block_group,
 				    struct btrfs_path *path,
 				    u64 start, u64 size)
 {
-	struct btrfs_root *root = fs_info->free_space_root;
+	struct btrfs_root *root = trans->fs_info->free_space_root;
 	struct btrfs_key key;
 	u64 found_start, found_end;
 	u64 end = start + size;
@@ -769,7 +768,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
 	}
 
 	btrfs_release_path(path);
-	ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+	ret = update_free_space_extent_count(trans, block_group, path,
 					     new_extents);
 
 out:
@@ -777,7 +776,6 @@ out:
 }
 
 int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
-				  struct btrfs_fs_info *fs_info,
 				  struct btrfs_block_group_cache *block_group,
 				  struct btrfs_path *path, u64 start, u64 size)
 {
@@ -786,36 +784,35 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
 	int ret;
 
 	if (block_group->needs_free_space) {
-		ret = __add_block_group_free_space(trans, fs_info, block_group,
-						   path);
+		ret = __add_block_group_free_space(trans, block_group, path);
 		if (ret)
 			return ret;
 	}
 
-	info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+	info = search_free_space_info(NULL, trans->fs_info, block_group, path,
+				      0);
 	if (IS_ERR(info))
 		return PTR_ERR(info);
 	flags = btrfs_free_space_flags(path->nodes[0], info);
 	btrfs_release_path(path);
 
 	if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
-		return modify_free_space_bitmap(trans, fs_info, block_group,
-						path, start, size, 1);
+		return modify_free_space_bitmap(trans, block_group, path,
+						start, size, 1);
 	} else {
-		return remove_free_space_extent(trans, fs_info, block_group,
-						path, start, size);
+		return remove_free_space_extent(trans, block_group, path,
+						start, size);
 	}
 }
 
 int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
-				struct btrfs_fs_info *fs_info,
 				u64 start, u64 size)
 {
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_path *path;
 	int ret;
 
-	if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+	if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
 		return 0;
 
 	path = btrfs_alloc_path();
@@ -824,7 +821,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	block_group = btrfs_lookup_block_group(fs_info, start);
+	block_group = btrfs_lookup_block_group(trans->fs_info, start);
 	if (!block_group) {
 		ASSERT(0);
 		ret = -ENOENT;
@@ -832,8 +829,8 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
 	}
 
 	mutex_lock(&block_group->free_space_lock);
-	ret = __remove_from_free_space_tree(trans, fs_info, block_group, path,
-					    start, size);
+	ret = __remove_from_free_space_tree(trans, block_group, path, start,
+					    size);
 	mutex_unlock(&block_group->free_space_lock);
 
 	btrfs_put_block_group(block_group);
@@ -845,12 +842,11 @@ out:
 }
 
 static int add_free_space_extent(struct btrfs_trans_handle *trans,
-				 struct btrfs_fs_info *fs_info,
 				 struct btrfs_block_group_cache *block_group,
 				 struct btrfs_path *path,
 				 u64 start, u64 size)
 {
-	struct btrfs_root *root = fs_info->free_space_root;
+	struct btrfs_root *root = trans->fs_info->free_space_root;
 	struct btrfs_key key, new_key;
 	u64 found_start, found_end;
 	u64 end = start + size;
@@ -965,7 +961,7 @@ insert:
 		goto out;
 
 	btrfs_release_path(path);
-	ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+	ret = update_free_space_extent_count(trans, block_group, path,
 					     new_extents);
 
 out:
@@ -973,17 +969,16 @@ out:
 }
 
 int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
-			     struct btrfs_fs_info *fs_info,
 			     struct btrfs_block_group_cache *block_group,
 			     struct btrfs_path *path, u64 start, u64 size)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_free_space_info *info;
 	u32 flags;
 	int ret;
 
 	if (block_group->needs_free_space) {
-		ret = __add_block_group_free_space(trans, fs_info, block_group,
-						   path);
+		ret = __add_block_group_free_space(trans, block_group, path);
 		if (ret)
 			return ret;
 	}
@@ -995,23 +990,22 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
 	btrfs_release_path(path);
 
 	if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
-		return modify_free_space_bitmap(trans, fs_info, block_group,
-						path, start, size, 0);
+		return modify_free_space_bitmap(trans, block_group, path,
+						start, size, 0);
 	} else {
-		return add_free_space_extent(trans, fs_info, block_group, path,
-					     start, size);
+		return add_free_space_extent(trans, block_group, path, start,
+					     size);
 	}
 }
 
 int add_to_free_space_tree(struct btrfs_trans_handle *trans,
-			   struct btrfs_fs_info *fs_info,
 			   u64 start, u64 size)
 {
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_path *path;
 	int ret;
 
-	if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+	if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
 		return 0;
 
 	path = btrfs_alloc_path();
@@ -1020,7 +1014,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	block_group = btrfs_lookup_block_group(fs_info, start);
+	block_group = btrfs_lookup_block_group(trans->fs_info, start);
 	if (!block_group) {
 		ASSERT(0);
 		ret = -ENOENT;
@@ -1028,8 +1022,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans,
 	}
 
 	mutex_lock(&block_group->free_space_lock);
-	ret = __add_to_free_space_tree(trans, fs_info, block_group, path, start,
-				       size);
+	ret = __add_to_free_space_tree(trans, block_group, path, start, size);
 	mutex_unlock(&block_group->free_space_lock);
 
 	btrfs_put_block_group(block_group);
@@ -1046,10 +1039,9 @@ out:
  * through the normal add/remove hooks.
  */
 static int populate_free_space_tree(struct btrfs_trans_handle *trans,
-				    struct btrfs_fs_info *fs_info,
 				    struct btrfs_block_group_cache *block_group)
 {
-	struct btrfs_root *extent_root = fs_info->extent_root;
+	struct btrfs_root *extent_root = trans->fs_info->extent_root;
 	struct btrfs_path *path, *path2;
 	struct btrfs_key key;
 	u64 start, end;
@@ -1066,7 +1058,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	}
 
-	ret = add_new_free_space_info(trans, fs_info, block_group, path2);
+	ret = add_new_free_space_info(trans, block_group, path2);
 	if (ret)
 		goto out;
 
@@ -1099,7 +1091,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
 				break;
 
 			if (start < key.objectid) {
-				ret = __add_to_free_space_tree(trans, fs_info,
+				ret = __add_to_free_space_tree(trans,
 							       block_group,
 							       path2, start,
 							       key.objectid -
@@ -1109,7 +1101,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
 			}
 			start = key.objectid;
 			if (key.type == BTRFS_METADATA_ITEM_KEY)
-				start += fs_info->nodesize;
+				start += trans->fs_info->nodesize;
 			else
 				start += key.offset;
 		} else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
@@ -1124,8 +1116,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
 			break;
 	}
 	if (start < end) {
-		ret = __add_to_free_space_tree(trans, fs_info, block_group,
-					       path2, start, end - start);
+		ret = __add_to_free_space_tree(trans, block_group, path2,
+					       start, end - start);
 		if (ret)
 			goto out_locked;
 	}
@@ -1165,7 +1157,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
 	while (node) {
 		block_group = rb_entry(node, struct btrfs_block_group_cache,
 				       cache_node);
-		ret = populate_free_space_tree(trans, fs_info, block_group);
+		ret = populate_free_space_tree(trans, block_group);
 		if (ret)
 			goto abort;
 		node = rb_next(node);
@@ -1269,7 +1261,6 @@ abort:
 }
 
 static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
-					struct btrfs_fs_info *fs_info,
 					struct btrfs_block_group_cache *block_group,
 					struct btrfs_path *path)
 {
@@ -1277,19 +1268,19 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
 
 	block_group->needs_free_space = 0;
 
-	ret = add_new_free_space_info(trans, fs_info, block_group, path);
+	ret = add_new_free_space_info(trans, block_group, path);
 	if (ret)
 		return ret;
 
-	return __add_to_free_space_tree(trans, fs_info, block_group, path,
+	return __add_to_free_space_tree(trans, block_group, path,
 					block_group->key.objectid,
 					block_group->key.offset);
 }
 
 int add_block_group_free_space(struct btrfs_trans_handle *trans,
-			       struct btrfs_fs_info *fs_info,
 			       struct btrfs_block_group_cache *block_group)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_path *path = NULL;
 	int ret = 0;
 
@@ -1306,7 +1297,7 @@ int add_block_group_free_space(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	ret = __add_block_group_free_space(trans, fs_info, block_group, path);
+	ret = __add_block_group_free_space(trans, block_group, path);
 
 out:
 	btrfs_free_path(path);
@@ -1317,10 +1308,9 @@ out:
 }
 
 int remove_block_group_free_space(struct btrfs_trans_handle *trans,
-				  struct btrfs_fs_info *fs_info,
 				  struct btrfs_block_group_cache *block_group)
 {
-	struct btrfs_root *root = fs_info->free_space_root;
+	struct btrfs_root *root = trans->fs_info->free_space_root;
 	struct btrfs_path *path;
 	struct btrfs_key key, found_key;
 	struct extent_buffer *leaf;
@@ -1328,7 +1318,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
 	int done = 0, nr;
 	int ret;
 
-	if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+	if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
 		return 0;
 
 	if (block_group->needs_free_space) {
@@ -1439,7 +1429,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
 				extent_start = offset;
 			} else if (prev_bit == 1 && bit == 0) {
 				total_found += add_new_free_space(block_group,
-								  fs_info,
 								  extent_start,
 								  offset);
 				if (total_found > CACHING_CTL_WAKE_UP) {
@@ -1453,8 +1442,8 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
 		}
 	}
 	if (prev_bit == 1) {
-		total_found += add_new_free_space(block_group, fs_info,
-						  extent_start, end);
+		total_found += add_new_free_space(block_group, extent_start,
+						  end);
 		extent_count++;
 	}
 
@@ -1511,8 +1500,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
 
 		caching_ctl->progress = key.objectid;
 
-		total_found += add_new_free_space(block_group, fs_info,
-						  key.objectid,
+		total_found += add_new_free_space(block_group, key.objectid,
 						  key.objectid + key.offset);
 		if (total_found > CACHING_CTL_WAKE_UP) {
 			total_found = 0;
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index 874b4fe..3133651d 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h
@@ -19,16 +19,12 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
 int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info);
 int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
 int add_block_group_free_space(struct btrfs_trans_handle *trans,
-			       struct btrfs_fs_info *fs_info,
 			       struct btrfs_block_group_cache *block_group);
 int remove_block_group_free_space(struct btrfs_trans_handle *trans,
-				  struct btrfs_fs_info *fs_info,
 				  struct btrfs_block_group_cache *block_group);
 int add_to_free_space_tree(struct btrfs_trans_handle *trans,
-			   struct btrfs_fs_info *fs_info,
 			   u64 start, u64 size);
 int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
-				struct btrfs_fs_info *fs_info,
 				u64 start, u64 size);
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -38,19 +34,15 @@ search_free_space_info(struct btrfs_trans_handle *trans,
 		       struct btrfs_block_group_cache *block_group,
 		       struct btrfs_path *path, int cow);
 int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
-			     struct btrfs_fs_info *fs_info,
 			     struct btrfs_block_group_cache *block_group,
 			     struct btrfs_path *path, u64 start, u64 size);
 int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
-				  struct btrfs_fs_info *fs_info,
 				  struct btrfs_block_group_cache *block_group,
 				  struct btrfs_path *path, u64 start, u64 size);
 int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
-				  struct btrfs_fs_info *fs_info,
 				  struct btrfs_block_group_cache *block_group,
 				  struct btrfs_path *path);
 int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
-				  struct btrfs_fs_info *fs_info,
 				  struct btrfs_block_group_cache *block_group,
 				  struct btrfs_path *path);
 int free_space_test_bit(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0b86cf1..89b2082 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1018,8 +1018,10 @@ static noinline int cow_file_range(struct inode *inode,
 				  ram_size, /* ram_bytes */
 				  BTRFS_COMPRESS_NONE, /* compress_type */
 				  BTRFS_ORDERED_REGULAR /* type */);
-		if (IS_ERR(em))
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
 			goto out_reserve;
+		}
 		free_extent_map(em);
 
 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
@@ -1156,13 +1158,10 @@ static noinline void async_cow_submit(struct btrfs_work *work)
 	nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
 		PAGE_SHIFT;
 
-	/*
-	 * atomic_sub_return implies a barrier for waitqueue_active
-	 */
+	/* atomic_sub_return implies a barrier */
 	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
-	    5 * SZ_1M &&
-	    waitqueue_active(&fs_info->async_submit_wait))
-		wake_up(&fs_info->async_submit_wait);
+	    5 * SZ_1M)
+		cond_wake_up_nomb(&fs_info->async_submit_wait);
 
 	if (async_cow->inode)
 		submit_compressed_extents(async_cow->inode, async_cow);
@@ -1373,6 +1372,13 @@ next_slot:
 			    btrfs_file_extent_encryption(leaf, fi) ||
 			    btrfs_file_extent_other_encoding(leaf, fi))
 				goto out_check;
+			/*
+			 * Do the same check as in btrfs_cross_ref_exist but
+			 * without the unnecessary search.
+			 */
+			if (btrfs_file_extent_generation(leaf, fi) <=
+			    btrfs_root_last_snapshot(&root->root_item))
+				goto out_check;
 			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
 				goto out_check;
 			if (btrfs_extent_readonly(fs_info, disk_bytenr))
@@ -1754,6 +1760,7 @@ void __btrfs_del_delalloc_inode(struct btrfs_root *root,
 			  &inode->runtime_flags);
 		root->nr_delalloc_inodes--;
 		if (!root->nr_delalloc_inodes) {
+			ASSERT(list_empty(&root->delalloc_inodes));
 			spin_lock(&fs_info->delalloc_root_lock);
 			BUG_ON(list_empty(&root->delalloc_root));
 			list_del_init(&root->delalloc_root);
@@ -3158,6 +3165,9 @@ out:
 	/* once for the tree */
 	btrfs_put_ordered_extent(ordered_extent);
 
+	/* Try to release some metadata so we don't get an OOM but don't wait */
+	btrfs_btree_balance_dirty_nodelay(fs_info);
+
 	return ret;
 }
 
@@ -3300,177 +3310,31 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
 }
 
 /*
- * This is called in transaction commit time. If there are no orphan
- * files in the subvolume, it removes orphan item and frees block_rsv
- * structure.
- */
-void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_block_rsv *block_rsv;
-	int ret;
-
-	if (atomic_read(&root->orphan_inodes) ||
-	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
-		return;
-
-	spin_lock(&root->orphan_lock);
-	if (atomic_read(&root->orphan_inodes)) {
-		spin_unlock(&root->orphan_lock);
-		return;
-	}
-
-	if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
-		spin_unlock(&root->orphan_lock);
-		return;
-	}
-
-	block_rsv = root->orphan_block_rsv;
-	root->orphan_block_rsv = NULL;
-	spin_unlock(&root->orphan_lock);
-
-	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
-	    btrfs_root_refs(&root->root_item) > 0) {
-		ret = btrfs_del_orphan_item(trans, fs_info->tree_root,
-					    root->root_key.objectid);
-		if (ret)
-			btrfs_abort_transaction(trans, ret);
-		else
-			clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
-				  &root->state);
-	}
-
-	if (block_rsv) {
-		WARN_ON(block_rsv->size > 0);
-		btrfs_free_block_rsv(fs_info, block_rsv);
-	}
-}
-
-/*
- * This creates an orphan entry for the given inode in case something goes
- * wrong in the middle of an unlink/truncate.
- *
- * NOTE: caller of this function should reserve 5 units of metadata for
- *	 this function.
+ * This creates an orphan entry for the given inode in case something goes wrong
+ * in the middle of an unlink.
  */
 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
-		struct btrfs_inode *inode)
+		     struct btrfs_inode *inode)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-	struct btrfs_root *root = inode->root;
-	struct btrfs_block_rsv *block_rsv = NULL;
-	int reserve = 0;
-	bool insert = false;
 	int ret;
 
-	if (!root->orphan_block_rsv) {
-		block_rsv = btrfs_alloc_block_rsv(fs_info,
-						  BTRFS_BLOCK_RSV_TEMP);
-		if (!block_rsv)
-			return -ENOMEM;
-	}
-
-	if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-			      &inode->runtime_flags))
-		insert = true;
-
-	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
-			      &inode->runtime_flags))
-		reserve = 1;
-
-	spin_lock(&root->orphan_lock);
-	/* If someone has created ->orphan_block_rsv, be happy to use it. */
-	if (!root->orphan_block_rsv) {
-		root->orphan_block_rsv = block_rsv;
-	} else if (block_rsv) {
-		btrfs_free_block_rsv(fs_info, block_rsv);
-		block_rsv = NULL;
-	}
-
-	if (insert)
-		atomic_inc(&root->orphan_inodes);
-	spin_unlock(&root->orphan_lock);
-
-	/* grab metadata reservation from transaction handle */
-	if (reserve) {
-		ret = btrfs_orphan_reserve_metadata(trans, inode);
-		ASSERT(!ret);
-		if (ret) {
-			/*
-			 * dec doesn't need spin_lock as ->orphan_block_rsv
-			 * would be released only if ->orphan_inodes is
-			 * zero.
-			 */
-			atomic_dec(&root->orphan_inodes);
-			clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
-				  &inode->runtime_flags);
-			if (insert)
-				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-					  &inode->runtime_flags);
-			return ret;
-		}
-	}
-
-	/* insert an orphan item to track this unlinked/truncated file */
-	if (insert) {
-		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
-		if (ret) {
-			if (reserve) {
-				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
-					  &inode->runtime_flags);
-				btrfs_orphan_release_metadata(inode);
-			}
-			/*
-			 * btrfs_orphan_commit_root may race with us and set
-			 * ->orphan_block_rsv to zero, in order to avoid that,
-			 * decrease ->orphan_inodes after everything is done.
-			 */
-			atomic_dec(&root->orphan_inodes);
-			if (ret != -EEXIST) {
-				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-					  &inode->runtime_flags);
-				btrfs_abort_transaction(trans, ret);
-				return ret;
-			}
-		}
-		ret = 0;
+	ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
+	if (ret && ret != -EEXIST) {
+		btrfs_abort_transaction(trans, ret);
+		return ret;
 	}
 
 	return 0;
 }
 
 /*
- * We have done the truncate/delete so we can go ahead and remove the orphan
- * item for this particular inode.
+ * We have done the delete so we can go ahead and remove the orphan item for
+ * this particular inode.
  */
 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
 			    struct btrfs_inode *inode)
 {
-	struct btrfs_root *root = inode->root;
-	int delete_item = 0;
-	int ret = 0;
-
-	if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-			       &inode->runtime_flags))
-		delete_item = 1;
-
-	if (delete_item && trans)
-		ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
-
-	if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
-			       &inode->runtime_flags))
-		btrfs_orphan_release_metadata(inode);
-
-	/*
-	 * btrfs_orphan_commit_root may race with us and set ->orphan_block_rsv
-	 * to zero, in order to avoid that, decrease ->orphan_inodes after
-	 * everything is done.
-	 */
-	if (delete_item)
-		atomic_dec(&root->orphan_inodes);
-
-	return ret;
+	return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
 }
 
 /*
@@ -3486,7 +3350,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 	struct btrfs_trans_handle *trans;
 	struct inode *inode;
 	u64 last_objectid = 0;
-	int ret = 0, nr_unlink = 0, nr_truncate = 0;
+	int ret = 0, nr_unlink = 0;
 
 	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
 		return 0;
@@ -3586,12 +3450,31 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 				key.offset = found_key.objectid - 1;
 				continue;
 			}
+
 		}
+
 		/*
-		 * Inode is already gone but the orphan item is still there,
-		 * kill the orphan item.
+		 * If we have an inode with links, there are a couple of
+		 * possibilities. Old kernels (before v3.12) used to create an
+		 * orphan item for truncate indicating that there were possibly
+		 * extent items past i_size that needed to be deleted. In v3.12,
+		 * truncate was changed to update i_size in sync with the extent
+		 * items, but the (useless) orphan item was still created. Since
+		 * v4.18, we don't create the orphan item for truncate at all.
+		 *
+		 * So, this item could mean that we need to do a truncate, but
+		 * only if this filesystem was last used on a pre-v3.12 kernel
+		 * and was not cleanly unmounted. The odds of that are quite
+		 * slim, and it's a pain to do the truncate now, so just delete
+		 * the orphan item.
+		 *
+		 * It's also possible that this orphan item was supposed to be
+		 * deleted but wasn't. The inode number may have been reused,
+		 * but either way, we can delete the orphan item.
 		 */
-		if (ret == -ENOENT) {
+		if (ret == -ENOENT || inode->i_nlink) {
+			if (!ret)
+				iput(inode);
 			trans = btrfs_start_transaction(root, 1);
 			if (IS_ERR(trans)) {
 				ret = PTR_ERR(trans);
@@ -3607,42 +3490,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 			continue;
 		}
 
-		/*
-		 * add this inode to the orphan list so btrfs_orphan_del does
-		 * the proper thing when we hit it
-		 */
-		set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-			&BTRFS_I(inode)->runtime_flags);
-		atomic_inc(&root->orphan_inodes);
-
-		/* if we have links, this was a truncate, lets do that */
-		if (inode->i_nlink) {
-			if (WARN_ON(!S_ISREG(inode->i_mode))) {
-				iput(inode);
-				continue;
-			}
-			nr_truncate++;
-
-			/* 1 for the orphan item deletion. */
-			trans = btrfs_start_transaction(root, 1);
-			if (IS_ERR(trans)) {
-				iput(inode);
-				ret = PTR_ERR(trans);
-				goto out;
-			}
-			ret = btrfs_orphan_add(trans, BTRFS_I(inode));
-			btrfs_end_transaction(trans);
-			if (ret) {
-				iput(inode);
-				goto out;
-			}
-
-			ret = btrfs_truncate(inode, false);
-			if (ret)
-				btrfs_orphan_del(NULL, BTRFS_I(inode));
-		} else {
-			nr_unlink++;
-		}
+		nr_unlink++;
 
 		/* this will do delete_inode and everything for us */
 		iput(inode);
@@ -3654,12 +3502,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 
 	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
 
-	if (root->orphan_block_rsv)
-		btrfs_block_rsv_release(fs_info, root->orphan_block_rsv,
-					(u64)-1);
-
-	if (root->orphan_block_rsv ||
-	    test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
+	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
 		trans = btrfs_join_transaction(root);
 		if (!IS_ERR(trans))
 			btrfs_end_transaction(trans);
@@ -3667,8 +3510,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 
 	if (nr_unlink)
 		btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
-	if (nr_truncate)
-		btrfs_debug(fs_info, "truncated %d orphans", nr_truncate);
 
 out:
 	if (ret)
@@ -3931,7 +3772,7 @@ cache_acl:
 		break;
 	}
 
-	btrfs_update_iflags(inode);
+	btrfs_sync_inode_flags_to_i_flags(inode);
 	return 0;
 
 make_bad:
@@ -4245,7 +4086,7 @@ out:
 	return ret;
 }
 
-int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct inode *dir, u64 objectid,
 			const char *name, int name_len)
@@ -4326,6 +4167,262 @@ out:
 	return ret;
 }
 
+/*
+ * Helper to check if the subvolume references other subvolumes or if it's
+ * default.
+ */
+static noinline int may_destroy_subvol(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_path *path;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	u64 dir_id;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* Make sure this root isn't set as the default subvol */
+	dir_id = btrfs_super_root_dir(fs_info->super_copy);
+	di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
+				   dir_id, "default", 7, 0);
+	if (di && !IS_ERR(di)) {
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+		if (key.objectid == root->root_key.objectid) {
+			ret = -EPERM;
+			btrfs_err(fs_info,
+				  "deleting default subvolume %llu is not allowed",
+				  key.objectid);
+			goto out;
+		}
+		btrfs_release_path(path);
+	}
+
+	key.objectid = root->root_key.objectid;
+	key.type = BTRFS_ROOT_REF_KEY;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret == 0);
+
+	ret = 0;
+	if (path->slots[0] > 0) {
+		path->slots[0]--;
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (key.objectid == root->root_key.objectid &&
+		    key.type == BTRFS_ROOT_REF_KEY)
+			ret = -ENOTEMPTY;
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/* Delete all dentries for inodes belonging to the root */
+static void btrfs_prune_dentries(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct rb_node *node;
+	struct rb_node *prev;
+	struct btrfs_inode *entry;
+	struct inode *inode;
+	u64 objectid = 0;
+
+	if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+
+	spin_lock(&root->inode_lock);
+again:
+	node = root->inode_tree.rb_node;
+	prev = NULL;
+	while (node) {
+		prev = node;
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+		if (objectid < btrfs_ino(BTRFS_I(&entry->vfs_inode)))
+			node = node->rb_left;
+		else if (objectid > btrfs_ino(BTRFS_I(&entry->vfs_inode)))
+			node = node->rb_right;
+		else
+			break;
+	}
+	if (!node) {
+		while (prev) {
+			entry = rb_entry(prev, struct btrfs_inode, rb_node);
+			if (objectid <= btrfs_ino(BTRFS_I(&entry->vfs_inode))) {
+				node = prev;
+				break;
+			}
+			prev = rb_next(prev);
+		}
+	}
+	while (node) {
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+		objectid = btrfs_ino(BTRFS_I(&entry->vfs_inode)) + 1;
+		inode = igrab(&entry->vfs_inode);
+		if (inode) {
+			spin_unlock(&root->inode_lock);
+			if (atomic_read(&inode->i_count) > 1)
+				d_prune_aliases(inode);
+			/*
+			 * btrfs_drop_inode will have it removed from the inode
+			 * cache when its usage count hits zero.
+			 */
+			iput(inode);
+			cond_resched();
+			spin_lock(&root->inode_lock);
+			goto again;
+		}
+
+		if (cond_resched_lock(&root->inode_lock))
+			goto again;
+
+		node = rb_next(node);
+	}
+	spin_unlock(&root->inode_lock);
+}
+
+int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct inode *inode = d_inode(dentry);
+	struct btrfs_root *dest = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_block_rsv block_rsv;
+	u64 root_flags;
+	int ret;
+	int err;
+
+	/*
+	 * Don't allow to delete a subvolume with send in progress. This is
+	 * inside the inode lock so the error handling that has to drop the bit
+	 * again is not run concurrently.
+	 */
+	spin_lock(&dest->root_item_lock);
+	root_flags = btrfs_root_flags(&dest->root_item);
+	if (dest->send_in_progress == 0) {
+		btrfs_set_root_flags(&dest->root_item,
+				root_flags | BTRFS_ROOT_SUBVOL_DEAD);
+		spin_unlock(&dest->root_item_lock);
+	} else {
+		spin_unlock(&dest->root_item_lock);
+		btrfs_warn(fs_info,
+			   "attempt to delete subvolume %llu during send",
+			   dest->root_key.objectid);
+		return -EPERM;
+	}
+
+	down_write(&fs_info->subvol_sem);
+
+	err = may_destroy_subvol(dest);
+	if (err)
+		goto out_up_write;
+
+	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
+	/*
+	 * One for dir inode,
+	 * two for dir entries,
+	 * two for root ref/backref.
+	 */
+	err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
+	if (err)
+		goto out_up_write;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out_release;
+	}
+	trans->block_rsv = &block_rsv;
+	trans->bytes_reserved = block_rsv.size;
+
+	btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
+
+	ret = btrfs_unlink_subvol(trans, root, dir,
+				dest->root_key.objectid,
+				dentry->d_name.name,
+				dentry->d_name.len);
+	if (ret) {
+		err = ret;
+		btrfs_abort_transaction(trans, ret);
+		goto out_end_trans;
+	}
+
+	btrfs_record_root_in_trans(trans, dest);
+
+	memset(&dest->root_item.drop_progress, 0,
+		sizeof(dest->root_item.drop_progress));
+	dest->root_item.drop_level = 0;
+	btrfs_set_root_refs(&dest->root_item, 0);
+
+	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
+		ret = btrfs_insert_orphan_item(trans,
+					fs_info->tree_root,
+					dest->root_key.objectid);
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			err = ret;
+			goto out_end_trans;
+		}
+	}
+
+	ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
+				  BTRFS_UUID_KEY_SUBVOL,
+				  dest->root_key.objectid);
+	if (ret && ret != -ENOENT) {
+		btrfs_abort_transaction(trans, ret);
+		err = ret;
+		goto out_end_trans;
+	}
+	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
+		ret = btrfs_uuid_tree_remove(trans,
+					  dest->root_item.received_uuid,
+					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+					  dest->root_key.objectid);
+		if (ret && ret != -ENOENT) {
+			btrfs_abort_transaction(trans, ret);
+			err = ret;
+			goto out_end_trans;
+		}
+	}
+
+out_end_trans:
+	trans->block_rsv = NULL;
+	trans->bytes_reserved = 0;
+	ret = btrfs_end_transaction(trans);
+	if (ret && !err)
+		err = ret;
+	inode->i_flags |= S_DEAD;
+out_release:
+	btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+out_up_write:
+	up_write(&fs_info->subvol_sem);
+	if (err) {
+		spin_lock(&dest->root_item_lock);
+		root_flags = btrfs_root_flags(&dest->root_item);
+		btrfs_set_root_flags(&dest->root_item,
+				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
+		spin_unlock(&dest->root_item_lock);
+	} else {
+		d_invalidate(dentry);
+		btrfs_prune_dentries(dest);
+		ASSERT(dest->send_in_progress == 0);
+
+		/* the last ref */
+		if (dest->ino_cache_inode) {
+			iput(dest->ino_cache_inode);
+			dest->ino_cache_inode = NULL;
+		}
+	}
+
+	return err;
+}
+
 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = d_inode(dentry);
@@ -4337,7 +4434,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
 	if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
-		return -EPERM;
+		return btrfs_delete_subvolume(dir, dentry);
 
 	trans = __unlink_start_trans(dir);
 	if (IS_ERR(trans))
@@ -4449,7 +4546,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	int pending_del_slot = 0;
 	int extent_type = -1;
 	int ret;
-	int err = 0;
 	u64 ino = btrfs_ino(BTRFS_I(inode));
 	u64 bytes_deleted = 0;
 	bool be_nice = false;
@@ -4501,22 +4597,19 @@ search_again:
 	 * up a huge file in a single leaf.  Most of the time that
 	 * bytes_deleted is > 0, it will be huge by the time we get here
 	 */
-	if (be_nice && bytes_deleted > SZ_32M) {
-		if (btrfs_should_end_transaction(trans)) {
-			err = -EAGAIN;
-			goto error;
-		}
+	if (be_nice && bytes_deleted > SZ_32M &&
+	    btrfs_should_end_transaction(trans)) {
+		ret = -EAGAIN;
+		goto out;
 	}
 
-
 	path->leave_spinning = 1;
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret < 0) {
-		err = ret;
+	if (ret < 0)
 		goto out;
-	}
 
 	if (ret > 0) {
+		ret = 0;
 		/* there are no items in the tree for us to truncate, we're
 		 * done
 		 */
@@ -4627,7 +4720,7 @@ search_again:
 				 * We have to bail so the last_size is set to
 				 * just before this extent.
 				 */
-				err = NEED_TRUNCATE_BLOCK;
+				ret = NEED_TRUNCATE_BLOCK;
 				break;
 			}
 
@@ -4666,7 +4759,10 @@ delete:
 						extent_num_bytes, 0,
 						btrfs_header_owner(leaf),
 						ino, extent_offset);
-			BUG_ON(ret);
+			if (ret) {
+				btrfs_abort_transaction(trans, ret);
+				break;
+			}
 			if (btrfs_should_throttle_delayed_refs(trans, fs_info))
 				btrfs_async_run_delayed_refs(fs_info,
 					trans->delayed_ref_updates * 2,
@@ -4694,7 +4790,7 @@ delete:
 						pending_del_nr);
 				if (ret) {
 					btrfs_abort_transaction(trans, ret);
-					goto error;
+					break;
 				}
 				pending_del_nr = 0;
 			}
@@ -4705,8 +4801,8 @@ delete:
 					trans->delayed_ref_updates = 0;
 					ret = btrfs_run_delayed_refs(trans,
 								   updates * 2);
-					if (ret && !err)
-						err = ret;
+					if (ret)
+						break;
 				}
 			}
 			/*
@@ -4714,8 +4810,8 @@ delete:
 			 * and let the transaction restart
 			 */
 			if (should_end) {
-				err = -EAGAIN;
-				goto error;
+				ret = -EAGAIN;
+				break;
 			}
 			goto search_again;
 		} else {
@@ -4723,32 +4819,37 @@ delete:
 		}
 	}
 out:
-	if (pending_del_nr) {
-		ret = btrfs_del_items(trans, root, path, pending_del_slot,
+	if (ret >= 0 && pending_del_nr) {
+		int err;
+
+		err = btrfs_del_items(trans, root, path, pending_del_slot,
 				      pending_del_nr);
-		if (ret)
-			btrfs_abort_transaction(trans, ret);
+		if (err) {
+			btrfs_abort_transaction(trans, err);
+			ret = err;
+		}
 	}
-error:
 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
 		ASSERT(last_size >= new_size);
-		if (!err && last_size > new_size)
+		if (!ret && last_size > new_size)
 			last_size = new_size;
 		btrfs_ordered_update_i_size(inode, last_size, NULL);
 	}
 
 	btrfs_free_path(path);
 
-	if (be_nice && bytes_deleted > SZ_32M) {
+	if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) {
 		unsigned long updates = trans->delayed_ref_updates;
+		int err;
+
 		if (updates) {
 			trans->delayed_ref_updates = 0;
-			ret = btrfs_run_delayed_refs(trans, updates * 2);
-			if (ret && !err)
-				err = ret;
+			err = btrfs_run_delayed_refs(trans, updates * 2);
+			if (err)
+				ret = err;
 		}
 	}
-	return err;
+	return ret;
 }
 
 /*
@@ -5090,30 +5191,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 			set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
 				&BTRFS_I(inode)->runtime_flags);
 
-		/*
-		 * 1 for the orphan item we're going to add
-		 * 1 for the orphan item deletion.
-		 */
-		trans = btrfs_start_transaction(root, 2);
-		if (IS_ERR(trans))
-			return PTR_ERR(trans);
-
-		/*
-		 * We need to do this in case we fail at _any_ point during the
-		 * actual truncate.  Once we do the truncate_setsize we could
-		 * invalidate pages which forces any outstanding ordered io to
-		 * be instantly completed which will give us extents that need
-		 * to be truncated.  If we fail to get an orphan inode down we
-		 * could have left over extents that were never meant to live,
-		 * so we need to guarantee from this point on that everything
-		 * will be consistent.
-		 */
-		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
-		btrfs_end_transaction(trans);
-		if (ret)
-			return ret;
-
-		/* we don't support swapfiles, so vmtruncate shouldn't fail */
 		truncate_setsize(inode, newsize);
 
 		/* Disable nonlocked read DIO to avoid the end less truncate */
@@ -5125,29 +5202,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 		if (ret && inode->i_nlink) {
 			int err;
 
-			/* To get a stable disk_i_size */
-			err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
-			if (err) {
-				btrfs_orphan_del(NULL, BTRFS_I(inode));
-				return err;
-			}
-
 			/*
-			 * failed to truncate, disk_i_size is only adjusted down
-			 * as we remove extents, so it should represent the true
-			 * size of the inode, so reset the in memory size and
-			 * delete our orphan entry.
+			 * Truncate failed, so fix up the in-memory size. We
+			 * adjusted disk_i_size down as we removed extents, so
+			 * wait for disk_i_size to be stable and then update the
+			 * in-memory size to match.
 			 */
-			trans = btrfs_join_transaction(root);
-			if (IS_ERR(trans)) {
-				btrfs_orphan_del(NULL, BTRFS_I(inode));
-				return ret;
-			}
-			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
-			err = btrfs_orphan_del(trans, BTRFS_I(inode));
+			err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
 			if (err)
-				btrfs_abort_transaction(trans, err);
-			btrfs_end_transaction(trans);
+				return err;
+			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
 		}
 	}
 
@@ -5277,13 +5341,52 @@ static void evict_inode_truncate_pages(struct inode *inode)
 	spin_unlock(&io_tree->lock);
 }
 
+static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
+							struct btrfs_block_rsv *rsv,
+							u64 min_size)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	int failures = 0;
+
+	for (;;) {
+		struct btrfs_trans_handle *trans;
+		int ret;
+
+		ret = btrfs_block_rsv_refill(root, rsv, min_size,
+					     BTRFS_RESERVE_FLUSH_LIMIT);
+
+		if (ret && ++failures > 2) {
+			btrfs_warn(fs_info,
+				   "could not allocate space for a delete; will truncate on mount");
+			return ERR_PTR(-ENOSPC);
+		}
+
+		trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans) || !ret)
+			return trans;
+
+		/*
+		 * Try to steal from the global reserve if there is space for
+		 * it.
+		 */
+		if (!btrfs_check_space_for_delayed_refs(trans, fs_info) &&
+		    !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0))
+			return trans;
+
+		/* If not, commit and try again. */
+		ret = btrfs_commit_transaction(trans);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+}
+
 void btrfs_evict_inode(struct inode *inode)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_block_rsv *rsv, *global_rsv;
-	int steal_from_global = 0;
+	struct btrfs_block_rsv *rsv;
 	u64 min_size;
 	int ret;
 
@@ -5304,21 +5407,16 @@ void btrfs_evict_inode(struct inode *inode)
 	     btrfs_is_free_space_inode(BTRFS_I(inode))))
 		goto no_delete;
 
-	if (is_bad_inode(inode)) {
-		btrfs_orphan_del(NULL, BTRFS_I(inode));
+	if (is_bad_inode(inode))
 		goto no_delete;
-	}
 	/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
 	if (!special_file(inode->i_mode))
 		btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
 	btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
 
-	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
-		BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-				 &BTRFS_I(inode)->runtime_flags));
+	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
 		goto no_delete;
-	}
 
 	if (inode->i_nlink > 0) {
 		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
@@ -5327,130 +5425,63 @@ void btrfs_evict_inode(struct inode *inode)
 	}
 
 	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
-	if (ret) {
-		btrfs_orphan_del(NULL, BTRFS_I(inode));
+	if (ret)
 		goto no_delete;
-	}
 
 	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
-	if (!rsv) {
-		btrfs_orphan_del(NULL, BTRFS_I(inode));
+	if (!rsv)
 		goto no_delete;
-	}
 	rsv->size = min_size;
 	rsv->failfast = 1;
-	global_rsv = &fs_info->global_block_rsv;
 
 	btrfs_i_size_write(BTRFS_I(inode), 0);
 
-	/*
-	 * This is a bit simpler than btrfs_truncate since we've already
-	 * reserved our space for our orphan item in the unlink, so we just
-	 * need to reserve some slack space in case we add bytes and update
-	 * inode item when doing the truncate.
-	 */
 	while (1) {
-		ret = btrfs_block_rsv_refill(root, rsv, min_size,
-					     BTRFS_RESERVE_FLUSH_LIMIT);
-
-		/*
-		 * Try and steal from the global reserve since we will
-		 * likely not use this space anyway, we want to try as
-		 * hard as possible to get this to work.
-		 */
-		if (ret)
-			steal_from_global++;
-		else
-			steal_from_global = 0;
-		ret = 0;
-
-		/*
-		 * steal_from_global == 0: we reserved stuff, hooray!
-		 * steal_from_global == 1: we didn't reserve stuff, boo!
-		 * steal_from_global == 2: we've committed, still not a lot of
-		 * room but maybe we'll have room in the global reserve this
-		 * time.
-		 * steal_from_global == 3: abandon all hope!
-		 */
-		if (steal_from_global > 2) {
-			btrfs_warn(fs_info,
-				   "Could not get space for a delete, will truncate on mount %d",
-				   ret);
-			btrfs_orphan_del(NULL, BTRFS_I(inode));
-			btrfs_free_block_rsv(fs_info, rsv);
-			goto no_delete;
-		}
-
-		trans = btrfs_join_transaction(root);
-		if (IS_ERR(trans)) {
-			btrfs_orphan_del(NULL, BTRFS_I(inode));
-			btrfs_free_block_rsv(fs_info, rsv);
-			goto no_delete;
-		}
-
-		/*
-		 * We can't just steal from the global reserve, we need to make
-		 * sure there is room to do it, if not we need to commit and try
-		 * again.
-		 */
-		if (steal_from_global) {
-			if (!btrfs_check_space_for_delayed_refs(trans, fs_info))
-				ret = btrfs_block_rsv_migrate(global_rsv, rsv,
-							      min_size, 0);
-			else
-				ret = -ENOSPC;
-		}
-
-		/*
-		 * Couldn't steal from the global reserve, we have too much
-		 * pending stuff built up, commit the transaction and try it
-		 * again.
-		 */
-		if (ret) {
-			ret = btrfs_commit_transaction(trans);
-			if (ret) {
-				btrfs_orphan_del(NULL, BTRFS_I(inode));
-				btrfs_free_block_rsv(fs_info, rsv);
-				goto no_delete;
-			}
-			continue;
-		} else {
-			steal_from_global = 0;
-		}
+		trans = evict_refill_and_join(root, rsv, min_size);
+		if (IS_ERR(trans))
+			goto free_rsv;
 
 		trans->block_rsv = rsv;
 
 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
-		if (ret != -ENOSPC && ret != -EAGAIN)
-			break;
-
 		trans->block_rsv = &fs_info->trans_block_rsv;
 		btrfs_end_transaction(trans);
-		trans = NULL;
 		btrfs_btree_balance_dirty(fs_info);
+		if (ret && ret != -ENOSPC && ret != -EAGAIN)
+			goto free_rsv;
+		else if (!ret)
+			break;
 	}
 
-	btrfs_free_block_rsv(fs_info, rsv);
-
 	/*
-	 * Errors here aren't a big deal, it just means we leave orphan items
-	 * in the tree.  They will be cleaned up on the next mount.
+	 * Errors here aren't a big deal, it just means we leave orphan items in
+	 * the tree. They will be cleaned up on the next mount. If the inode
+	 * number gets reused, cleanup deletes the orphan item without doing
+	 * anything, and unlink reuses the existing orphan item.
+	 *
+	 * If it turns out that we are dropping too many of these, we might want
+	 * to add a mechanism for retrying these after a commit.
 	 */
-	if (ret == 0) {
-		trans->block_rsv = root->orphan_block_rsv;
+	trans = evict_refill_and_join(root, rsv, min_size);
+	if (!IS_ERR(trans)) {
+		trans->block_rsv = rsv;
 		btrfs_orphan_del(trans, BTRFS_I(inode));
-	} else {
-		btrfs_orphan_del(NULL, BTRFS_I(inode));
+		trans->block_rsv = &fs_info->trans_block_rsv;
+		btrfs_end_transaction(trans);
 	}
 
-	trans->block_rsv = &fs_info->trans_block_rsv;
 	if (!(root == fs_info->tree_root ||
 	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
 		btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
 
-	btrfs_end_transaction(trans);
-	btrfs_btree_balance_dirty(fs_info);
+free_rsv:
+	btrfs_free_block_rsv(fs_info, rsv);
 no_delete:
+	/*
+	 * If we didn't successfully delete, the orphan item will still be in
+	 * the tree and we'll retry on the next mount. Again, we might also want
+	 * to retry these periodically in the future.
+	 */
 	btrfs_remove_delayed_node(BTRFS_I(inode));
 	clear_inode(inode);
 }
@@ -5626,69 +5657,6 @@ static void inode_tree_del(struct inode *inode)
 	}
 }
 
-void btrfs_invalidate_inodes(struct btrfs_root *root)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct rb_node *node;
-	struct rb_node *prev;
-	struct btrfs_inode *entry;
-	struct inode *inode;
-	u64 objectid = 0;
-
-	if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
-		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
-
-	spin_lock(&root->inode_lock);
-again:
-	node = root->inode_tree.rb_node;
-	prev = NULL;
-	while (node) {
-		prev = node;
-		entry = rb_entry(node, struct btrfs_inode, rb_node);
-
-		if (objectid < btrfs_ino(BTRFS_I(&entry->vfs_inode)))
-			node = node->rb_left;
-		else if (objectid > btrfs_ino(BTRFS_I(&entry->vfs_inode)))
-			node = node->rb_right;
-		else
-			break;
-	}
-	if (!node) {
-		while (prev) {
-			entry = rb_entry(prev, struct btrfs_inode, rb_node);
-			if (objectid <= btrfs_ino(BTRFS_I(&entry->vfs_inode))) {
-				node = prev;
-				break;
-			}
-			prev = rb_next(prev);
-		}
-	}
-	while (node) {
-		entry = rb_entry(node, struct btrfs_inode, rb_node);
-		objectid = btrfs_ino(BTRFS_I(&entry->vfs_inode)) + 1;
-		inode = igrab(&entry->vfs_inode);
-		if (inode) {
-			spin_unlock(&root->inode_lock);
-			if (atomic_read(&inode->i_count) > 1)
-				d_prune_aliases(inode);
-			/*
-			 * btrfs_drop_inode will have it removed from
-			 * the inode cache when its usage count
-			 * hits zero.
-			 */
-			iput(inode);
-			cond_resched();
-			spin_lock(&root->inode_lock);
-			goto again;
-		}
-
-		if (cond_resched_lock(&root->inode_lock))
-			goto again;
-
-		node = rb_next(node);
-	}
-	spin_unlock(&root->inode_lock);
-}
 
 static int btrfs_init_locked_inode(struct inode *inode, void *p)
 {
@@ -5850,11 +5818,6 @@ static int btrfs_dentry_delete(const struct dentry *dentry)
 	return 0;
 }
 
-static void btrfs_dentry_release(struct dentry *dentry)
-{
-	kfree(dentry->d_fsdata);
-}
-
 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 				   unsigned int flags)
 {
@@ -6270,7 +6233,7 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
 	}
 
-	btrfs_update_iflags(inode);
+	btrfs_sync_inode_flags_to_i_flags(inode);
 }
 
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
@@ -6705,8 +6668,9 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	 * 2 items for inode and inode ref
 	 * 2 items for dir items
 	 * 1 item for parent inode
+	 * 1 item for orphan item deletion if O_TMPFILE
 	 */
-	trans = btrfs_start_transaction(root, 5);
+	trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
 		trans = NULL;
@@ -7083,7 +7047,7 @@ insert:
 
 	err = 0;
 	write_lock(&em_tree->lock);
-	err = btrfs_add_extent_mapping(em_tree, &em, start, len);
+	err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
 	write_unlock(&em_tree->lock);
 out:
 
@@ -7368,6 +7332,14 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 	    btrfs_file_extent_other_encoding(leaf, fi))
 		goto out;
 
+	/*
+	 * Do the same check as in btrfs_cross_ref_exist but without the
+	 * unnecessary search.
+	 */
+	if (btrfs_file_extent_generation(leaf, fi) <=
+	    btrfs_root_last_snapshot(&root->root_item))
+		goto out;
+
 	backref_offset = btrfs_file_extent_offset(leaf, fi);
 
 	if (orig_start) {
@@ -7568,6 +7540,125 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
 	return em;
 }
 
+
+static int btrfs_get_blocks_direct_read(struct extent_map *em,
+					struct buffer_head *bh_result,
+					struct inode *inode,
+					u64 start, u64 len)
+{
+	if (em->block_start == EXTENT_MAP_HOLE ||
+			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+		return -ENOENT;
+
+	len = min(len, em->len - (start - em->start));
+
+	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+		inode->i_blkbits;
+	bh_result->b_size = len;
+	bh_result->b_bdev = em->bdev;
+	set_buffer_mapped(bh_result);
+
+	return 0;
+}
+
+static int btrfs_get_blocks_direct_write(struct extent_map **map,
+					 struct buffer_head *bh_result,
+					 struct inode *inode,
+					 struct btrfs_dio_data *dio_data,
+					 u64 start, u64 len)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct extent_map *em = *map;
+	int ret = 0;
+
+	/*
+	 * We don't allocate a new extent in the following cases
+	 *
+	 * 1) The inode is marked as NODATACOW. In this case we'll just use the
+	 * existing extent.
+	 * 2) The extent is marked as PREALLOC. We're good to go here and can
+	 * just use the extent.
+	 *
+	 */
+	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+	     em->block_start != EXTENT_MAP_HOLE)) {
+		int type;
+		u64 block_start, orig_start, orig_block_len, ram_bytes;
+
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+			type = BTRFS_ORDERED_PREALLOC;
+		else
+			type = BTRFS_ORDERED_NOCOW;
+		len = min(len, em->len - (start - em->start));
+		block_start = em->block_start + (start - em->start);
+
+		if (can_nocow_extent(inode, start, &len, &orig_start,
+				     &orig_block_len, &ram_bytes) == 1 &&
+		    btrfs_inc_nocow_writers(fs_info, block_start)) {
+			struct extent_map *em2;
+
+			em2 = btrfs_create_dio_extent(inode, start, len,
+						      orig_start, block_start,
+						      len, orig_block_len,
+						      ram_bytes, type);
+			btrfs_dec_nocow_writers(fs_info, block_start);
+			if (type == BTRFS_ORDERED_PREALLOC) {
+				free_extent_map(em);
+				*map = em = em2;
+			}
+
+			if (em2 && IS_ERR(em2)) {
+				ret = PTR_ERR(em2);
+				goto out;
+			}
+			/*
+			 * For inode marked NODATACOW or extent marked PREALLOC,
+			 * use the existing or preallocated extent, so does not
+			 * need to adjust btrfs_space_info's bytes_may_use.
+			 */
+			btrfs_free_reserved_data_space_noquota(inode, start,
+							       len);
+			goto skip_cow;
+		}
+	}
+
+	/* this will cow the extent */
+	len = bh_result->b_size;
+	free_extent_map(em);
+	*map = em = btrfs_new_extent_direct(inode, start, len);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out;
+	}
+
+	len = min(len, em->len - (start - em->start));
+
+skip_cow:
+	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+		inode->i_blkbits;
+	bh_result->b_size = len;
+	bh_result->b_bdev = em->bdev;
+	set_buffer_mapped(bh_result);
+
+	if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+		set_buffer_new(bh_result);
+
+	/*
+	 * Need to update the i_size under the extent lock so buffered
+	 * readers will get the updated i_size when we unlock.
+	 */
+	if (!dio_data->overwrite && start + len > i_size_read(inode))
+		i_size_write(inode, start + len);
+
+	WARN_ON(dio_data->reserve < len);
+	dio_data->reserve -= len;
+	dio_data->unsubmitted_oe_range_end = start + len;
+	current->journal_info = dio_data;
+out:
+	return ret;
+}
+
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
@@ -7636,116 +7727,36 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 		goto unlock_err;
 	}
 
-	/* Just a good old fashioned hole, return */
-	if (!create && (em->block_start == EXTENT_MAP_HOLE ||
-			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-		free_extent_map(em);
-		goto unlock_err;
-	}
-
-	/*
-	 * We don't allocate a new extent in the following cases
-	 *
-	 * 1) The inode is marked as NODATACOW.  In this case we'll just use the
-	 * existing extent.
-	 * 2) The extent is marked as PREALLOC.  We're good to go here and can
-	 * just use the extent.
-	 *
-	 */
-	if (!create) {
-		len = min(len, em->len - (start - em->start));
-		lockstart = start + len;
-		goto unlock;
-	}
-
-	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
-	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
-	     em->block_start != EXTENT_MAP_HOLE)) {
-		int type;
-		u64 block_start, orig_start, orig_block_len, ram_bytes;
-
-		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
-			type = BTRFS_ORDERED_PREALLOC;
-		else
-			type = BTRFS_ORDERED_NOCOW;
-		len = min(len, em->len - (start - em->start));
-		block_start = em->block_start + (start - em->start);
-
-		if (can_nocow_extent(inode, start, &len, &orig_start,
-				     &orig_block_len, &ram_bytes) == 1 &&
-		    btrfs_inc_nocow_writers(fs_info, block_start)) {
-			struct extent_map *em2;
-
-			em2 = btrfs_create_dio_extent(inode, start, len,
-						      orig_start, block_start,
-						      len, orig_block_len,
-						      ram_bytes, type);
-			btrfs_dec_nocow_writers(fs_info, block_start);
-			if (type == BTRFS_ORDERED_PREALLOC) {
-				free_extent_map(em);
-				em = em2;
-			}
-			if (em2 && IS_ERR(em2)) {
-				ret = PTR_ERR(em2);
-				goto unlock_err;
-			}
-			/*
-			 * For inode marked NODATACOW or extent marked PREALLOC,
-			 * use the existing or preallocated extent, so does not
-			 * need to adjust btrfs_space_info's bytes_may_use.
-			 */
-			btrfs_free_reserved_data_space_noquota(inode,
-					start, len);
-			goto unlock;
-		}
-	}
-
-	/*
-	 * this will cow the extent, reset the len in case we changed
-	 * it above
-	 */
-	len = bh_result->b_size;
-	free_extent_map(em);
-	em = btrfs_new_extent_direct(inode, start, len);
-	if (IS_ERR(em)) {
-		ret = PTR_ERR(em);
-		goto unlock_err;
-	}
-	len = min(len, em->len - (start - em->start));
-unlock:
-	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
-		inode->i_blkbits;
-	bh_result->b_size = len;
-	bh_result->b_bdev = em->bdev;
-	set_buffer_mapped(bh_result);
 	if (create) {
-		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
-			set_buffer_new(bh_result);
+		ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
+						    dio_data, start, len);
+		if (ret < 0)
+			goto unlock_err;
 
+		/* clear and unlock the entire range */
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 unlock_bits, 1, 0, &cached_state);
+	} else {
+		ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
+						   start, len);
+		/* Can be negative only if we read from a hole */
+		if (ret < 0) {
+			ret = 0;
+			free_extent_map(em);
+			goto unlock_err;
+		}
 		/*
-		 * Need to update the i_size under the extent lock so buffered
-		 * readers will get the updated i_size when we unlock.
+		 * We need to unlock only the end area that we aren't using.
+		 * The rest is going to be unlocked by the endio routine.
 		 */
-		if (!dio_data->overwrite && start + len > i_size_read(inode))
-			i_size_write(inode, start + len);
-
-		WARN_ON(dio_data->reserve < len);
-		dio_data->reserve -= len;
-		dio_data->unsubmitted_oe_range_end = start + len;
-		current->journal_info = dio_data;
-	}
-
-	/*
-	 * In the case of write we need to clear and unlock the entire range,
-	 * in the case of read we need to unlock only the end area that we
-	 * aren't using if there is any left over space.
-	 */
-	if (lockstart < lockend) {
-		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-				 lockend, unlock_bits, 1, 0,
-				 &cached_state);
-	} else {
-		free_extent_state(cached_state);
+		lockstart = start + bh_result->b_size;
+		if (lockstart < lockend) {
+			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+					 lockend, unlock_bits, 1, 0,
+					 &cached_state);
+		} else {
+			free_extent_state(cached_state);
+		}
 	}
 
 	free_extent_map(em);
@@ -8131,7 +8142,6 @@ static void __endio_write_update_ordered(struct inode *inode,
 	u64 ordered_offset = offset;
 	u64 ordered_bytes = bytes;
 	u64 last_offset;
-	int ret;
 
 	if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
 		wq = fs_info->endio_freespace_worker;
@@ -8141,32 +8151,31 @@ static void __endio_write_update_ordered(struct inode *inode,
 		func = btrfs_endio_write_helper;
 	}
 
-again:
-	last_offset = ordered_offset;
-	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
-						   &ordered_offset,
-						   ordered_bytes,
-						   uptodate);
-	if (!ret)
-		goto out_test;
-
-	btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL);
-	btrfs_queue_work(wq, &ordered->work);
-out_test:
-	/*
-	 * If btrfs_dec_test_ordered_pending does not find any ordered extent
-	 * in the range, we can exit.
-	 */
-	if (ordered_offset == last_offset)
-		return;
-	/*
-	 * our bio might span multiple ordered extents.  If we haven't
-	 * completed the accounting for the whole dio, go back and try again
-	 */
-	if (ordered_offset < offset + bytes) {
-		ordered_bytes = offset + bytes - ordered_offset;
-		ordered = NULL;
-		goto again;
+	while (ordered_offset < offset + bytes) {
+		last_offset = ordered_offset;
+		if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
+							   &ordered_offset,
+							   ordered_bytes,
+							   uptodate)) {
+			btrfs_init_work(&ordered->work, func,
+					finish_ordered_fn,
+					NULL, NULL);
+			btrfs_queue_work(wq, &ordered->work);
+		}
+		/*
+		 * If btrfs_dec_test_ordered_pending does not find any ordered
+		 * extent in the range, we can exit.
+		 */
+		if (ordered_offset == last_offset)
+			return;
+		/*
+		 * Our bio might span multiple ordered extents. In this case
+		 * we keep goin until we have accounted the whole dio.
+		 */
+		if (ordered_offset < offset + bytes) {
+			ordered_bytes = offset + bytes - ordered_offset;
+			ordered = NULL;
+		}
 	}
 }
 
@@ -8705,29 +8714,19 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 static int btrfs_writepages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
-	struct extent_io_tree *tree;
-
-	tree = &BTRFS_I(mapping->host)->io_tree;
-	return extent_writepages(tree, mapping, wbc);
+	return extent_writepages(mapping, wbc);
 }
 
 static int
 btrfs_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
-	struct extent_io_tree *tree;
-	tree = &BTRFS_I(mapping->host)->io_tree;
-	return extent_readpages(tree, mapping, pages, nr_pages);
+	return extent_readpages(mapping, pages, nr_pages);
 }
+
 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 {
-	struct extent_io_tree *tree;
-	struct extent_map_tree *map;
-	int ret;
-
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	map = &BTRFS_I(page->mapping->host)->extent_tree;
-	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
+	int ret = try_release_extent_mapping(page, gfp_flags);
 	if (ret == 1) {
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
@@ -8868,8 +8867,8 @@ again:
  *
  * We are not allowed to take the i_mutex here so we have to play games to
  * protect against truncate races as the page could now be beyond EOF.  Because
- * vmtruncate() writes the inode size before removing pages, once we have the
- * page lock we can determine safely if the page is beyond EOF. If it is not
+ * truncate_setsize() writes the inode size before removing pages, once we have
+ * the page lock we can determine safely if the page is beyond EOF. If it is not
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
  */
@@ -9031,8 +9030,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_block_rsv *rsv;
-	int ret = 0;
-	int err = 0;
+	int ret;
 	struct btrfs_trans_handle *trans;
 	u64 mask = fs_info->sectorsize - 1;
 	u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
@@ -9045,39 +9043,31 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 	}
 
 	/*
-	 * Yes ladies and gentlemen, this is indeed ugly.  The fact is we have
-	 * 3 things going on here
+	 * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
+	 * things going on here:
 	 *
-	 * 1) We need to reserve space for our orphan item and the space to
-	 * delete our orphan item.  Lord knows we don't want to have a dangling
-	 * orphan item because we didn't reserve space to remove it.
+	 * 1) We need to reserve space to update our inode.
 	 *
-	 * 2) We need to reserve space to update our inode.
-	 *
-	 * 3) We need to have something to cache all the space that is going to
+	 * 2) We need to have something to cache all the space that is going to
 	 * be free'd up by the truncate operation, but also have some slack
 	 * space reserved in case it uses space during the truncate (thank you
 	 * very much snapshotting).
 	 *
-	 * And we need these to all be separate.  The fact is we can use a lot of
+	 * And we need these to be separate.  The fact is we can use a lot of
 	 * space doing the truncate, and we have no earthly idea how much space
 	 * we will use, so we need the truncate reservation to be separate so it
-	 * doesn't end up using space reserved for updating the inode or
-	 * removing the orphan item.  We also need to be able to stop the
-	 * transaction and start a new one, which means we need to be able to
-	 * update the inode several times, and we have no idea of knowing how
-	 * many times that will be, so we can't just reserve 1 item for the
-	 * entirety of the operation, so that has to be done separately as well.
-	 * Then there is the orphan item, which does indeed need to be held on
-	 * to for the whole operation, and we need nobody to touch this reserved
-	 * space except the orphan code.
+	 * doesn't end up using space reserved for updating the inode.  We also
+	 * need to be able to stop the transaction and start a new one, which
+	 * means we need to be able to update the inode several times, and we
+	 * have no idea of knowing how many times that will be, so we can't just
+	 * reserve 1 item for the entirety of the operation, so that has to be
+	 * done separately as well.
 	 *
 	 * So that leaves us with
 	 *
-	 * 1) root->orphan_block_rsv - for the orphan deletion.
-	 * 2) rsv - for the truncate reservation, which we will steal from the
+	 * 1) rsv - for the truncate reservation, which we will steal from the
 	 * transaction reservation.
-	 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
+	 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
 	 * updating the inode.
 	 */
 	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
@@ -9092,7 +9082,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 	 */
 	trans = btrfs_start_transaction(root, 2);
 	if (IS_ERR(trans)) {
-		err = PTR_ERR(trans);
+		ret = PTR_ERR(trans);
 		goto out;
 	}
 
@@ -9116,24 +9106,19 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 						 inode->i_size,
 						 BTRFS_EXTENT_DATA_KEY);
 		trans->block_rsv = &fs_info->trans_block_rsv;
-		if (ret != -ENOSPC && ret != -EAGAIN) {
-			if (ret < 0)
-				err = ret;
+		if (ret != -ENOSPC && ret != -EAGAIN)
 			break;
-		}
 
 		ret = btrfs_update_inode(trans, root, inode);
-		if (ret) {
-			err = ret;
+		if (ret)
 			break;
-		}
 
 		btrfs_end_transaction(trans);
 		btrfs_btree_balance_dirty(fs_info);
 
 		trans = btrfs_start_transaction(root, 2);
 		if (IS_ERR(trans)) {
-			ret = err = PTR_ERR(trans);
+			ret = PTR_ERR(trans);
 			trans = NULL;
 			break;
 		}
@@ -9166,29 +9151,23 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 		btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
 	}
 
-	if (ret == 0 && inode->i_nlink > 0) {
-		trans->block_rsv = root->orphan_block_rsv;
-		ret = btrfs_orphan_del(trans, BTRFS_I(inode));
-		if (ret)
-			err = ret;
-	}
-
 	if (trans) {
+		int ret2;
+
 		trans->block_rsv = &fs_info->trans_block_rsv;
-		ret = btrfs_update_inode(trans, root, inode);
-		if (ret && !err)
-			err = ret;
+		ret2 = btrfs_update_inode(trans, root, inode);
+		if (ret2 && !ret)
+			ret = ret2;
 
-		ret = btrfs_end_transaction(trans);
+		ret2 = btrfs_end_transaction(trans);
+		if (ret2 && !ret)
+			ret = ret2;
 		btrfs_btree_balance_dirty(fs_info);
 	}
 out:
 	btrfs_free_block_rsv(fs_info, rsv);
 
-	if (ret && !err)
-		err = ret;
-
-	return err;
+	return ret;
 }
 
 /*
@@ -9324,13 +9303,6 @@ void btrfs_destroy_inode(struct inode *inode)
 	if (!root)
 		goto free;
 
-	if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-		     &BTRFS_I(inode)->runtime_flags)) {
-		btrfs_info(fs_info, "inode %llu still on the orphan list",
-			   btrfs_ino(BTRFS_I(inode)));
-		atomic_dec(&root->orphan_inodes);
-	}
-
 	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
 		if (!ordered)
@@ -9964,6 +9936,13 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
 	return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
 }
 
+struct btrfs_delalloc_work {
+	struct inode *inode;
+	struct completion completion;
+	struct list_head list;
+	struct btrfs_work work;
+};
+
 static void btrfs_run_delalloc_work(struct btrfs_work *work)
 {
 	struct btrfs_delalloc_work *delalloc_work;
@@ -9977,15 +9956,11 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work)
 				&BTRFS_I(inode)->runtime_flags))
 		filemap_flush(inode->i_mapping);
 
-	if (delalloc_work->delay_iput)
-		btrfs_add_delayed_iput(inode);
-	else
-		iput(inode);
+	iput(inode);
 	complete(&delalloc_work->completion);
 }
 
-struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
-						    int delay_iput)
+static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
 {
 	struct btrfs_delalloc_work *work;
 
@@ -9996,7 +9971,6 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
 	init_completion(&work->completion);
 	INIT_LIST_HEAD(&work->list);
 	work->inode = inode;
-	work->delay_iput = delay_iput;
 	WARN_ON_ONCE(!inode);
 	btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
 			btrfs_run_delalloc_work, NULL, NULL);
@@ -10004,18 +9978,11 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
 	return work;
 }
 
-void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
-{
-	wait_for_completion(&work->completion);
-	kfree(work);
-}
-
 /*
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
-				   int nr)
+static int start_delalloc_inodes(struct btrfs_root *root, int nr)
 {
 	struct btrfs_inode *binode;
 	struct inode *inode;
@@ -10043,12 +10010,9 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
 		}
 		spin_unlock(&root->delalloc_lock);
 
-		work = btrfs_alloc_delalloc_work(inode, delay_iput);
+		work = btrfs_alloc_delalloc_work(inode);
 		if (!work) {
-			if (delay_iput)
-				btrfs_add_delayed_iput(inode);
-			else
-				iput(inode);
+			iput(inode);
 			ret = -ENOMEM;
 			goto out;
 		}
@@ -10066,10 +10030,11 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
 out:
 	list_for_each_entry_safe(work, next, &works, list) {
 		list_del_init(&work->list);
-		btrfs_wait_and_free_delalloc_work(work);
+		wait_for_completion(&work->completion);
+		kfree(work);
 	}
 
-	if (!list_empty_careful(&splice)) {
+	if (!list_empty(&splice)) {
 		spin_lock(&root->delalloc_lock);
 		list_splice_tail(&splice, &root->delalloc_inodes);
 		spin_unlock(&root->delalloc_lock);
@@ -10078,7 +10043,7 @@ out:
 	return ret;
 }
 
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
@@ -10086,14 +10051,13 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
 		return -EROFS;
 
-	ret = __start_delalloc_inodes(root, delay_iput, -1);
+	ret = start_delalloc_inodes(root, -1);
 	if (ret > 0)
 		ret = 0;
 	return ret;
 }
 
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
-			       int nr)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
 {
 	struct btrfs_root *root;
 	struct list_head splice;
@@ -10116,7 +10080,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
 			       &fs_info->delalloc_roots);
 		spin_unlock(&fs_info->delalloc_root_lock);
 
-		ret = __start_delalloc_inodes(root, delay_iput, nr);
+		ret = start_delalloc_inodes(root, nr);
 		btrfs_put_fs_root(root);
 		if (ret < 0)
 			goto out;
@@ -10131,7 +10095,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
 
 	ret = 0;
 out:
-	if (!list_empty_careful(&splice)) {
+	if (!list_empty(&splice)) {
 		spin_lock(&fs_info->delalloc_root_lock);
 		list_splice_tail(&splice, &fs_info->delalloc_roots);
 		spin_unlock(&fs_info->delalloc_root_lock);
@@ -10669,5 +10633,4 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
 
 const struct dentry_operations btrfs_dentry_operations = {
 	.d_delete	= btrfs_dentry_delete,
-	.d_release	= btrfs_dentry_release,
 };
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 632e26d..d29992f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -93,20 +93,22 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 		       int no_time_update);
 
 /* Mask out flags that are inappropriate for the given type of inode. */
-static unsigned int btrfs_mask_flags(umode_t mode, unsigned int flags)
+static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
+		unsigned int flags)
 {
-	if (S_ISDIR(mode))
+	if (S_ISDIR(inode->i_mode))
 		return flags;
-	else if (S_ISREG(mode))
+	else if (S_ISREG(inode->i_mode))
 		return flags & ~FS_DIRSYNC_FL;
 	else
 		return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
 }
 
 /*
- * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
+ * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
+ * ioctl.
  */
-static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
+static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
 {
 	unsigned int iflags = 0;
 
@@ -136,20 +138,20 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
 /*
  * Update inode->i_flags based on the btrfs internal flags.
  */
-void btrfs_update_iflags(struct inode *inode)
+void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
 {
-	struct btrfs_inode *ip = BTRFS_I(inode);
+	struct btrfs_inode *binode = BTRFS_I(inode);
 	unsigned int new_fl = 0;
 
-	if (ip->flags & BTRFS_INODE_SYNC)
+	if (binode->flags & BTRFS_INODE_SYNC)
 		new_fl |= S_SYNC;
-	if (ip->flags & BTRFS_INODE_IMMUTABLE)
+	if (binode->flags & BTRFS_INODE_IMMUTABLE)
 		new_fl |= S_IMMUTABLE;
-	if (ip->flags & BTRFS_INODE_APPEND)
+	if (binode->flags & BTRFS_INODE_APPEND)
 		new_fl |= S_APPEND;
-	if (ip->flags & BTRFS_INODE_NOATIME)
+	if (binode->flags & BTRFS_INODE_NOATIME)
 		new_fl |= S_NOATIME;
-	if (ip->flags & BTRFS_INODE_DIRSYNC)
+	if (binode->flags & BTRFS_INODE_DIRSYNC)
 		new_fl |= S_DIRSYNC;
 
 	set_mask_bits(&inode->i_flags,
@@ -159,15 +161,16 @@ void btrfs_update_iflags(struct inode *inode)
 
 static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
 {
-	struct btrfs_inode *ip = BTRFS_I(file_inode(file));
-	unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
+	struct btrfs_inode *binode = BTRFS_I(file_inode(file));
+	unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags);
 
 	if (copy_to_user(arg, &flags, sizeof(flags)))
 		return -EFAULT;
 	return 0;
 }
 
-static int check_flags(unsigned int flags)
+/* Check if @flags are a supported and valid set of FS_*_FL flags */
+static int check_fsflags(unsigned int flags)
 {
 	if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
 		      FS_NOATIME_FL | FS_NODUMP_FL | \
@@ -186,13 +189,13 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 {
 	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct btrfs_inode *ip = BTRFS_I(inode);
-	struct btrfs_root *root = ip->root;
+	struct btrfs_inode *binode = BTRFS_I(inode);
+	struct btrfs_root *root = binode->root;
 	struct btrfs_trans_handle *trans;
-	unsigned int flags, oldflags;
+	unsigned int fsflags, old_fsflags;
 	int ret;
-	u64 ip_oldflags;
-	unsigned int i_oldflags;
+	u64 old_flags;
+	unsigned int old_i_flags;
 	umode_t mode;
 
 	if (!inode_owner_or_capable(inode))
@@ -201,10 +204,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	if (btrfs_root_readonly(root))
 		return -EROFS;
 
-	if (copy_from_user(&flags, arg, sizeof(flags)))
+	if (copy_from_user(&fsflags, arg, sizeof(fsflags)))
 		return -EFAULT;
 
-	ret = check_flags(flags);
+	ret = check_fsflags(fsflags);
 	if (ret)
 		return ret;
 
@@ -214,44 +217,44 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
 	inode_lock(inode);
 
-	ip_oldflags = ip->flags;
-	i_oldflags = inode->i_flags;
+	old_flags = binode->flags;
+	old_i_flags = inode->i_flags;
 	mode = inode->i_mode;
 
-	flags = btrfs_mask_flags(inode->i_mode, flags);
-	oldflags = btrfs_flags_to_ioctl(ip->flags);
-	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+	fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
+	old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
+	if ((fsflags ^ old_fsflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
 		if (!capable(CAP_LINUX_IMMUTABLE)) {
 			ret = -EPERM;
 			goto out_unlock;
 		}
 	}
 
-	if (flags & FS_SYNC_FL)
-		ip->flags |= BTRFS_INODE_SYNC;
+	if (fsflags & FS_SYNC_FL)
+		binode->flags |= BTRFS_INODE_SYNC;
 	else
-		ip->flags &= ~BTRFS_INODE_SYNC;
-	if (flags & FS_IMMUTABLE_FL)
-		ip->flags |= BTRFS_INODE_IMMUTABLE;
+		binode->flags &= ~BTRFS_INODE_SYNC;
+	if (fsflags & FS_IMMUTABLE_FL)
+		binode->flags |= BTRFS_INODE_IMMUTABLE;
 	else
-		ip->flags &= ~BTRFS_INODE_IMMUTABLE;
-	if (flags & FS_APPEND_FL)
-		ip->flags |= BTRFS_INODE_APPEND;
+		binode->flags &= ~BTRFS_INODE_IMMUTABLE;
+	if (fsflags & FS_APPEND_FL)
+		binode->flags |= BTRFS_INODE_APPEND;
 	else
-		ip->flags &= ~BTRFS_INODE_APPEND;
-	if (flags & FS_NODUMP_FL)
-		ip->flags |= BTRFS_INODE_NODUMP;
+		binode->flags &= ~BTRFS_INODE_APPEND;
+	if (fsflags & FS_NODUMP_FL)
+		binode->flags |= BTRFS_INODE_NODUMP;
 	else
-		ip->flags &= ~BTRFS_INODE_NODUMP;
-	if (flags & FS_NOATIME_FL)
-		ip->flags |= BTRFS_INODE_NOATIME;
+		binode->flags &= ~BTRFS_INODE_NODUMP;
+	if (fsflags & FS_NOATIME_FL)
+		binode->flags |= BTRFS_INODE_NOATIME;
 	else
-		ip->flags &= ~BTRFS_INODE_NOATIME;
-	if (flags & FS_DIRSYNC_FL)
-		ip->flags |= BTRFS_INODE_DIRSYNC;
+		binode->flags &= ~BTRFS_INODE_NOATIME;
+	if (fsflags & FS_DIRSYNC_FL)
+		binode->flags |= BTRFS_INODE_DIRSYNC;
 	else
-		ip->flags &= ~BTRFS_INODE_DIRSYNC;
-	if (flags & FS_NOCOW_FL) {
+		binode->flags &= ~BTRFS_INODE_DIRSYNC;
+	if (fsflags & FS_NOCOW_FL) {
 		if (S_ISREG(mode)) {
 			/*
 			 * It's safe to turn csums off here, no extents exist.
@@ -259,10 +262,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 			 * status of the file and will not set it.
 			 */
 			if (inode->i_size == 0)
-				ip->flags |= BTRFS_INODE_NODATACOW
-					   | BTRFS_INODE_NODATASUM;
+				binode->flags |= BTRFS_INODE_NODATACOW
+					      | BTRFS_INODE_NODATASUM;
 		} else {
-			ip->flags |= BTRFS_INODE_NODATACOW;
+			binode->flags |= BTRFS_INODE_NODATACOW;
 		}
 	} else {
 		/*
@@ -270,10 +273,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 		 */
 		if (S_ISREG(mode)) {
 			if (inode->i_size == 0)
-				ip->flags &= ~(BTRFS_INODE_NODATACOW
+				binode->flags &= ~(BTRFS_INODE_NODATACOW
 				             | BTRFS_INODE_NODATASUM);
 		} else {
-			ip->flags &= ~BTRFS_INODE_NODATACOW;
+			binode->flags &= ~BTRFS_INODE_NODATACOW;
 		}
 	}
 
@@ -282,18 +285,18 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	 * flag may be changed automatically if compression code won't make
 	 * things smaller.
 	 */
-	if (flags & FS_NOCOMP_FL) {
-		ip->flags &= ~BTRFS_INODE_COMPRESS;
-		ip->flags |= BTRFS_INODE_NOCOMPRESS;
+	if (fsflags & FS_NOCOMP_FL) {
+		binode->flags &= ~BTRFS_INODE_COMPRESS;
+		binode->flags |= BTRFS_INODE_NOCOMPRESS;
 
 		ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
 		if (ret && ret != -ENODATA)
 			goto out_drop;
-	} else if (flags & FS_COMPR_FL) {
+	} else if (fsflags & FS_COMPR_FL) {
 		const char *comp;
 
-		ip->flags |= BTRFS_INODE_COMPRESS;
-		ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
+		binode->flags |= BTRFS_INODE_COMPRESS;
+		binode->flags &= ~BTRFS_INODE_NOCOMPRESS;
 
 		comp = btrfs_compress_type2str(fs_info->compress_type);
 		if (!comp || comp[0] == 0)
@@ -308,7 +311,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 		ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
 		if (ret && ret != -ENODATA)
 			goto out_drop;
-		ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+		binode->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
 	}
 
 	trans = btrfs_start_transaction(root, 1);
@@ -317,7 +320,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 		goto out_drop;
 	}
 
-	btrfs_update_iflags(inode);
+	btrfs_sync_inode_flags_to_i_flags(inode);
 	inode_inc_iversion(inode);
 	inode->i_ctime = current_time(inode);
 	ret = btrfs_update_inode(trans, root, inode);
@@ -325,8 +328,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	btrfs_end_transaction(trans);
  out_drop:
 	if (ret) {
-		ip->flags = ip_oldflags;
-		inode->i_flags = i_oldflags;
+		binode->flags = old_flags;
+		inode->i_flags = old_i_flags;
 	}
 
  out_unlock:
@@ -335,6 +338,148 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	return ret;
 }
 
+/*
+ * Translate btrfs internal inode flags to xflags as expected by the
+ * FS_IOC_FSGETXATT ioctl. Filter only the supported ones, unknown flags are
+ * silently dropped.
+ */
+static unsigned int btrfs_inode_flags_to_xflags(unsigned int flags)
+{
+	unsigned int xflags = 0;
+
+	if (flags & BTRFS_INODE_APPEND)
+		xflags |= FS_XFLAG_APPEND;
+	if (flags & BTRFS_INODE_IMMUTABLE)
+		xflags |= FS_XFLAG_IMMUTABLE;
+	if (flags & BTRFS_INODE_NOATIME)
+		xflags |= FS_XFLAG_NOATIME;
+	if (flags & BTRFS_INODE_NODUMP)
+		xflags |= FS_XFLAG_NODUMP;
+	if (flags & BTRFS_INODE_SYNC)
+		xflags |= FS_XFLAG_SYNC;
+
+	return xflags;
+}
+
+/* Check if @flags are a supported and valid set of FS_XFLAGS_* flags */
+static int check_xflags(unsigned int flags)
+{
+	if (flags & ~(FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE | FS_XFLAG_NOATIME |
+		      FS_XFLAG_NODUMP | FS_XFLAG_SYNC))
+		return -EOPNOTSUPP;
+	return 0;
+}
+
+/*
+ * Set the xflags from the internal inode flags. The remaining items of fsxattr
+ * are zeroed.
+ */
+static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg)
+{
+	struct btrfs_inode *binode = BTRFS_I(file_inode(file));
+	struct fsxattr fa;
+
+	memset(&fa, 0, sizeof(fa));
+	fa.fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags);
+
+	if (copy_to_user(arg, &fa, sizeof(fa)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
+{
+	struct inode *inode = file_inode(file);
+	struct btrfs_inode *binode = BTRFS_I(inode);
+	struct btrfs_root *root = binode->root;
+	struct btrfs_trans_handle *trans;
+	struct fsxattr fa;
+	unsigned old_flags;
+	unsigned old_i_flags;
+	int ret = 0;
+
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
+	memset(&fa, 0, sizeof(fa));
+	if (copy_from_user(&fa, arg, sizeof(fa)))
+		return -EFAULT;
+
+	ret = check_xflags(fa.fsx_xflags);
+	if (ret)
+		return ret;
+
+	if (fa.fsx_extsize != 0 || fa.fsx_projid != 0 || fa.fsx_cowextsize != 0)
+		return -EOPNOTSUPP;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	inode_lock(inode);
+
+	old_flags = binode->flags;
+	old_i_flags = inode->i_flags;
+
+	/* We need the capabilities to change append-only or immutable inode */
+	if (((old_flags & (BTRFS_INODE_APPEND | BTRFS_INODE_IMMUTABLE)) ||
+	     (fa.fsx_xflags & (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE))) &&
+	    !capable(CAP_LINUX_IMMUTABLE)) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
+	if (fa.fsx_xflags & FS_XFLAG_SYNC)
+		binode->flags |= BTRFS_INODE_SYNC;
+	else
+		binode->flags &= ~BTRFS_INODE_SYNC;
+	if (fa.fsx_xflags & FS_XFLAG_IMMUTABLE)
+		binode->flags |= BTRFS_INODE_IMMUTABLE;
+	else
+		binode->flags &= ~BTRFS_INODE_IMMUTABLE;
+	if (fa.fsx_xflags & FS_XFLAG_APPEND)
+		binode->flags |= BTRFS_INODE_APPEND;
+	else
+		binode->flags &= ~BTRFS_INODE_APPEND;
+	if (fa.fsx_xflags & FS_XFLAG_NODUMP)
+		binode->flags |= BTRFS_INODE_NODUMP;
+	else
+		binode->flags &= ~BTRFS_INODE_NODUMP;
+	if (fa.fsx_xflags & FS_XFLAG_NOATIME)
+		binode->flags |= BTRFS_INODE_NOATIME;
+	else
+		binode->flags &= ~BTRFS_INODE_NOATIME;
+
+	/* 1 item for the inode */
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_unlock;
+	}
+
+	btrfs_sync_inode_flags_to_i_flags(inode);
+	inode_inc_iversion(inode);
+	inode->i_ctime = current_time(inode);
+	ret = btrfs_update_inode(trans, root, inode);
+
+	btrfs_end_transaction(trans);
+
+out_unlock:
+	if (ret) {
+		binode->flags = old_flags;
+		inode->i_flags = old_i_flags;
+	}
+
+	inode_unlock(inode);
+	mnt_drop_write_file(file);
+
+	return ret;
+}
+
 static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
 {
 	struct inode *inode = file_inode(file);
@@ -424,7 +569,6 @@ static noinline int create_subvol(struct inode *dir,
 	u64 objectid;
 	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
 	u64 index = 0;
-	u64 qgroup_reserved;
 	uuid_le new_uuid;
 
 	root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
@@ -449,8 +593,7 @@ static noinline int create_subvol(struct inode *dir,
 	 * The same as the snapshot creation, please see the comment
 	 * of create_snapshot().
 	 */
-	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
-					       8, &qgroup_reserved, false);
+	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false);
 	if (ret)
 		goto fail_free;
 
@@ -573,7 +716,7 @@ static noinline int create_subvol(struct inode *dir,
 				 btrfs_ino(BTRFS_I(dir)), index, name, namelen);
 	BUG_ON(ret);
 
-	ret = btrfs_uuid_tree_add(trans, fs_info, root_item->uuid,
+	ret = btrfs_uuid_tree_add(trans, root_item->uuid,
 				  BTRFS_UUID_KEY_SUBVOL, objectid);
 	if (ret)
 		btrfs_abort_transaction(trans, ret);
@@ -640,7 +783,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 	wait_event(root->subv_writers->wait,
 		   percpu_counter_sum(&root->subv_writers->counter) == 0);
 
-	ret = btrfs_start_delalloc_inodes(root, 0);
+	ret = btrfs_start_delalloc_inodes(root);
 	if (ret)
 		goto dec_and_free;
 
@@ -658,7 +801,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 	 */
 	ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
 					&pending_snapshot->block_rsv, 8,
-					&pending_snapshot->qgroup_reserved,
 					false);
 	if (ret)
 		goto dec_and_free;
@@ -1457,7 +1599,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 		return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
 	}
 
-	mutex_lock(&fs_info->volume_mutex);
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args)) {
 		ret = PTR_ERR(vol_args);
@@ -1565,7 +1706,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 out_free:
 	kfree(vol_args);
 out:
-	mutex_unlock(&fs_info->volume_mutex);
 	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
 	mnt_drop_write_file(file);
 	return ret;
@@ -1832,60 +1972,6 @@ out:
 	return ret;
 }
 
-/*
- * helper to check if the subvolume references other subvolumes
- */
-static noinline int may_destroy_subvol(struct btrfs_root *root)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_path *path;
-	struct btrfs_dir_item *di;
-	struct btrfs_key key;
-	u64 dir_id;
-	int ret;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	/* Make sure this root isn't set as the default subvol */
-	dir_id = btrfs_super_root_dir(fs_info->super_copy);
-	di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
-				   dir_id, "default", 7, 0);
-	if (di && !IS_ERR(di)) {
-		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
-		if (key.objectid == root->root_key.objectid) {
-			ret = -EPERM;
-			btrfs_err(fs_info,
-				  "deleting default subvolume %llu is not allowed",
-				  key.objectid);
-			goto out;
-		}
-		btrfs_release_path(path);
-	}
-
-	key.objectid = root->root_key.objectid;
-	key.type = BTRFS_ROOT_REF_KEY;
-	key.offset = (u64)-1;
-
-	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
-	if (ret < 0)
-		goto out;
-	BUG_ON(ret == 0);
-
-	ret = 0;
-	if (path->slots[0] > 0) {
-		path->slots[0]--;
-		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
-		if (key.objectid == root->root_key.objectid &&
-		    key.type == BTRFS_ROOT_REF_KEY)
-			ret = -ENOTEMPTY;
-	}
-out:
-	btrfs_free_path(path);
-	return ret;
-}
-
 static noinline int key_in_sk(struct btrfs_key *key,
 			      struct btrfs_ioctl_search_key *sk)
 {
@@ -2066,7 +2152,7 @@ static noinline int search_ioctl(struct inode *inode,
 		root = btrfs_read_fs_root_no_name(info, &key);
 		if (IS_ERR(root)) {
 			btrfs_free_path(path);
-			return -ENOENT;
+			return PTR_ERR(root);
 		}
 	}
 
@@ -2200,8 +2286,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 	key.offset = (u64)-1;
 	root = btrfs_read_fs_root_no_name(info, &key);
 	if (IS_ERR(root)) {
-		btrfs_err(info, "could not find root %llu", tree_id);
-		ret = -ENOENT;
+		ret = PTR_ERR(root);
 		goto out;
 	}
 
@@ -2256,6 +2341,165 @@ out:
 	return ret;
 }
 
+static int btrfs_search_path_in_tree_user(struct inode *inode,
+				struct btrfs_ioctl_ino_lookup_user_args *args)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct super_block *sb = inode->i_sb;
+	struct btrfs_key upper_limit = BTRFS_I(inode)->location;
+	u64 treeid = BTRFS_I(inode)->root->root_key.objectid;
+	u64 dirid = args->dirid;
+	unsigned long item_off;
+	unsigned long item_len;
+	struct btrfs_inode_ref *iref;
+	struct btrfs_root_ref *rref;
+	struct btrfs_root *root;
+	struct btrfs_path *path;
+	struct btrfs_key key, key2;
+	struct extent_buffer *leaf;
+	struct inode *temp_inode;
+	char *ptr;
+	int slot;
+	int len;
+	int total_len = 0;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/*
+	 * If the bottom subvolume does not exist directly under upper_limit,
+	 * construct the path in from the bottom up.
+	 */
+	if (dirid != upper_limit.objectid) {
+		ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
+
+		key.objectid = treeid;
+		key.type = BTRFS_ROOT_ITEM_KEY;
+		key.offset = (u64)-1;
+		root = btrfs_read_fs_root_no_name(fs_info, &key);
+		if (IS_ERR(root)) {
+			ret = PTR_ERR(root);
+			goto out;
+		}
+
+		key.objectid = dirid;
+		key.type = BTRFS_INODE_REF_KEY;
+		key.offset = (u64)-1;
+		while (1) {
+			ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = btrfs_previous_item(root, path, dirid,
+							  BTRFS_INODE_REF_KEY);
+				if (ret < 0) {
+					goto out;
+				} else if (ret > 0) {
+					ret = -ENOENT;
+					goto out;
+				}
+			}
+
+			leaf = path->nodes[0];
+			slot = path->slots[0];
+			btrfs_item_key_to_cpu(leaf, &key, slot);
+
+			iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
+			len = btrfs_inode_ref_name_len(leaf, iref);
+			ptr -= len + 1;
+			total_len += len + 1;
+			if (ptr < args->path) {
+				ret = -ENAMETOOLONG;
+				goto out;
+			}
+
+			*(ptr + len) = '/';
+			read_extent_buffer(leaf, ptr,
+					(unsigned long)(iref + 1), len);
+
+			/* Check the read+exec permission of this directory */
+			ret = btrfs_previous_item(root, path, dirid,
+						  BTRFS_INODE_ITEM_KEY);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = -ENOENT;
+				goto out;
+			}
+
+			leaf = path->nodes[0];
+			slot = path->slots[0];
+			btrfs_item_key_to_cpu(leaf, &key2, slot);
+			if (key2.objectid != dirid) {
+				ret = -ENOENT;
+				goto out;
+			}
+
+			temp_inode = btrfs_iget(sb, &key2, root, NULL);
+			ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC);
+			iput(temp_inode);
+			if (ret) {
+				ret = -EACCES;
+				goto out;
+			}
+
+			if (key.offset == upper_limit.objectid)
+				break;
+			if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
+				ret = -EACCES;
+				goto out;
+			}
+
+			btrfs_release_path(path);
+			key.objectid = key.offset;
+			key.offset = (u64)-1;
+			dirid = key.objectid;
+		}
+
+		memmove(args->path, ptr, total_len);
+		args->path[total_len] = '\0';
+		btrfs_release_path(path);
+	}
+
+	/* Get the bottom subvolume's name from ROOT_REF */
+	root = fs_info->tree_root;
+	key.objectid = treeid;
+	key.type = BTRFS_ROOT_REF_KEY;
+	key.offset = args->treeid;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		goto out;
+	} else if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+
+	item_off = btrfs_item_ptr_offset(leaf, slot);
+	item_len = btrfs_item_size_nr(leaf, slot);
+	/* Check if dirid in ROOT_REF corresponds to passed dirid */
+	rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+	if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Copy subvolume's name */
+	item_off += sizeof(struct btrfs_root_ref);
+	item_len -= sizeof(struct btrfs_root_ref);
+	read_extent_buffer(leaf, args->name, item_off, item_len);
+	args->name[item_len] = 0;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 static noinline int btrfs_ioctl_ino_lookup(struct file *file,
 					   void __user *argp)
 {
@@ -2298,6 +2542,265 @@ out:
 	return ret;
 }
 
+/*
+ * Version of ino_lookup ioctl (unprivileged)
+ *
+ * The main differences from ino_lookup ioctl are:
+ *
+ *   1. Read + Exec permission will be checked using inode_permission() during
+ *      path construction. -EACCES will be returned in case of failure.
+ *   2. Path construction will be stopped at the inode number which corresponds
+ *      to the fd with which this ioctl is called. If constructed path does not
+ *      exist under fd's inode, -EACCES will be returned.
+ *   3. The name of bottom subvolume is also searched and filled.
+ */
+static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
+{
+	struct btrfs_ioctl_ino_lookup_user_args *args;
+	struct inode *inode;
+	int ret;
+
+	args = memdup_user(argp, sizeof(*args));
+	if (IS_ERR(args))
+		return PTR_ERR(args);
+
+	inode = file_inode(file);
+
+	if (args->dirid == BTRFS_FIRST_FREE_OBJECTID &&
+	    BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) {
+		/*
+		 * The subvolume does not exist under fd with which this is
+		 * called
+		 */
+		kfree(args);
+		return -EACCES;
+	}
+
+	ret = btrfs_search_path_in_tree_user(inode, args);
+
+	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+		ret = -EFAULT;
+
+	kfree(args);
+	return ret;
+}
+
+/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
+static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
+{
+	struct btrfs_ioctl_get_subvol_info_args *subvol_info;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_root *root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_root_item *root_item;
+	struct btrfs_root_ref *rref;
+	struct extent_buffer *leaf;
+	unsigned long item_off;
+	unsigned long item_len;
+	struct inode *inode;
+	int slot;
+	int ret = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL);
+	if (!subvol_info) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+
+	inode = file_inode(file);
+	fs_info = BTRFS_I(inode)->root->fs_info;
+
+	/* Get root_item of inode's subvolume */
+	key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+	root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out;
+	}
+	root_item = &root->root_item;
+
+	subvol_info->treeid = key.objectid;
+
+	subvol_info->generation = btrfs_root_generation(root_item);
+	subvol_info->flags = btrfs_root_flags(root_item);
+
+	memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE);
+	memcpy(subvol_info->parent_uuid, root_item->parent_uuid,
+						    BTRFS_UUID_SIZE);
+	memcpy(subvol_info->received_uuid, root_item->received_uuid,
+						    BTRFS_UUID_SIZE);
+
+	subvol_info->ctransid = btrfs_root_ctransid(root_item);
+	subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime);
+	subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime);
+
+	subvol_info->otransid = btrfs_root_otransid(root_item);
+	subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime);
+	subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime);
+
+	subvol_info->stransid = btrfs_root_stransid(root_item);
+	subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime);
+	subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime);
+
+	subvol_info->rtransid = btrfs_root_rtransid(root_item);
+	subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime);
+	subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime);
+
+	if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
+		/* Search root tree for ROOT_BACKREF of this subvolume */
+		root = fs_info->tree_root;
+
+		key.type = BTRFS_ROOT_BACKREF_KEY;
+		key.offset = 0;
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0) {
+			goto out;
+		} else if (path->slots[0] >=
+			   btrfs_header_nritems(path->nodes[0])) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = -EUCLEAN;
+				goto out;
+			}
+		}
+
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid == subvol_info->treeid &&
+		    key.type == BTRFS_ROOT_BACKREF_KEY) {
+			subvol_info->parent_id = key.offset;
+
+			rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+			subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref);
+
+			item_off = btrfs_item_ptr_offset(leaf, slot)
+					+ sizeof(struct btrfs_root_ref);
+			item_len = btrfs_item_size_nr(leaf, slot)
+					- sizeof(struct btrfs_root_ref);
+			read_extent_buffer(leaf, subvol_info->name,
+					   item_off, item_len);
+		} else {
+			ret = -ENOENT;
+			goto out;
+		}
+	}
+
+	if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
+		ret = -EFAULT;
+
+out:
+	btrfs_free_path(path);
+	kzfree(subvol_info);
+	return ret;
+}
+
+/*
+ * Return ROOT_REF information of the subvolume containing this inode
+ * except the subvolume name.
+ */
+static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp)
+{
+	struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
+	struct btrfs_root_ref *rref;
+	struct btrfs_root *root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct inode *inode;
+	u64 objectid;
+	int slot;
+	int ret;
+	u8 found;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	rootrefs = memdup_user(argp, sizeof(*rootrefs));
+	if (IS_ERR(rootrefs)) {
+		btrfs_free_path(path);
+		return PTR_ERR(rootrefs);
+	}
+
+	inode = file_inode(file);
+	root = BTRFS_I(inode)->root->fs_info->tree_root;
+	objectid = BTRFS_I(inode)->root->root_key.objectid;
+
+	key.objectid = objectid;
+	key.type = BTRFS_ROOT_REF_KEY;
+	key.offset = rootrefs->min_treeid;
+	found = 0;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		goto out;
+	} else if (path->slots[0] >=
+		   btrfs_header_nritems(path->nodes[0])) {
+		ret = btrfs_next_leaf(root, path);
+		if (ret < 0) {
+			goto out;
+		} else if (ret > 0) {
+			ret = -EUCLEAN;
+			goto out;
+		}
+	}
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) {
+			ret = 0;
+			goto out;
+		}
+
+		if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) {
+			ret = -EOVERFLOW;
+			goto out;
+		}
+
+		rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+		rootrefs->rootref[found].treeid = key.offset;
+		rootrefs->rootref[found].dirid =
+				  btrfs_root_ref_dirid(leaf, rref);
+		found++;
+
+		ret = btrfs_next_item(root, path);
+		if (ret < 0) {
+			goto out;
+		} else if (ret > 0) {
+			ret = -EUCLEAN;
+			goto out;
+		}
+	}
+
+out:
+	if (!ret || ret == -EOVERFLOW) {
+		rootrefs->num_items = found;
+		/* update min_treeid for next search */
+		if (found)
+			rootrefs->min_treeid =
+				rootrefs->rootref[found - 1].treeid + 1;
+		if (copy_to_user(argp, rootrefs, sizeof(*rootrefs)))
+			ret = -EFAULT;
+	}
+
+	kfree(rootrefs);
+	btrfs_free_path(path);
+
+	return ret;
+}
+
 static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 					     void __user *arg)
 {
@@ -2309,12 +2812,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_root *dest = NULL;
 	struct btrfs_ioctl_vol_args *vol_args;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_block_rsv block_rsv;
-	u64 root_flags;
-	u64 qgroup_reserved;
 	int namelen;
-	int ret;
 	int err = 0;
 
 	if (!S_ISDIR(dir->i_mode))
@@ -2398,133 +2896,11 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	}
 
 	inode_lock(inode);
-
-	/*
-	 * Don't allow to delete a subvolume with send in progress. This is
-	 * inside the i_mutex so the error handling that has to drop the bit
-	 * again is not run concurrently.
-	 */
-	spin_lock(&dest->root_item_lock);
-	root_flags = btrfs_root_flags(&dest->root_item);
-	if (dest->send_in_progress == 0) {
-		btrfs_set_root_flags(&dest->root_item,
-				root_flags | BTRFS_ROOT_SUBVOL_DEAD);
-		spin_unlock(&dest->root_item_lock);
-	} else {
-		spin_unlock(&dest->root_item_lock);
-		btrfs_warn(fs_info,
-			   "Attempt to delete subvolume %llu during send",
-			   dest->root_key.objectid);
-		err = -EPERM;
-		goto out_unlock_inode;
-	}
-
-	down_write(&fs_info->subvol_sem);
-
-	err = may_destroy_subvol(dest);
-	if (err)
-		goto out_up_write;
-
-	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
-	/*
-	 * One for dir inode, two for dir entries, two for root
-	 * ref/backref.
-	 */
-	err = btrfs_subvolume_reserve_metadata(root, &block_rsv,
-					       5, &qgroup_reserved, true);
-	if (err)
-		goto out_up_write;
-
-	trans = btrfs_start_transaction(root, 0);
-	if (IS_ERR(trans)) {
-		err = PTR_ERR(trans);
-		goto out_release;
-	}
-	trans->block_rsv = &block_rsv;
-	trans->bytes_reserved = block_rsv.size;
-
-	btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
-
-	ret = btrfs_unlink_subvol(trans, root, dir,
-				dest->root_key.objectid,
-				dentry->d_name.name,
-				dentry->d_name.len);
-	if (ret) {
-		err = ret;
-		btrfs_abort_transaction(trans, ret);
-		goto out_end_trans;
-	}
-
-	btrfs_record_root_in_trans(trans, dest);
-
-	memset(&dest->root_item.drop_progress, 0,
-		sizeof(dest->root_item.drop_progress));
-	dest->root_item.drop_level = 0;
-	btrfs_set_root_refs(&dest->root_item, 0);
-
-	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
-		ret = btrfs_insert_orphan_item(trans,
-					fs_info->tree_root,
-					dest->root_key.objectid);
-		if (ret) {
-			btrfs_abort_transaction(trans, ret);
-			err = ret;
-			goto out_end_trans;
-		}
-	}
-
-	ret = btrfs_uuid_tree_rem(trans, fs_info, dest->root_item.uuid,
-				  BTRFS_UUID_KEY_SUBVOL,
-				  dest->root_key.objectid);
-	if (ret && ret != -ENOENT) {
-		btrfs_abort_transaction(trans, ret);
-		err = ret;
-		goto out_end_trans;
-	}
-	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
-		ret = btrfs_uuid_tree_rem(trans, fs_info,
-					  dest->root_item.received_uuid,
-					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
-					  dest->root_key.objectid);
-		if (ret && ret != -ENOENT) {
-			btrfs_abort_transaction(trans, ret);
-			err = ret;
-			goto out_end_trans;
-		}
-	}
-
-out_end_trans:
-	trans->block_rsv = NULL;
-	trans->bytes_reserved = 0;
-	ret = btrfs_end_transaction(trans);
-	if (ret && !err)
-		err = ret;
-	inode->i_flags |= S_DEAD;
-out_release:
-	btrfs_subvolume_release_metadata(fs_info, &block_rsv);
-out_up_write:
-	up_write(&fs_info->subvol_sem);
-	if (err) {
-		spin_lock(&dest->root_item_lock);
-		root_flags = btrfs_root_flags(&dest->root_item);
-		btrfs_set_root_flags(&dest->root_item,
-				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
-		spin_unlock(&dest->root_item_lock);
-	}
-out_unlock_inode:
+	err = btrfs_delete_subvolume(dir, dentry);
 	inode_unlock(inode);
-	if (!err) {
-		d_invalidate(dentry);
-		btrfs_invalidate_inodes(dest);
+	if (!err)
 		d_delete(dentry);
-		ASSERT(dest->send_in_progress == 0);
 
-		/* the last ref */
-		if (dest->ino_cache_inode) {
-			iput(dest->ino_cache_inode);
-			dest->ino_cache_inode = NULL;
-		}
-	}
 out_dput:
 	dput(dentry);
 out_unlock_dir:
@@ -2613,7 +2989,6 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
 	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
 		return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
 
-	mutex_lock(&fs_info->volume_mutex);
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args)) {
 		ret = PTR_ERR(vol_args);
@@ -2628,7 +3003,6 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
 
 	kfree(vol_args);
 out:
-	mutex_unlock(&fs_info->volume_mutex);
 	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
 	return ret;
 }
@@ -2654,8 +3028,10 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 	}
 
 	/* Check for compatibility reject unknown flags */
-	if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED)
-		return -EOPNOTSUPP;
+	if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
 
 	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
 		ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
@@ -2954,8 +3330,6 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp)
 			put_page(pg);
 		}
 	}
-	kfree(cmp->src_pages);
-	kfree(cmp->dst_pages);
 }
 
 static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
@@ -2964,40 +3338,14 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
 {
 	int ret;
 	int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT;
-	struct page **src_pgarr, **dst_pgarr;
 
-	/*
-	 * We must gather up all the pages before we initiate our
-	 * extent locking. We use an array for the page pointers. Size
-	 * of the array is bounded by len, which is in turn bounded by
-	 * BTRFS_MAX_DEDUPE_LEN.
-	 */
-	src_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
-	dst_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
-	if (!src_pgarr || !dst_pgarr) {
-		kfree(src_pgarr);
-		kfree(dst_pgarr);
-		return -ENOMEM;
-	}
 	cmp->num_pages = num_pages;
-	cmp->src_pages = src_pgarr;
-	cmp->dst_pages = dst_pgarr;
-
-	/*
-	 * If deduping ranges in the same inode, locking rules make it mandatory
-	 * to always lock pages in ascending order to avoid deadlocks with
-	 * concurrent tasks (such as starting writeback/delalloc).
-	 */
-	if (src == dst && dst_loff < loff) {
-		swap(src_pgarr, dst_pgarr);
-		swap(loff, dst_loff);
-	}
 
-	ret = gather_extent_pages(src, src_pgarr, cmp->num_pages, loff);
+	ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff);
 	if (ret)
 		goto out;
 
-	ret = gather_extent_pages(dst, dst_pgarr, cmp->num_pages, dst_loff);
+	ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff);
 
 out:
 	if (ret)
@@ -3067,31 +3415,23 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen,
 	return 0;
 }
 
-static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
-			     struct inode *dst, u64 dst_loff)
+static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
+				   struct inode *dst, u64 dst_loff,
+				   struct cmp_pages *cmp)
 {
 	int ret;
 	u64 len = olen;
-	struct cmp_pages cmp;
 	bool same_inode = (src == dst);
 	u64 same_lock_start = 0;
 	u64 same_lock_len = 0;
 
-	if (len == 0)
-		return 0;
-
-	if (same_inode)
-		inode_lock(src);
-	else
-		btrfs_double_inode_lock(src, dst);
-
 	ret = extent_same_check_offsets(src, loff, &len, olen);
 	if (ret)
-		goto out_unlock;
+		return ret;
 
 	ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
 	if (ret)
-		goto out_unlock;
+		return ret;
 
 	if (same_inode) {
 		/*
@@ -3108,32 +3448,21 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
 		 * allow an unaligned length so long as it ends at
 		 * i_size.
 		 */
-		if (len != olen) {
-			ret = -EINVAL;
-			goto out_unlock;
-		}
+		if (len != olen)
+			return -EINVAL;
 
 		/* Check for overlapping ranges */
-		if (dst_loff + len > loff && dst_loff < loff + len) {
-			ret = -EINVAL;
-			goto out_unlock;
-		}
+		if (dst_loff + len > loff && dst_loff < loff + len)
+			return -EINVAL;
 
 		same_lock_start = min_t(u64, loff, dst_loff);
 		same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
 	}
 
-	/* don't make the dst file partly checksummed */
-	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
-	    (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
 again:
-	ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp);
+	ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp);
 	if (ret)
-		goto out_unlock;
+		return ret;
 
 	if (same_inode)
 		ret = lock_extent_range(src, same_lock_start, same_lock_len,
@@ -3154,7 +3483,7 @@ again:
 		 * Ranges in the io trees already unlocked. Now unlock all
 		 * pages before waiting for all IO to complete.
 		 */
-		btrfs_cmp_data_free(&cmp);
+		btrfs_cmp_data_free(cmp);
 		if (same_inode) {
 			btrfs_wait_ordered_range(src, same_lock_start,
 						 same_lock_len);
@@ -3167,12 +3496,12 @@ again:
 	ASSERT(ret == 0);
 	if (WARN_ON(ret)) {
 		/* ranges in the io trees already unlocked */
-		btrfs_cmp_data_free(&cmp);
+		btrfs_cmp_data_free(cmp);
 		return ret;
 	}
 
 	/* pass original length for comparison so we stay within i_size */
-	ret = btrfs_cmp_data(olen, &cmp);
+	ret = btrfs_cmp_data(olen, cmp);
 	if (ret == 0)
 		ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
 
@@ -3182,18 +3511,91 @@ again:
 	else
 		btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
 
-	btrfs_cmp_data_free(&cmp);
+	btrfs_cmp_data_free(cmp);
+
+	return ret;
+}
+
+#define BTRFS_MAX_DEDUPE_LEN	SZ_16M
+
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
+			     struct inode *dst, u64 dst_loff)
+{
+	int ret;
+	struct cmp_pages cmp;
+	int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT;
+	bool same_inode = (src == dst);
+	u64 i, tail_len, chunk_count;
+
+	if (olen == 0)
+		return 0;
+
+	if (same_inode)
+		inode_lock(src);
+	else
+		btrfs_double_inode_lock(src, dst);
+
+	/* don't make the dst file partly checksummed */
+	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+	    (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
+	chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
+	if (chunk_count == 0)
+		num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT;
+
+	/*
+	 * If deduping ranges in the same inode, locking rules make it
+	 * mandatory to always lock pages in ascending order to avoid deadlocks
+	 * with concurrent tasks (such as starting writeback/delalloc).
+	 */
+	if (same_inode && dst_loff < loff)
+		swap(loff, dst_loff);
+
+	/*
+	 * We must gather up all the pages before we initiate our extent
+	 * locking. We use an array for the page pointers. Size of the array is
+	 * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN.
+	 */
+	cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *),
+				       GFP_KERNEL | __GFP_ZERO);
+	cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *),
+				       GFP_KERNEL | __GFP_ZERO);
+	if (!cmp.src_pages || !cmp.dst_pages) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+
+	for (i = 0; i < chunk_count; i++) {
+		ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
+					      dst, dst_loff, &cmp);
+		if (ret)
+			goto out_unlock;
+
+		loff += BTRFS_MAX_DEDUPE_LEN;
+		dst_loff += BTRFS_MAX_DEDUPE_LEN;
+	}
+
+	if (tail_len > 0)
+		ret = btrfs_extent_same_range(src, loff, tail_len, dst,
+					      dst_loff, &cmp);
+
 out_unlock:
 	if (same_inode)
 		inode_unlock(src);
 	else
 		btrfs_double_inode_unlock(src, dst);
 
+out_free:
+	kvfree(cmp.src_pages);
+	kvfree(cmp.dst_pages);
+
 	return ret;
 }
 
-#define BTRFS_MAX_DEDUPE_LEN	SZ_16M
-
 ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
 				struct file *dst_file, u64 dst_loff)
 {
@@ -3202,9 +3604,6 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
 	u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
 	ssize_t res;
 
-	if (olen > BTRFS_MAX_DEDUPE_LEN)
-		olen = BTRFS_MAX_DEDUPE_LEN;
-
 	if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
 		/*
 		 * Btrfs does not support blocksize < page_size. As a
@@ -3826,11 +4225,6 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
 	    src->i_sb != inode->i_sb)
 		return -EXDEV;
 
-	/* don't make the dst file partly checksummed */
-	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
-	    (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
-		return -EINVAL;
-
 	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
 		return -EISDIR;
 
@@ -3840,6 +4234,13 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
 		inode_lock(src);
 	}
 
+	/* don't make the dst file partly checksummed */
+	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+	    (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
 	/* determine range to clone */
 	ret = -EINVAL;
 	if (off + len > src->i_size || off + len < off)
@@ -4007,8 +4408,8 @@ out:
 	return ret;
 }
 
-void btrfs_get_block_group_info(struct list_head *groups_list,
-				struct btrfs_ioctl_space_info *space)
+static void get_block_group_info(struct list_head *groups_list,
+				 struct btrfs_ioctl_space_info *space)
 {
 	struct btrfs_block_group_cache *block_group;
 
@@ -4124,8 +4525,8 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
 		down_read(&info->groups_sem);
 		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
 			if (!list_empty(&info->block_groups[c])) {
-				btrfs_get_block_group_info(
-					&info->block_groups[c], &space);
+				get_block_group_info(&info->block_groups[c],
+						     &space);
 				memcpy(dest, &space, sizeof(space));
 				dest++;
 				space_args.total_spaces++;
@@ -4490,14 +4891,14 @@ out_loi:
 	return ret;
 }
 
-void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
 			       struct btrfs_ioctl_balance_args *bargs)
 {
 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
 
 	bargs->flags = bctl->flags;
 
-	if (atomic_read(&fs_info->balance_running))
+	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags))
 		bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
 	if (atomic_read(&fs_info->balance_pause_req))
 		bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
@@ -4508,13 +4909,9 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
 	memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
 	memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
 
-	if (lock) {
-		spin_lock(&fs_info->balance_lock);
-		memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
-		spin_unlock(&fs_info->balance_lock);
-	} else {
-		memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
-	}
+	spin_lock(&fs_info->balance_lock);
+	memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+	spin_unlock(&fs_info->balance_lock);
 }
 
 static long btrfs_ioctl_balance(struct file *file, void __user *arg)
@@ -4535,7 +4932,6 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
 
 again:
 	if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
-		mutex_lock(&fs_info->volume_mutex);
 		mutex_lock(&fs_info->balance_mutex);
 		need_unlock = true;
 		goto locked;
@@ -4550,21 +4946,22 @@ again:
 	mutex_lock(&fs_info->balance_mutex);
 	if (fs_info->balance_ctl) {
 		/* this is either (2) or (3) */
-		if (!atomic_read(&fs_info->balance_running)) {
+		if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
 			mutex_unlock(&fs_info->balance_mutex);
-			if (!mutex_trylock(&fs_info->volume_mutex))
-				goto again;
+			/*
+			 * Lock released to allow other waiters to continue,
+			 * we'll reexamine the status again.
+			 */
 			mutex_lock(&fs_info->balance_mutex);
 
 			if (fs_info->balance_ctl &&
-			    !atomic_read(&fs_info->balance_running)) {
+			    !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
 				/* this is (3) */
 				need_unlock = false;
 				goto locked;
 			}
 
 			mutex_unlock(&fs_info->balance_mutex);
-			mutex_unlock(&fs_info->volume_mutex);
 			goto again;
 		} else {
 			/* this is (2) */
@@ -4617,7 +5014,6 @@ locked:
 		goto out_bargs;
 	}
 
-	bctl->fs_info = fs_info;
 	if (arg) {
 		memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
 		memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
@@ -4636,14 +5032,14 @@ locked:
 
 do_balance:
 	/*
-	 * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP
-	 * goes to to btrfs_balance.  bctl is freed in __cancel_balance,
-	 * or, if restriper was paused all the way until unmount, in
-	 * free_fs_info.  The flag is cleared in __cancel_balance.
+	 * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to
+	 * btrfs_balance.  bctl is freed in reset_balance_state, or, if
+	 * restriper was paused all the way until unmount, in free_fs_info.
+	 * The flag should be cleared after reset_balance_state.
 	 */
 	need_unlock = false;
 
-	ret = btrfs_balance(bctl, bargs);
+	ret = btrfs_balance(fs_info, bctl, bargs);
 	bctl = NULL;
 
 	if (arg) {
@@ -4657,7 +5053,6 @@ out_bargs:
 	kfree(bargs);
 out_unlock:
 	mutex_unlock(&fs_info->balance_mutex);
-	mutex_unlock(&fs_info->volume_mutex);
 	if (need_unlock)
 		clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
 out:
@@ -4701,7 +5096,7 @@ static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
 		goto out;
 	}
 
-	update_ioctl_balance_args(fs_info, 1, bargs);
+	btrfs_update_ioctl_balance_args(fs_info, bargs);
 
 	if (copy_to_user(arg, bargs, sizeof(*bargs)))
 		ret = -EFAULT;
@@ -5038,8 +5433,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
 				       BTRFS_UUID_SIZE);
 	if (received_uuid_changed &&
 	    !btrfs_is_empty_uuid(root_item->received_uuid)) {
-		ret = btrfs_uuid_tree_rem(trans, fs_info,
-					  root_item->received_uuid,
+		ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid,
 					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
 					  root->root_key.objectid);
 		if (ret && ret != -ENOENT) {
@@ -5063,7 +5457,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
 		goto out;
 	}
 	if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
-		ret = btrfs_uuid_tree_add(trans, fs_info, sa->uuid,
+		ret = btrfs_uuid_tree_add(trans, sa->uuid,
 					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
 					  root->root_key.objectid);
 		if (ret < 0 && ret != -EEXIST) {
@@ -5497,7 +5891,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_SYNC: {
 		int ret;
 
-		ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
+		ret = btrfs_start_delalloc_roots(fs_info, -1);
 		if (ret)
 			return ret;
 		ret = btrfs_sync_fs(inode->i_sb, 1);
@@ -5565,6 +5959,16 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_get_features(file, argp);
 	case BTRFS_IOC_SET_FEATURES:
 		return btrfs_ioctl_set_features(file, argp);
+	case FS_IOC_FSGETXATTR:
+		return btrfs_ioctl_fsgetxattr(file, argp);
+	case FS_IOC_FSSETXATTR:
+		return btrfs_ioctl_fssetxattr(file, argp);
+	case BTRFS_IOC_GET_SUBVOL_INFO:
+		return btrfs_ioctl_get_subvol_info(file, argp);
+	case BTRFS_IOC_GET_SUBVOL_ROOTREF:
+		return btrfs_ioctl_get_subvol_rootref(file, argp);
+	case BTRFS_IOC_INO_LOOKUP_USER:
+		return btrfs_ioctl_ino_lookup_user(file, argp);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index e4faefa..1da768e 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -66,22 +66,16 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 		write_lock(&eb->lock);
 		WARN_ON(atomic_read(&eb->spinning_writers));
 		atomic_inc(&eb->spinning_writers);
-		/*
-		 * atomic_dec_and_test implies a barrier for waitqueue_active
-		 */
-		if (atomic_dec_and_test(&eb->blocking_writers) &&
-		    waitqueue_active(&eb->write_lock_wq))
-			wake_up(&eb->write_lock_wq);
+		/* atomic_dec_and_test implies a barrier */
+		if (atomic_dec_and_test(&eb->blocking_writers))
+			cond_wake_up_nomb(&eb->write_lock_wq);
 	} else if (rw == BTRFS_READ_LOCK_BLOCKING) {
 		BUG_ON(atomic_read(&eb->blocking_readers) == 0);
 		read_lock(&eb->lock);
 		atomic_inc(&eb->spinning_readers);
-		/*
-		 * atomic_dec_and_test implies a barrier for waitqueue_active
-		 */
-		if (atomic_dec_and_test(&eb->blocking_readers) &&
-		    waitqueue_active(&eb->read_lock_wq))
-			wake_up(&eb->read_lock_wq);
+		/* atomic_dec_and_test implies a barrier */
+		if (atomic_dec_and_test(&eb->blocking_readers))
+			cond_wake_up_nomb(&eb->read_lock_wq);
 	}
 }
 
@@ -221,12 +215,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
 	}
 	btrfs_assert_tree_read_locked(eb);
 	WARN_ON(atomic_read(&eb->blocking_readers) == 0);
-	/*
-	 * atomic_dec_and_test implies a barrier for waitqueue_active
-	 */
-	if (atomic_dec_and_test(&eb->blocking_readers) &&
-	    waitqueue_active(&eb->read_lock_wq))
-		wake_up(&eb->read_lock_wq);
+	/* atomic_dec_and_test implies a barrier */
+	if (atomic_dec_and_test(&eb->blocking_readers))
+		cond_wake_up_nomb(&eb->read_lock_wq);
 	atomic_dec(&eb->read_locks);
 }
 
@@ -275,12 +266,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
 	if (blockers) {
 		WARN_ON(atomic_read(&eb->spinning_writers));
 		atomic_dec(&eb->blocking_writers);
-		/*
-		 * Make sure counter is updated before we wake up waiters.
-		 */
+		/* Use the lighter barrier after atomic */
 		smp_mb__after_atomic();
-		if (waitqueue_active(&eb->write_lock_wq))
-			wake_up(&eb->write_lock_wq);
+		cond_wake_up_nomb(&eb->write_lock_wq);
 	} else {
 		WARN_ON(atomic_read(&eb->spinning_writers) != 1);
 		atomic_dec(&eb->spinning_writers);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 0667ea0..b6a4cc1 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -17,6 +17,43 @@
 
 #define LZO_LEN	4
 
+/*
+ * Btrfs LZO compression format
+ *
+ * Regular and inlined LZO compressed data extents consist of:
+ *
+ * 1.  Header
+ *     Fixed size. LZO_LEN (4) bytes long, LE32.
+ *     Records the total size (including the header) of compressed data.
+ *
+ * 2.  Segment(s)
+ *     Variable size. Each segment includes one segment header, followd by data
+ *     payload.
+ *     One regular LZO compressed extent can have one or more segments.
+ *     For inlined LZO compressed extent, only one segment is allowed.
+ *     One segment represents at most one page of uncompressed data.
+ *
+ * 2.1 Segment header
+ *     Fixed size. LZO_LEN (4) bytes long, LE32.
+ *     Records the total size of the segment (not including the header).
+ *     Segment header never crosses page boundary, thus it's possible to
+ *     have at most 3 padding zeros at the end of the page.
+ *
+ * 2.2 Data Payload
+ *     Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE)
+ *     which is 4419 for a 4KiB page.
+ *
+ * Example:
+ * Page 1:
+ *          0     0x2   0x4   0x6   0x8   0xa   0xc   0xe     0x10
+ * 0x0000   |  Header   | SegHdr 01 | Data payload 01 ...     |
+ * ...
+ * 0x0ff0   | SegHdr  N | Data payload  N     ...          |00|
+ *                                                          ^^ padding zeros
+ * Page 2:
+ * 0x1000   | SegHdr N+1| Data payload N+1 ...                |
+ */
+
 struct workspace {
 	void *mem;
 	void *buf;	/* where decompressed data goes */
@@ -258,6 +295,7 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 	unsigned long working_bytes;
 	size_t in_len;
 	size_t out_len;
+	const size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
 	unsigned long in_offset;
 	unsigned long in_page_bytes_left;
 	unsigned long tot_in;
@@ -271,10 +309,22 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 
 	data_in = kmap(pages_in[0]);
 	tot_len = read_compress_length(data_in);
+	/*
+	 * Compressed data header check.
+	 *
+	 * The real compressed size can't exceed the maximum extent length, and
+	 * all pages should be used (whole unused page with just the segment
+	 * header is not possible).  If this happens it means the compressed
+	 * extent is corrupted.
+	 */
+	if (tot_len > min_t(size_t, BTRFS_MAX_COMPRESSED, srclen) ||
+	    tot_len < srclen - PAGE_SIZE) {
+		ret = -EUCLEAN;
+		goto done;
+	}
 
 	tot_in = LZO_LEN;
 	in_offset = LZO_LEN;
-	tot_len = min_t(size_t, srclen, tot_len);
 	in_page_bytes_left = PAGE_SIZE - LZO_LEN;
 
 	tot_out = 0;
@@ -285,6 +335,17 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 		in_offset += LZO_LEN;
 		tot_in += LZO_LEN;
 
+		/*
+		 * Segment header check.
+		 *
+		 * The segment length must not exceed the maximum LZO
+		 * compression size, nor the total compressed size.
+		 */
+		if (in_len > max_segment_len || tot_in + in_len > tot_len) {
+			ret = -EUCLEAN;
+			goto done;
+		}
+
 		tot_in += in_len;
 		working_bytes = in_len;
 		may_late_unmap = need_unmap = false;
@@ -335,7 +396,7 @@ cont:
 			}
 		}
 
-		out_len = lzo1x_worst_compress(PAGE_SIZE);
+		out_len = max_segment_len;
 		ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
 					    &out_len);
 		if (need_unmap)
@@ -369,15 +430,24 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
 	size_t in_len;
 	size_t out_len;
+	size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
 	int ret = 0;
 	char *kaddr;
 	unsigned long bytes;
 
-	BUG_ON(srclen < LZO_LEN);
+	if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)
+		return -EUCLEAN;
 
+	in_len = read_compress_length(data_in);
+	if (in_len != srclen)
+		return -EUCLEAN;
 	data_in += LZO_LEN;
 
 	in_len = read_compress_length(data_in);
+	if (in_len != srclen - LZO_LEN * 2) {
+		ret = -EUCLEAN;
+		goto out;
+	}
 	data_in += LZO_LEN;
 
 	out_len = PAGE_SIZE;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6db8bb2..2e1a169 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -343,11 +343,8 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
 
 	if (entry->bytes_left == 0) {
 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
-		/*
-		 * Implicit memory barrier after test_and_set_bit
-		 */
-		if (waitqueue_active(&entry->wait))
-			wake_up(&entry->wait);
+		/* test_and_set_bit implies a barrier */
+		cond_wake_up_nomb(&entry->wait);
 	} else {
 		ret = 1;
 	}
@@ -410,11 +407,8 @@ have_entry:
 
 	if (entry->bytes_left == 0) {
 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
-		/*
-		 * Implicit memory barrier after test_and_set_bit
-		 */
-		if (waitqueue_active(&entry->wait))
-			wake_up(&entry->wait);
+		/* test_and_set_bit implies a barrier */
+		cond_wake_up_nomb(&entry->wait);
 	} else {
 		ret = 1;
 	}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 21a831d..a4e11cf 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -166,6 +166,25 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
 	}
 }
 
+/*
+ * Helper to output refs and locking status of extent buffer.  Useful to debug
+ * race condition related problems.
+ */
+static void print_eb_refs_lock(struct extent_buffer *eb)
+{
+#ifdef CONFIG_BTRFS_DEBUG
+	btrfs_info(eb->fs_info,
+"refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u",
+		   atomic_read(&eb->refs), atomic_read(&eb->write_locks),
+		   atomic_read(&eb->read_locks),
+		   atomic_read(&eb->blocking_writers),
+		   atomic_read(&eb->blocking_readers),
+		   atomic_read(&eb->spinning_writers),
+		   atomic_read(&eb->spinning_readers),
+		   eb->lock_owner, current->pid);
+#endif
+}
+
 void btrfs_print_leaf(struct extent_buffer *l)
 {
 	struct btrfs_fs_info *fs_info;
@@ -193,6 +212,7 @@ void btrfs_print_leaf(struct extent_buffer *l)
 		   "leaf %llu gen %llu total ptrs %d free space %d owner %llu",
 		   btrfs_header_bytenr(l), btrfs_header_generation(l), nr,
 		   btrfs_leaf_free_space(fs_info, l), btrfs_header_owner(l));
+	print_eb_refs_lock(l);
 	for (i = 0 ; i < nr ; i++) {
 		item = btrfs_item_nr(i);
 		btrfs_item_key_to_cpu(l, &key, i);
@@ -347,6 +367,7 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow)
 		   btrfs_header_bytenr(c), level, btrfs_header_generation(c),
 		   nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr,
 		   btrfs_header_owner(c));
+	print_eb_refs_lock(c);
 	for (i = 0; i < nr; i++) {
 		btrfs_node_key_to_cpu(c, &key, i);
 		pr_info("\tkey %d (%llu %u %llu) block %llu gen %llu\n",
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9fb758d..1874a6d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1882,8 +1882,8 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
 		cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
 		cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
 
-		trace_qgroup_update_counters(fs_info, qg->qgroupid,
-					     cur_old_count, cur_new_count);
+		trace_qgroup_update_counters(fs_info, qg, cur_old_count,
+					     cur_new_count);
 
 		/* Rfer update part */
 		if (cur_old_count == 0 && cur_new_count > 0) {
@@ -2014,8 +2014,8 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
 
 	BUG_ON(!fs_info->quota_root);
 
-	trace_btrfs_qgroup_account_extent(fs_info, bytenr, num_bytes,
-					  nr_old_roots, nr_new_roots);
+	trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
+					num_bytes, nr_old_roots, nr_new_roots);
 
 	qgroups = ulist_alloc(GFP_NOFS);
 	if (!qgroups) {
@@ -2580,6 +2580,21 @@ out:
 }
 
 /*
+ * Check if the leaf is the last leaf. Which means all node pointers
+ * are at their last position.
+ */
+static bool is_last_leaf(struct btrfs_path *path)
+{
+	int i;
+
+	for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
+		if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
+			return false;
+	}
+	return true;
+}
+
+/*
  * returns < 0 on error, 0 when more leafs are to be scanned.
  * returns 1 when done.
  */
@@ -2590,8 +2605,8 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 	struct btrfs_key found;
 	struct extent_buffer *scratch_leaf = NULL;
 	struct ulist *roots = NULL;
-	struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
 	u64 num_bytes;
+	bool done;
 	int slot;
 	int ret;
 
@@ -2620,12 +2635,12 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 		mutex_unlock(&fs_info->qgroup_rescan_lock);
 		return ret;
 	}
+	done = is_last_leaf(path);
 
 	btrfs_item_key_to_cpu(path->nodes[0], &found,
 			      btrfs_header_nritems(path->nodes[0]) - 1);
 	fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
 
-	btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
 	scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
 	if (!scratch_leaf) {
 		ret = -ENOMEM;
@@ -2664,8 +2679,9 @@ out:
 		btrfs_tree_read_unlock_blocking(scratch_leaf);
 		free_extent_buffer(scratch_leaf);
 	}
-	btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
 
+	if (done && !ret)
+		ret = 1;
 	return ret;
 }
 
@@ -2681,6 +2697,12 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 	path = btrfs_alloc_path();
 	if (!path)
 		goto out;
+	/*
+	 * Rescan should only search for commit root, and any later difference
+	 * should be recorded by qgroup
+	 */
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
 
 	err = 0;
 	while (!err && !btrfs_fs_closing(fs_info)) {
@@ -2760,26 +2782,36 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 {
 	int ret = 0;
 
-	if (!init_flags &&
-	    (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
-	     !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
-		ret = -EINVAL;
-		goto err;
+	if (!init_flags) {
+		/* we're resuming qgroup rescan at mount time */
+		if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN))
+			btrfs_warn(fs_info,
+			"qgroup rescan init failed, qgroup is not enabled");
+		else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+			btrfs_warn(fs_info,
+			"qgroup rescan init failed, qgroup rescan is not queued");
+		return -EINVAL;
 	}
 
 	mutex_lock(&fs_info->qgroup_rescan_lock);
 	spin_lock(&fs_info->qgroup_lock);
 
 	if (init_flags) {
-		if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+		if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+			btrfs_warn(fs_info,
+				   "qgroup rescan is already in progress");
 			ret = -EINPROGRESS;
-		else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+		} else if (!(fs_info->qgroup_flags &
+			     BTRFS_QGROUP_STATUS_FLAG_ON)) {
+			btrfs_warn(fs_info,
+			"qgroup rescan init failed, qgroup is not enabled");
 			ret = -EINVAL;
+		}
 
 		if (ret) {
 			spin_unlock(&fs_info->qgroup_lock);
 			mutex_unlock(&fs_info->qgroup_rescan_lock);
-			goto err;
+			return ret;
 		}
 		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
 	}
@@ -2798,13 +2830,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 	btrfs_init_work(&fs_info->qgroup_rescan_work,
 			btrfs_qgroup_rescan_helper,
 			btrfs_qgroup_rescan_worker, NULL, NULL);
-
-	if (ret) {
-err:
-		btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret);
-		return ret;
-	}
-
 	return 0;
 }
 
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 9abd950..5e4ad134 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -163,6 +163,12 @@ struct btrfs_raid_bio {
 	 * bitmap to record which horizontal stripe has data
 	 */
 	unsigned long *dbitmap;
+
+	/* allocated with real_stripes-many pointers for finish_*() calls */
+	void **finish_pointers;
+
+	/* allocated with stripe_npages-many bits for finish_*() calls */
+	unsigned long *finish_pbitmap;
 };
 
 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
@@ -981,9 +987,14 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 	int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
 	void *p;
 
-	rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
-		       DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) *
-		       sizeof(long), GFP_NOFS);
+	rbio = kzalloc(sizeof(*rbio) +
+		       sizeof(*rbio->stripe_pages) * num_pages +
+		       sizeof(*rbio->bio_pages) * num_pages +
+		       sizeof(*rbio->finish_pointers) * real_stripes +
+		       sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
+		       sizeof(*rbio->finish_pbitmap) *
+				BITS_TO_LONGS(stripe_npages),
+		       GFP_NOFS);
 	if (!rbio)
 		return ERR_PTR(-ENOMEM);
 
@@ -1005,13 +1016,20 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 	atomic_set(&rbio->stripes_pending, 0);
 
 	/*
-	 * the stripe_pages and bio_pages array point to the extra
+	 * the stripe_pages, bio_pages, etc arrays point to the extra
 	 * memory we allocated past the end of the rbio
 	 */
 	p = rbio + 1;
-	rbio->stripe_pages = p;
-	rbio->bio_pages = p + sizeof(struct page *) * num_pages;
-	rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
+#define CONSUME_ALLOC(ptr, count)	do {				\
+		ptr = p;						\
+		p = (unsigned char *)p + sizeof(*(ptr)) * (count);	\
+	} while (0)
+	CONSUME_ALLOC(rbio->stripe_pages, num_pages);
+	CONSUME_ALLOC(rbio->bio_pages, num_pages);
+	CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
+	CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
+	CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
+#undef  CONSUME_ALLOC
 
 	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
 		nr_data = real_stripes - 1;
@@ -1180,7 +1198,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 {
 	struct btrfs_bio *bbio = rbio->bbio;
-	void *pointers[rbio->real_stripes];
+	void **pointers = rbio->finish_pointers;
 	int nr_data = rbio->nr_data;
 	int stripe;
 	int pagenr;
@@ -2350,8 +2368,8 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 					 int need_check)
 {
 	struct btrfs_bio *bbio = rbio->bbio;
-	void *pointers[rbio->real_stripes];
-	DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
+	void **pointers = rbio->finish_pointers;
+	unsigned long *pbitmap = rbio->finish_pbitmap;
 	int nr_data = rbio->nr_data;
 	int stripe;
 	int pagenr;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b041b94..879b76f 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4299,7 +4299,7 @@ out:
 	return inode;
 }
 
-static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
+static struct reloc_control *alloc_reloc_control(void)
 {
 	struct reloc_control *rc;
 
@@ -4344,7 +4344,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info,
 		DESCRIBE_FLAG(RAID5,    "raid5");
 		DESCRIBE_FLAG(RAID6,    "raid6");
 		if (flags)
-			snprintf(buf, buf - bp + sizeof(buf), "|0x%llx", flags);
+			snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags);
 #undef DESCRIBE_FLAG
 	}
 
@@ -4366,7 +4366,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
 	int rw = 0;
 	int err = 0;
 
-	rc = alloc_reloc_control(fs_info);
+	rc = alloc_reloc_control();
 	if (!rc)
 		return -ENOMEM;
 
@@ -4562,7 +4562,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 	if (list_empty(&reloc_roots))
 		goto out;
 
-	rc = alloc_reloc_control(fs_info);
+	rc = alloc_reloc_control();
 	if (!rc) {
 		err = -ENOMEM;
 		goto out;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 52b39a0..a590058 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3984,6 +3984,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 			spin_lock(&fs_info->unused_bgs_lock);
 			if (list_empty(&cache->bg_list)) {
 				btrfs_get_block_group(cache);
+				trace_btrfs_add_unused_block_group(cache);
 				list_add_tail(&cache->bg_list,
 					      &fs_info->unused_bgs);
 			}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index c0074d2..c47f62b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -235,6 +235,7 @@ struct orphan_dir_info {
 	struct rb_node node;
 	u64 ino;
 	u64 gen;
+	u64 last_dir_index_offset;
 };
 
 struct name_cache_entry {
@@ -2844,12 +2845,6 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
 	struct rb_node *parent = NULL;
 	struct orphan_dir_info *entry, *odi;
 
-	odi = kmalloc(sizeof(*odi), GFP_KERNEL);
-	if (!odi)
-		return ERR_PTR(-ENOMEM);
-	odi->ino = dir_ino;
-	odi->gen = 0;
-
 	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct orphan_dir_info, node);
@@ -2858,11 +2853,17 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
 		} else if (dir_ino > entry->ino) {
 			p = &(*p)->rb_right;
 		} else {
-			kfree(odi);
 			return entry;
 		}
 	}
 
+	odi = kmalloc(sizeof(*odi), GFP_KERNEL);
+	if (!odi)
+		return ERR_PTR(-ENOMEM);
+	odi->ino = dir_ino;
+	odi->gen = 0;
+	odi->last_dir_index_offset = 0;
+
 	rb_link_node(&odi->node, parent, p);
 	rb_insert_color(&odi->node, &sctx->orphan_dirs);
 	return odi;
@@ -2917,6 +2918,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 	struct btrfs_key found_key;
 	struct btrfs_key loc;
 	struct btrfs_dir_item *di;
+	struct orphan_dir_info *odi = NULL;
 
 	/*
 	 * Don't try to rmdir the top/root subvolume dir.
@@ -2931,6 +2933,11 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 	key.objectid = dir;
 	key.type = BTRFS_DIR_INDEX_KEY;
 	key.offset = 0;
+
+	odi = get_orphan_dir_info(sctx, dir);
+	if (odi)
+		key.offset = odi->last_dir_index_offset;
+
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
@@ -2958,30 +2965,33 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 
 		dm = get_waiting_dir_move(sctx, loc.objectid);
 		if (dm) {
-			struct orphan_dir_info *odi;
-
 			odi = add_orphan_dir_info(sctx, dir);
 			if (IS_ERR(odi)) {
 				ret = PTR_ERR(odi);
 				goto out;
 			}
 			odi->gen = dir_gen;
+			odi->last_dir_index_offset = found_key.offset;
 			dm->rmdir_ino = dir;
 			ret = 0;
 			goto out;
 		}
 
 		if (loc.objectid > send_progress) {
-			struct orphan_dir_info *odi;
-
-			odi = get_orphan_dir_info(sctx, dir);
-			free_orphan_dir_info(sctx, odi);
+			odi = add_orphan_dir_info(sctx, dir);
+			if (IS_ERR(odi)) {
+				ret = PTR_ERR(odi);
+				goto out;
+			}
+			odi->gen = dir_gen;
+			odi->last_dir_index_offset = found_key.offset;
 			ret = 0;
 			goto out;
 		}
 
 		path->slots[0]++;
 	}
+	free_orphan_dir_info(sctx, odi);
 
 	ret = 1;
 
@@ -3259,13 +3269,16 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 
 	if (rmdir_ino) {
 		struct orphan_dir_info *odi;
+		u64 gen;
 
 		odi = get_orphan_dir_info(sctx, rmdir_ino);
 		if (!odi) {
 			/* already deleted */
 			goto finish;
 		}
-		ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino);
+		gen = odi->gen;
+
+		ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino);
 		if (ret < 0)
 			goto out;
 		if (!ret)
@@ -3276,13 +3289,12 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 			ret = -ENOMEM;
 			goto out;
 		}
-		ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
+		ret = get_cur_path(sctx, rmdir_ino, gen, name);
 		if (ret < 0)
 			goto out;
 		ret = send_rmdir(sctx, name);
 		if (ret < 0)
 			goto out;
-		free_orphan_dir_info(sctx, odi);
 	}
 
 finish:
@@ -6454,7 +6466,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
 	 */
 	if (root->send_in_progress < 0)
 		btrfs_err(root->fs_info,
-			  "send_in_progres unbalanced %d root %llu",
+			  "send_in_progress unbalanced %d root %llu",
 			  root->send_in_progress, root->root_key.objectid);
 	spin_unlock(&root->root_item_lock);
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0628092..81107ad 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -323,6 +323,7 @@ enum {
 	Opt_ssd, Opt_nossd,
 	Opt_ssd_spread, Opt_nossd_spread,
 	Opt_subvol,
+	Opt_subvol_empty,
 	Opt_subvolid,
 	Opt_thread_pool,
 	Opt_treelog, Opt_notreelog,
@@ -388,6 +389,7 @@ static const match_table_t tokens = {
 	{Opt_ssd_spread, "ssd_spread"},
 	{Opt_nossd_spread, "nossd_spread"},
 	{Opt_subvol, "subvol=%s"},
+	{Opt_subvol_empty, "subvol="},
 	{Opt_subvolid, "subvolid=%s"},
 	{Opt_thread_pool, "thread_pool=%u"},
 	{Opt_treelog, "treelog"},
@@ -461,6 +463,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			btrfs_set_opt(info->mount_opt, DEGRADED);
 			break;
 		case Opt_subvol:
+		case Opt_subvol_empty:
 		case Opt_subvolid:
 		case Opt_subvolrootid:
 		case Opt_device:
@@ -1782,10 +1785,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 	}
 
 	ret = btrfs_parse_options(fs_info, data, *flags);
-	if (ret) {
-		ret = -EINVAL;
+	if (ret)
 		goto restore;
-	}
 
 	btrfs_remount_begin(fs_info, old_opts, *flags);
 	btrfs_resize_thread_pool(fs_info,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 4848a43..4a4e960 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -210,12 +210,42 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
 	NULL
 };
 
+/*
+ * Features which depend on feature bits and may differ between each fs.
+ *
+ * /sys/fs/btrfs/features lists all available features of this kernel while
+ * /sys/fs/btrfs/UUID/features shows features of the fs which are enabled or
+ * can be changed online.
+ */
 static const struct attribute_group btrfs_feature_attr_group = {
 	.name = "features",
 	.is_visible = btrfs_feature_visible,
 	.attrs = btrfs_supported_feature_attrs,
 };
 
+static ssize_t rmdir_subvol_show(struct kobject *kobj,
+				 struct kobj_attribute *ka, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "0\n");
+}
+BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show);
+
+static struct attribute *btrfs_supported_static_feature_attrs[] = {
+	BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
+	NULL
+};
+
+/*
+ * Features which only depend on kernel version.
+ *
+ * These are listed in /sys/fs/btrfs/features along with
+ * btrfs_feature_attr_group
+ */
+static const struct attribute_group btrfs_static_feature_attr_group = {
+	.name = "features",
+	.attrs = btrfs_supported_static_feature_attrs,
+};
+
 static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
 {
 	u64 val;
@@ -514,10 +544,11 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
 }
 
 #define NUM_FEATURE_BITS 64
-static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
-static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
+#define BTRFS_FEATURE_NAME_MAX 13
+static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX];
+static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS];
 
-static const u64 supported_feature_masks[3] = {
+static const u64 supported_feature_masks[FEAT_MAX] = {
 	[FEAT_COMPAT]    = BTRFS_FEATURE_COMPAT_SUPP,
 	[FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
 	[FEAT_INCOMPAT]  = BTRFS_FEATURE_INCOMPAT_SUPP,
@@ -589,7 +620,7 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
 		return;
 	}
 
-	list_for_each_entry(fs_devs, fs_uuids, list) {
+	list_for_each_entry(fs_devs, fs_uuids, fs_list) {
 		__btrfs_sysfs_remove_fsid(fs_devs);
 	}
 }
@@ -609,7 +640,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
 	btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
 }
 
-const char * const btrfs_feature_set_names[3] = {
+const char * const btrfs_feature_set_names[FEAT_MAX] = {
 	[FEAT_COMPAT]	 = "compat",
 	[FEAT_COMPAT_RO] = "compat_ro",
 	[FEAT_INCOMPAT]	 = "incompat",
@@ -673,7 +704,7 @@ static void init_feature_attrs(void)
 			if (fa->kobj_attr.attr.name)
 				continue;
 
-			snprintf(name, 13, "%s:%u",
+			snprintf(name, BTRFS_FEATURE_NAME_MAX, "%s:%u",
 				 btrfs_feature_set_names[set], i);
 
 			fa->kobj_attr.attr.name = name;
@@ -900,8 +931,15 @@ int __init btrfs_init_sysfs(void)
 	ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
 	if (ret)
 		goto out2;
+	ret = sysfs_merge_group(&btrfs_kset->kobj,
+				&btrfs_static_feature_attr_group);
+	if (ret)
+		goto out_remove_group;
 
 	return 0;
+
+out_remove_group:
+	sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
 out2:
 	debugfs_remove_recursive(btrfs_debugfs_root_dentry);
 out1:
@@ -912,6 +950,8 @@ out1:
 
 void __cold btrfs_exit_sysfs(void)
 {
+	sysfs_unmerge_group(&btrfs_kset->kobj,
+			    &btrfs_static_feature_attr_group);
 	sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
 	kset_unregister(btrfs_kset);
 	debugfs_remove_recursive(btrfs_debugfs_root_dentry);
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index b567560..c6ee600 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -9,7 +9,7 @@
 extern u64 btrfs_debugfs_test;
 
 enum btrfs_feature_set {
-	FEAT_COMPAT,
+	FEAT_COMPAT = 0,
 	FEAT_COMPAT_RO,
 	FEAT_INCOMPAT,
 	FEAT_MAX
@@ -77,7 +77,7 @@ attr_to_btrfs_feature_attr(struct attribute *attr)
 }
 
 char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
-extern const char * const btrfs_feature_set_names[3];
+extern const char * const btrfs_feature_set_names[FEAT_MAX];
 extern struct kobj_type space_info_ktype;
 extern struct kobj_type btrfs_raid_ktype;
 int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 30ed438..db72b3b 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -219,11 +219,13 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache)
 	kfree(cache);
 }
 
-void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans)
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
+			    struct btrfs_fs_info *fs_info)
 {
 	memset(trans, 0, sizeof(*trans));
 	trans->transid = 1;
 	trans->type = __TRANS_DUMMY;
+	trans->fs_info = fs_info;
 }
 
 int btrfs_run_sanity_tests(void)
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index a5a0b95..70ff9f9 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -9,7 +9,8 @@
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 int btrfs_run_sanity_tests(void);
 
-#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
+#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__)
+#define test_err(fmt, ...) pr_err("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__)
 
 struct btrfs_root;
 struct btrfs_trans_handle;
@@ -28,7 +29,8 @@ void btrfs_free_dummy_root(struct btrfs_root *root);
 struct btrfs_block_group_cache *
 btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length);
 void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache);
-void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans);
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
+			    struct btrfs_fs_info *fs_info);
 #else
 static inline int btrfs_run_sanity_tests(void)
 {
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index 31e8a9e..7d72eab 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -26,31 +26,31 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	u32 value_len = strlen(value);
 	int ret = 0;
 
-	test_msg("Running btrfs_split_item tests\n");
+	test_msg("running btrfs_split_item tests");
 
 	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
 	if (!fs_info) {
-		test_msg("Could not allocate fs_info\n");
+		test_err("could not allocate fs_info");
 		return -ENOMEM;
 	}
 
 	root = btrfs_alloc_dummy_root(fs_info);
 	if (IS_ERR(root)) {
-		test_msg("Could not allocate root\n");
+		test_err("could not allocate root");
 		ret = PTR_ERR(root);
 		goto out;
 	}
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		test_msg("Could not allocate path\n");
+		test_err("could not allocate path");
 		ret = -ENOMEM;
 		goto out;
 	}
 
 	path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize);
 	if (!eb) {
-		test_msg("Could not allocate dummy buffer\n");
+		test_err("could not allocate dummy buffer");
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -75,7 +75,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	 */
 	ret = btrfs_split_item(NULL, root, path, &key, 17);
 	if (ret) {
-		test_msg("Split item failed %d\n", ret);
+		test_err("split item failed %d", ret);
 		goto out;
 	}
 
@@ -86,14 +86,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	btrfs_item_key_to_cpu(eb, &key, 0);
 	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
 	    key.offset != 0) {
-		test_msg("Invalid key at slot 0\n");
+		test_err("invalid key at slot 0");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	item = btrfs_item_nr(0);
 	if (btrfs_item_size(eb, item) != strlen(split1)) {
-		test_msg("Invalid len in the first split\n");
+		test_err("invalid len in the first split");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -101,8 +101,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
 			   strlen(split1));
 	if (memcmp(buf, split1, strlen(split1))) {
-		test_msg("Data in the buffer doesn't match what it should "
-			 "in the first split have='%.*s' want '%s'\n",
+		test_err(
+"data in the buffer doesn't match what it should in the first split have='%.*s' want '%s'",
 			 (int)strlen(split1), buf, split1);
 		ret = -EINVAL;
 		goto out;
@@ -111,14 +111,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	btrfs_item_key_to_cpu(eb, &key, 1);
 	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
 	    key.offset != 3) {
-		test_msg("Invalid key at slot 1\n");
+		test_err("invalid key at slot 1");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	item = btrfs_item_nr(1);
 	if (btrfs_item_size(eb, item) != strlen(split2)) {
-		test_msg("Invalid len in the second split\n");
+		test_err("invalid len in the second split");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -126,8 +126,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
 			   strlen(split2));
 	if (memcmp(buf, split2, strlen(split2))) {
-		test_msg("Data in the buffer doesn't match what it should "
-			 "in the second split\n");
+		test_err(
+	"data in the buffer doesn't match what it should in the second split");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -136,21 +136,21 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	/* Do it again so we test memmoving the other items in the leaf */
 	ret = btrfs_split_item(NULL, root, path, &key, 4);
 	if (ret) {
-		test_msg("Second split item failed %d\n", ret);
+		test_err("second split item failed %d", ret);
 		goto out;
 	}
 
 	btrfs_item_key_to_cpu(eb, &key, 0);
 	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
 	    key.offset != 0) {
-		test_msg("Invalid key at slot 0\n");
+		test_err("invalid key at slot 0");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	item = btrfs_item_nr(0);
 	if (btrfs_item_size(eb, item) != strlen(split3)) {
-		test_msg("Invalid len in the first split\n");
+		test_err("invalid len in the first split");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -158,8 +158,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
 			   strlen(split3));
 	if (memcmp(buf, split3, strlen(split3))) {
-		test_msg("Data in the buffer doesn't match what it should "
-			 "in the third split");
+		test_err(
+	"data in the buffer doesn't match what it should in the third split");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -167,14 +167,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	btrfs_item_key_to_cpu(eb, &key, 1);
 	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
 	    key.offset != 1) {
-		test_msg("Invalid key at slot 1\n");
+		test_err("invalid key at slot 1");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	item = btrfs_item_nr(1);
 	if (btrfs_item_size(eb, item) != strlen(split4)) {
-		test_msg("Invalid len in the second split\n");
+		test_err("invalid len in the second split");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -182,8 +182,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
 			   strlen(split4));
 	if (memcmp(buf, split4, strlen(split4))) {
-		test_msg("Data in the buffer doesn't match what it should "
-			 "in the fourth split\n");
+		test_err(
+	"data in the buffer doesn't match what it should in the fourth split");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -191,14 +191,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	btrfs_item_key_to_cpu(eb, &key, 2);
 	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
 	    key.offset != 3) {
-		test_msg("Invalid key at slot 2\n");
+		test_err("invalid key at slot 2");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	item = btrfs_item_nr(2);
 	if (btrfs_item_size(eb, item) != strlen(split2)) {
-		test_msg("Invalid len in the second split\n");
+		test_err("invalid len in the second split");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -206,8 +206,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2),
 			   strlen(split2));
 	if (memcmp(buf, split2, strlen(split2))) {
-		test_msg("Data in the buffer doesn't match what it should "
-			 "in the last chunk\n");
+		test_err(
+	"data in the buffer doesn't match what it should in the last chunk");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -220,6 +220,6 @@ out:
 
 int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize)
 {
-	test_msg("Running extent buffer operation tests\n");
+	test_msg("running extent buffer operation tests");
 	return test_btrfs_split_item(sectorsize, nodesize);
 }
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 76aa5a6..d9269a5 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -46,7 +46,9 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
 		cond_resched();
 		loops++;
 		if (loops > 100000) {
-			printk(KERN_ERR "stuck in a loop, start %Lu, end %Lu, nr_pages %lu, ret %d\n", start, end, nr_pages, ret);
+			printk(KERN_ERR
+		"stuck in a loop, start %llu, end %llu, nr_pages %lu, ret %d\n",
+				start, end, nr_pages, ret);
 			break;
 		}
 	}
@@ -66,11 +68,11 @@ static int test_find_delalloc(u32 sectorsize)
 	u64 found;
 	int ret = -EINVAL;
 
-	test_msg("Running find delalloc tests\n");
+	test_msg("running find delalloc tests");
 
 	inode = btrfs_new_test_inode();
 	if (!inode) {
-		test_msg("Failed to allocate test inode\n");
+		test_err("failed to allocate test inode");
 		return -ENOMEM;
 	}
 
@@ -84,7 +86,7 @@ static int test_find_delalloc(u32 sectorsize)
 	for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) {
 		page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
 		if (!page) {
-			test_msg("Failed to allocate test page\n");
+			test_err("failed to allocate test page");
 			ret = -ENOMEM;
 			goto out;
 		}
@@ -107,11 +109,11 @@ static int test_find_delalloc(u32 sectorsize)
 	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
 					 &end, max_bytes);
 	if (!found) {
-		test_msg("Should have found at least one delalloc\n");
+		test_err("should have found at least one delalloc");
 		goto out_bits;
 	}
 	if (start != 0 || end != (sectorsize - 1)) {
-		test_msg("Expected start 0 end %u, got start %llu end %llu\n",
+		test_err("expected start 0 end %u, got start %llu end %llu",
 			sectorsize - 1, start, end);
 		goto out_bits;
 	}
@@ -129,7 +131,7 @@ static int test_find_delalloc(u32 sectorsize)
 	locked_page = find_lock_page(inode->i_mapping,
 				     test_start >> PAGE_SHIFT);
 	if (!locked_page) {
-		test_msg("Couldn't find the locked page\n");
+		test_err("couldn't find the locked page");
 		goto out_bits;
 	}
 	set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL);
@@ -138,17 +140,17 @@ static int test_find_delalloc(u32 sectorsize)
 	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
 					 &end, max_bytes);
 	if (!found) {
-		test_msg("Couldn't find delalloc in our range\n");
+		test_err("couldn't find delalloc in our range");
 		goto out_bits;
 	}
 	if (start != test_start || end != max_bytes - 1) {
-		test_msg("Expected start %Lu end %Lu, got start %Lu, end "
-			 "%Lu\n", test_start, max_bytes - 1, start, end);
+		test_err("expected start %llu end %llu, got start %llu, end %llu",
+				test_start, max_bytes - 1, start, end);
 		goto out_bits;
 	}
 	if (process_page_range(inode, start, end,
 			       PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
-		test_msg("There were unlocked pages in the range\n");
+		test_err("there were unlocked pages in the range");
 		goto out_bits;
 	}
 	unlock_extent(&tmp, start, end);
@@ -164,7 +166,7 @@ static int test_find_delalloc(u32 sectorsize)
 	locked_page = find_lock_page(inode->i_mapping, test_start >>
 				     PAGE_SHIFT);
 	if (!locked_page) {
-		test_msg("Couldn't find the locked page\n");
+		test_err("couldn't find the locked page");
 		goto out_bits;
 	}
 	start = test_start;
@@ -172,11 +174,11 @@ static int test_find_delalloc(u32 sectorsize)
 	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
 					 &end, max_bytes);
 	if (found) {
-		test_msg("Found range when we shouldn't have\n");
+		test_err("found range when we shouldn't have");
 		goto out_bits;
 	}
 	if (end != (u64)-1) {
-		test_msg("Did not return the proper end offset\n");
+		test_err("did not return the proper end offset");
 		goto out_bits;
 	}
 
@@ -193,17 +195,17 @@ static int test_find_delalloc(u32 sectorsize)
 	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
 					 &end, max_bytes);
 	if (!found) {
-		test_msg("Didn't find our range\n");
+		test_err("didn't find our range");
 		goto out_bits;
 	}
 	if (start != test_start || end != total_dirty - 1) {
-		test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+		test_err("expected start %llu end %llu, got start %llu end %llu",
 			 test_start, total_dirty - 1, start, end);
 		goto out_bits;
 	}
 	if (process_page_range(inode, start, end,
 			       PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
-		test_msg("Pages in range were not all locked\n");
+		test_err("pages in range were not all locked");
 		goto out_bits;
 	}
 	unlock_extent(&tmp, start, end);
@@ -215,7 +217,7 @@ static int test_find_delalloc(u32 sectorsize)
 	page = find_get_page(inode->i_mapping,
 			     (max_bytes + SZ_1M) >> PAGE_SHIFT);
 	if (!page) {
-		test_msg("Couldn't find our page\n");
+		test_err("couldn't find our page");
 		goto out_bits;
 	}
 	ClearPageDirty(page);
@@ -234,18 +236,17 @@ static int test_find_delalloc(u32 sectorsize)
 	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
 					 &end, max_bytes);
 	if (!found) {
-		test_msg("Didn't find our range\n");
+		test_err("didn't find our range");
 		goto out_bits;
 	}
 	if (start != test_start && end != test_start + PAGE_SIZE - 1) {
-		test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
-			 test_start, test_start + PAGE_SIZE - 1, start,
-			 end);
+		test_err("expected start %llu end %llu, got start %llu end %llu",
+			 test_start, test_start + PAGE_SIZE - 1, start, end);
 		goto out_bits;
 	}
 	if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED |
 			       PROCESS_UNLOCK)) {
-		test_msg("Pages in range were not all locked\n");
+		test_err("pages in range were not all locked");
 		goto out_bits;
 	}
 	ret = 0;
@@ -271,14 +272,14 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb,
 		bit = !!test_bit(i, bitmap);
 		bit1 = !!extent_buffer_test_bit(eb, 0, i);
 		if (bit1 != bit) {
-			test_msg("Bits do not match\n");
+			test_err("bits do not match");
 			return -EINVAL;
 		}
 
 		bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
 						i % BITS_PER_BYTE);
 		if (bit1 != bit) {
-			test_msg("Offset bits do not match\n");
+			test_err("offset bits do not match");
 			return -EINVAL;
 		}
 	}
@@ -295,7 +296,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
 	memset(bitmap, 0, len);
 	memzero_extent_buffer(eb, 0, len);
 	if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
-		test_msg("Bitmap was not zeroed\n");
+		test_err("bitmap was not zeroed");
 		return -EINVAL;
 	}
 
@@ -303,7 +304,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
 	extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
 	ret = check_eb_bitmap(bitmap, eb, len);
 	if (ret) {
-		test_msg("Setting all bits failed\n");
+		test_err("setting all bits failed");
 		return ret;
 	}
 
@@ -311,7 +312,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
 	extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE);
 	ret = check_eb_bitmap(bitmap, eb, len);
 	if (ret) {
-		test_msg("Clearing all bits failed\n");
+		test_err("clearing all bits failed");
 		return ret;
 	}
 
@@ -324,7 +325,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
 					sizeof(long) * BITS_PER_BYTE);
 		ret = check_eb_bitmap(bitmap, eb, len);
 		if (ret) {
-			test_msg("Setting straddling pages failed\n");
+			test_err("setting straddling pages failed");
 			return ret;
 		}
 
@@ -337,7 +338,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
 					sizeof(long) * BITS_PER_BYTE);
 		ret = check_eb_bitmap(bitmap, eb, len);
 		if (ret) {
-			test_msg("Clearing straddling pages failed\n");
+			test_err("clearing straddling pages failed");
 			return ret;
 		}
 	}
@@ -361,7 +362,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
 
 	ret = check_eb_bitmap(bitmap, eb, len);
 	if (ret) {
-		test_msg("Random bit pattern failed\n");
+		test_err("random bit pattern failed");
 		return ret;
 	}
 
@@ -376,7 +377,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
 	struct extent_buffer *eb;
 	int ret;
 
-	test_msg("Running extent buffer bitmap tests\n");
+	test_msg("running extent buffer bitmap tests");
 
 	/*
 	 * In ppc64, sectorsize can be 64K, thus 4 * 64K will be larger than
@@ -389,13 +390,13 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
 
 	bitmap = kmalloc(len, GFP_KERNEL);
 	if (!bitmap) {
-		test_msg("Couldn't allocate test bitmap\n");
+		test_err("couldn't allocate test bitmap");
 		return -ENOMEM;
 	}
 
 	eb = __alloc_dummy_extent_buffer(fs_info, 0, len);
 	if (!eb) {
-		test_msg("Couldn't allocate test extent buffer\n");
+		test_err("couldn't allocate test extent buffer");
 		kfree(bitmap);
 		return -ENOMEM;
 	}
@@ -408,7 +409,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
 	free_extent_buffer(eb);
 	eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len);
 	if (!eb) {
-		test_msg("Couldn't allocate test extent buffer\n");
+		test_err("couldn't allocate test extent buffer");
 		kfree(bitmap);
 		return -ENOMEM;
 	}
@@ -424,7 +425,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
 {
 	int ret;
 
-	test_msg("Running extent I/O tests\n");
+	test_msg("running extent I/O tests");
 
 	ret = test_find_delalloc(sectorsize);
 	if (ret)
@@ -432,6 +433,6 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
 
 	ret = test_eb_bitmaps(sectorsize, nodesize);
 out:
-	test_msg("Extent I/O tests finished\n");
+	test_msg("extent I/O tests finished");
 	return ret;
 }
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 79e0a5f..385a531 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -19,8 +19,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
 
 #ifdef CONFIG_BTRFS_DEBUG
 		if (refcount_read(&em->refs) != 1) {
-			test_msg(
-"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d\n",
+			test_err(
+"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d",
 				 em->start, em->len, em->block_start,
 				 em->block_len, refcount_read(&em->refs));
 
@@ -47,7 +47,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
  *                                    ->add_extent_mapping(0, 16K)
  *                                    -> #handle -EEXIST
  */
-static void test_case_1(struct extent_map_tree *em_tree)
+static void test_case_1(struct btrfs_fs_info *fs_info,
+		struct extent_map_tree *em_tree)
 {
 	struct extent_map *em;
 	u64 start = 0;
@@ -90,14 +91,14 @@ static void test_case_1(struct extent_map_tree *em_tree)
 	em->len = len;
 	em->block_start = start;
 	em->block_len = len;
-	ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
 	if (ret)
-		test_msg("case1 [%llu %llu]: ret %d\n", start, start + len, ret);
+		test_err("case1 [%llu %llu]: ret %d", start, start + len, ret);
 	if (em &&
 	    (em->start != 0 || extent_map_end(em) != SZ_16K ||
 	     em->block_start != 0 || em->block_len != SZ_16K))
-		test_msg(
-"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n",
+		test_err(
+"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
 			 start, start + len, ret, em->start, em->len,
 			 em->block_start, em->block_len);
 	free_extent_map(em);
@@ -112,7 +113,8 @@ out:
  * Reading the inline ending up with EEXIST, ie. read an inline
  * extent and discard page cache and read it again.
  */
-static void test_case_2(struct extent_map_tree *em_tree)
+static void test_case_2(struct btrfs_fs_info *fs_info,
+		struct extent_map_tree *em_tree)
 {
 	struct extent_map *em;
 	int ret;
@@ -153,14 +155,14 @@ static void test_case_2(struct extent_map_tree *em_tree)
 	em->len = SZ_1K;
 	em->block_start = EXTENT_MAP_INLINE;
 	em->block_len = (u64)-1;
-	ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
 	if (ret)
-		test_msg("case2 [0 1K]: ret %d\n", ret);
+		test_err("case2 [0 1K]: ret %d", ret);
 	if (em &&
 	    (em->start != 0 || extent_map_end(em) != SZ_1K ||
 	     em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1))
-		test_msg(
-"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n",
+		test_err(
+"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
 			 ret, em->start, em->len, em->block_start,
 			 em->block_len);
 	free_extent_map(em);
@@ -169,7 +171,8 @@ out:
 	free_extent_map_tree(em_tree);
 }
 
-static void __test_case_3(struct extent_map_tree *em_tree, u64 start)
+static void __test_case_3(struct btrfs_fs_info *fs_info,
+		struct extent_map_tree *em_tree, u64 start)
 {
 	struct extent_map *em;
 	u64 len = SZ_4K;
@@ -198,9 +201,9 @@ static void __test_case_3(struct extent_map_tree *em_tree, u64 start)
 	em->len = SZ_16K;
 	em->block_start = 0;
 	em->block_len = SZ_16K;
-	ret = btrfs_add_extent_mapping(em_tree, &em, start, len);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
 	if (ret)
-		test_msg("case3 [0x%llx 0x%llx): ret %d\n",
+		test_err("case3 [0x%llx 0x%llx): ret %d",
 			 start, start + len, ret);
 	/*
 	 * Since bytes within em are contiguous, em->block_start is identical to
@@ -209,8 +212,8 @@ static void __test_case_3(struct extent_map_tree *em_tree, u64 start)
 	if (em &&
 	    (start < em->start || start + len > extent_map_end(em) ||
 	     em->start != em->block_start || em->len != em->block_len))
-		test_msg(
-"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n",
+		test_err(
+"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
 			 start, start + len, ret, em->start, em->len,
 			 em->block_start, em->block_len);
 	free_extent_map(em);
@@ -235,14 +238,16 @@ out:
  *   -> add_extent_mapping()
  *                            -> add_extent_mapping()
  */
-static void test_case_3(struct extent_map_tree *em_tree)
+static void test_case_3(struct btrfs_fs_info *fs_info,
+		struct extent_map_tree *em_tree)
 {
-	__test_case_3(em_tree, 0);
-	__test_case_3(em_tree, SZ_8K);
-	__test_case_3(em_tree, (12 * 1024ULL));
+	__test_case_3(fs_info, em_tree, 0);
+	__test_case_3(fs_info, em_tree, SZ_8K);
+	__test_case_3(fs_info, em_tree, (12 * 1024ULL));
 }
 
-static void __test_case_4(struct extent_map_tree *em_tree, u64 start)
+static void __test_case_4(struct btrfs_fs_info *fs_info,
+		struct extent_map_tree *em_tree, u64 start)
 {
 	struct extent_map *em;
 	u64 len = SZ_4K;
@@ -283,14 +288,14 @@ static void __test_case_4(struct extent_map_tree *em_tree, u64 start)
 	em->len = SZ_32K;
 	em->block_start = 0;
 	em->block_len = SZ_32K;
-	ret = btrfs_add_extent_mapping(em_tree, &em, start, len);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
 	if (ret)
-		test_msg("case4 [0x%llx 0x%llx): ret %d\n",
+		test_err("case4 [0x%llx 0x%llx): ret %d",
 			 start, len, ret);
 	if (em &&
 	    (start < em->start || start + len > extent_map_end(em)))
-		test_msg(
-"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n",
+		test_err(
+"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
 			 start, len, ret, em->start, em->len, em->block_start,
 			 em->block_len);
 	free_extent_map(em);
@@ -324,30 +329,45 @@ out:
  *                                             # handle -EEXIST when adding
  *                                             # [0, 32K)
  */
-static void test_case_4(struct extent_map_tree *em_tree)
+static void test_case_4(struct btrfs_fs_info *fs_info,
+		struct extent_map_tree *em_tree)
 {
-	__test_case_4(em_tree, 0);
-	__test_case_4(em_tree, SZ_4K);
+	__test_case_4(fs_info, em_tree, 0);
+	__test_case_4(fs_info, em_tree, SZ_4K);
 }
 
 int btrfs_test_extent_map(void)
 {
+	struct btrfs_fs_info *fs_info = NULL;
 	struct extent_map_tree *em_tree;
 
-	test_msg("Running extent_map tests\n");
+	test_msg("running extent_map tests");
+
+	/*
+	 * Note: the fs_info is not set up completely, we only need
+	 * fs_info::fsid for the tracepoint.
+	 */
+	fs_info = btrfs_alloc_dummy_fs_info(PAGE_SIZE, PAGE_SIZE);
+	if (!fs_info) {
+		test_msg("Couldn't allocate dummy fs info");
+		return -ENOMEM;
+	}
 
 	em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL);
 	if (!em_tree)
 		/* Skip the test on error. */
-		return 0;
+		goto out;
 
 	extent_map_tree_init(em_tree);
 
-	test_case_1(em_tree);
-	test_case_2(em_tree);
-	test_case_3(em_tree);
-	test_case_4(em_tree);
+	test_case_1(fs_info, em_tree);
+	test_case_2(fs_info, em_tree);
+	test_case_3(fs_info, em_tree);
+	test_case_4(fs_info, em_tree);
 
 	kfree(em_tree);
+out:
+	btrfs_free_dummy_fs_info(fs_info);
+
 	return 0;
 }
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index d3c9f8a..5c2f77e 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -20,63 +20,63 @@ static int test_extents(struct btrfs_block_group_cache *cache)
 {
 	int ret = 0;
 
-	test_msg("Running extent only tests\n");
+	test_msg("running extent only tests");
 
 	/* First just make sure we can remove an entire entry */
 	ret = btrfs_add_free_space(cache, 0, SZ_4M);
 	if (ret) {
-		test_msg("Error adding initial extents %d\n", ret);
+		test_err("error adding initial extents %d", ret);
 		return ret;
 	}
 
 	ret = btrfs_remove_free_space(cache, 0, SZ_4M);
 	if (ret) {
-		test_msg("Error removing extent %d\n", ret);
+		test_err("error removing extent %d", ret);
 		return ret;
 	}
 
 	if (test_check_exists(cache, 0, SZ_4M)) {
-		test_msg("Full remove left some lingering space\n");
+		test_err("full remove left some lingering space");
 		return -1;
 	}
 
 	/* Ok edge and middle cases now */
 	ret = btrfs_add_free_space(cache, 0, SZ_4M);
 	if (ret) {
-		test_msg("Error adding half extent %d\n", ret);
+		test_err("error adding half extent %d", ret);
 		return ret;
 	}
 
 	ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_1M);
 	if (ret) {
-		test_msg("Error removing tail end %d\n", ret);
+		test_err("error removing tail end %d", ret);
 		return ret;
 	}
 
 	ret = btrfs_remove_free_space(cache, 0, SZ_1M);
 	if (ret) {
-		test_msg("Error removing front end %d\n", ret);
+		test_err("error removing front end %d", ret);
 		return ret;
 	}
 
 	ret = btrfs_remove_free_space(cache, SZ_2M, 4096);
 	if (ret) {
-		test_msg("Error removing middle piece %d\n", ret);
+		test_err("error removing middle piece %d", ret);
 		return ret;
 	}
 
 	if (test_check_exists(cache, 0, SZ_1M)) {
-		test_msg("Still have space at the front\n");
+		test_err("still have space at the front");
 		return -1;
 	}
 
 	if (test_check_exists(cache, SZ_2M, 4096)) {
-		test_msg("Still have space in the middle\n");
+		test_err("still have space in the middle");
 		return -1;
 	}
 
 	if (test_check_exists(cache, 3 * SZ_1M, SZ_1M)) {
-		test_msg("Still have space at the end\n");
+		test_err("still have space at the end");
 		return -1;
 	}
 
@@ -92,34 +92,34 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache,
 	u64 next_bitmap_offset;
 	int ret;
 
-	test_msg("Running bitmap only tests\n");
+	test_msg("running bitmap only tests");
 
 	ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
 	if (ret) {
-		test_msg("Couldn't create a bitmap entry %d\n", ret);
+		test_err("couldn't create a bitmap entry %d", ret);
 		return ret;
 	}
 
 	ret = btrfs_remove_free_space(cache, 0, SZ_4M);
 	if (ret) {
-		test_msg("Error removing bitmap full range %d\n", ret);
+		test_err("error removing bitmap full range %d", ret);
 		return ret;
 	}
 
 	if (test_check_exists(cache, 0, SZ_4M)) {
-		test_msg("Left some space in bitmap\n");
+		test_err("left some space in bitmap");
 		return -1;
 	}
 
 	ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
 	if (ret) {
-		test_msg("Couldn't add to our bitmap entry %d\n", ret);
+		test_err("couldn't add to our bitmap entry %d", ret);
 		return ret;
 	}
 
 	ret = btrfs_remove_free_space(cache, SZ_1M, SZ_2M);
 	if (ret) {
-		test_msg("Couldn't remove middle chunk %d\n", ret);
+		test_err("couldn't remove middle chunk %d", ret);
 		return ret;
 	}
 
@@ -133,19 +133,19 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache,
 	ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M,
 					SZ_4M, 1);
 	if (ret) {
-		test_msg("Couldn't add space that straddles two bitmaps %d\n",
+		test_err("couldn't add space that straddles two bitmaps %d",
 				ret);
 		return ret;
 	}
 
 	ret = btrfs_remove_free_space(cache, next_bitmap_offset - SZ_1M, SZ_2M);
 	if (ret) {
-		test_msg("Couldn't remove overlapping space %d\n", ret);
+		test_err("couldn't remove overlapping space %d", ret);
 		return ret;
 	}
 
 	if (test_check_exists(cache, next_bitmap_offset - SZ_1M, SZ_2M)) {
-		test_msg("Left some space when removing overlapping\n");
+		test_err("left some space when removing overlapping");
 		return -1;
 	}
 
@@ -161,7 +161,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
 	u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize);
 	int ret;
 
-	test_msg("Running bitmap and extent tests\n");
+	test_msg("running bitmap and extent tests");
 
 	/*
 	 * First let's do something simple, an extent at the same offset as the
@@ -170,42 +170,42 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
 	 */
 	ret = test_add_free_space_entry(cache, SZ_4M, SZ_1M, 1);
 	if (ret) {
-		test_msg("Couldn't create bitmap entry %d\n", ret);
+		test_err("couldn't create bitmap entry %d", ret);
 		return ret;
 	}
 
 	ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
 	if (ret) {
-		test_msg("Couldn't add extent entry %d\n", ret);
+		test_err("couldn't add extent entry %d", ret);
 		return ret;
 	}
 
 	ret = btrfs_remove_free_space(cache, 0, SZ_1M);
 	if (ret) {
-		test_msg("Couldn't remove extent entry %d\n", ret);
+		test_err("couldn't remove extent entry %d", ret);
 		return ret;
 	}
 
 	if (test_check_exists(cache, 0, SZ_1M)) {
-		test_msg("Left remnants after our remove\n");
+		test_err("left remnants after our remove");
 		return -1;
 	}
 
 	/* Now to add back the extent entry and remove from the bitmap */
 	ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
 	if (ret) {
-		test_msg("Couldn't re-add extent entry %d\n", ret);
+		test_err("couldn't re-add extent entry %d", ret);
 		return ret;
 	}
 
 	ret = btrfs_remove_free_space(cache, SZ_4M, SZ_1M);
 	if (ret) {
-		test_msg("Couldn't remove from bitmap %d\n", ret);
+		test_err("couldn't remove from bitmap %d", ret);
 		return ret;
 	}
 
 	if (test_check_exists(cache, SZ_4M, SZ_1M)) {
-		test_msg("Left remnants in the bitmap\n");
+		test_err("left remnants in the bitmap");
 		return -1;
 	}
 
@@ -215,18 +215,18 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
 	 */
 	ret = test_add_free_space_entry(cache, SZ_1M, SZ_4M, 1);
 	if (ret) {
-		test_msg("Couldn't add to a bitmap %d\n", ret);
+		test_err("couldn't add to a bitmap %d", ret);
 		return ret;
 	}
 
 	ret = btrfs_remove_free_space(cache, SZ_512K, 3 * SZ_1M);
 	if (ret) {
-		test_msg("Couldn't remove overlapping space %d\n", ret);
+		test_err("couldn't remove overlapping space %d", ret);
 		return ret;
 	}
 
 	if (test_check_exists(cache, SZ_512K, 3 * SZ_1M)) {
-		test_msg("Left over pieces after removing overlapping\n");
+		test_err("left over pieces after removing overlapping");
 		return -1;
 	}
 
@@ -235,24 +235,24 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
 	/* Now with the extent entry offset into the bitmap */
 	ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1);
 	if (ret) {
-		test_msg("Couldn't add space to the bitmap %d\n", ret);
+		test_err("couldn't add space to the bitmap %d", ret);
 		return ret;
 	}
 
 	ret = test_add_free_space_entry(cache, SZ_2M, SZ_2M, 0);
 	if (ret) {
-		test_msg("Couldn't add extent to the cache %d\n", ret);
+		test_err("couldn't add extent to the cache %d", ret);
 		return ret;
 	}
 
 	ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_4M);
 	if (ret) {
-		test_msg("Problem removing overlapping space %d\n", ret);
+		test_err("problem removing overlapping space %d", ret);
 		return ret;
 	}
 
 	if (test_check_exists(cache, 3 * SZ_1M, SZ_4M)) {
-		test_msg("Left something behind when removing space");
+		test_err("left something behind when removing space");
 		return -1;
 	}
 
@@ -269,25 +269,25 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
 	__btrfs_remove_free_space_cache(cache->free_space_ctl);
 	ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1);
 	if (ret) {
-		test_msg("Couldn't add bitmap %d\n", ret);
+		test_err("couldn't add bitmap %d", ret);
 		return ret;
 	}
 
 	ret = test_add_free_space_entry(cache, bitmap_offset - SZ_1M,
 					5 * SZ_1M, 0);
 	if (ret) {
-		test_msg("Couldn't add extent entry %d\n", ret);
+		test_err("couldn't add extent entry %d", ret);
 		return ret;
 	}
 
 	ret = btrfs_remove_free_space(cache, bitmap_offset + SZ_1M, 5 * SZ_1M);
 	if (ret) {
-		test_msg("Failed to free our space %d\n", ret);
+		test_err("failed to free our space %d", ret);
 		return ret;
 	}
 
 	if (test_check_exists(cache, bitmap_offset + SZ_1M, 5 * SZ_1M)) {
-		test_msg("Left stuff over\n");
+		test_err("left stuff over");
 		return -1;
 	}
 
@@ -301,19 +301,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
 	 */
 	ret = test_add_free_space_entry(cache, SZ_1M, SZ_2M, 1);
 	if (ret) {
-		test_msg("Couldn't add bitmap entry %d\n", ret);
+		test_err("couldn't add bitmap entry %d", ret);
 		return ret;
 	}
 
 	ret = test_add_free_space_entry(cache, 3 * SZ_1M, SZ_1M, 0);
 	if (ret) {
-		test_msg("Couldn't add extent entry %d\n", ret);
+		test_err("couldn't add extent entry %d", ret);
 		return ret;
 	}
 
 	ret = btrfs_remove_free_space(cache, SZ_1M, 3 * SZ_1M);
 	if (ret) {
-		test_msg("Error removing bitmap and extent overlapping %d\n", ret);
+		test_err("error removing bitmap and extent overlapping %d", ret);
 		return ret;
 	}
 
@@ -335,12 +335,14 @@ check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache,
 			      const int num_bitmaps)
 {
 	if (cache->free_space_ctl->free_extents != num_extents) {
-		test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+		test_err(
+		"incorrect # of extent entries in the cache: %d, expected %d",
 			 cache->free_space_ctl->free_extents, num_extents);
 		return -EINVAL;
 	}
 	if (cache->free_space_ctl->total_bitmaps != num_bitmaps) {
-		test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+		test_err(
+		"incorrect # of extent entries in the cache: %d, expected %d",
 			 cache->free_space_ctl->total_bitmaps, num_bitmaps);
 		return -EINVAL;
 	}
@@ -358,7 +360,7 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache)
 	 * allocate.
 	 */
 	if (cache->free_space_ctl->free_space != 0) {
-		test_msg("Cache free space is not 0\n");
+		test_err("cache free space is not 0");
 		return -EINVAL;
 	}
 
@@ -366,7 +368,7 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache)
 	offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0,
 					    &max_extent_size);
 	if (offset != 0) {
-		test_msg("Space allocation did not fail, returned offset: %llu",
+		test_err("space allocation did not fail, returned offset: %llu",
 			 offset);
 		return -EINVAL;
 	}
@@ -402,7 +404,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	};
 	const struct btrfs_free_space_op *orig_free_space_ops;
 
-	test_msg("Running space stealing from bitmap to extent\n");
+	test_msg("running space stealing from bitmap to extent");
 
 	/*
 	 * For this test, we want to ensure we end up with an extent entry
@@ -430,7 +432,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	 */
 	ret = test_add_free_space_entry(cache, SZ_128M - SZ_256K, SZ_128K, 0);
 	if (ret) {
-		test_msg("Couldn't add extent entry %d\n", ret);
+		test_err("couldn't add extent entry %d", ret);
 		return ret;
 	}
 
@@ -438,7 +440,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	ret = test_add_free_space_entry(cache, SZ_128M + SZ_512K,
 					SZ_128M - SZ_512K, 1);
 	if (ret) {
-		test_msg("Couldn't add bitmap entry %d\n", ret);
+		test_err("couldn't add bitmap entry %d", ret);
 		return ret;
 	}
 
@@ -457,17 +459,17 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 				      SZ_128M + 768 * SZ_1K,
 				      SZ_128M - 768 * SZ_1K);
 	if (ret) {
-		test_msg("Failed to free part of bitmap space %d\n", ret);
+		test_err("failed to free part of bitmap space %d", ret);
 		return ret;
 	}
 
 	/* Confirm that only those 2 ranges are marked as free. */
 	if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_128K)) {
-		test_msg("Free space range missing\n");
+		test_err("free space range missing");
 		return -ENOENT;
 	}
 	if (!test_check_exists(cache, SZ_128M + SZ_512K, SZ_256K)) {
-		test_msg("Free space range missing\n");
+		test_err("free space range missing");
 		return -ENOENT;
 	}
 
@@ -477,7 +479,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	 */
 	if (test_check_exists(cache, SZ_128M + 768 * SZ_1K,
 			      SZ_128M - 768 * SZ_1K)) {
-		test_msg("Bitmap region not removed from space cache\n");
+		test_err("bitmap region not removed from space cache");
 		return -EINVAL;
 	}
 
@@ -486,7 +488,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	 * covered by the bitmap, isn't marked as free.
 	 */
 	if (test_check_exists(cache, SZ_128M + SZ_256K, SZ_256K)) {
-		test_msg("Invalid bitmap region marked as free\n");
+		test_err("invalid bitmap region marked as free");
 		return -EINVAL;
 	}
 
@@ -495,7 +497,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	 * by the bitmap too, isn't marked as free either.
 	 */
 	if (test_check_exists(cache, SZ_128M, SZ_256K)) {
-		test_msg("Invalid bitmap region marked as free\n");
+		test_err("invalid bitmap region marked as free");
 		return -EINVAL;
 	}
 
@@ -506,12 +508,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	 */
 	ret = btrfs_add_free_space(cache, SZ_128M, SZ_512K);
 	if (ret) {
-		test_msg("Error adding free space: %d\n", ret);
+		test_err("error adding free space: %d", ret);
 		return ret;
 	}
 	/* Confirm the region is marked as free. */
 	if (!test_check_exists(cache, SZ_128M, SZ_512K)) {
-		test_msg("Bitmap region not marked as free\n");
+		test_err("bitmap region not marked as free");
 		return -ENOENT;
 	}
 
@@ -531,7 +533,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	 */
 	ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, sectorsize);
 	if (ret) {
-		test_msg("Error adding free space: %d\n", ret);
+		test_err("error adding free space: %d", ret);
 		return ret;
 	}
 
@@ -550,12 +552,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	 */
 	ret = btrfs_add_free_space(cache, SZ_128M - SZ_128K, SZ_128K);
 	if (ret) {
-		test_msg("Error adding free space: %d\n", ret);
+		test_err("error adding free space: %d", ret);
 		return ret;
 	}
 	/* Confirm the region is marked as free. */
 	if (!test_check_exists(cache, SZ_128M - SZ_128K, SZ_128K)) {
-		test_msg("Extent region not marked as free\n");
+		test_err("extent region not marked as free");
 		return -ENOENT;
 	}
 
@@ -583,12 +585,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	 * allocate the whole free space at once.
 	 */
 	if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_1M)) {
-		test_msg("Expected region not marked as free\n");
+		test_err("expected region not marked as free");
 		return -ENOENT;
 	}
 
 	if (cache->free_space_ctl->free_space != (SZ_1M + sectorsize)) {
-		test_msg("Cache free space is not 1Mb + %u\n", sectorsize);
+		test_err("cache free space is not 1Mb + %u", sectorsize);
 		return -EINVAL;
 	}
 
@@ -596,7 +598,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 					    0, SZ_1M, 0,
 					    &max_extent_size);
 	if (offset != (SZ_128M - SZ_256K)) {
-		test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+		test_err(
+	"failed to allocate 1Mb from space cache, returned offset is: %llu",
 			 offset);
 		return -EINVAL;
 	}
@@ -610,7 +613,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 		return ret;
 
 	if (cache->free_space_ctl->free_space != sectorsize) {
-		test_msg("Cache free space is not %u\n", sectorsize);
+		test_err("cache free space is not %u", sectorsize);
 		return -EINVAL;
 	}
 
@@ -618,7 +621,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 					    0, sectorsize, 0,
 					    &max_extent_size);
 	if (offset != (SZ_128M + SZ_16M)) {
-		test_msg("Failed to allocate %u, returned offset : %llu\n",
+		test_err("failed to allocate %u, returned offset : %llu",
 			 sectorsize, offset);
 		return -EINVAL;
 	}
@@ -640,14 +643,14 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	 */
 	ret = test_add_free_space_entry(cache, SZ_128M + SZ_128K, SZ_128K, 0);
 	if (ret) {
-		test_msg("Couldn't add extent entry %d\n", ret);
+		test_err("couldn't add extent entry %d", ret);
 		return ret;
 	}
 
 	/* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
 	ret = test_add_free_space_entry(cache, 0, SZ_128M - SZ_512K, 1);
 	if (ret) {
-		test_msg("Couldn't add bitmap entry %d\n", ret);
+		test_err("couldn't add bitmap entry %d", ret);
 		return ret;
 	}
 
@@ -664,17 +667,17 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	 */
 	ret = btrfs_remove_free_space(cache, 0, SZ_128M - 768 * SZ_1K);
 	if (ret) {
-		test_msg("Failed to free part of bitmap space %d\n", ret);
+		test_err("failed to free part of bitmap space %d", ret);
 		return ret;
 	}
 
 	/* Confirm that only those 2 ranges are marked as free. */
 	if (!test_check_exists(cache, SZ_128M + SZ_128K, SZ_128K)) {
-		test_msg("Free space range missing\n");
+		test_err("free space range missing");
 		return -ENOENT;
 	}
 	if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_256K)) {
-		test_msg("Free space range missing\n");
+		test_err("free space range missing");
 		return -ENOENT;
 	}
 
@@ -683,7 +686,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	 * as free anymore.
 	 */
 	if (test_check_exists(cache, 0, SZ_128M - 768 * SZ_1K)) {
-		test_msg("Bitmap region not removed from space cache\n");
+		test_err("bitmap region not removed from space cache");
 		return -EINVAL;
 	}
 
@@ -692,7 +695,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	 * covered by the bitmap, isn't marked as free.
 	 */
 	if (test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
-		test_msg("Invalid bitmap region marked as free\n");
+		test_err("invalid bitmap region marked as free");
 		return -EINVAL;
 	}
 
@@ -703,12 +706,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	 */
 	ret = btrfs_add_free_space(cache, SZ_128M - SZ_512K, SZ_512K);
 	if (ret) {
-		test_msg("Error adding free space: %d\n", ret);
+		test_err("error adding free space: %d", ret);
 		return ret;
 	}
 	/* Confirm the region is marked as free. */
 	if (!test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
-		test_msg("Bitmap region not marked as free\n");
+		test_err("bitmap region not marked as free");
 		return -ENOENT;
 	}
 
@@ -728,7 +731,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	 */
 	ret = btrfs_add_free_space(cache, SZ_32M, 2 * sectorsize);
 	if (ret) {
-		test_msg("Error adding free space: %d\n", ret);
+		test_err("error adding free space: %d", ret);
 		return ret;
 	}
 
@@ -739,12 +742,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	 */
 	ret = btrfs_add_free_space(cache, SZ_128M, SZ_128K);
 	if (ret) {
-		test_msg("Error adding free space: %d\n", ret);
+		test_err("error adding free space: %d", ret);
 		return ret;
 	}
 	/* Confirm the region is marked as free. */
 	if (!test_check_exists(cache, SZ_128M, SZ_128K)) {
-		test_msg("Extent region not marked as free\n");
+		test_err("extent region not marked as free");
 		return -ENOENT;
 	}
 
@@ -772,19 +775,20 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 	 * allocate the whole free space at once.
 	 */
 	if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_1M)) {
-		test_msg("Expected region not marked as free\n");
+		test_err("expected region not marked as free");
 		return -ENOENT;
 	}
 
 	if (cache->free_space_ctl->free_space != (SZ_1M + 2 * sectorsize)) {
-		test_msg("Cache free space is not 1Mb + %u\n", 2 * sectorsize);
+		test_err("cache free space is not 1Mb + %u", 2 * sectorsize);
 		return -EINVAL;
 	}
 
 	offset = btrfs_find_space_for_alloc(cache, 0, SZ_1M, 0,
 					    &max_extent_size);
 	if (offset != (SZ_128M - 768 * SZ_1K)) {
-		test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+		test_err(
+	"failed to allocate 1Mb from space cache, returned offset is: %llu",
 			 offset);
 		return -EINVAL;
 	}
@@ -798,7 +802,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 		return ret;
 
 	if (cache->free_space_ctl->free_space != 2 * sectorsize) {
-		test_msg("Cache free space is not %u\n", 2 * sectorsize);
+		test_err("cache free space is not %u", 2 * sectorsize);
 		return -EINVAL;
 	}
 
@@ -806,9 +810,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
 					    0, 2 * sectorsize, 0,
 					    &max_extent_size);
 	if (offset != SZ_32M) {
-		test_msg("Failed to allocate %u, offset: %llu\n",
-			 2 * sectorsize,
-			 offset);
+		test_err("failed to allocate %u, offset: %llu",
+			 2 * sectorsize, offset);
 		return -EINVAL;
 	}
 
@@ -829,7 +832,7 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
 	struct btrfs_root *root = NULL;
 	int ret = -ENOMEM;
 
-	test_msg("Running btrfs free space cache tests\n");
+	test_msg("running btrfs free space cache tests");
 	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
 	if (!fs_info)
 		return -ENOMEM;
@@ -843,7 +846,7 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
 	cache = btrfs_alloc_dummy_block_group(fs_info,
 				      BITS_PER_BITMAP * sectorsize + PAGE_SIZE);
 	if (!cache) {
-		test_msg("Couldn't run the tests\n");
+		test_err("couldn't run the tests");
 		btrfs_free_dummy_fs_info(fs_info);
 		return 0;
 	}
@@ -871,6 +874,6 @@ out:
 	btrfs_free_dummy_block_group(cache);
 	btrfs_free_dummy_root(root);
 	btrfs_free_dummy_fs_info(fs_info);
-	test_msg("Free space cache tests finished\n");
+	test_msg("free space cache tests finished");
 	return ret;
 }
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index e1f9666..89346da 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -32,7 +32,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
 
 	info = search_free_space_info(trans, fs_info, cache, path, 0);
 	if (IS_ERR(info)) {
-		test_msg("Could not find free space info\n");
+		test_err("could not find free space info");
 		ret = PTR_ERR(info);
 		goto out;
 	}
@@ -40,7 +40,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
 	extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
 
 	if (extent_count != num_extents) {
-		test_msg("Extent count is wrong\n");
+		test_err("extent count is wrong");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -99,7 +99,7 @@ out:
 	btrfs_release_path(path);
 	return ret;
 invalid:
-	test_msg("Free space tree is invalid\n");
+	test_err("free space tree is invalid");
 	ret = -EINVAL;
 	goto out;
 }
@@ -117,7 +117,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans,
 
 	info = search_free_space_info(trans, fs_info, cache, path, 0);
 	if (IS_ERR(info)) {
-		test_msg("Could not find free space info\n");
+		test_err("could not find free space info");
 		btrfs_release_path(path);
 		return PTR_ERR(info);
 	}
@@ -131,15 +131,15 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans,
 
 	/* Flip it to the other format and check that for good measure. */
 	if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
-		ret = convert_free_space_to_extents(trans, fs_info, cache, path);
+		ret = convert_free_space_to_extents(trans, cache, path);
 		if (ret) {
-			test_msg("Could not convert to extents\n");
+			test_err("could not convert to extents");
 			return ret;
 		}
 	} else {
-		ret = convert_free_space_to_bitmaps(trans, fs_info, cache, path);
+		ret = convert_free_space_to_bitmaps(trans, cache, path);
 		if (ret) {
-			test_msg("Could not convert to bitmaps\n");
+			test_err("could not convert to bitmaps");
 			return ret;
 		}
 	}
@@ -170,11 +170,11 @@ static int test_remove_all(struct btrfs_trans_handle *trans,
 	const struct free_space_extent extents[] = {};
 	int ret;
 
-	ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+	ret = __remove_from_free_space_tree(trans, cache, path,
 					    cache->key.objectid,
 					    cache->key.offset);
 	if (ret) {
-		test_msg("Could not remove free space\n");
+		test_err("could not remove free space");
 		return ret;
 	}
 
@@ -194,10 +194,10 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans,
 	};
 	int ret;
 
-	ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+	ret = __remove_from_free_space_tree(trans, cache, path,
 					    cache->key.objectid, alignment);
 	if (ret) {
-		test_msg("Could not remove free space\n");
+		test_err("could not remove free space");
 		return ret;
 	}
 
@@ -217,12 +217,12 @@ static int test_remove_end(struct btrfs_trans_handle *trans,
 	};
 	int ret;
 
-	ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+	ret = __remove_from_free_space_tree(trans, cache, path,
 					    cache->key.objectid +
 					    cache->key.offset - alignment,
 					    alignment);
 	if (ret) {
-		test_msg("Could not remove free space\n");
+		test_err("could not remove free space");
 		return ret;
 	}
 
@@ -243,11 +243,11 @@ static int test_remove_middle(struct btrfs_trans_handle *trans,
 	};
 	int ret;
 
-	ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+	ret = __remove_from_free_space_tree(trans, cache, path,
 					    cache->key.objectid + alignment,
 					    alignment);
 	if (ret) {
-		test_msg("Could not remove free space\n");
+		test_err("could not remove free space");
 		return ret;
 	}
 
@@ -266,26 +266,26 @@ static int test_merge_left(struct btrfs_trans_handle *trans,
 	};
 	int ret;
 
-	ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+	ret = __remove_from_free_space_tree(trans, cache, path,
 					    cache->key.objectid,
 					    cache->key.offset);
 	if (ret) {
-		test_msg("Could not remove free space\n");
+		test_err("could not remove free space");
 		return ret;
 	}
 
-	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
-				       cache->key.objectid, alignment);
+	ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+				       alignment);
 	if (ret) {
-		test_msg("Could not add free space\n");
+		test_err("could not add free space");
 		return ret;
 	}
 
-	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+	ret = __add_to_free_space_tree(trans, cache, path,
 				       cache->key.objectid + alignment,
 				       alignment);
 	if (ret) {
-		test_msg("Could not add free space\n");
+		test_err("could not add free space");
 		return ret;
 	}
 
@@ -304,27 +304,27 @@ static int test_merge_right(struct btrfs_trans_handle *trans,
 	};
 	int ret;
 
-	ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+	ret = __remove_from_free_space_tree(trans, cache, path,
 					    cache->key.objectid,
 					    cache->key.offset);
 	if (ret) {
-		test_msg("Could not remove free space\n");
+		test_err("could not remove free space");
 		return ret;
 	}
 
-	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+	ret = __add_to_free_space_tree(trans, cache, path,
 				       cache->key.objectid + 2 * alignment,
 				       alignment);
 	if (ret) {
-		test_msg("Could not add free space\n");
+		test_err("could not add free space");
 		return ret;
 	}
 
-	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+	ret = __add_to_free_space_tree(trans, cache, path,
 				       cache->key.objectid + alignment,
 				       alignment);
 	if (ret) {
-		test_msg("Could not add free space\n");
+		test_err("could not add free space");
 		return ret;
 	}
 
@@ -343,34 +343,34 @@ static int test_merge_both(struct btrfs_trans_handle *trans,
 	};
 	int ret;
 
-	ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+	ret = __remove_from_free_space_tree(trans, cache, path,
 					    cache->key.objectid,
 					    cache->key.offset);
 	if (ret) {
-		test_msg("Could not remove free space\n");
+		test_err("could not remove free space");
 		return ret;
 	}
 
-	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
-				       cache->key.objectid, alignment);
+	ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+				       alignment);
 	if (ret) {
-		test_msg("Could not add free space\n");
+		test_err("could not add free space");
 		return ret;
 	}
 
-	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+	ret = __add_to_free_space_tree(trans, cache, path,
 				       cache->key.objectid + 2 * alignment,
 				       alignment);
 	if (ret) {
-		test_msg("Could not add free space\n");
+		test_err("could not add free space");
 		return ret;
 	}
 
-	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+	ret = __add_to_free_space_tree(trans, cache, path,
 				       cache->key.objectid + alignment,
 				       alignment);
 	if (ret) {
-		test_msg("Could not add free space\n");
+		test_err("could not add free space");
 		return ret;
 	}
 
@@ -391,34 +391,34 @@ static int test_merge_none(struct btrfs_trans_handle *trans,
 	};
 	int ret;
 
-	ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+	ret = __remove_from_free_space_tree(trans, cache, path,
 					    cache->key.objectid,
 					    cache->key.offset);
 	if (ret) {
-		test_msg("Could not remove free space\n");
+		test_err("could not remove free space");
 		return ret;
 	}
 
-	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
-				       cache->key.objectid, alignment);
+	ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+				       alignment);
 	if (ret) {
-		test_msg("Could not add free space\n");
+		test_err("could not add free space");
 		return ret;
 	}
 
-	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+	ret = __add_to_free_space_tree(trans, cache, path,
 				       cache->key.objectid + 4 * alignment,
 				       alignment);
 	if (ret) {
-		test_msg("Could not add free space\n");
+		test_err("could not add free space");
 		return ret;
 	}
 
-	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+	ret = __add_to_free_space_tree(trans, cache, path,
 				       cache->key.objectid + 2 * alignment,
 				       alignment);
 	if (ret) {
-		test_msg("Could not add free space\n");
+		test_err("could not add free space");
 		return ret;
 	}
 
@@ -444,14 +444,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
 
 	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
 	if (!fs_info) {
-		test_msg("Couldn't allocate dummy fs info\n");
+		test_err("couldn't allocate dummy fs info");
 		ret = -ENOMEM;
 		goto out;
 	}
 
 	root = btrfs_alloc_dummy_root(fs_info);
 	if (IS_ERR(root)) {
-		test_msg("Couldn't allocate dummy root\n");
+		test_err("couldn't allocate dummy root");
 		ret = PTR_ERR(root);
 		goto out;
 	}
@@ -463,7 +463,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
 
 	root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
 	if (!root->node) {
-		test_msg("Couldn't allocate dummy buffer\n");
+		test_err("couldn't allocate dummy buffer");
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -473,7 +473,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
 
 	cache = btrfs_alloc_dummy_block_group(fs_info, 8 * alignment);
 	if (!cache) {
-		test_msg("Couldn't allocate dummy block group cache\n");
+		test_err("couldn't allocate dummy block group cache");
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -482,26 +482,25 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
 	cache->needs_free_space = 1;
 	cache->fs_info = root->fs_info;
 
-	btrfs_init_dummy_trans(&trans);
+	btrfs_init_dummy_trans(&trans, root->fs_info);
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		test_msg("Couldn't allocate path\n");
+		test_err("couldn't allocate path");
 		ret = -ENOMEM;
 		goto out;
 	}
 
-	ret = add_block_group_free_space(&trans, root->fs_info, cache);
+	ret = add_block_group_free_space(&trans, cache);
 	if (ret) {
-		test_msg("Could not add block group free space\n");
+		test_err("could not add block group free space");
 		goto out;
 	}
 
 	if (bitmaps) {
-		ret = convert_free_space_to_bitmaps(&trans, root->fs_info,
-						    cache, path);
+		ret = convert_free_space_to_bitmaps(&trans, cache, path);
 		if (ret) {
-			test_msg("Could not convert block group to bitmaps\n");
+			test_err("could not convert block group to bitmaps");
 			goto out;
 		}
 	}
@@ -510,14 +509,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
 	if (ret)
 		goto out;
 
-	ret = remove_block_group_free_space(&trans, root->fs_info, cache);
+	ret = remove_block_group_free_space(&trans, cache);
 	if (ret) {
-		test_msg("Could not remove block group free space\n");
+		test_err("could not remove block group free space");
 		goto out;
 	}
 
 	if (btrfs_header_nritems(root->node) != 0) {
-		test_msg("Free space tree has leftover items\n");
+		test_err("free space tree has leftover items");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -539,14 +538,16 @@ static int run_test_both_formats(test_func_t test_func, u32 sectorsize,
 
 	ret = run_test(test_func, 0, sectorsize, nodesize, alignment);
 	if (ret) {
-		test_msg("%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u\n",
+		test_err(
+	"%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u",
 			 test_func, sectorsize, nodesize, alignment);
 		test_ret = ret;
 	}
 
 	ret = run_test(test_func, 1, sectorsize, nodesize, alignment);
 	if (ret) {
-		test_msg("%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u\n",
+		test_err(
+	"%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u",
 			 test_func, sectorsize, nodesize, alignment);
 		test_ret = ret;
 	}
@@ -577,7 +578,7 @@ int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize)
 	 */
 	bitmap_alignment = BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE;
 
-	test_msg("Running free space tree tests\n");
+	test_msg("running free space tree tests");
 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
 		int ret;
 
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index e0ba799..64043f0 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -228,7 +228,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 
 	inode = btrfs_new_test_inode();
 	if (!inode) {
-		test_msg("Couldn't allocate inode\n");
+		test_err("couldn't allocate inode");
 		return ret;
 	}
 
@@ -238,19 +238,19 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 
 	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
 	if (!fs_info) {
-		test_msg("Couldn't allocate dummy fs info\n");
+		test_err("couldn't allocate dummy fs info");
 		goto out;
 	}
 
 	root = btrfs_alloc_dummy_root(fs_info);
 	if (IS_ERR(root)) {
-		test_msg("Couldn't allocate root\n");
+		test_err("couldn't allocate root");
 		goto out;
 	}
 
 	root->node = alloc_dummy_extent_buffer(fs_info, nodesize);
 	if (!root->node) {
-		test_msg("Couldn't allocate dummy buffer\n");
+		test_err("couldn't allocate dummy buffer");
 		goto out;
 	}
 
@@ -268,11 +268,11 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize, 0);
 	if (IS_ERR(em)) {
 		em = NULL;
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start != EXTENT_MAP_HOLE) {
-		test_msg("Expected a hole, got %llu\n", em->block_start);
+		test_err("expected a hole, got %llu", em->block_start);
 		goto out;
 	}
 	free_extent_map(em);
@@ -287,20 +287,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start != EXTENT_MAP_HOLE) {
-		test_msg("Expected a hole, got %llu\n", em->block_start);
+		test_err("expected a hole, got %llu", em->block_start);
 		goto out;
 	}
 	if (em->start != 0 || em->len != 5) {
-		test_msg("Unexpected extent wanted start 0 len 5, got start "
-			 "%llu len %llu\n", em->start, em->len);
+		test_err(
+		"unexpected extent wanted start 0 len 5, got start %llu len %llu",
+			em->start, em->len);
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		test_err("unexpected flags set, want 0 have %lu", em->flags);
 		goto out;
 	}
 	offset = em->start + em->len;
@@ -308,21 +309,22 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start != EXTENT_MAP_INLINE) {
-		test_msg("Expected an inline, got %llu\n", em->block_start);
+		test_err("expected an inline, got %llu", em->block_start);
 		goto out;
 	}
 
 	if (em->start != offset || em->len != (sectorsize - 5)) {
-		test_msg("Unexpected extent wanted start %llu len 1, got start "
-			 "%llu len %llu\n", offset, em->start, em->len);
+		test_err(
+	"unexpected extent wanted start %llu len 1, got start %llu len %llu",
+			offset, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		test_err("unexpected flags set, want 0 have %lu", em->flags);
 		goto out;
 	}
 	/*
@@ -335,20 +337,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start != EXTENT_MAP_HOLE) {
-		test_msg("Expected a hole, got %llu\n", em->block_start);
+		test_err("expected a hole, got %llu", em->block_start);
 		goto out;
 	}
 	if (em->start != offset || em->len != 4) {
-		test_msg("Unexpected extent wanted start %llu len 4, got start "
-			 "%llu len %llu\n", offset, em->start, em->len);
+		test_err(
+	"unexpected extent wanted start %llu len 4, got start %llu len %llu",
+			offset, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		test_err("unexpected flags set, want 0 have %lu", em->flags);
 		goto out;
 	}
 	offset = em->start + em->len;
@@ -357,24 +360,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	/* Regular extent */
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		test_err("expected a real extent, got %llu", em->block_start);
 		goto out;
 	}
 	if (em->start != offset || em->len != sectorsize - 1) {
-		test_msg("Unexpected extent wanted start %llu len 4095, got "
-			 "start %llu len %llu\n", offset, em->start, em->len);
+		test_err(
+	"unexpected extent wanted start %llu len 4095, got start %llu len %llu",
+			offset, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		test_err("unexpected flags set, want 0 have %lu", em->flags);
 		goto out;
 	}
 	if (em->orig_start != em->start) {
-		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+		test_err("wrong orig offset, want %llu, have %llu", em->start,
 			 em->orig_start);
 		goto out;
 	}
@@ -384,25 +388,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	/* The next 3 are split extents */
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		test_err("expected a real extent, got %llu", em->block_start);
 		goto out;
 	}
 	if (em->start != offset || em->len != sectorsize) {
-		test_msg("Unexpected extent start %llu len %u, "
-			"got start %llu len %llu\n",
+		test_err(
+		"unexpected extent start %llu len %u, got start %llu len %llu",
 			offset, sectorsize, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		test_err("unexpected flags set, want 0 have %lu", em->flags);
 		goto out;
 	}
 	if (em->orig_start != em->start) {
-		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+		test_err("wrong orig offset, want %llu, have %llu", em->start,
 			 em->orig_start);
 		goto out;
 	}
@@ -413,21 +417,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start != EXTENT_MAP_HOLE) {
-		test_msg("Expected a hole, got %llu\n", em->block_start);
+		test_err("expected a hole, got %llu", em->block_start);
 		goto out;
 	}
 	if (em->start != offset || em->len != sectorsize) {
-		test_msg("Unexpected extent wanted start %llu len %u, "
-			"got start %llu len %llu\n",
+		test_err(
+	"unexpected extent wanted start %llu len %u, got start %llu len %llu",
 			offset, sectorsize, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		test_err("unexpected flags set, want 0 have %lu", em->flags);
 		goto out;
 	}
 	offset = em->start + em->len;
@@ -435,31 +439,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		test_err("expected a real extent, got %llu", em->block_start);
 		goto out;
 	}
 	if (em->start != offset || em->len != 2 * sectorsize) {
-		test_msg("Unexpected extent wanted start %llu len %u, "
-			"got start %llu len %llu\n",
+		test_err(
+	"unexpected extent wanted start %llu len %u, got start %llu len %llu",
 			offset, 2 * sectorsize, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		test_err("unexpected flags set, want 0 have %lu", em->flags);
 		goto out;
 	}
 	if (em->orig_start != orig_start) {
-		test_msg("Wrong orig offset, want %llu, have %llu\n",
+		test_err("wrong orig offset, want %llu, have %llu",
 			 orig_start, em->orig_start);
 		goto out;
 	}
 	disk_bytenr += (em->start - orig_start);
 	if (em->block_start != disk_bytenr) {
-		test_msg("Wrong block start, want %llu, have %llu\n",
+		test_err("wrong block start, want %llu, have %llu",
 			 disk_bytenr, em->block_start);
 		goto out;
 	}
@@ -469,26 +473,26 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	/* Prealloc extent */
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		test_err("expected a real extent, got %llu", em->block_start);
 		goto out;
 	}
 	if (em->start != offset || em->len != sectorsize) {
-		test_msg("Unexpected extent wanted start %llu len %u, "
-			"got start %llu len %llu\n",
+		test_err(
+	"unexpected extent wanted start %llu len %u, got start %llu len %llu",
 			offset, sectorsize, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != prealloc_only) {
-		test_msg("Unexpected flags set, want %lu have %lu\n",
+		test_err("unexpected flags set, want %lu have %lu",
 			 prealloc_only, em->flags);
 		goto out;
 	}
 	if (em->orig_start != em->start) {
-		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+		test_err("wrong orig offset, want %llu, have %llu", em->start,
 			 em->orig_start);
 		goto out;
 	}
@@ -498,26 +502,26 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	/* The next 3 are a half written prealloc extent */
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		test_err("expected a real extent, got %llu", em->block_start);
 		goto out;
 	}
 	if (em->start != offset || em->len != sectorsize) {
-		test_msg("Unexpected extent wanted start %llu len %u, "
-			"got start %llu len %llu\n",
+		test_err(
+	"unexpected extent wanted start %llu len %u, got start %llu len %llu",
 			offset, sectorsize, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != prealloc_only) {
-		test_msg("Unexpected flags set, want %lu have %lu\n",
+		test_err("unexpected flags set, want %lu have %lu",
 			 prealloc_only, em->flags);
 		goto out;
 	}
 	if (em->orig_start != em->start) {
-		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+		test_err("wrong orig offset, want %llu, have %llu", em->start,
 			 em->orig_start);
 		goto out;
 	}
@@ -528,30 +532,30 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start >= EXTENT_MAP_HOLE) {
-		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		test_err("expected a real extent, got %llu", em->block_start);
 		goto out;
 	}
 	if (em->start != offset || em->len != sectorsize) {
-		test_msg("Unexpected extent wanted start %llu len %u, "
-			"got start %llu len %llu\n",
+		test_err(
+	"unexpected extent wanted start %llu len %u, got start %llu len %llu",
 			offset, sectorsize, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		test_err("unexpected flags set, want 0 have %lu", em->flags);
 		goto out;
 	}
 	if (em->orig_start != orig_start) {
-		test_msg("Unexpected orig offset, wanted %llu, have %llu\n",
+		test_err("unexpected orig offset, wanted %llu, have %llu",
 			 orig_start, em->orig_start);
 		goto out;
 	}
 	if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
-		test_msg("Unexpected block start, wanted %llu, have %llu\n",
+		test_err("unexpected block start, wanted %llu, have %llu",
 			 disk_bytenr + (em->start - em->orig_start),
 			 em->block_start);
 		goto out;
@@ -561,31 +565,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		test_err("expected a real extent, got %llu", em->block_start);
 		goto out;
 	}
 	if (em->start != offset || em->len != 2 * sectorsize) {
-		test_msg("Unexpected extent wanted start %llu len %u, "
-			"got start %llu len %llu\n",
+		test_err(
+	"unexpected extent wanted start %llu len %u, got start %llu len %llu",
 			offset, 2 * sectorsize, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != prealloc_only) {
-		test_msg("Unexpected flags set, want %lu have %lu\n",
+		test_err("unexpected flags set, want %lu have %lu",
 			 prealloc_only, em->flags);
 		goto out;
 	}
 	if (em->orig_start != orig_start) {
-		test_msg("Wrong orig offset, want %llu, have %llu\n", orig_start,
+		test_err("wrong orig offset, want %llu, have %llu", orig_start,
 			 em->orig_start);
 		goto out;
 	}
 	if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
-		test_msg("Unexpected block start, wanted %llu, have %llu\n",
+		test_err("unexpected block start, wanted %llu, have %llu",
 			 disk_bytenr + (em->start - em->orig_start),
 			 em->block_start);
 		goto out;
@@ -596,31 +600,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	/* Now for the compressed extent */
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		test_err("expected a real extent, got %llu", em->block_start);
 		goto out;
 	}
 	if (em->start != offset || em->len != 2 * sectorsize) {
-		test_msg("Unexpected extent wanted start %llu len %u,"
-			"got start %llu len %llu\n",
+		test_err(
+	"unexpected extent wanted start %llu len %u, got start %llu len %llu",
 			offset, 2 * sectorsize, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != compressed_only) {
-		test_msg("Unexpected flags set, want %lu have %lu\n",
+		test_err("unexpected flags set, want %lu have %lu",
 			 compressed_only, em->flags);
 		goto out;
 	}
 	if (em->orig_start != em->start) {
-		test_msg("Wrong orig offset, want %llu, have %llu\n",
+		test_err("wrong orig offset, want %llu, have %llu",
 			 em->start, em->orig_start);
 		goto out;
 	}
 	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
-		test_msg("Unexpected compress type, wanted %d, got %d\n",
+		test_err("unexpected compress type, wanted %d, got %d",
 			 BTRFS_COMPRESS_ZLIB, em->compress_type);
 		goto out;
 	}
@@ -630,31 +634,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	/* Split compressed extent */
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		test_err("expected a real extent, got %llu", em->block_start);
 		goto out;
 	}
 	if (em->start != offset || em->len != sectorsize) {
-		test_msg("Unexpected extent wanted start %llu len %u,"
-			"got start %llu len %llu\n",
+		test_err(
+	"unexpected extent wanted start %llu len %u, got start %llu len %llu",
 			offset, sectorsize, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != compressed_only) {
-		test_msg("Unexpected flags set, want %lu have %lu\n",
+		test_err("unexpected flags set, want %lu have %lu",
 			 compressed_only, em->flags);
 		goto out;
 	}
 	if (em->orig_start != em->start) {
-		test_msg("Wrong orig offset, want %llu, have %llu\n",
+		test_err("wrong orig offset, want %llu, have %llu",
 			 em->start, em->orig_start);
 		goto out;
 	}
 	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
-		test_msg("Unexpected compress type, wanted %d, got %d\n",
+		test_err("unexpected compress type, wanted %d, got %d",
 			 BTRFS_COMPRESS_ZLIB, em->compress_type);
 		goto out;
 	}
@@ -665,25 +669,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		test_err("expected a real extent, got %llu", em->block_start);
 		goto out;
 	}
 	if (em->start != offset || em->len != sectorsize) {
-		test_msg("Unexpected extent wanted start %llu len %u, "
-			"got start %llu len %llu\n",
+		test_err(
+	"unexpected extent wanted start %llu len %u, got start %llu len %llu",
 			offset, sectorsize, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		test_err("unexpected flags set, want 0 have %lu", em->flags);
 		goto out;
 	}
 	if (em->orig_start != em->start) {
-		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+		test_err("wrong orig offset, want %llu, have %llu", em->start,
 			 em->orig_start);
 		goto out;
 	}
@@ -692,32 +696,32 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start != disk_bytenr) {
-		test_msg("Block start does not match, want %llu got %llu\n",
+		test_err("block start does not match, want %llu got %llu",
 			 disk_bytenr, em->block_start);
 		goto out;
 	}
 	if (em->start != offset || em->len != 2 * sectorsize) {
-		test_msg("Unexpected extent wanted start %llu len %u, "
-			"got start %llu len %llu\n",
+		test_err(
+	"unexpected extent wanted start %llu len %u, got start %llu len %llu",
 			offset, 2 * sectorsize, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != compressed_only) {
-		test_msg("Unexpected flags set, want %lu have %lu\n",
+		test_err("unexpected flags set, want %lu have %lu",
 			 compressed_only, em->flags);
 		goto out;
 	}
 	if (em->orig_start != orig_start) {
-		test_msg("Wrong orig offset, want %llu, have %llu\n",
+		test_err("wrong orig offset, want %llu, have %llu",
 			 em->start, orig_start);
 		goto out;
 	}
 	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
-		test_msg("Unexpected compress type, wanted %d, got %d\n",
+		test_err("unexpected compress type, wanted %d, got %d",
 			 BTRFS_COMPRESS_ZLIB, em->compress_type);
 		goto out;
 	}
@@ -728,25 +732,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6,
 			sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		test_err("expected a real extent, got %llu", em->block_start);
 		goto out;
 	}
 	if (em->start != offset || em->len != sectorsize) {
-		test_msg("Unexpected extent wanted start %llu len %u, "
-			"got start %llu len %llu\n",
+		test_err(
+	"unexpected extent wanted start %llu len %u, got start %llu len %llu",
 			offset, sectorsize, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		test_err("unexpected flags set, want 0 have %lu", em->flags);
 		goto out;
 	}
 	if (em->orig_start != em->start) {
-		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+		test_err("wrong orig offset, want %llu, have %llu", em->start,
 			 em->orig_start);
 		goto out;
 	}
@@ -755,11 +759,11 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start != EXTENT_MAP_HOLE) {
-		test_msg("Expected a hole extent, got %llu\n", em->block_start);
+		test_err("expected a hole extent, got %llu", em->block_start);
 		goto out;
 	}
 	/*
@@ -768,18 +772,18 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	 * test.
 	 */
 	if (em->start != offset || em->len != 3 * sectorsize) {
-		test_msg("Unexpected extent wanted start %llu len %u, "
-			"got start %llu len %llu\n",
+		test_err(
+	"unexpected extent wanted start %llu len %u, got start %llu len %llu",
 			offset, 3 * sectorsize, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != vacancy_only) {
-		test_msg("Unexpected flags set, want %lu have %lu\n",
+		test_err("unexpected flags set, want %lu have %lu",
 			 vacancy_only, em->flags);
 		goto out;
 	}
 	if (em->orig_start != em->start) {
-		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+		test_err("wrong orig offset, want %llu, have %llu", em->start,
 			 em->orig_start);
 		goto out;
 	}
@@ -788,25 +792,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		test_err("expected a real extent, got %llu", em->block_start);
 		goto out;
 	}
 	if (em->start != offset || em->len != sectorsize) {
-		test_msg("Unexpected extent wanted start %llu len %u,"
-			"got start %llu len %llu\n",
+		test_err(
+	"unexpected extent wanted start %llu len %u, got start %llu len %llu",
 			offset, sectorsize, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		test_err("unexpected flags set, want 0 have %lu", em->flags);
 		goto out;
 	}
 	if (em->orig_start != em->start) {
-		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+		test_err("wrong orig offset, want %llu, have %llu", em->start,
 			 em->orig_start);
 		goto out;
 	}
@@ -830,7 +834,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
 
 	inode = btrfs_new_test_inode();
 	if (!inode) {
-		test_msg("Couldn't allocate inode\n");
+		test_err("couldn't allocate inode");
 		return ret;
 	}
 
@@ -840,19 +844,19 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
 
 	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
 	if (!fs_info) {
-		test_msg("Couldn't allocate dummy fs info\n");
+		test_err("couldn't allocate dummy fs info");
 		goto out;
 	}
 
 	root = btrfs_alloc_dummy_root(fs_info);
 	if (IS_ERR(root)) {
-		test_msg("Couldn't allocate root\n");
+		test_err("couldn't allocate root");
 		goto out;
 	}
 
 	root->node = alloc_dummy_extent_buffer(fs_info, nodesize);
 	if (!root->node) {
-		test_msg("Couldn't allocate dummy buffer\n");
+		test_err("couldn't allocate dummy buffer");
 		goto out;
 	}
 
@@ -871,21 +875,21 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
 		      sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1);
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start != EXTENT_MAP_HOLE) {
-		test_msg("Expected a hole, got %llu\n", em->block_start);
+		test_err("expected a hole, got %llu", em->block_start);
 		goto out;
 	}
 	if (em->start != 0 || em->len != sectorsize) {
-		test_msg("Unexpected extent wanted start 0 len %u, "
-			"got start %llu len %llu\n",
+		test_err(
+	"unexpected extent wanted start 0 len %u, got start %llu len %llu",
 			sectorsize, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != vacancy_only) {
-		test_msg("Wrong flags, wanted %lu, have %lu\n", vacancy_only,
+		test_err("wrong flags, wanted %lu, have %lu", vacancy_only,
 			 em->flags);
 		goto out;
 	}
@@ -894,21 +898,21 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize,
 			2 * sectorsize, 0);
 	if (IS_ERR(em)) {
-		test_msg("Got an error when we shouldn't have\n");
+		test_err("got an error when we shouldn't have");
 		goto out;
 	}
 	if (em->block_start != sectorsize) {
-		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		test_err("expected a real extent, got %llu", em->block_start);
 		goto out;
 	}
 	if (em->start != sectorsize || em->len != sectorsize) {
-		test_msg("Unexpected extent wanted start %u len %u, "
-			"got start %llu len %llu\n",
+		test_err(
+	"unexpected extent wanted start %u len %u, got start %llu len %llu",
 			sectorsize, sectorsize, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_msg("Unexpected flags set, wanted 0 got %lu\n",
+		test_err("unexpected flags set, wanted 0 got %lu",
 			 em->flags);
 		goto out;
 	}
@@ -931,19 +935,19 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 
 	inode = btrfs_new_test_inode();
 	if (!inode) {
-		test_msg("Couldn't allocate inode\n");
+		test_err("couldn't allocate inode");
 		return ret;
 	}
 
 	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
 	if (!fs_info) {
-		test_msg("Couldn't allocate dummy fs info\n");
+		test_err("couldn't allocate dummy fs info");
 		goto out;
 	}
 
 	root = btrfs_alloc_dummy_root(fs_info);
 	if (IS_ERR(root)) {
-		test_msg("Couldn't allocate root\n");
+		test_err("couldn't allocate root");
 		goto out;
 	}
 
@@ -954,12 +958,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0,
 					NULL, 0);
 	if (ret) {
-		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		test_err("btrfs_set_extent_delalloc returned %d", ret);
 		goto out;
 	}
 	if (BTRFS_I(inode)->outstanding_extents != 1) {
 		ret = -EINVAL;
-		test_msg("Miscount, wanted 1, got %u\n",
+		test_err("miscount, wanted 1, got %u",
 			 BTRFS_I(inode)->outstanding_extents);
 		goto out;
 	}
@@ -969,12 +973,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 					BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
 					0, NULL, 0);
 	if (ret) {
-		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		test_err("btrfs_set_extent_delalloc returned %d", ret);
 		goto out;
 	}
 	if (BTRFS_I(inode)->outstanding_extents != 2) {
 		ret = -EINVAL;
-		test_msg("Miscount, wanted 2, got %u\n",
+		test_err("miscount, wanted 2, got %u",
 			 BTRFS_I(inode)->outstanding_extents);
 		goto out;
 	}
@@ -986,12 +990,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 			       EXTENT_DELALLOC | EXTENT_DIRTY |
 			       EXTENT_UPTODATE, 0, 0, NULL);
 	if (ret) {
-		test_msg("clear_extent_bit returned %d\n", ret);
+		test_err("clear_extent_bit returned %d", ret);
 		goto out;
 	}
 	if (BTRFS_I(inode)->outstanding_extents != 2) {
 		ret = -EINVAL;
-		test_msg("Miscount, wanted 2, got %u\n",
+		test_err("miscount, wanted 2, got %u",
 			 BTRFS_I(inode)->outstanding_extents);
 		goto out;
 	}
@@ -1002,12 +1006,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 					+ sectorsize - 1,
 					0, NULL, 0);
 	if (ret) {
-		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		test_err("btrfs_set_extent_delalloc returned %d", ret);
 		goto out;
 	}
 	if (BTRFS_I(inode)->outstanding_extents != 2) {
 		ret = -EINVAL;
-		test_msg("Miscount, wanted 2, got %u\n",
+		test_err("miscount, wanted 2, got %u",
 			 BTRFS_I(inode)->outstanding_extents);
 		goto out;
 	}
@@ -1020,12 +1024,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 			(BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
 			0, NULL, 0);
 	if (ret) {
-		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		test_err("btrfs_set_extent_delalloc returned %d", ret);
 		goto out;
 	}
 	if (BTRFS_I(inode)->outstanding_extents != 4) {
 		ret = -EINVAL;
-		test_msg("Miscount, wanted 4, got %u\n",
+		test_err("miscount, wanted 4, got %u",
 			 BTRFS_I(inode)->outstanding_extents);
 		goto out;
 	}
@@ -1037,12 +1041,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 			BTRFS_MAX_EXTENT_SIZE + sectorsize,
 			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0);
 	if (ret) {
-		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		test_err("btrfs_set_extent_delalloc returned %d", ret);
 		goto out;
 	}
 	if (BTRFS_I(inode)->outstanding_extents != 3) {
 		ret = -EINVAL;
-		test_msg("Miscount, wanted 3, got %u\n",
+		test_err("miscount, wanted 3, got %u",
 			 BTRFS_I(inode)->outstanding_extents);
 		goto out;
 	}
@@ -1054,12 +1058,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 			       EXTENT_DIRTY | EXTENT_DELALLOC |
 			       EXTENT_UPTODATE, 0, 0, NULL);
 	if (ret) {
-		test_msg("clear_extent_bit returned %d\n", ret);
+		test_err("clear_extent_bit returned %d", ret);
 		goto out;
 	}
 	if (BTRFS_I(inode)->outstanding_extents != 4) {
 		ret = -EINVAL;
-		test_msg("Miscount, wanted 4, got %u\n",
+		test_err("miscount, wanted 4, got %u",
 			 BTRFS_I(inode)->outstanding_extents);
 		goto out;
 	}
@@ -1072,12 +1076,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 			BTRFS_MAX_EXTENT_SIZE + sectorsize,
 			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0);
 	if (ret) {
-		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+		test_err("btrfs_set_extent_delalloc returned %d", ret);
 		goto out;
 	}
 	if (BTRFS_I(inode)->outstanding_extents != 3) {
 		ret = -EINVAL;
-		test_msg("Miscount, wanted 3, got %u\n",
+		test_err("miscount, wanted 3, got %u",
 			 BTRFS_I(inode)->outstanding_extents);
 		goto out;
 	}
@@ -1087,12 +1091,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 			       EXTENT_DIRTY | EXTENT_DELALLOC |
 			       EXTENT_UPTODATE, 0, 0, NULL);
 	if (ret) {
-		test_msg("clear_extent_bit returned %d\n", ret);
+		test_err("clear_extent_bit returned %d", ret);
 		goto out;
 	}
 	if (BTRFS_I(inode)->outstanding_extents) {
 		ret = -EINVAL;
-		test_msg("Miscount, wanted 0, got %u\n",
+		test_err("miscount, wanted 0, got %u",
 			 BTRFS_I(inode)->outstanding_extents);
 		goto out;
 	}
@@ -1115,14 +1119,14 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
 	set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
 	set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
 
-	test_msg("Running btrfs_get_extent tests\n");
+	test_msg("running btrfs_get_extent tests");
 	ret = test_btrfs_get_extent(sectorsize, nodesize);
 	if (ret)
 		return ret;
-	test_msg("Running hole first btrfs_get_extent test\n");
+	test_msg("running hole first btrfs_get_extent test");
 	ret = test_hole_first(sectorsize, nodesize);
 	if (ret)
 		return ret;
-	test_msg("Running outstanding_extents tests\n");
+	test_msg("running outstanding_extents tests");
 	return test_extent_accounting(sectorsize, nodesize);
 }
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 39b9578..ace94db 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -24,7 +24,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
 	u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
 	int ret;
 
-	btrfs_init_dummy_trans(&trans);
+	btrfs_init_dummy_trans(&trans, NULL);
 
 	ins.objectid = bytenr;
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
@@ -32,14 +32,14 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		test_msg("Couldn't allocate path\n");
+		test_err("couldn't allocate path");
 		return -ENOMEM;
 	}
 
 	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_item(&trans, root, path, &ins, size);
 	if (ret) {
-		test_msg("Couldn't insert ref %d\n", ret);
+		test_err("couldn't insert ref %d", ret);
 		btrfs_free_path(path);
 		return ret;
 	}
@@ -74,7 +74,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
 	u64 refs;
 	int ret;
 
-	btrfs_init_dummy_trans(&trans);
+	btrfs_init_dummy_trans(&trans, NULL);
 
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -82,14 +82,14 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		test_msg("Couldn't allocate path\n");
+		test_err("couldn't allocate path");
 		return -ENOMEM;
 	}
 
 	path->leave_spinning = 1;
 	ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
 	if (ret) {
-		test_msg("Couldn't find extent ref\n");
+		test_err("couldn't find extent ref");
 		btrfs_free_path(path);
 		return ret;
 	}
@@ -111,7 +111,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
 
 	ret = btrfs_insert_empty_item(&trans, root, path, &key, 0);
 	if (ret)
-		test_msg("Failed to insert backref\n");
+		test_err("failed to insert backref");
 	btrfs_free_path(path);
 	return ret;
 }
@@ -124,7 +124,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
 	struct btrfs_path *path;
 	int ret;
 
-	btrfs_init_dummy_trans(&trans);
+	btrfs_init_dummy_trans(&trans, NULL);
 
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -132,14 +132,14 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		test_msg("Couldn't allocate path\n");
+		test_err("couldn't allocate path");
 		return -ENOMEM;
 	}
 	path->leave_spinning = 1;
 
 	ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
 	if (ret) {
-		test_msg("Didn't find our key %d\n", ret);
+		test_err("didn't find our key %d", ret);
 		btrfs_free_path(path);
 		return ret;
 	}
@@ -158,7 +158,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
 	u64 refs;
 	int ret;
 
-	btrfs_init_dummy_trans(&trans);
+	btrfs_init_dummy_trans(&trans, NULL);
 
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -166,14 +166,14 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		test_msg("Couldn't allocate path\n");
+		test_err("couldn't allocate path");
 		return -ENOMEM;
 	}
 
 	path->leave_spinning = 1;
 	ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
 	if (ret) {
-		test_msg("Couldn't find extent ref\n");
+		test_err("couldn't find extent ref");
 		btrfs_free_path(path);
 		return ret;
 	}
@@ -195,7 +195,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
 
 	ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
 	if (ret) {
-		test_msg("Couldn't find backref %d\n", ret);
+		test_err("couldn't find backref %d", ret);
 		btrfs_free_path(path);
 		return ret;
 	}
@@ -213,12 +213,12 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 	struct ulist *new_roots = NULL;
 	int ret;
 
-	btrfs_init_dummy_trans(&trans);
+	btrfs_init_dummy_trans(&trans, fs_info);
 
-	test_msg("Qgroup basic add\n");
+	test_msg("qgroup basic add");
 	ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FS_TREE_OBJECTID);
 	if (ret) {
-		test_msg("Couldn't create a qgroup %d\n", ret);
+		test_err("couldn't create a qgroup %d", ret);
 		return ret;
 	}
 
@@ -231,7 +231,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 			false);
 	if (ret) {
 		ulist_free(old_roots);
-		test_msg("Couldn't find old roots: %d\n", ret);
+		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
 
@@ -245,20 +245,20 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 	if (ret) {
 		ulist_free(old_roots);
 		ulist_free(new_roots);
-		test_msg("Couldn't find old roots: %d\n", ret);
+		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
 
 	ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
 					  nodesize, old_roots, new_roots);
 	if (ret) {
-		test_msg("Couldn't account space for a qgroup %d\n", ret);
+		test_err("couldn't account space for a qgroup %d", ret);
 		return ret;
 	}
 
 	if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
 				nodesize, nodesize)) {
-		test_msg("Qgroup counts didn't match expected values\n");
+		test_err("qgroup counts didn't match expected values");
 		return -EINVAL;
 	}
 	old_roots = NULL;
@@ -268,7 +268,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 			false);
 	if (ret) {
 		ulist_free(old_roots);
-		test_msg("Couldn't find old roots: %d\n", ret);
+		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
 
@@ -281,19 +281,19 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 	if (ret) {
 		ulist_free(old_roots);
 		ulist_free(new_roots);
-		test_msg("Couldn't find old roots: %d\n", ret);
+		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
 
 	ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
 					  nodesize, old_roots, new_roots);
 	if (ret) {
-		test_msg("Couldn't account space for a qgroup %d\n", ret);
+		test_err("couldn't account space for a qgroup %d", ret);
 		return -EINVAL;
 	}
 
 	if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, 0, 0)) {
-		test_msg("Qgroup counts didn't match expected values\n");
+		test_err("qgroup counts didn't match expected values");
 		return -EINVAL;
 	}
 
@@ -314,9 +314,9 @@ static int test_multiple_refs(struct btrfs_root *root,
 	struct ulist *new_roots = NULL;
 	int ret;
 
-	btrfs_init_dummy_trans(&trans);
+	btrfs_init_dummy_trans(&trans, fs_info);
 
-	test_msg("Qgroup multiple refs test\n");
+	test_msg("qgroup multiple refs test");
 
 	/*
 	 * We have BTRFS_FS_TREE_OBJECTID created already from the
@@ -324,7 +324,7 @@ static int test_multiple_refs(struct btrfs_root *root,
 	 */
 	ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FIRST_FREE_OBJECTID);
 	if (ret) {
-		test_msg("Couldn't create a qgroup %d\n", ret);
+		test_err("couldn't create a qgroup %d", ret);
 		return ret;
 	}
 
@@ -332,7 +332,7 @@ static int test_multiple_refs(struct btrfs_root *root,
 			false);
 	if (ret) {
 		ulist_free(old_roots);
-		test_msg("Couldn't find old roots: %d\n", ret);
+		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
 
@@ -346,20 +346,20 @@ static int test_multiple_refs(struct btrfs_root *root,
 	if (ret) {
 		ulist_free(old_roots);
 		ulist_free(new_roots);
-		test_msg("Couldn't find old roots: %d\n", ret);
+		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
 
 	ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
 					  nodesize, old_roots, new_roots);
 	if (ret) {
-		test_msg("Couldn't account space for a qgroup %d\n", ret);
+		test_err("couldn't account space for a qgroup %d", ret);
 		return ret;
 	}
 
 	if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
 				       nodesize, nodesize)) {
-		test_msg("Qgroup counts didn't match expected values\n");
+		test_err("qgroup counts didn't match expected values");
 		return -EINVAL;
 	}
 
@@ -367,7 +367,7 @@ static int test_multiple_refs(struct btrfs_root *root,
 			false);
 	if (ret) {
 		ulist_free(old_roots);
-		test_msg("Couldn't find old roots: %d\n", ret);
+		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
 
@@ -381,26 +381,26 @@ static int test_multiple_refs(struct btrfs_root *root,
 	if (ret) {
 		ulist_free(old_roots);
 		ulist_free(new_roots);
-		test_msg("Couldn't find old roots: %d\n", ret);
+		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
 
 	ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
 					  nodesize, old_roots, new_roots);
 	if (ret) {
-		test_msg("Couldn't account space for a qgroup %d\n", ret);
+		test_err("couldn't account space for a qgroup %d", ret);
 		return ret;
 	}
 
 	if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
 					nodesize, 0)) {
-		test_msg("Qgroup counts didn't match expected values\n");
+		test_err("qgroup counts didn't match expected values");
 		return -EINVAL;
 	}
 
 	if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID,
 					nodesize, 0)) {
-		test_msg("Qgroup counts didn't match expected values\n");
+		test_err("qgroup counts didn't match expected values");
 		return -EINVAL;
 	}
 
@@ -408,7 +408,7 @@ static int test_multiple_refs(struct btrfs_root *root,
 			false);
 	if (ret) {
 		ulist_free(old_roots);
-		test_msg("Couldn't find old roots: %d\n", ret);
+		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
 
@@ -422,26 +422,26 @@ static int test_multiple_refs(struct btrfs_root *root,
 	if (ret) {
 		ulist_free(old_roots);
 		ulist_free(new_roots);
-		test_msg("Couldn't find old roots: %d\n", ret);
+		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
 
 	ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
 					  nodesize, old_roots, new_roots);
 	if (ret) {
-		test_msg("Couldn't account space for a qgroup %d\n", ret);
+		test_err("couldn't account space for a qgroup %d", ret);
 		return ret;
 	}
 
 	if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID,
 					0, 0)) {
-		test_msg("Qgroup counts didn't match expected values\n");
+		test_err("qgroup counts didn't match expected values");
 		return -EINVAL;
 	}
 
 	if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
 					nodesize, nodesize)) {
-		test_msg("Qgroup counts didn't match expected values\n");
+		test_err("qgroup counts didn't match expected values");
 		return -EINVAL;
 	}
 
@@ -457,13 +457,13 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
 
 	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
 	if (!fs_info) {
-		test_msg("Couldn't allocate dummy fs info\n");
+		test_err("couldn't allocate dummy fs info");
 		return -ENOMEM;
 	}
 
 	root = btrfs_alloc_dummy_root(fs_info);
 	if (IS_ERR(root)) {
-		test_msg("Couldn't allocate root\n");
+		test_err("couldn't allocate root");
 		ret = PTR_ERR(root);
 		goto out;
 	}
@@ -485,7 +485,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
 	 */
 	root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
 	if (!root->node) {
-		test_msg("Couldn't allocate dummy buffer\n");
+		test_err("couldn't allocate dummy buffer");
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -495,7 +495,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
 
 	tmp_root = btrfs_alloc_dummy_root(fs_info);
 	if (IS_ERR(tmp_root)) {
-		test_msg("Couldn't allocate a fs root\n");
+		test_err("couldn't allocate a fs root");
 		ret = PTR_ERR(tmp_root);
 		goto out;
 	}
@@ -504,13 +504,13 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
 	root->fs_info->fs_root = tmp_root;
 	ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
 	if (ret) {
-		test_msg("Couldn't insert fs root %d\n", ret);
+		test_err("couldn't insert fs root %d", ret);
 		goto out;
 	}
 
 	tmp_root = btrfs_alloc_dummy_root(fs_info);
 	if (IS_ERR(tmp_root)) {
-		test_msg("Couldn't allocate a fs root\n");
+		test_err("couldn't allocate a fs root");
 		ret = PTR_ERR(tmp_root);
 		goto out;
 	}
@@ -518,11 +518,11 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
 	tmp_root->root_key.objectid = BTRFS_FIRST_FREE_OBJECTID;
 	ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
 	if (ret) {
-		test_msg("Couldn't insert fs root %d\n", ret);
+		test_err("couldn't insert fs root %d", ret);
 		goto out;
 	}
 
-	test_msg("Running qgroup tests\n");
+	test_msg("running qgroup tests");
 	ret = test_no_shared_qgroup(root, sectorsize, nodesize);
 	if (ret)
 		goto out;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c944b47..4485eae 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -877,12 +877,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	atomic_dec(&cur_trans->num_writers);
 	extwriter_counter_dec(cur_trans, trans->type);
 
-	/*
-	 * Make sure counter is updated before we wake up waiters.
-	 */
-	smp_mb();
-	if (waitqueue_active(&cur_trans->writer_wait))
-		wake_up(&cur_trans->writer_wait);
+	cond_wake_up(&cur_trans->writer_wait);
 	btrfs_put_transaction(cur_trans);
 
 	if (current->journal_info == trans)
@@ -1250,7 +1245,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
 
 			btrfs_free_log(trans, root);
 			btrfs_update_reloc_root(trans, root);
-			btrfs_orphan_commit_root(trans, root);
 
 			btrfs_save_ino_cache(root, trans);
 
@@ -1640,15 +1634,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
 	}
-	ret = btrfs_uuid_tree_add(trans, fs_info, new_uuid.b,
-				  BTRFS_UUID_KEY_SUBVOL, objectid);
+	ret = btrfs_uuid_tree_add(trans, new_uuid.b, BTRFS_UUID_KEY_SUBVOL,
+				  objectid);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
 	}
 	if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
-		ret = btrfs_uuid_tree_add(trans, fs_info,
-					  new_root_item->received_uuid,
+		ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid,
 					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
 					  objectid);
 		if (ret && ret != -EEXIST) {
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d8c0826..9443948 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -139,7 +139,6 @@ struct btrfs_pending_snapshot {
 	struct btrfs_path *path;
 	/* block reservation for the operation */
 	struct btrfs_block_rsv block_rsv;
-	u64 qgroup_reserved;
 	/* extra metadata reservation for relocation */
 	int error;
 	bool readonly;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8f23a94..f8220ec 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -222,11 +222,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
 void btrfs_end_log_trans(struct btrfs_root *root)
 {
 	if (atomic_dec_and_test(&root->log_writers)) {
-		/*
-		 * Implicit memory barrier after atomic_dec_and_test
-		 */
-		if (waitqueue_active(&root->log_writer_wait))
-			wake_up(&root->log_writer_wait);
+		/* atomic_dec_and_test implies a barrier */
+		cond_wake_up_nomb(&root->log_writer_wait);
 	}
 }
 
@@ -2988,11 +2985,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&log_root_tree->log_mutex);
 	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
-		/*
-		 * Implicit memory barrier after atomic_dec_and_test
-		 */
-		if (waitqueue_active(&log_root_tree->log_writer_wait))
-			wake_up(&log_root_tree->log_writer_wait);
+		/* atomic_dec_and_test implies a barrier */
+		cond_wake_up_nomb(&log_root_tree->log_writer_wait);
 	}
 
 	if (ret) {
@@ -3116,10 +3110,11 @@ out_wake_log_root:
 	mutex_unlock(&log_root_tree->log_mutex);
 
 	/*
-	 * The barrier before waitqueue_active is implied by mutex_unlock
+	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
+	 * all the updates above are seen by the woken threads. It might not be
+	 * necessary, but proving that seems to be hard.
 	 */
-	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
-		wake_up(&log_root_tree->log_commit_wait[index2]);
+	cond_wake_up(&log_root_tree->log_commit_wait[index2]);
 out:
 	mutex_lock(&root->log_mutex);
 	btrfs_remove_all_log_ctxs(root, index1, ret);
@@ -3128,10 +3123,11 @@ out:
 	mutex_unlock(&root->log_mutex);
 
 	/*
-	 * The barrier before waitqueue_active is implied by mutex_unlock
+	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
+	 * all the updates above are seen by the woken threads. It might not be
+	 * necessary, but proving that seems to be hard.
 	 */
-	if (waitqueue_active(&root->log_commit_wait[index1]))
-		wake_up(&root->log_commit_wait[index1]);
+	cond_wake_up(&root->log_commit_wait[index1]);
 	return ret;
 }
 
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 1ba7ca2..3b2ae34 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -79,10 +79,10 @@ out:
 	return ret;
 }
 
-int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
-			struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
 			u64 subid_cpu)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_root *uuid_root = fs_info->uuid_root;
 	int ret;
 	struct btrfs_path *path = NULL;
@@ -144,10 +144,10 @@ out:
 	return ret;
 }
 
-int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
-			struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
 			u64 subid)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_root *uuid_root = fs_info->uuid_root;
 	int ret;
 	struct btrfs_path *path = NULL;
@@ -239,7 +239,7 @@ static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type,
 		goto out;
 	}
 
-	ret = btrfs_uuid_tree_rem(trans, uuid_root->fs_info, uuid, type, subid);
+	ret = btrfs_uuid_tree_remove(trans, uuid, type, subid);
 	btrfs_end_transaction(trans);
 
 out:
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index be3fc70..e034ad9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -40,6 +40,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.tolerated_failures = 1,
 		.devs_increment	= 2,
 		.ncopies	= 2,
+		.raid_name	= "raid10",
+		.bg_flag	= BTRFS_BLOCK_GROUP_RAID10,
+		.mindev_error	= BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
 	},
 	[BTRFS_RAID_RAID1] = {
 		.sub_stripes	= 1,
@@ -49,6 +52,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.tolerated_failures = 1,
 		.devs_increment	= 2,
 		.ncopies	= 2,
+		.raid_name	= "raid1",
+		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1,
+		.mindev_error	= BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
 	},
 	[BTRFS_RAID_DUP] = {
 		.sub_stripes	= 1,
@@ -58,6 +64,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.tolerated_failures = 0,
 		.devs_increment	= 1,
 		.ncopies	= 2,
+		.raid_name	= "dup",
+		.bg_flag	= BTRFS_BLOCK_GROUP_DUP,
+		.mindev_error	= 0,
 	},
 	[BTRFS_RAID_RAID0] = {
 		.sub_stripes	= 1,
@@ -67,6 +76,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.tolerated_failures = 0,
 		.devs_increment	= 1,
 		.ncopies	= 1,
+		.raid_name	= "raid0",
+		.bg_flag	= BTRFS_BLOCK_GROUP_RAID0,
+		.mindev_error	= 0,
 	},
 	[BTRFS_RAID_SINGLE] = {
 		.sub_stripes	= 1,
@@ -76,6 +88,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.tolerated_failures = 0,
 		.devs_increment	= 1,
 		.ncopies	= 1,
+		.raid_name	= "single",
+		.bg_flag	= 0,
+		.mindev_error	= 0,
 	},
 	[BTRFS_RAID_RAID5] = {
 		.sub_stripes	= 1,
@@ -85,6 +100,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.tolerated_failures = 1,
 		.devs_increment	= 1,
 		.ncopies	= 2,
+		.raid_name	= "raid5",
+		.bg_flag	= BTRFS_BLOCK_GROUP_RAID5,
+		.mindev_error	= BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
 	},
 	[BTRFS_RAID_RAID6] = {
 		.sub_stripes	= 1,
@@ -94,33 +112,19 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.tolerated_failures = 2,
 		.devs_increment	= 1,
 		.ncopies	= 3,
+		.raid_name	= "raid6",
+		.bg_flag	= BTRFS_BLOCK_GROUP_RAID6,
+		.mindev_error	= BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
 	},
 };
 
-const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
-	[BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
-	[BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
-	[BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
-	[BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
-	[BTRFS_RAID_SINGLE] = 0,
-	[BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
-	[BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
-};
+const char *get_raid_name(enum btrfs_raid_types type)
+{
+	if (type >= BTRFS_NR_RAID_TYPES)
+		return NULL;
 
-/*
- * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
- * condition is not met. Zero means there's no corresponding
- * BTRFS_ERROR_DEV_*_NOT_MET value.
- */
-const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
-	[BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
-	[BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
-	[BTRFS_RAID_DUP]    = 0,
-	[BTRFS_RAID_RAID0]  = 0,
-	[BTRFS_RAID_SINGLE] = 0,
-	[BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
-	[BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
-};
+	return btrfs_raid_array[type].raid_name;
+}
 
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
 				struct btrfs_fs_info *fs_info);
@@ -167,12 +171,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
  * may be used to exclude some operations from running concurrently without any
  * modifications to the list (see write_all_supers)
  *
- * volume_mutex
- * ------------
- * coarse lock owned by a mounted filesystem; used to exclude some operations
- * that cannot run in parallel and affect the higher-level properties of the
- * filesystem like: device add/deleting/resize/replace, or balance
- *
  * balance_mutex
  * -------------
  * protects balance structures (status, state) and context accessed from
@@ -197,6 +195,41 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
  *     device_list_mutex
  *       chunk_mutex
  *     balance_mutex
+ *
+ *
+ * Exclusive operations, BTRFS_FS_EXCL_OP
+ * ======================================
+ *
+ * Maintains the exclusivity of the following operations that apply to the
+ * whole filesystem and cannot run in parallel.
+ *
+ * - Balance (*)
+ * - Device add
+ * - Device remove
+ * - Device replace (*)
+ * - Resize
+ *
+ * The device operations (as above) can be in one of the following states:
+ *
+ * - Running state
+ * - Paused state
+ * - Completed state
+ *
+ * Only device operations marked with (*) can go into the Paused state for the
+ * following reasons:
+ *
+ * - ioctl (only Balance can be Paused through ioctl)
+ * - filesystem remounted as read-only
+ * - filesystem unmounted and mounted as read-only
+ * - system power-cycle and filesystem mounted as read-only
+ * - filesystem or device errors leading to forced read-only
+ *
+ * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
+ * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
+ * A device operation in Paused or Running state can be canceled or resumed
+ * either by ioctl (Balance only) or when remounted as read-write.
+ * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
+ * completed.
  */
 
 DEFINE_MUTEX(uuid_mutex);
@@ -227,14 +260,14 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
 	INIT_LIST_HEAD(&fs_devs->devices);
 	INIT_LIST_HEAD(&fs_devs->resized_devices);
 	INIT_LIST_HEAD(&fs_devs->alloc_list);
-	INIT_LIST_HEAD(&fs_devs->list);
+	INIT_LIST_HEAD(&fs_devs->fs_list);
 	if (fsid)
 		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
 
 	return fs_devs;
 }
 
-static void free_device(struct btrfs_device *device)
+void btrfs_free_device(struct btrfs_device *device)
 {
 	rcu_string_free(device->name);
 	bio_put(device->flush_bio);
@@ -249,7 +282,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 		device = list_entry(fs_devices->devices.next,
 				    struct btrfs_device, dev_list);
 		list_del(&device->dev_list);
-		free_device(device);
+		btrfs_free_device(device);
 	}
 	kfree(fs_devices);
 }
@@ -273,8 +306,8 @@ void __exit btrfs_cleanup_fs_uuids(void)
 
 	while (!list_empty(&fs_uuids)) {
 		fs_devices = list_entry(fs_uuids.next,
-					struct btrfs_fs_devices, list);
-		list_del(&fs_devices->list);
+					struct btrfs_fs_devices, fs_list);
+		list_del(&fs_devices->fs_list);
 		free_fs_devices(fs_devices);
 	}
 }
@@ -282,7 +315,7 @@ void __exit btrfs_cleanup_fs_uuids(void)
 /*
  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
  * Returned struct is not linked onto any lists and must be destroyed using
- * free_device.
+ * btrfs_free_device.
  */
 static struct btrfs_device *__alloc_device(void)
 {
@@ -327,10 +360,9 @@ static struct btrfs_device *__alloc_device(void)
 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
 		u64 devid, const u8 *uuid)
 {
-	struct list_head *head = &fs_devices->devices;
 	struct btrfs_device *dev;
 
-	list_for_each_entry(dev, head, dev_list) {
+	list_for_each_entry(dev, &fs_devices->devices, dev_list) {
 		if (dev->devid == devid &&
 		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
 			return dev;
@@ -343,7 +375,7 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 {
 	struct btrfs_fs_devices *fs_devices;
 
-	list_for_each_entry(fs_devices, &fs_uuids, list) {
+	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 			return fs_devices;
 	}
@@ -607,7 +639,7 @@ static void btrfs_free_stale_devices(const char *path,
 	struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
 	struct btrfs_device *dev, *tmp_dev;
 
-	list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) {
+	list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) {
 
 		if (fs_devs->opened)
 			continue;
@@ -632,13 +664,13 @@ static void btrfs_free_stale_devices(const char *path,
 			/* delete the stale device */
 			if (fs_devs->num_devices == 1) {
 				btrfs_sysfs_remove_fsid(fs_devs);
-				list_del(&fs_devs->list);
+				list_del(&fs_devs->fs_list);
 				free_fs_devices(fs_devs);
 				break;
 			} else {
 				fs_devs->num_devices--;
 				list_del(&dev->dev_list);
-				free_device(dev);
+				btrfs_free_device(dev);
 			}
 		}
 	}
@@ -732,7 +764,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 		if (IS_ERR(fs_devices))
 			return ERR_CAST(fs_devices);
 
-		list_add(&fs_devices->list, &fs_uuids);
+		list_add(&fs_devices->fs_list, &fs_uuids);
 
 		device = NULL;
 	} else {
@@ -753,7 +785,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 
 		name = rcu_string_strdup(path, GFP_NOFS);
 		if (!name) {
-			free_device(device);
+			btrfs_free_device(device);
 			return ERR_PTR(-ENOMEM);
 		}
 		rcu_assign_pointer(device->name, name);
@@ -866,7 +898,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 			name = rcu_string_strdup(orig_dev->name->str,
 					GFP_KERNEL);
 			if (!name) {
-				free_device(device);
+				btrfs_free_device(device);
 				goto error;
 			}
 			rcu_assign_pointer(device->name, name);
@@ -938,7 +970,7 @@ again:
 		}
 		list_del_init(&device->dev_list);
 		fs_devices->num_devices--;
-		free_device(device);
+		btrfs_free_device(device);
 	}
 
 	if (fs_devices->seed) {
@@ -956,7 +988,7 @@ static void free_device_rcu(struct rcu_head *head)
 	struct btrfs_device *device;
 
 	device = container_of(head, struct btrfs_device, rcu);
-	free_device(device);
+	btrfs_free_device(device);
 }
 
 static void btrfs_close_bdev(struct btrfs_device *device)
@@ -1005,7 +1037,7 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device)
 	new_device->fs_devices = device->fs_devices;
 }
 
-static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
 {
 	struct btrfs_device *device, *tmp;
 	struct list_head pending_put;
@@ -1050,7 +1082,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 	int ret;
 
 	mutex_lock(&uuid_mutex);
-	ret = __btrfs_close_devices(fs_devices);
+	ret = close_fs_devices(fs_devices);
 	if (!fs_devices->opened) {
 		seed_devices = fs_devices->seed;
 		fs_devices->seed = NULL;
@@ -1060,23 +1092,22 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 	while (seed_devices) {
 		fs_devices = seed_devices;
 		seed_devices = fs_devices->seed;
-		__btrfs_close_devices(fs_devices);
+		close_fs_devices(fs_devices);
 		free_fs_devices(fs_devices);
 	}
 	return ret;
 }
 
-static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
 				fmode_t flags, void *holder)
 {
-	struct list_head *head = &fs_devices->devices;
 	struct btrfs_device *device;
 	struct btrfs_device *latest_dev = NULL;
 	int ret = 0;
 
 	flags |= FMODE_EXCL;
 
-	list_for_each_entry(device, head, dev_list) {
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
 		/* Just open everything we can; ignore failures here */
 		if (btrfs_open_one_device(fs_devices, device, flags, holder))
 			continue;
@@ -1115,15 +1146,16 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 {
 	int ret;
 
-	mutex_lock(&uuid_mutex);
+	mutex_lock(&fs_devices->device_list_mutex);
 	if (fs_devices->opened) {
 		fs_devices->opened++;
 		ret = 0;
 	} else {
 		list_sort(NULL, &fs_devices->devices, devid_cmp);
-		ret = __btrfs_open_devices(fs_devices, flags, holder);
+		ret = open_fs_devices(fs_devices, flags, holder);
 	}
-	mutex_unlock(&uuid_mutex);
+	mutex_unlock(&fs_devices->device_list_mutex);
+
 	return ret;
 }
 
@@ -1201,31 +1233,29 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	 */
 	bytenr = btrfs_sb_offset(0);
 	flags |= FMODE_EXCL;
-	mutex_lock(&uuid_mutex);
 
 	bdev = blkdev_get_by_path(path, flags, holder);
-	if (IS_ERR(bdev)) {
-		ret = PTR_ERR(bdev);
-		goto error;
-	}
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
 
 	if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
 		ret = -EINVAL;
 		goto error_bdev_put;
 	}
 
+	mutex_lock(&uuid_mutex);
 	device = device_list_add(path, disk_super);
 	if (IS_ERR(device))
 		ret = PTR_ERR(device);
 	else
 		*fs_devices_ret = device->fs_devices;
+	mutex_unlock(&uuid_mutex);
 
 	btrfs_release_disk_super(page);
 
 error_bdev_put:
 	blkdev_put(bdev, flags);
-error:
-	mutex_unlock(&uuid_mutex);
+
 	return ret;
 }
 
@@ -1857,11 +1887,11 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
 	} while (read_seqretry(&fs_info->profiles_lock, seq));
 
 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
-		if (!(all_avail & btrfs_raid_group[i]))
+		if (!(all_avail & btrfs_raid_array[i].bg_flag))
 			continue;
 
 		if (num_devices < btrfs_raid_array[i].devs_min) {
-			int ret = btrfs_raid_mindev_error[i];
+			int ret = btrfs_raid_array[i].mindev_error;
 
 			if (ret)
 				return ret;
@@ -1917,13 +1947,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 {
 	struct btrfs_device *device;
 	struct btrfs_fs_devices *cur_devices;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	u64 num_devices;
 	int ret = 0;
 
-	mutex_lock(&fs_info->volume_mutex);
 	mutex_lock(&uuid_mutex);
 
-	num_devices = fs_info->fs_devices->num_devices;
+	num_devices = fs_devices->num_devices;
 	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
 	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
 		WARN_ON(num_devices < 1);
@@ -1986,27 +2016,32 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 	 * (super_copy) should hold the device list mutex.
 	 */
 
+	/*
+	 * In normal cases the cur_devices == fs_devices. But in case
+	 * of deleting a seed device, the cur_devices should point to
+	 * its own fs_devices listed under the fs_devices->seed.
+	 */
 	cur_devices = device->fs_devices;
-	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+	mutex_lock(&fs_devices->device_list_mutex);
 	list_del_rcu(&device->dev_list);
 
-	device->fs_devices->num_devices--;
-	device->fs_devices->total_devices--;
+	cur_devices->num_devices--;
+	cur_devices->total_devices--;
 
 	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
-		device->fs_devices->missing_devices--;
+		cur_devices->missing_devices--;
 
 	btrfs_assign_next_active_device(fs_info, device, NULL);
 
 	if (device->bdev) {
-		device->fs_devices->open_devices--;
+		cur_devices->open_devices--;
 		/* remove sysfs entry */
-		btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
+		btrfs_sysfs_rm_device_link(fs_devices, device);
 	}
 
 	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
 	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
-	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+	mutex_unlock(&fs_devices->device_list_mutex);
 
 	/*
 	 * at this point, the device is zero sized and detached from
@@ -2020,8 +2055,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 	call_rcu(&device->rcu, free_device_rcu);
 
 	if (cur_devices->open_devices == 0) {
-		struct btrfs_fs_devices *fs_devices;
-		fs_devices = fs_info->fs_devices;
 		while (fs_devices) {
 			if (fs_devices->seed == cur_devices) {
 				fs_devices->seed = cur_devices->seed;
@@ -2030,20 +2063,19 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 			fs_devices = fs_devices->seed;
 		}
 		cur_devices->seed = NULL;
-		__btrfs_close_devices(cur_devices);
+		close_fs_devices(cur_devices);
 		free_fs_devices(cur_devices);
 	}
 
 out:
 	mutex_unlock(&uuid_mutex);
-	mutex_unlock(&fs_info->volume_mutex);
 	return ret;
 
 error_undo:
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 		mutex_lock(&fs_info->chunk_mutex);
 		list_add(&device->dev_alloc_list,
-			 &fs_info->fs_devices->alloc_list);
+			 &fs_devices->alloc_list);
 		device->fs_devices->rw_devices++;
 		mutex_unlock(&fs_info->chunk_mutex);
 	}
@@ -2112,7 +2144,7 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
 			tmp_fs_devices = tmp_fs_devices->seed;
 		}
 		fs_devices->seed = NULL;
-		__btrfs_close_devices(fs_devices);
+		close_fs_devices(fs_devices);
 		free_fs_devices(fs_devices);
 	}
 }
@@ -2120,23 +2152,23 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 				      struct btrfs_device *tgtdev)
 {
-	mutex_lock(&uuid_mutex);
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+
 	WARN_ON(!tgtdev);
-	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+	mutex_lock(&fs_devices->device_list_mutex);
 
-	btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
+	btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
 
 	if (tgtdev->bdev)
-		fs_info->fs_devices->open_devices--;
+		fs_devices->open_devices--;
 
-	fs_info->fs_devices->num_devices--;
+	fs_devices->num_devices--;
 
 	btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
 
 	list_del_rcu(&tgtdev->dev_list);
 
-	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-	mutex_unlock(&uuid_mutex);
+	mutex_unlock(&fs_devices->device_list_mutex);
 
 	/*
 	 * The update_dev_time() with in btrfs_scratch_superblocks()
@@ -2188,10 +2220,6 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
 		struct btrfs_device *tmp;
 
 		devices = &fs_info->fs_devices->devices;
-		/*
-		 * It is safe to read the devices since the volume_mutex
-		 * is held by the caller.
-		 */
 		list_for_each_entry(tmp, devices, dev_list) {
 			if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
 					&tmp->dev_state) && !tmp->bdev) {
@@ -2259,7 +2287,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
 		return PTR_ERR(old_devices);
 	}
 
-	list_add(&old_devices->list, &fs_uuids);
+	list_add(&old_devices->fs_list, &fs_uuids);
 
 	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
 	seed_devices->opened = 1;
@@ -2570,7 +2598,7 @@ error_trans:
 	if (trans)
 		btrfs_end_transaction(trans);
 error_free_device:
-	free_device(device);
+	btrfs_free_device(device);
 error:
 	blkdev_put(bdev, FMODE_EXCL);
 	if (seeding_dev && !unlocked) {
@@ -2580,99 +2608,6 @@ error:
 	return ret;
 }
 
-int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
-				  const char *device_path,
-				  struct btrfs_device *srcdev,
-				  struct btrfs_device **device_out)
-{
-	struct btrfs_device *device;
-	struct block_device *bdev;
-	struct list_head *devices;
-	struct rcu_string *name;
-	u64 devid = BTRFS_DEV_REPLACE_DEVID;
-	int ret = 0;
-
-	*device_out = NULL;
-	if (fs_info->fs_devices->seeding) {
-		btrfs_err(fs_info, "the filesystem is a seed filesystem!");
-		return -EINVAL;
-	}
-
-	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
-				  fs_info->bdev_holder);
-	if (IS_ERR(bdev)) {
-		btrfs_err(fs_info, "target device %s is invalid!", device_path);
-		return PTR_ERR(bdev);
-	}
-
-	filemap_write_and_wait(bdev->bd_inode->i_mapping);
-
-	devices = &fs_info->fs_devices->devices;
-	list_for_each_entry(device, devices, dev_list) {
-		if (device->bdev == bdev) {
-			btrfs_err(fs_info,
-				  "target device is in the filesystem!");
-			ret = -EEXIST;
-			goto error;
-		}
-	}
-
-
-	if (i_size_read(bdev->bd_inode) <
-	    btrfs_device_get_total_bytes(srcdev)) {
-		btrfs_err(fs_info,
-			  "target device is smaller than source device!");
-		ret = -EINVAL;
-		goto error;
-	}
-
-
-	device = btrfs_alloc_device(NULL, &devid, NULL);
-	if (IS_ERR(device)) {
-		ret = PTR_ERR(device);
-		goto error;
-	}
-
-	name = rcu_string_strdup(device_path, GFP_KERNEL);
-	if (!name) {
-		free_device(device);
-		ret = -ENOMEM;
-		goto error;
-	}
-	rcu_assign_pointer(device->name, name);
-
-	mutex_lock(&fs_info->fs_devices->device_list_mutex);
-	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
-	device->generation = 0;
-	device->io_width = fs_info->sectorsize;
-	device->io_align = fs_info->sectorsize;
-	device->sector_size = fs_info->sectorsize;
-	device->total_bytes = btrfs_device_get_total_bytes(srcdev);
-	device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
-	device->bytes_used = btrfs_device_get_bytes_used(srcdev);
-	device->commit_total_bytes = srcdev->commit_total_bytes;
-	device->commit_bytes_used = device->bytes_used;
-	device->fs_info = fs_info;
-	device->bdev = bdev;
-	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
-	set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
-	device->mode = FMODE_EXCL;
-	device->dev_stats_valid = 1;
-	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
-	device->fs_devices = fs_info->fs_devices;
-	list_add(&device->dev_list, &fs_info->fs_devices->devices);
-	fs_info->fs_devices->num_devices++;
-	fs_info->fs_devices->open_devices++;
-	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-
-	*device_out = device;
-	return ret;
-
-error:
-	blkdev_put(bdev, FMODE_EXCL);
-	return ret;
-}
-
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
 					struct btrfs_device *device)
 {
@@ -3273,24 +3208,12 @@ static void update_balance_args(struct btrfs_balance_control *bctl)
 }
 
 /*
- * Should be called with both balance and volume mutexes held to
- * serialize other volume operations (add_dev/rm_dev/resize) with
- * restriper.  Same goes for unset_balance_control.
+ * Clear the balance status in fs_info and delete the balance item from disk.
  */
-static void set_balance_control(struct btrfs_balance_control *bctl)
-{
-	struct btrfs_fs_info *fs_info = bctl->fs_info;
-
-	BUG_ON(fs_info->balance_ctl);
-
-	spin_lock(&fs_info->balance_lock);
-	fs_info->balance_ctl = bctl;
-	spin_unlock(&fs_info->balance_lock);
-}
-
-static void unset_balance_control(struct btrfs_fs_info *fs_info)
+static void reset_balance_state(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+	int ret;
 
 	BUG_ON(!fs_info->balance_ctl);
 
@@ -3299,6 +3222,9 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info)
 	spin_unlock(&fs_info->balance_lock);
 
 	kfree(bctl);
+	ret = del_balance_item(fs_info);
+	if (ret)
+		btrfs_handle_fs_error(fs_info, ret, NULL);
 }
 
 /*
@@ -3835,18 +3761,6 @@ static inline int balance_need_close(struct btrfs_fs_info *fs_info)
 		 atomic_read(&fs_info->balance_cancel_req) == 0);
 }
 
-static void __cancel_balance(struct btrfs_fs_info *fs_info)
-{
-	int ret;
-
-	unset_balance_control(fs_info);
-	ret = del_balance_item(fs_info);
-	if (ret)
-		btrfs_handle_fs_error(fs_info, ret, NULL);
-
-	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
-}
-
 /* Non-zero return value signifies invalidity */
 static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
 		u64 allowed)
@@ -3857,12 +3771,12 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
 }
 
 /*
- * Should be called with both balance and volume mutexes held
+ * Should be called with balance mutexe held
  */
-int btrfs_balance(struct btrfs_balance_control *bctl,
+int btrfs_balance(struct btrfs_fs_info *fs_info,
+		  struct btrfs_balance_control *bctl,
 		  struct btrfs_ioctl_balance_args *bargs)
 {
-	struct btrfs_fs_info *fs_info = bctl->fs_info;
 	u64 meta_target, data_target;
 	u64 allowed;
 	int mixed = 0;
@@ -3891,7 +3805,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
 		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
 			btrfs_err(fs_info,
-				  "with mixed groups data and metadata balance options must be the same");
+	  "balance: mixed groups data and metadata options must be the same");
 			ret = -EINVAL;
 			goto out;
 		}
@@ -3913,23 +3827,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
 			    BTRFS_BLOCK_GROUP_RAID6);
 	if (validate_convert_profile(&bctl->data, allowed)) {
+		int index = btrfs_bg_flags_to_raid_index(bctl->data.target);
+
 		btrfs_err(fs_info,
-			  "unable to start balance with target data profile %llu",
-			  bctl->data.target);
+			  "balance: invalid convert data profile %s",
+			  get_raid_name(index));
 		ret = -EINVAL;
 		goto out;
 	}
 	if (validate_convert_profile(&bctl->meta, allowed)) {
+		int index = btrfs_bg_flags_to_raid_index(bctl->meta.target);
+
 		btrfs_err(fs_info,
-			  "unable to start balance with target metadata profile %llu",
-			  bctl->meta.target);
+			  "balance: invalid convert metadata profile %s",
+			  get_raid_name(index));
 		ret = -EINVAL;
 		goto out;
 	}
 	if (validate_convert_profile(&bctl->sys, allowed)) {
+		int index = btrfs_bg_flags_to_raid_index(bctl->sys.target);
+
 		btrfs_err(fs_info,
-			  "unable to start balance with target system profile %llu",
-			  bctl->sys.target);
+			  "balance: invalid convert system profile %s",
+			  get_raid_name(index));
 		ret = -EINVAL;
 		goto out;
 	}
@@ -3950,10 +3870,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		     !(bctl->meta.target & allowed))) {
 			if (bctl->flags & BTRFS_BALANCE_FORCE) {
 				btrfs_info(fs_info,
-					   "force reducing metadata integrity");
+				"balance: force reducing metadata integrity");
 			} else {
 				btrfs_err(fs_info,
-					  "balance will reduce metadata integrity, use force if you want this");
+	"balance: reduces metadata integrity, use --force if you want this");
 				ret = -EINVAL;
 				goto out;
 			}
@@ -3967,9 +3887,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		bctl->data.target : fs_info->avail_data_alloc_bits;
 	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
 		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
+		int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
+		int data_index = btrfs_bg_flags_to_raid_index(data_target);
+
 		btrfs_warn(fs_info,
-			   "metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
-			   meta_target, data_target);
+	"balance: metadata profile %s has lower redundancy than data profile %s",
+			   get_raid_name(meta_index), get_raid_name(data_index));
 	}
 
 	ret = insert_balance_item(fs_info, bctl);
@@ -3978,7 +3901,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 
 	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
 		BUG_ON(ret == -EEXIST);
-		set_balance_control(bctl);
+		BUG_ON(fs_info->balance_ctl);
+		spin_lock(&fs_info->balance_lock);
+		fs_info->balance_ctl = bctl;
+		spin_unlock(&fs_info->balance_lock);
 	} else {
 		BUG_ON(ret != -EEXIST);
 		spin_lock(&fs_info->balance_lock);
@@ -3986,22 +3912,24 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		spin_unlock(&fs_info->balance_lock);
 	}
 
-	atomic_inc(&fs_info->balance_running);
+	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
+	set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
 	mutex_unlock(&fs_info->balance_mutex);
 
 	ret = __btrfs_balance(fs_info);
 
 	mutex_lock(&fs_info->balance_mutex);
-	atomic_dec(&fs_info->balance_running);
+	clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
 
 	if (bargs) {
 		memset(bargs, 0, sizeof(*bargs));
-		update_ioctl_balance_args(fs_info, 0, bargs);
+		btrfs_update_ioctl_balance_args(fs_info, bargs);
 	}
 
 	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
 	    balance_need_close(fs_info)) {
-		__cancel_balance(fs_info);
+		reset_balance_state(fs_info);
+		clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
 	}
 
 	wake_up(&fs_info->balance_wait_q);
@@ -4009,11 +3937,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	return ret;
 out:
 	if (bctl->flags & BTRFS_BALANCE_RESUME)
-		__cancel_balance(fs_info);
-	else {
+		reset_balance_state(fs_info);
+	else
 		kfree(bctl);
-		clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
-	}
+	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+
 	return ret;
 }
 
@@ -4022,16 +3950,12 @@ static int balance_kthread(void *data)
 	struct btrfs_fs_info *fs_info = data;
 	int ret = 0;
 
-	mutex_lock(&fs_info->volume_mutex);
 	mutex_lock(&fs_info->balance_mutex);
-
 	if (fs_info->balance_ctl) {
-		btrfs_info(fs_info, "continuing balance");
-		ret = btrfs_balance(fs_info->balance_ctl, NULL);
+		btrfs_info(fs_info, "balance: resuming");
+		ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
 	}
-
 	mutex_unlock(&fs_info->balance_mutex);
-	mutex_unlock(&fs_info->volume_mutex);
 
 	return ret;
 }
@@ -4040,15 +3964,15 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
 {
 	struct task_struct *tsk;
 
-	spin_lock(&fs_info->balance_lock);
+	mutex_lock(&fs_info->balance_mutex);
 	if (!fs_info->balance_ctl) {
-		spin_unlock(&fs_info->balance_lock);
+		mutex_unlock(&fs_info->balance_mutex);
 		return 0;
 	}
-	spin_unlock(&fs_info->balance_lock);
+	mutex_unlock(&fs_info->balance_mutex);
 
 	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
-		btrfs_info(fs_info, "force skipping balance");
+		btrfs_info(fs_info, "balance: resume skipped");
 		return 0;
 	}
 
@@ -4100,7 +4024,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
 	leaf = path->nodes[0];
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
 
-	bctl->fs_info = fs_info;
 	bctl->flags = btrfs_balance_flags(leaf, item);
 	bctl->flags |= BTRFS_BALANCE_RESUME;
 
@@ -4111,15 +4034,26 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
 	btrfs_balance_sys(leaf, item, &disk_bargs);
 	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
 
-	WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
+	/*
+	 * This should never happen, as the paused balance state is recovered
+	 * during mount without any chance of other exclusive ops to collide.
+	 *
+	 * This gives the exclusive op status to balance and keeps in paused
+	 * state until user intervention (cancel or umount). If the ownership
+	 * cannot be assigned, show a message but do not fail. The balance
+	 * is in a paused state and must have fs_info::balance_ctl properly
+	 * set up.
+	 */
+	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+		btrfs_warn(fs_info,
+	"balance: cannot set exclusive op status, resume manually");
 
-	mutex_lock(&fs_info->volume_mutex);
 	mutex_lock(&fs_info->balance_mutex);
-
-	set_balance_control(bctl);
-
+	BUG_ON(fs_info->balance_ctl);
+	spin_lock(&fs_info->balance_lock);
+	fs_info->balance_ctl = bctl;
+	spin_unlock(&fs_info->balance_lock);
 	mutex_unlock(&fs_info->balance_mutex);
-	mutex_unlock(&fs_info->volume_mutex);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -4135,16 +4069,16 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
 		return -ENOTCONN;
 	}
 
-	if (atomic_read(&fs_info->balance_running)) {
+	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
 		atomic_inc(&fs_info->balance_pause_req);
 		mutex_unlock(&fs_info->balance_mutex);
 
 		wait_event(fs_info->balance_wait_q,
-			   atomic_read(&fs_info->balance_running) == 0);
+			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
 
 		mutex_lock(&fs_info->balance_mutex);
 		/* we are good with balance_ctl ripped off from under us */
-		BUG_ON(atomic_read(&fs_info->balance_running));
+		BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
 		atomic_dec(&fs_info->balance_pause_req);
 	} else {
 		ret = -ENOTCONN;
@@ -4156,38 +4090,49 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
 
 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
 {
-	if (sb_rdonly(fs_info->sb))
-		return -EROFS;
-
 	mutex_lock(&fs_info->balance_mutex);
 	if (!fs_info->balance_ctl) {
 		mutex_unlock(&fs_info->balance_mutex);
 		return -ENOTCONN;
 	}
 
+	/*
+	 * A paused balance with the item stored on disk can be resumed at
+	 * mount time if the mount is read-write. Otherwise it's still paused
+	 * and we must not allow cancelling as it deletes the item.
+	 */
+	if (sb_rdonly(fs_info->sb)) {
+		mutex_unlock(&fs_info->balance_mutex);
+		return -EROFS;
+	}
+
 	atomic_inc(&fs_info->balance_cancel_req);
 	/*
 	 * if we are running just wait and return, balance item is
 	 * deleted in btrfs_balance in this case
 	 */
-	if (atomic_read(&fs_info->balance_running)) {
+	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
 		mutex_unlock(&fs_info->balance_mutex);
 		wait_event(fs_info->balance_wait_q,
-			   atomic_read(&fs_info->balance_running) == 0);
+			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
 		mutex_lock(&fs_info->balance_mutex);
 	} else {
-		/* __cancel_balance needs volume_mutex */
 		mutex_unlock(&fs_info->balance_mutex);
-		mutex_lock(&fs_info->volume_mutex);
+		/*
+		 * Lock released to allow other waiters to continue, we'll
+		 * reexamine the status again.
+		 */
 		mutex_lock(&fs_info->balance_mutex);
 
-		if (fs_info->balance_ctl)
-			__cancel_balance(fs_info);
-
-		mutex_unlock(&fs_info->volume_mutex);
+		if (fs_info->balance_ctl) {
+			reset_balance_state(fs_info);
+			clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+			btrfs_info(fs_info, "balance: canceled");
+		}
 	}
 
-	BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+	BUG_ON(fs_info->balance_ctl ||
+		test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
 	atomic_dec(&fs_info->balance_cancel_req);
 	mutex_unlock(&fs_info->balance_mutex);
 	return 0;
@@ -4264,8 +4209,7 @@ static int btrfs_uuid_scan_kthread(void *data)
 		}
 update_tree:
 		if (!btrfs_is_empty_uuid(root_item.uuid)) {
-			ret = btrfs_uuid_tree_add(trans, fs_info,
-						  root_item.uuid,
+			ret = btrfs_uuid_tree_add(trans, root_item.uuid,
 						  BTRFS_UUID_KEY_SUBVOL,
 						  key.objectid);
 			if (ret < 0) {
@@ -4276,7 +4220,7 @@ update_tree:
 		}
 
 		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
-			ret = btrfs_uuid_tree_add(trans, fs_info,
+			ret = btrfs_uuid_tree_add(trans,
 						  root_item.received_uuid,
 						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
 						  key.objectid);
@@ -4482,7 +4426,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	if (!path)
 		return -ENOMEM;
 
-	path->reada = READA_FORWARD;
+	path->reada = READA_BACK;
 
 	mutex_lock(&fs_info->chunk_mutex);
 
@@ -6043,9 +5987,8 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
 }
 
-int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
-		     u64 chunk_start, u64 physical, u64 devid,
-		     u64 **logical, int *naddrs, int *stripe_len)
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+		     u64 physical, u64 **logical, int *naddrs, int *stripe_len)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -6077,8 +6020,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
 	BUG_ON(!buf); /* -ENOMEM */
 
 	for (i = 0; i < map->num_stripes; i++) {
-		if (devid && map->stripes[i].dev->devid != devid)
-			continue;
 		if (map->stripes[i].physical > physical ||
 		    map->stripes[i].physical + length <= physical)
 			continue;
@@ -6410,7 +6351,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
  *
  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
  * on error.  Returned struct is not linked onto any lists and must be
- * destroyed with free_device.
+ * destroyed with btrfs_free_device.
  */
 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 					const u64 *devid,
@@ -6433,7 +6374,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 
 		ret = find_next_devid(fs_info, &tmp);
 		if (ret) {
-			free_device(dev);
+			btrfs_free_device(dev);
 			return ERR_PTR(ret);
 		}
 	}
@@ -6684,8 +6625,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
 	if (IS_ERR(fs_devices))
 		return fs_devices;
 
-	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
-				   fs_info->bdev_holder);
+	ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
 	if (ret) {
 		free_fs_devices(fs_devices);
 		fs_devices = ERR_PTR(ret);
@@ -6693,7 +6633,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
 	}
 
 	if (!fs_devices->seeding) {
-		__btrfs_close_devices(fs_devices);
+		close_fs_devices(fs_devices);
 		free_fs_devices(fs_devices);
 		fs_devices = ERR_PTR(-EINVAL);
 		goto out;
@@ -7002,6 +6942,10 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
 	if (!path)
 		return -ENOMEM;
 
+	/*
+	 * uuid_mutex is needed only if we are mounting a sprout FS
+	 * otherwise we don't need it.
+	 */
 	mutex_lock(&uuid_mutex);
 	mutex_lock(&fs_info->chunk_mutex);
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7909688..5139ec8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -208,6 +208,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
 
 struct btrfs_fs_devices {
 	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+	struct list_head fs_list;
 
 	u64 num_devices;
 	u64 open_devices;
@@ -229,7 +230,6 @@ struct btrfs_fs_devices {
 	struct list_head resized_devices;
 	/* devices not currently being allocated */
 	struct list_head alloc_list;
-	struct list_head list;
 
 	struct btrfs_fs_devices *seed;
 	int seeding;
@@ -329,11 +329,12 @@ struct btrfs_raid_attr {
 	int tolerated_failures; /* max tolerated fail devs */
 	int devs_increment;	/* ndevs has to be a multiple of this */
 	int ncopies;		/* how many copies to data has */
+	int mindev_error;	/* error code if min devs requisite is unmet */
+	const char raid_name[8]; /* name of the raid */
+	u64 bg_flag;		/* block group flag of the raid */
 };
 
 extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
-extern const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES];
-extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES];
 
 struct map_lookup {
 	u64 type;
@@ -351,8 +352,6 @@ struct map_lookup {
 struct btrfs_balance_args;
 struct btrfs_balance_progress;
 struct btrfs_balance_control {
-	struct btrfs_fs_info *fs_info;
-
 	struct btrfs_balance_args data;
 	struct btrfs_balance_args meta;
 	struct btrfs_balance_args sys;
@@ -393,9 +392,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		     u64 logical, u64 *length,
 		     struct btrfs_bio **bbio_ret);
-int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
-		     u64 chunk_start, u64 physical, u64 devid,
-		     u64 **logical, int *naddrs, int *stripe_len);
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+		     u64 physical, u64 **logical, int *naddrs, int *stripe_len);
 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
@@ -421,6 +419,7 @@ int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 					const u64 *devid,
 					const u8 *uuid);
+void btrfs_free_device(struct btrfs_device *device);
 int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		    const char *device_path, u64 devid);
 void __exit btrfs_cleanup_fs_uuids(void);
@@ -431,11 +430,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
 				       u8 *uuid, u8 *fsid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
-int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
-				  const char *device_path,
-				  struct btrfs_device *srcdev,
-				  struct btrfs_device **device_out);
-int btrfs_balance(struct btrfs_balance_control *bctl,
+int btrfs_balance(struct btrfs_fs_info *fs_info,
+		  struct btrfs_balance_control *bctl,
 		  struct btrfs_ioctl_balance_args *bargs);
 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
 int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
@@ -553,6 +549,8 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
 	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
 }
 
+const char *get_raid_name(enum btrfs_raid_types type);
+
 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
 void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans);
 
diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
index 125b90f..0ce1aa5 100644
--- a/fs/cachefiles/proc.c
+++ b/fs/cachefiles/proc.c
@@ -85,21 +85,6 @@ static const struct seq_operations cachefiles_histogram_ops = {
 };
 
 /*
- * open "/proc/fs/cachefiles/XXX" which provide statistics summaries
- */
-static int cachefiles_histogram_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &cachefiles_histogram_ops);
-}
-
-static const struct file_operations cachefiles_histogram_fops = {
-	.open		= cachefiles_histogram_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-/*
  * initialise the /proc/fs/cachefiles/ directory
  */
 int __init cachefiles_proc_init(void)
@@ -109,8 +94,8 @@ int __init cachefiles_proc_init(void)
 	if (!proc_mkdir("fs/cachefiles", NULL))
 		goto error_dir;
 
-	if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
-			 &cachefiles_histogram_fops))
+	if (!proc_create_seq("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
+			 &cachefiles_histogram_ops))
 		goto error_histogram;
 
 	_leave(" = 0");
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 7e4a1e2..8581799 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: GPL-2.0
 #
-# Makefile for Linux CIFS VFS client 
+# Makefile for Linux CIFS/SMB2/SMB3 VFS client
 #
+ccflags-y += -I$(src)		# needed for trace events
 obj-$(CONFIG_CIFS) += cifs.o
 
-cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
-	  link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
+cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \
+	  inode.o link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
 	  cifs_unicode.o nterr.o cifsencrypt.o \
 	  readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o \
 	  smb2ops.o smb2maperror.o smb2transport.o \
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 9d69ea4..1161460 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -42,7 +42,7 @@ cifs_dump_mem(char *label, void *data, int length)
 		       data, length, true);
 }
 
-void cifs_dump_detail(void *buf)
+void cifs_dump_detail(void *buf, struct TCP_Server_Info *server)
 {
 #ifdef CONFIG_CIFS_DEBUG2
 	struct smb_hdr *smb = (struct smb_hdr *)buf;
@@ -50,7 +50,8 @@ void cifs_dump_detail(void *buf)
 	cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n",
 		 smb->Command, smb->Status.CifsError,
 		 smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
-	cifs_dbg(VFS, "smb buf %p len %u\n", smb, smbCalcSize(smb));
+	cifs_dbg(VFS, "smb buf %p len %u\n", smb,
+		 server->ops->calc_smb_size(smb, server));
 #endif /* CONFIG_CIFS_DEBUG2 */
 }
 
@@ -83,7 +84,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 		cifs_dbg(VFS, "IsMult: %d IsEnd: %d\n",
 			 mid_entry->multiRsp, mid_entry->multiEnd);
 		if (mid_entry->resp_buf) {
-			cifs_dump_detail(mid_entry->resp_buf);
+			cifs_dump_detail(mid_entry->resp_buf, server);
 			cifs_dump_mem("existing buf: ",
 				mid_entry->resp_buf, 62);
 		}
@@ -113,6 +114,8 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
 		seq_printf(m, " type: %d ", dev_type);
 	if (tcon->seal)
 		seq_printf(m, " Encrypted");
+	if (tcon->nocase)
+		seq_printf(m, " nocase");
 	if (tcon->unix_ext)
 		seq_printf(m, " POSIX Extensions");
 	if (tcon->ses->server->ops->dump_share_caps)
@@ -237,6 +240,10 @@ skip_rdma:
 			server->credits,  server->dialect);
 		if (server->sign)
 			seq_printf(m, " signed");
+#ifdef CONFIG_CIFS_SMB311
+		if (server->posix_ext_supported)
+			seq_printf(m, " posix");
+#endif /* 3.1.1 */
 		i++;
 		list_for_each(tmp2, &server->smb_ses_list) {
 			ses = list_entry(tmp2, struct cifs_ses,
@@ -314,18 +321,6 @@ skip_rdma:
 	return 0;
 }
 
-static int cifs_debug_data_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, cifs_debug_data_proc_show, NULL);
-}
-
-static const struct file_operations cifs_debug_data_proc_fops = {
-	.open		= cifs_debug_data_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 #ifdef CONFIG_CIFS_STATS
 static ssize_t cifs_stats_proc_write(struct file *file,
 		const char __user *buffer, size_t count, loff_t *ppos)
@@ -497,35 +492,36 @@ cifs_proc_init(void)
 	if (proc_fs_cifs == NULL)
 		return;
 
-	proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops);
+	proc_create_single("DebugData", 0, proc_fs_cifs,
+			cifs_debug_data_proc_show);
 
 #ifdef CONFIG_CIFS_STATS
-	proc_create("Stats", 0, proc_fs_cifs, &cifs_stats_proc_fops);
+	proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_fops);
 #endif /* STATS */
-	proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops);
-	proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops);
-	proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs,
+	proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_fops);
+	proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_fops);
+	proc_create("LinuxExtensionsEnabled", 0644, proc_fs_cifs,
 		    &cifs_linux_ext_proc_fops);
-	proc_create("SecurityFlags", 0, proc_fs_cifs,
+	proc_create("SecurityFlags", 0644, proc_fs_cifs,
 		    &cifs_security_flags_proc_fops);
-	proc_create("LookupCacheEnabled", 0, proc_fs_cifs,
+	proc_create("LookupCacheEnabled", 0644, proc_fs_cifs,
 		    &cifs_lookup_cache_proc_fops);
 #ifdef CONFIG_CIFS_SMB_DIRECT
-	proc_create("rdma_readwrite_threshold", 0, proc_fs_cifs,
+	proc_create("rdma_readwrite_threshold", 0644, proc_fs_cifs,
 		&cifs_rdma_readwrite_threshold_proc_fops);
-	proc_create("smbd_max_frmr_depth", 0, proc_fs_cifs,
+	proc_create("smbd_max_frmr_depth", 0644, proc_fs_cifs,
 		&cifs_smbd_max_frmr_depth_proc_fops);
-	proc_create("smbd_keep_alive_interval", 0, proc_fs_cifs,
+	proc_create("smbd_keep_alive_interval", 0644, proc_fs_cifs,
 		&cifs_smbd_keep_alive_interval_proc_fops);
-	proc_create("smbd_max_receive_size", 0, proc_fs_cifs,
+	proc_create("smbd_max_receive_size", 0644, proc_fs_cifs,
 		&cifs_smbd_max_receive_size_proc_fops);
-	proc_create("smbd_max_fragmented_recv_size", 0, proc_fs_cifs,
+	proc_create("smbd_max_fragmented_recv_size", 0644, proc_fs_cifs,
 		&cifs_smbd_max_fragmented_recv_size_proc_fops);
-	proc_create("smbd_max_send_size", 0, proc_fs_cifs,
+	proc_create("smbd_max_send_size", 0644, proc_fs_cifs,
 		&cifs_smbd_max_send_size_proc_fops);
-	proc_create("smbd_send_credit_target", 0, proc_fs_cifs,
+	proc_create("smbd_send_credit_target", 0644, proc_fs_cifs,
 		&cifs_smbd_send_credit_target_proc_fops);
-	proc_create("smbd_receive_credit_max", 0, proc_fs_cifs,
+	proc_create("smbd_receive_credit_max", 0644, proc_fs_cifs,
 		&cifs_smbd_receive_credit_max_proc_fops);
 #endif
 }
@@ -583,6 +579,8 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
 		cifsFYI = bv;
 	else if ((c[0] > '1') && (c[0] <= '9'))
 		cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */
+	else
+		return -EINVAL;
 
 	return count;
 }
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 0e74690..f4f3f08 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -23,7 +23,7 @@
 #define _H_CIFS_DEBUG
 
 void cifs_dump_mem(char *label, void *data, int length);
-void cifs_dump_detail(void *);
+void cifs_dump_detail(void *buf, struct TCP_Server_Info *ptcp_info);
 void cifs_dump_mids(struct TCP_Server_Info *);
 extern bool traceSMB;		/* flag which enables the function below */
 void dump_smb(void *, int);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 350fa55..9731d0d 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -50,6 +50,7 @@
 					      * root mountable
 					      */
 #define CIFS_MOUNT_UID_FROM_ACL 0x2000000 /* try to get UID via special SID */
+#define CIFS_MOUNT_NO_HANDLE_CACHE 0x4000000 /* disable caching dir handles */
 
 struct cifs_sb_info {
 	struct rb_root tlink_tree;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5a5a015..eb7b657 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -58,13 +58,15 @@ bool traceSMB;
 bool enable_oplocks = true;
 bool linuxExtEnabled = true;
 bool lookupCacheEnabled = true;
+bool disable_legacy_dialects; /* false by default */
 unsigned int global_secflags = CIFSSEC_DEF;
 /* unsigned int ntlmv2_support = 0; */
 unsigned int sign_CIFS_PDUs = 1;
 static const struct super_operations cifs_super_ops;
 unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
 module_param(CIFSMaxBufSize, uint, 0444);
-MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). "
+MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header) "
+				 "for CIFS requests. "
 				 "Default: 16384 Range: 8192 to 130048");
 unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL;
 module_param(cifs_min_rcv, uint, 0444);
@@ -76,11 +78,21 @@ MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
 				 "Range: 2 to 256");
 unsigned int cifs_max_pending = CIFS_MAX_REQ;
 module_param(cifs_max_pending, uint, 0444);
-MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
+MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server for "
+				   "CIFS/SMB1 dialect (N/A for SMB3) "
 				   "Default: 32767 Range: 2 to 32767.");
 module_param(enable_oplocks, bool, 0644);
 MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
 
+module_param(disable_legacy_dialects, bool, 0644);
+MODULE_PARM_DESC(disable_legacy_dialects, "To improve security it may be "
+				  "helpful to restrict the ability to "
+				  "override the default dialects (SMB2.1, "
+				  "SMB3 and SMB3.02) on mount with old "
+				  "dialects (CIFS/SMB1 and SMB2) since "
+				  "vers=1.0 (CIFS/SMB1) and vers=2.0 are weaker"
+				  " and less secure. Default: n/N/0");
+
 extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
@@ -469,10 +481,20 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 		seq_puts(s, ",persistenthandles");
 	else if (tcon->use_resilient)
 		seq_puts(s, ",resilienthandles");
+
+#ifdef CONFIG_CIFS_SMB311
+	if (tcon->posix_extensions)
+		seq_puts(s, ",posix");
+	else if (tcon->unix_ext)
+		seq_puts(s, ",unix");
+	else
+		seq_puts(s, ",nounix");
+#else
 	if (tcon->unix_ext)
 		seq_puts(s, ",unix");
 	else
 		seq_puts(s, ",nounix");
+#endif /* SMB311 */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
 		seq_puts(s, ",posixpaths");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
@@ -495,6 +517,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 		seq_puts(s, ",sfu");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
 		seq_puts(s, ",nobrl");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_HANDLE_CACHE)
+		seq_puts(s, ",nohandlecache");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
 		seq_puts(s, ",cifsacl");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
@@ -897,6 +921,17 @@ struct file_system_type cifs_fs_type = {
 	/*  .fs_flags */
 };
 MODULE_ALIAS_FS("cifs");
+
+static struct file_system_type smb3_fs_type = {
+	.owner = THIS_MODULE,
+	.name = "smb3",
+	.mount = cifs_do_mount,
+	.kill_sb = cifs_kill_sb,
+	/*  .fs_flags */
+};
+MODULE_ALIAS_FS("smb3");
+MODULE_ALIAS("smb3");
+
 const struct inode_operations cifs_dir_inode_ops = {
 	.create = cifs_create,
 	.atomic_open = cifs_atomic_open,
@@ -1435,6 +1470,12 @@ init_cifs(void)
 	if (rc)
 		goto out_init_cifs_idmap;
 
+	rc = register_filesystem(&smb3_fs_type);
+	if (rc) {
+		unregister_filesystem(&cifs_fs_type);
+		goto out_init_cifs_idmap;
+	}
+
 	return 0;
 
 out_init_cifs_idmap:
@@ -1465,8 +1506,9 @@ out_clean_proc:
 static void __exit
 exit_cifs(void)
 {
-	cifs_dbg(NOISY, "exit_cifs\n");
+	cifs_dbg(NOISY, "exit_smb3\n");
 	unregister_filesystem(&cifs_fs_type);
+	unregister_filesystem(&smb3_fs_type);
 	cifs_dfs_release_automount_timer();
 #ifdef CONFIG_CIFS_ACL
 	exit_cifs_idmap();
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 013ba2a..5f02318 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -149,5 +149,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
 
-#define CIFS_VERSION   "2.11"
+#define CIFS_VERSION   "2.12"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index cb950a5..08d1cdd 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -176,6 +176,7 @@ struct smb_rqst {
 	struct kvec	*rq_iov;	/* array of kvecs */
 	unsigned int	rq_nvec;	/* number of kvecs in array */
 	struct page	**rq_pages;	/* pointer to array of page ptrs */
+	unsigned int	rq_offset;	/* the offset to the 1st page */
 	unsigned int	rq_npages;	/* number pages in array */
 	unsigned int	rq_pagesz;	/* page size to use */
 	unsigned int	rq_tailsz;	/* length of last page */
@@ -244,7 +245,7 @@ struct smb_version_operations {
 	int (*map_error)(char *, bool);
 	/* find mid corresponding to the response message */
 	struct mid_q_entry * (*find_mid)(struct TCP_Server_Info *, char *);
-	void (*dump_detail)(void *);
+	void (*dump_detail)(void *buf, struct TCP_Server_Info *ptcp_info);
 	void (*clear_stats)(struct cifs_tcon *);
 	void (*print_stats)(struct seq_file *m, struct cifs_tcon *);
 	void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *);
@@ -372,7 +373,7 @@ struct smb_version_operations {
 	int (*close_dir)(const unsigned int, struct cifs_tcon *,
 			 struct cifs_fid *);
 	/* calculate a size of SMB message */
-	unsigned int (*calc_smb_size)(void *);
+	unsigned int (*calc_smb_size)(void *buf, struct TCP_Server_Info *ptcpi);
 	/* check for STATUS_PENDING and process it in a positive case */
 	bool (*is_status_pending)(char *, struct TCP_Server_Info *, int);
 	/* check for STATUS_NETWORK_SESSION_EXPIRED */
@@ -417,7 +418,7 @@ struct smb_version_operations {
 	/* create lease context buffer for CREATE request */
 	char * (*create_lease_buf)(u8 *, u8);
 	/* parse lease context buffer and return oplock/epoch info */
-	__u8 (*parse_lease_buf)(void *, unsigned int *);
+	__u8 (*parse_lease_buf)(void *buf, unsigned int *epoch, char *lkey);
 	ssize_t (*copychunk_range)(const unsigned int,
 			struct cifsFileInfo *src_file,
 			struct cifsFileInfo *target_file,
@@ -457,7 +458,7 @@ struct smb_version_operations {
 				 struct mid_q_entry **);
 	enum securityEnum (*select_sectype)(struct TCP_Server_Info *,
 			    enum securityEnum);
-
+	int (*next_header)(char *);
 };
 
 struct smb_version_values {
@@ -521,10 +522,12 @@ struct smb_vol {
 	bool sfu_remap:1;  /* remap seven reserved chars ala SFU */
 	bool posix_paths:1; /* unset to not ask for posix pathnames. */
 	bool no_linux_ext:1;
+	bool linux_ext:1;
 	bool sfu_emul:1;
 	bool nullauth:1;   /* attempt to authenticate with null user */
 	bool nocase:1;     /* request case insensitive filenames */
 	bool nobrl:1;      /* disable sending byte range locks to srv */
+	bool nohandlecache:1; /* disable caching dir handles if srvr probs */
 	bool mand_lock:1;  /* send mandatory not posix byte range lock reqs */
 	bool seal:1;       /* request transport encryption on share */
 	bool nodfs:1;      /* Do not request DFS, even if available */
@@ -630,7 +633,7 @@ struct TCP_Server_Info {
 	bool oplocks:1; /* enable oplocks */
 	unsigned int maxReq;	/* Clients should submit no more */
 	/* than maxReq distinct unanswered SMBs to the server when using  */
-	/* multiplexed reads or writes */
+	/* multiplexed reads or writes (for SMB1/CIFS only, not SMB2/SMB3) */
 	unsigned int maxBuf;	/* maxBuf specifies the maximum */
 	/* message size the server can send or receive for non-raw SMBs */
 	/* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */
@@ -681,6 +684,7 @@ struct TCP_Server_Info {
 	__le16	cipher_type;
 	 /* save initital negprot hash */
 	__u8	preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE];
+	bool	posix_ext_supported;
 #endif /* 3.1.1 */
 	struct delayed_work reconnect; /* reconnect workqueue job */
 	struct mutex reconnect_mutex; /* prevent simultaneous reconnects */
@@ -953,9 +957,13 @@ struct cifs_tcon {
 	bool print:1; /* set if connection to printer share */
 	bool retry:1;
 	bool nocase:1;
+	bool nohandlecache:1; /* if strange server resource prob can turn off */
 	bool seal:1;      /* transport encryption for this mounted share */
 	bool unix_ext:1;  /* if false disable Linux extensions to CIFS protocol
 				for this mount even if server would support */
+#ifdef CONFIG_CIFS_SMB311
+	bool posix_extensions; /* if true SMB3.11 posix extensions enabled */
+#endif /* CIFS_311 */
 	bool local_lease:1; /* check leases (only) on local system not remote */
 	bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
 	bool broken_sparse_sup; /* if server or share does not support sparse */
@@ -979,6 +987,9 @@ struct cifs_tcon {
 	struct fscache_cookie *fscache;	/* cookie for share */
 #endif
 	struct list_head pending_opens;	/* list of incomplete opens */
+	bool valid_root_fid:1;	/* Do we have a useable root fid */
+	struct mutex prfid_mutex; /* prevents reopen race after dead ses*/
+	struct cifs_fid *prfid;	/* handle to the directory at top of share */
 	/* BB add field for back pointer to sb struct(s)? */
 };
 
@@ -1071,6 +1082,7 @@ struct cifs_open_parms {
 	int create_options;
 	const char *path;
 	struct cifs_fid *fid;
+	umode_t mode;
 	bool reconnect:1;
 };
 
@@ -1169,10 +1181,11 @@ struct cifs_readdata {
 	struct smbd_mr			*mr;
 #endif
 	unsigned int			pagesz;
+	unsigned int			page_offset;
 	unsigned int			tailsz;
 	unsigned int			credits;
 	unsigned int			nr_pages;
-	struct page			*pages[];
+	struct page			**pages;
 };
 
 struct cifs_writedata;
@@ -1194,10 +1207,11 @@ struct cifs_writedata {
 	struct smbd_mr			*mr;
 #endif
 	unsigned int			pagesz;
+	unsigned int			page_offset;
 	unsigned int			tailsz;
 	unsigned int			credits;
 	unsigned int			nr_pages;
-	struct page			*pages[];
+	struct page			**pages;
 };
 
 /*
@@ -1692,16 +1706,17 @@ GLOBAL_EXTERN atomic_t smBufAllocCount;
 GLOBAL_EXTERN atomic_t midCount;
 
 /* Misc globals */
-GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */
-GLOBAL_EXTERN bool lookupCacheEnabled;
-GLOBAL_EXTERN unsigned int global_secflags;	/* if on, session setup sent
+extern bool enable_oplocks; /* enable or disable oplocks */
+extern bool lookupCacheEnabled;
+extern unsigned int global_secflags;	/* if on, session setup sent
 				with more secure ntlmssp2 challenge/resp */
-GLOBAL_EXTERN unsigned int sign_CIFS_PDUs;  /* enable smb packet signing */
-GLOBAL_EXTERN bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
-GLOBAL_EXTERN unsigned int CIFSMaxBufSize;  /* max size not including hdr */
-GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
-GLOBAL_EXTERN unsigned int cifs_min_small;  /* min size of small buf pool */
-GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
+extern unsigned int sign_CIFS_PDUs;  /* enable smb packet signing */
+extern bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
+extern unsigned int CIFSMaxBufSize;  /* max size not including hdr */
+extern unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
+extern unsigned int cifs_min_small;  /* min size of small buf pool */
+extern unsigned int cifs_max_pending; /* MAX requests at once to server*/
+extern bool disable_legacy_dialects;  /* forbid vers=1.0 and vers=2.0 mounts */
 
 #ifdef CONFIG_CIFS_ACL
 GLOBAL_EXTERN struct rb_root uidtree;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 365a414..7933c5f 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -21,6 +21,7 @@
 #ifndef _CIFSPROTO_H
 #define _CIFSPROTO_H
 #include <linux/nls.h>
+#include "trace.h"
 
 struct statfs;
 struct smb_vol;
@@ -47,6 +48,7 @@ extern void _free_xid(unsigned int);
 	cifs_dbg(FYI, "CIFS VFS: in %s as Xid: %u with uid: %d\n",	\
 		 __func__, __xid,					\
 		 from_kuid(&init_user_ns, current_fsuid()));		\
+	trace_smb3_enter(__xid, __func__);			\
 	__xid;							\
 })
 
@@ -54,7 +56,11 @@ extern void _free_xid(unsigned int);
 do {								\
 	_free_xid(curr_xid);					\
 	cifs_dbg(FYI, "CIFS VFS: leaving %s (xid = %u) rc = %d\n",	\
-		 __func__, curr_xid, (int)rc);				\
+		 __func__, curr_xid, (int)rc);			\
+	if (rc)							\
+		trace_smb3_exit_err(curr_xid, __func__, (int)rc);	\
+	else							\
+		trace_smb3_exit_done(curr_xid, __func__);	\
 } while (0)
 extern int init_cifs_idmap(void);
 extern void exit_cifs_idmap(void);
@@ -124,7 +130,7 @@ extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
 			    unsigned int bytes_written);
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
 extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
-extern unsigned int smbCalcSize(void *buf);
+extern unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
 			struct TCP_Server_Info *server);
 extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
@@ -197,7 +203,9 @@ extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
 extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
 			         unsigned int to_read);
 extern int cifs_read_page_from_socket(struct TCP_Server_Info *server,
-				      struct page *page, unsigned int to_read);
+					struct page *page,
+					unsigned int page_offset,
+					unsigned int to_read);
 extern int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 			       struct cifs_sb_info *cifs_sb);
 extern int cifs_match_super(struct super_block *, void *);
@@ -525,6 +533,8 @@ int cifs_async_writev(struct cifs_writedata *wdata,
 void cifs_writev_complete(struct work_struct *work);
 struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages,
 						work_func_t complete);
+struct cifs_writedata *cifs_writedata_direct_alloc(struct page **pages,
+						work_func_t complete);
 void cifs_writedata_release(struct kref *refcount);
 int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
 			  struct cifs_sb_info *cifs_sb,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1529a08..5aca336 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -106,6 +106,12 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
 		open_file->oplock_break_cancelled = true;
 	}
 	spin_unlock(&tcon->open_file_lock);
+
+	mutex_lock(&tcon->prfid_mutex);
+	tcon->valid_root_fid = false;
+	memset(tcon->prfid, 0, sizeof(struct cifs_fid));
+	mutex_unlock(&tcon->prfid_mutex);
+
 	/*
 	 * BB Add call to invalidate_inodes(sb) for all superblocks mounted
 	 * to this tcon.
@@ -1946,6 +1952,7 @@ cifs_writedata_release(struct kref *refcount)
 	if (wdata->cfile)
 		cifsFileInfo_put(wdata->cfile);
 
+	kvfree(wdata->pages);
 	kfree(wdata);
 }
 
@@ -2069,12 +2076,22 @@ cifs_writev_complete(struct work_struct *work)
 struct cifs_writedata *
 cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete)
 {
+	struct page **pages =
+		kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+	if (pages)
+		return cifs_writedata_direct_alloc(pages, complete);
+
+	return NULL;
+}
+
+struct cifs_writedata *
+cifs_writedata_direct_alloc(struct page **pages, work_func_t complete)
+{
 	struct cifs_writedata *wdata;
 
-	/* writedata + number of page pointers */
-	wdata = kzalloc(sizeof(*wdata) +
-			sizeof(struct page *) * nr_pages, GFP_NOFS);
+	wdata = kzalloc(sizeof(*wdata), GFP_NOFS);
 	if (wdata != NULL) {
+		wdata->pages = pages;
 		kref_init(&wdata->refcount);
 		INIT_LIST_HEAD(&wdata->list);
 		init_completion(&wdata->done);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7a10a5d..e5a2fe7 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -61,6 +61,7 @@
 #define RFC1001_PORT 139
 
 extern mempool_t *cifs_req_poolp;
+extern bool disable_legacy_dialects;
 
 /* FIXME: should these be tunable? */
 #define TLINK_ERROR_EXPIRE	(1 * HZ)
@@ -76,9 +77,10 @@ enum {
 	Opt_mapposix, Opt_nomapposix,
 	Opt_mapchars, Opt_nomapchars, Opt_sfu,
 	Opt_nosfu, Opt_nodfs, Opt_posixpaths,
-	Opt_noposixpaths, Opt_nounix,
+	Opt_noposixpaths, Opt_nounix, Opt_unix,
 	Opt_nocase,
 	Opt_brl, Opt_nobrl,
+	Opt_handlecache, Opt_nohandlecache,
 	Opt_forcemandatorylock, Opt_setuidfromacl, Opt_setuids,
 	Opt_nosetuids, Opt_dynperm, Opt_nodynperm,
 	Opt_nohard, Opt_nosoft,
@@ -144,10 +146,16 @@ static const match_table_t cifs_mount_option_tokens = {
 	{ Opt_noposixpaths, "noposixpaths" },
 	{ Opt_nounix, "nounix" },
 	{ Opt_nounix, "nolinux" },
+	{ Opt_nounix, "noposix" },
+	{ Opt_unix, "unix" },
+	{ Opt_unix, "linux" },
+	{ Opt_unix, "posix" },
 	{ Opt_nocase, "nocase" },
 	{ Opt_nocase, "ignorecase" },
 	{ Opt_brl, "brl" },
 	{ Opt_nobrl, "nobrl" },
+	{ Opt_handlecache, "handlecache" },
+	{ Opt_nohandlecache, "nohandlecache" },
 	{ Opt_nobrl, "nolock" },
 	{ Opt_forcemandatorylock, "forcemandatorylock" },
 	{ Opt_forcemandatorylock, "forcemand" },
@@ -591,10 +599,11 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
 
 int
 cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page,
-		      unsigned int to_read)
+	unsigned int page_offset, unsigned int to_read)
 {
 	struct msghdr smb_msg;
-	struct bio_vec bv = {.bv_page = page, .bv_len = to_read};
+	struct bio_vec bv = {
+		.bv_page = page, .bv_len = to_read, .bv_offset = page_offset};
 	iov_iter_bvec(&smb_msg.msg_iter, READ | ITER_BVEC, &bv, 1, to_read);
 	return cifs_readv_from_socket(server, &smb_msg);
 }
@@ -848,6 +857,7 @@ cifs_demultiplex_thread(void *p)
 	int length;
 	struct TCP_Server_Info *server = p;
 	unsigned int pdu_length;
+	unsigned int next_offset;
 	char *buf = NULL;
 	struct task_struct *task_to_wake = NULL;
 	struct mid_q_entry *mid_entry;
@@ -874,24 +884,29 @@ cifs_demultiplex_thread(void *p)
 		length = cifs_read_from_socket(server, buf, pdu_length);
 		if (length < 0)
 			continue;
-		server->total_read = length;
+
+		if (server->vals->header_preamble_size == 0)
+			server->total_read = 0;
+		else
+			server->total_read = length;
 
 		/*
 		 * The right amount was read from socket - 4 bytes,
 		 * so we can now interpret the length field.
 		 */
 		pdu_length = get_rfc1002_length(buf);
-		server->pdu_size = pdu_length;
 
 		cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length);
 		if (!is_smb_response(server, buf[0]))
 			continue;
+next_pdu:
+		server->pdu_size = pdu_length;
 
 		/* make sure we have enough to get to the MID */
-		if (pdu_length < HEADER_SIZE(server) - 1 -
+		if (server->pdu_size < HEADER_SIZE(server) - 1 -
 		    server->vals->header_preamble_size) {
 			cifs_dbg(VFS, "SMB response too short (%u bytes)\n",
-				 pdu_length);
+				 server->pdu_size);
 			cifs_reconnect(server);
 			wake_up(&server->response_q);
 			continue;
@@ -906,6 +921,12 @@ cifs_demultiplex_thread(void *p)
 			continue;
 		server->total_read += length;
 
+		if (server->ops->next_header) {
+			next_offset = server->ops->next_header(buf);
+			if (next_offset)
+				server->pdu_size = next_offset;
+		}
+
 		if (server->ops->is_transform_hdr &&
 		    server->ops->receive_transform &&
 		    server->ops->is_transform_hdr(buf)) {
@@ -948,10 +969,18 @@ cifs_demultiplex_thread(void *p)
 				      HEADER_SIZE(server));
 #ifdef CONFIG_CIFS_DEBUG2
 			if (server->ops->dump_detail)
-				server->ops->dump_detail(buf);
+				server->ops->dump_detail(buf, server);
 			cifs_dump_mids(server);
 #endif /* CIFS_DEBUG2 */
-
+		}
+		if (pdu_length > server->pdu_size) {
+			if (!allocate_buffers(server))
+				continue;
+			pdu_length -= server->pdu_size;
+			server->total_read = 0;
+			server->large_buf = false;
+			buf = server->smallbuf;
+			goto next_pdu;
 		}
 	} /* end while !EXITING */
 
@@ -1143,10 +1172,18 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
 
 	switch (match_token(value, cifs_smb_version_tokens, args)) {
 	case Smb_1:
+		if (disable_legacy_dialects) {
+			cifs_dbg(VFS, "mount with legacy dialect disabled\n");
+			return 1;
+		}
 		vol->ops = &smb1_operations;
 		vol->vals = &smb1_values;
 		break;
 	case Smb_20:
+		if (disable_legacy_dialects) {
+			cifs_dbg(VFS, "mount with legacy dialect disabled\n");
+			return 1;
+		}
 		vol->ops = &smb20_operations;
 		vol->vals = &smb20_values;
 		break;
@@ -1426,8 +1463,17 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			vol->posix_paths = 0;
 			break;
 		case Opt_nounix:
+			if (vol->linux_ext)
+				cifs_dbg(VFS,
+					"conflicting unix mount options\n");
 			vol->no_linux_ext = 1;
 			break;
+		case Opt_unix:
+			if (vol->no_linux_ext)
+				cifs_dbg(VFS,
+					"conflicting unix mount options\n");
+			vol->linux_ext = 1;
+			break;
 		case Opt_nocase:
 			vol->nocase = 1;
 			break;
@@ -1445,6 +1491,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 				(S_IALLUGO & ~(S_ISUID | S_IXGRP)))
 				vol->file_mode = S_IALLUGO;
 			break;
+		case Opt_nohandlecache:
+			vol->nohandlecache = 1;
+			break;
+		case Opt_handlecache:
+			vol->nohandlecache = 0;
+			break;
 		case Opt_forcemandatorylock:
 			vol->mand_lock = 1;
 			break;
@@ -2967,6 +3019,13 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
 		}
 	}
 
+#ifdef CONFIG_CIFS_SMB311
+	if ((volume_info->linux_ext) && (ses->server->posix_ext_supported)) {
+		if (ses->server->vals->protocol_id == SMB311_PROT_ID)
+			tcon->posix_extensions = true;
+	}
+#endif /* 311 */
+
 	/*
 	 * BB Do we need to wrap session_mutex around this TCon call and Unix
 	 * SetFS as we do on SessSetup and reconnect?
@@ -3022,6 +3081,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
 	 */
 	tcon->retry = volume_info->retry;
 	tcon->nocase = volume_info->nocase;
+	tcon->nohandlecache = volume_info->nohandlecache;
 	tcon->local_lease = volume_info->local_lease;
 	INIT_LIST_HEAD(&tcon->pending_opens);
 
@@ -3580,6 +3640,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
 	if (pvolume_info->nobrl)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
+	if (pvolume_info->nohandlecache)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_HANDLE_CACHE;
 	if (pvolume_info->nostrictsync)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC;
 	if (pvolume_info->mand_lock)
@@ -3922,6 +3984,12 @@ try_mount_again:
 		goto remote_path_check;
 	}
 
+#ifdef CONFIG_CIFS_SMB311
+	/* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
+	if (tcon->posix_extensions)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
+#endif /* SMB3.11 */
+
 	/* tell server which Unix caps we support */
 	if (cap_unix(tcon->ses)) {
 		/* reset of caps checks mount to see if unix extensions
@@ -4353,6 +4421,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
 	vol_info->UNC = master_tcon->treeName;
 	vol_info->retry = master_tcon->retry;
 	vol_info->nocase = master_tcon->nocase;
+	vol_info->nohandlecache = master_tcon->nohandlecache;
 	vol_info->local_lease = master_tcon->local_lease;
 	vol_info->no_linux_ext = !master_tcon->unix_ext;
 	vol_info->sectype = master_tcon->ses->sectype;
@@ -4382,8 +4451,14 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
 		goto out;
 	}
 
+#ifdef CONFIG_CIFS_SMB311
+	/* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
+	if (tcon->posix_extensions)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
+#endif /* SMB3.11 */
 	if (cap_unix(ses))
 		reset_cifs_unix_caps(0, tcon, NULL, vol_info);
+
 out:
 	kfree(vol_info->username);
 	kzfree(vol_info->password);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 9258443..ddae52b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -369,7 +369,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
 	oparms.path = full_path;
 	oparms.fid = fid;
 	oparms.reconnect = false;
-
+	oparms.mode = mode;
 	rc = server->ops->open(xid, &oparms, oplock, buf);
 	if (rc) {
 		cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc);
@@ -780,21 +780,25 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink)) {
 		free_xid(xid);
-		return (struct dentry *)tlink;
+		return ERR_CAST(tlink);
 	}
 	pTcon = tlink_tcon(tlink);
 
 	rc = check_name(direntry, pTcon);
-	if (rc)
-		goto lookup_out;
+	if (unlikely(rc)) {
+		cifs_put_tlink(tlink);
+		free_xid(xid);
+		return ERR_PTR(rc);
+	}
 
 	/* can not grab the rename sem here since it would
 	deadlock in the cases (beginning of sys_rename itself)
 	in which we already have the sb rename sem */
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
-		rc = -ENOMEM;
-		goto lookup_out;
+		cifs_put_tlink(tlink);
+		free_xid(xid);
+		return ERR_PTR(-ENOMEM);
 	}
 
 	if (d_really_is_positive(direntry)) {
@@ -813,29 +817,25 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 				parent_dir_inode->i_sb, xid, NULL);
 	}
 
-	if ((rc == 0) && (newInode != NULL)) {
-		d_add(direntry, newInode);
+	if (rc == 0) {
 		/* since paths are not looked up by component - the parent
 		   directories are presumed to be good here */
 		renew_parental_timestamps(direntry);
-
 	} else if (rc == -ENOENT) {
-		rc = 0;
 		cifs_set_time(direntry, jiffies);
-		d_add(direntry, NULL);
-	/*	if it was once a directory (but how can we tell?) we could do
-		shrink_dcache_parent(direntry); */
-	} else if (rc != -EACCES) {
-		cifs_dbg(FYI, "Unexpected lookup error %d\n", rc);
-		/* We special case check for Access Denied - since that
-		is a common return code */
+		newInode = NULL;
+	} else {
+		if (rc != -EACCES) {
+			cifs_dbg(FYI, "Unexpected lookup error %d\n", rc);
+			/* We special case check for Access Denied - since that
+			is a common return code */
+		}
+		newInode = ERR_PTR(rc);
 	}
-
-lookup_out:
 	kfree(full_path);
 	cifs_put_tlink(tlink);
 	free_xid(xid);
-	return ERR_PTR(rc);
+	return d_splice_alias(newInode, direntry);
 }
 
 static int
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 23fd430..87eece6 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2880,13 +2880,13 @@ out:
 }
 
 static struct cifs_readdata *
-cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete)
+cifs_readdata_direct_alloc(struct page **pages, work_func_t complete)
 {
 	struct cifs_readdata *rdata;
 
-	rdata = kzalloc(sizeof(*rdata) + (sizeof(struct page *) * nr_pages),
-			GFP_KERNEL);
+	rdata = kzalloc(sizeof(*rdata), GFP_KERNEL);
 	if (rdata != NULL) {
+		rdata->pages = pages;
 		kref_init(&rdata->refcount);
 		INIT_LIST_HEAD(&rdata->list);
 		init_completion(&rdata->done);
@@ -2896,6 +2896,22 @@ cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete)
 	return rdata;
 }
 
+static struct cifs_readdata *
+cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete)
+{
+	struct page **pages =
+		kzalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL);
+	struct cifs_readdata *ret = NULL;
+
+	if (pages) {
+		ret = cifs_readdata_direct_alloc(pages, complete);
+		if (!ret)
+			kfree(pages);
+	}
+
+	return ret;
+}
+
 void
 cifs_readdata_release(struct kref *refcount)
 {
@@ -2910,6 +2926,7 @@ cifs_readdata_release(struct kref *refcount)
 	if (rdata->cfile)
 		cifsFileInfo_put(rdata->cfile);
 
+	kvfree(rdata->pages);
 	kfree(rdata);
 }
 
@@ -3009,12 +3026,20 @@ uncached_fill_pages(struct TCP_Server_Info *server,
 	int result = 0;
 	unsigned int i;
 	unsigned int nr_pages = rdata->nr_pages;
+	unsigned int page_offset = rdata->page_offset;
 
 	rdata->got_bytes = 0;
 	rdata->tailsz = PAGE_SIZE;
 	for (i = 0; i < nr_pages; i++) {
 		struct page *page = rdata->pages[i];
 		size_t n;
+		unsigned int segment_size = rdata->pagesz;
+
+		if (i == 0)
+			segment_size -= page_offset;
+		else
+			page_offset = 0;
+
 
 		if (len <= 0) {
 			/* no need to hold page hostage */
@@ -3023,24 +3048,25 @@ uncached_fill_pages(struct TCP_Server_Info *server,
 			put_page(page);
 			continue;
 		}
+
 		n = len;
-		if (len >= PAGE_SIZE) {
+		if (len >= segment_size)
 			/* enough data to fill the page */
-			n = PAGE_SIZE;
-			len -= n;
-		} else {
-			zero_user(page, len, PAGE_SIZE - len);
+			n = segment_size;
+		else
 			rdata->tailsz = len;
-			len = 0;
-		}
+		len -= n;
+
 		if (iter)
-			result = copy_page_from_iter(page, 0, n, iter);
+			result = copy_page_from_iter(
+					page, page_offset, n, iter);
 #ifdef CONFIG_CIFS_SMB_DIRECT
 		else if (rdata->mr)
 			result = n;
 #endif
 		else
-			result = cifs_read_page_from_socket(server, page, n);
+			result = cifs_read_page_from_socket(
+					server, page, page_offset, n);
 		if (result < 0)
 			break;
 
@@ -3113,6 +3139,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 		rdata->bytes = cur_len;
 		rdata->pid = pid;
 		rdata->pagesz = PAGE_SIZE;
+		rdata->tailsz = PAGE_SIZE;
 		rdata->read_into_pages = cifs_uncached_read_into_pages;
 		rdata->copy_into_pages = cifs_uncached_copy_into_pages;
 		rdata->credits = credits;
@@ -3557,6 +3584,7 @@ readpages_fill_pages(struct TCP_Server_Info *server,
 	u64 eof;
 	pgoff_t eof_index;
 	unsigned int nr_pages = rdata->nr_pages;
+	unsigned int page_offset = rdata->page_offset;
 
 	/* determine the eof that the server (probably) has */
 	eof = CIFS_I(rdata->mapping->host)->server_eof;
@@ -3567,13 +3595,21 @@ readpages_fill_pages(struct TCP_Server_Info *server,
 	rdata->tailsz = PAGE_SIZE;
 	for (i = 0; i < nr_pages; i++) {
 		struct page *page = rdata->pages[i];
-		size_t n = PAGE_SIZE;
+		unsigned int to_read = rdata->pagesz;
+		size_t n;
+
+		if (i == 0)
+			to_read -= page_offset;
+		else
+			page_offset = 0;
+
+		n = to_read;
 
-		if (len >= PAGE_SIZE) {
-			len -= PAGE_SIZE;
+		if (len >= to_read) {
+			len -= to_read;
 		} else if (len > 0) {
 			/* enough for partial page, fill and zero the rest */
-			zero_user(page, len, PAGE_SIZE - len);
+			zero_user(page, len + page_offset, to_read - len);
 			n = rdata->tailsz = len;
 			len = 0;
 		} else if (page->index > eof_index) {
@@ -3605,13 +3641,15 @@ readpages_fill_pages(struct TCP_Server_Info *server,
 		}
 
 		if (iter)
-			result = copy_page_from_iter(page, 0, n, iter);
+			result = copy_page_from_iter(
+					page, page_offset, n, iter);
 #ifdef CONFIG_CIFS_SMB_DIRECT
 		else if (rdata->mr)
 			result = n;
 #endif
 		else
-			result = cifs_read_page_from_socket(server, page, n);
+			result = cifs_read_page_from_socket(
+					server, page, page_offset, n);
 		if (result < 0)
 			break;
 
@@ -3790,6 +3828,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		rdata->bytes = bytes;
 		rdata->pid = pid;
 		rdata->pagesz = PAGE_SIZE;
+		rdata->tailsz = PAGE_SIZE;
 		rdata->read_into_pages = cifs_readpages_read_into_pages;
 		rdata->copy_into_pages = cifs_readpages_copy_into_pages;
 		rdata->credits = credits;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 3c371f7..745fd7f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -746,7 +746,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
 	cifs_dbg(FYI, "Getting info on %s\n", full_path);
 
 	if ((data == NULL) && (*inode != NULL)) {
-		if (CIFS_CACHE_READ(CIFS_I(*inode))) {
+		if (CIFS_CACHE_READ(CIFS_I(*inode)) &&
+		    CIFS_I(*inode)->time != 0) {
 			cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
 			goto cgii_exit;
 		}
@@ -1857,15 +1858,15 @@ cifs_inode_needs_reval(struct inode *inode)
 	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 
+	if (cifs_i->time == 0)
+		return true;
+
 	if (CIFS_CACHE_READ(cifs_i))
 		return false;
 
 	if (!lookupCacheEnabled)
 		return true;
 
-	if (cifs_i->time == 0)
-		return true;
-
 	if (!cifs_sb->actimeo)
 		return true;
 
@@ -2104,10 +2105,14 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
 
 static void cifs_setsize(struct inode *inode, loff_t offset)
 {
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+
 	spin_lock(&inode->i_lock);
 	i_size_write(inode, offset);
 	spin_unlock(&inode->i_lock);
 
+	/* Cached inode must be refreshed on truncate */
+	cifs_i->time = 0;
 	truncate_pagecache(inode, offset);
 }
 
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 460084a..aba3fc3 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -117,6 +117,8 @@ tconInfoAlloc(void)
 		INIT_LIST_HEAD(&ret_buf->openFileList);
 		INIT_LIST_HEAD(&ret_buf->tcon_list);
 		spin_lock_init(&ret_buf->open_file_lock);
+		mutex_init(&ret_buf->prfid_mutex);
+		ret_buf->prfid = kzalloc(sizeof(struct cifs_fid), GFP_KERNEL);
 #ifdef CONFIG_CIFS_STATS
 		spin_lock_init(&ret_buf->stat_lock);
 #endif
@@ -134,6 +136,7 @@ tconInfoFree(struct cifs_tcon *buf_to_free)
 	atomic_dec(&tconInfoAllocCount);
 	kfree(buf_to_free->nativeFileSystem);
 	kzfree(buf_to_free->password);
+	kfree(buf_to_free->prfid);
 	kfree(buf_to_free);
 }
 
@@ -145,7 +148,7 @@ cifs_buf_get(void)
 	 * SMB2 header is bigger than CIFS one - no problems to clean some
 	 * more bytes for CIFS.
 	 */
-	size_t buf_size = sizeof(struct smb2_hdr);
+	size_t buf_size = sizeof(struct smb2_sync_hdr);
 
 	/*
 	 * We could use negotiated size instead of max_msgsize -
@@ -339,7 +342,7 @@ checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server)
 	/* otherwise, there is enough to get to the BCC */
 	if (check_smb_hdr(smb))
 		return -EIO;
-	clc_len = smbCalcSize(smb);
+	clc_len = smbCalcSize(smb, server);
 
 	if (4 + rfclen != total_read) {
 		cifs_dbg(VFS, "Length read does not match RFC1001 length %d\n",
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index cc88f4f..d7ad0df 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -903,7 +903,7 @@ map_smb_to_linux_error(char *buf, bool logErr)
  * portion, the number of word parameters and the data portion of the message
  */
 unsigned int
-smbCalcSize(void *buf)
+smbCalcSize(void *buf, struct TCP_Server_Info *server)
 {
 	struct smb_hdr *ptr = (struct smb_hdr *)buf;
 	return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index a27fc87..eeab81c 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -650,7 +650,8 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
 		char *cur_ent;
 		char *end_of_smb = cfile->srch_inf.ntwrk_buf_start +
 			server->ops->calc_smb_size(
-					cfile->srch_inf.ntwrk_buf_start);
+					cfile->srch_inf.ntwrk_buf_start,
+					server);
 
 		cur_ent = cfile->srch_inf.srch_entries_start;
 		first_entry_in_buffer = cfile->srch_inf.index_of_last_entry
@@ -831,7 +832,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
 	cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
 		 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
 	max_len = tcon->ses->server->ops->calc_smb_size(
-			cifsFile->srch_inf.ntwrk_buf_start);
+			cifsFile->srch_inf.ntwrk_buf_start,
+			tcon->ses->server);
 	end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
 
 	tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index 401a5d8..0ffa180 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -61,9 +61,4 @@
 /* Maximum buffer size value we can send with 1 credit */
 #define SMB2_MAX_BUFFER_SIZE 65536
 
-static inline struct smb2_sync_hdr *get_sync_hdr(void *buf)
-{
-	return &(((struct smb2_hdr *)buf)->sync_hdr);
-}
-
 #endif	/* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 1238cd3..a6e786e 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -44,26 +44,38 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
 		   __u32 create_options, void *data, int command)
 {
 	int rc, tmprc = 0;
-	__le16 *utf16_path;
+	__le16 *utf16_path = NULL;
 	__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
 	struct cifs_open_parms oparms;
 	struct cifs_fid fid;
+	bool use_cached_root_handle = false;
 
-	utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
-	if (!utf16_path)
-		return -ENOMEM;
+	if ((strcmp(full_path, "") == 0) && (create_options == 0) &&
+	    (desired_access == FILE_READ_ATTRIBUTES) &&
+	    (create_disposition == FILE_OPEN) &&
+	    (tcon->nohandlecache == false)) {
+		rc = open_shroot(xid, tcon, &fid);
+		if (rc == 0)
+			use_cached_root_handle = true;
+	}
+
+	if (use_cached_root_handle == false) {
+		utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
+		if (!utf16_path)
+			return -ENOMEM;
 
-	oparms.tcon = tcon;
-	oparms.desired_access = desired_access;
-	oparms.disposition = create_disposition;
-	oparms.create_options = create_options;
-	oparms.fid = &fid;
-	oparms.reconnect = false;
+		oparms.tcon = tcon;
+		oparms.desired_access = desired_access;
+		oparms.disposition = create_disposition;
+		oparms.create_options = create_options;
+		oparms.fid = &fid;
+		oparms.reconnect = false;
 
-	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
-	if (rc) {
-		kfree(utf16_path);
-		return rc;
+		rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
+		if (rc) {
+			kfree(utf16_path);
+			return rc;
+		}
 	}
 
 	switch (command) {
@@ -107,7 +119,8 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
 		break;
 	}
 
-	rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	if (use_cached_root_handle == false)
+		rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
 	if (tmprc)
 		rc = tmprc;
 	kfree(utf16_path);
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 3bfc9c9..20a2d30 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -27,6 +27,7 @@
 #include "smb2proto.h"
 #include "smb2status.h"
 #include "smb2glob.h"
+#include "trace.h"
 
 struct status_to_posix_error {
 	__le32 smb2_status;
@@ -2450,13 +2451,16 @@ smb2_print_status(__le32 status)
 int
 map_smb2_to_linux_error(char *buf, bool log_err)
 {
-	struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+	struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
 	unsigned int i;
 	int rc = -EIO;
 	__le32 smb2err = shdr->Status;
 
-	if (smb2err == 0)
+	if (smb2err == 0) {
+		trace_smb3_cmd_done(shdr->TreeId, shdr->SessionId,
+			le16_to_cpu(shdr->Command), le64_to_cpu(shdr->MessageId));
 		return 0;
+	}
 
 	/* mask facility */
 	if (log_err && (smb2err != STATUS_MORE_PROCESSING_REQUIRED) &&
@@ -2478,5 +2482,8 @@ map_smb2_to_linux_error(char *buf, bool log_err)
 	cifs_dbg(FYI, "Mapping SMB2 status code 0x%08x to POSIX err %d\n",
 		 __le32_to_cpu(smb2err), rc);
 
+	trace_smb3_cmd_err(shdr->TreeId, shdr->SessionId,
+			le16_to_cpu(shdr->Command),
+			le64_to_cpu(shdr->MessageId), le32_to_cpu(smb2err), rc);
 	return rc;
 }
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 68ea849..cb5728e 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -94,8 +94,8 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
 };
 
 #ifdef CONFIG_CIFS_SMB311
-static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, __u32 non_ctxlen,
-				size_t hdr_preamble_size)
+static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len,
+			      __u32 non_ctxlen)
 {
 	__u16 neg_count;
 	__u32 nc_offset, size_of_pad_before_neg_ctxts;
@@ -109,12 +109,11 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, __u32 non_ctxlen,
 
 	/* Make sure that negotiate contexts start after gss security blob */
 	nc_offset = le32_to_cpu(pneg_rsp->NegotiateContextOffset);
-	if (nc_offset < non_ctxlen - hdr_preamble_size /* RFC1001 len */) {
+	if (nc_offset < non_ctxlen) {
 		printk_once(KERN_WARNING "invalid negotiate context offset\n");
 		return 0;
 	}
-	size_of_pad_before_neg_ctxts = nc_offset -
-					(non_ctxlen - hdr_preamble_size);
+	size_of_pad_before_neg_ctxts = nc_offset - non_ctxlen;
 
 	/* Verify that at least minimal negotiate contexts fit within frame */
 	if (len < nc_offset + (neg_count * sizeof(struct smb2_neg_context))) {
@@ -131,25 +130,20 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, __u32 non_ctxlen,
 #endif /* CIFS_SMB311 */
 
 int
-smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
+smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
 {
-	struct smb2_pdu *pdu = (struct smb2_pdu *)buf;
-	struct smb2_hdr *hdr = &pdu->hdr;
-	struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+	struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+	struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)shdr;
 	__u64 mid;
-	__u32 len = get_rfc1002_length(buf);
 	__u32 clc_len;  /* calculated length */
 	int command;
-
-	/* BB disable following printk later */
-	cifs_dbg(FYI, "%s length: 0x%x, smb_buf_length: 0x%x\n",
-		 __func__, length, len);
+	int pdu_size = sizeof(struct smb2_sync_pdu);
+	int hdr_size = sizeof(struct smb2_sync_hdr);
 
 	/*
 	 * Add function to do table lookup of StructureSize by command
 	 * ie Validate the wct via smb2_struct_sizes table above
 	 */
-
 	if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
 		struct smb2_transform_hdr *thdr =
 			(struct smb2_transform_hdr *)buf;
@@ -173,8 +167,8 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
 	}
 
 	mid = le64_to_cpu(shdr->MessageId);
-	if (length < sizeof(struct smb2_pdu)) {
-		if ((length >= sizeof(struct smb2_hdr))
+	if (len < pdu_size) {
+		if ((len >= hdr_size)
 		    && (shdr->Status != 0)) {
 			pdu->StructureSize2 = 0;
 			/*
@@ -187,8 +181,7 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
 		}
 		return 1;
 	}
-	if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE -
-	    srvr->vals->header_preamble_size) {
+	if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE) {
 		cifs_dbg(VFS, "SMB length greater than maximum, mid=%llu\n",
 			 mid);
 		return 1;
@@ -227,44 +220,38 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
 		}
 	}
 
-	if (srvr->vals->header_preamble_size + len != length) {
-		cifs_dbg(VFS, "Total length %u RFC1002 length %zu mismatch mid %llu\n",
-			 length, srvr->vals->header_preamble_size + len, mid);
-		return 1;
-	}
-
-	clc_len = smb2_calc_size(hdr);
+	clc_len = smb2_calc_size(buf, srvr);
 
 #ifdef CONFIG_CIFS_SMB311
 	if (shdr->Command == SMB2_NEGOTIATE)
-		clc_len += get_neg_ctxt_len(hdr, len, clc_len,
-					srvr->vals->header_preamble_size);
+		clc_len += get_neg_ctxt_len(shdr, len, clc_len);
 #endif /* SMB311 */
-	if (srvr->vals->header_preamble_size + len != clc_len) {
-		cifs_dbg(FYI, "Calculated size %u length %zu mismatch mid %llu\n",
-			 clc_len, srvr->vals->header_preamble_size + len, mid);
+	if (len != clc_len) {
+		cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n",
+			 clc_len, len, mid);
 		/* create failed on symlink */
 		if (command == SMB2_CREATE_HE &&
 		    shdr->Status == STATUS_STOPPED_ON_SYMLINK)
 			return 0;
 		/* Windows 7 server returns 24 bytes more */
-		if (clc_len + 24 - srvr->vals->header_preamble_size == len && command == SMB2_OPLOCK_BREAK_HE)
+		if (clc_len + 24 == len && command == SMB2_OPLOCK_BREAK_HE)
 			return 0;
 		/* server can return one byte more due to implied bcc[0] */
-		if (clc_len == srvr->vals->header_preamble_size + len + 1)
+		if (clc_len == len + 1)
 			return 0;
 
 		/*
 		 * MacOS server pads after SMB2.1 write response with 3 bytes
 		 * of junk. Other servers match RFC1001 len to actual
 		 * SMB2/SMB3 frame length (header + smb2 response specific data)
+		 * Some windows servers do too when compounding is used.
 		 * Log the server error (once), but allow it and continue
 		 * since the frame is parseable.
 		 */
-		if (clc_len < srvr->vals->header_preamble_size /* RFC1001 header size */ + len) {
+		if (clc_len < len) {
 			printk_once(KERN_WARNING
-				"SMB2 server sent bad RFC1001 len %d not %zu\n",
-				len, clc_len - srvr->vals->header_preamble_size);
+				"SMB2 server sent bad RFC1001 len %d not %d\n",
+				len, clc_len);
 			return 0;
 		}
 
@@ -305,15 +292,14 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = {
  * area and the offset to it (from the beginning of the smb are also returned.
  */
 char *
-smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
+smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr)
 {
-	struct smb2_sync_hdr *shdr = get_sync_hdr(hdr);
 	*off = 0;
 	*len = 0;
 
 	/* error responses do not have data area */
 	if (shdr->Status && shdr->Status != STATUS_MORE_PROCESSING_REQUIRED &&
-	    (((struct smb2_err_rsp *)hdr)->StructureSize) ==
+	    (((struct smb2_err_rsp *)shdr)->StructureSize) ==
 						SMB2_ERROR_STRUCTURE_SIZE2)
 		return NULL;
 
@@ -325,42 +311,44 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
 	switch (shdr->Command) {
 	case SMB2_NEGOTIATE:
 		*off = le16_to_cpu(
-		    ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferOffset);
+		  ((struct smb2_negotiate_rsp *)shdr)->SecurityBufferOffset);
 		*len = le16_to_cpu(
-		    ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferLength);
+		  ((struct smb2_negotiate_rsp *)shdr)->SecurityBufferLength);
 		break;
 	case SMB2_SESSION_SETUP:
 		*off = le16_to_cpu(
-		    ((struct smb2_sess_setup_rsp *)hdr)->SecurityBufferOffset);
+		  ((struct smb2_sess_setup_rsp *)shdr)->SecurityBufferOffset);
 		*len = le16_to_cpu(
-		    ((struct smb2_sess_setup_rsp *)hdr)->SecurityBufferLength);
+		  ((struct smb2_sess_setup_rsp *)shdr)->SecurityBufferLength);
 		break;
 	case SMB2_CREATE:
 		*off = le32_to_cpu(
-		    ((struct smb2_create_rsp *)hdr)->CreateContextsOffset);
+		    ((struct smb2_create_rsp *)shdr)->CreateContextsOffset);
 		*len = le32_to_cpu(
-		    ((struct smb2_create_rsp *)hdr)->CreateContextsLength);
+		    ((struct smb2_create_rsp *)shdr)->CreateContextsLength);
 		break;
 	case SMB2_QUERY_INFO:
 		*off = le16_to_cpu(
-		    ((struct smb2_query_info_rsp *)hdr)->OutputBufferOffset);
+		    ((struct smb2_query_info_rsp *)shdr)->OutputBufferOffset);
 		*len = le32_to_cpu(
-		    ((struct smb2_query_info_rsp *)hdr)->OutputBufferLength);
+		    ((struct smb2_query_info_rsp *)shdr)->OutputBufferLength);
 		break;
 	case SMB2_READ:
-		*off = ((struct smb2_read_rsp *)hdr)->DataOffset;
-		*len = le32_to_cpu(((struct smb2_read_rsp *)hdr)->DataLength);
+		/* TODO: is this a bug ? */
+		*off = ((struct smb2_read_rsp *)shdr)->DataOffset;
+		*len = le32_to_cpu(((struct smb2_read_rsp *)shdr)->DataLength);
 		break;
 	case SMB2_QUERY_DIRECTORY:
 		*off = le16_to_cpu(
-		  ((struct smb2_query_directory_rsp *)hdr)->OutputBufferOffset);
+		  ((struct smb2_query_directory_rsp *)shdr)->OutputBufferOffset);
 		*len = le32_to_cpu(
-		  ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength);
+		  ((struct smb2_query_directory_rsp *)shdr)->OutputBufferLength);
 		break;
 	case SMB2_IOCTL:
 		*off = le32_to_cpu(
-		  ((struct smb2_ioctl_rsp *)hdr)->OutputOffset);
-		*len = le32_to_cpu(((struct smb2_ioctl_rsp *)hdr)->OutputCount);
+		  ((struct smb2_ioctl_rsp *)shdr)->OutputOffset);
+		*len = le32_to_cpu(
+		  ((struct smb2_ioctl_rsp *)shdr)->OutputCount);
 		break;
 	case SMB2_CHANGE_NOTIFY:
 	default:
@@ -403,15 +391,14 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
  * portion, the number of word parameters and the data portion of the message.
  */
 unsigned int
-smb2_calc_size(void *buf)
+smb2_calc_size(void *buf, struct TCP_Server_Info *srvr)
 {
-	struct smb2_pdu *pdu = (struct smb2_pdu *)buf;
-	struct smb2_hdr *hdr = &pdu->hdr;
-	struct smb2_sync_hdr *shdr = get_sync_hdr(hdr);
+	struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)buf;
+	struct smb2_sync_hdr *shdr = &pdu->sync_hdr;
 	int offset; /* the offset from the beginning of SMB to data area */
 	int data_length; /* the length of the variable length data area */
 	/* Structure Size has already been checked to make sure it is 64 */
-	int len = 4 + le16_to_cpu(shdr->StructureSize);
+	int len = le16_to_cpu(shdr->StructureSize);
 
 	/*
 	 * StructureSize2, ie length of fixed parameter area has already
@@ -422,7 +409,7 @@ smb2_calc_size(void *buf)
 	if (has_smb2_data_area[le16_to_cpu(shdr->Command)] == false)
 		goto calc_size_exit;
 
-	smb2_get_data_area_len(&offset, &data_length, hdr);
+	smb2_get_data_area_len(&offset, &data_length, shdr);
 	cifs_dbg(FYI, "SMB2 data length %d offset %d\n", data_length, offset);
 
 	if (data_length > 0) {
@@ -430,15 +417,14 @@ smb2_calc_size(void *buf)
 		 * Check to make sure that data area begins after fixed area,
 		 * Note that last byte of the fixed area is part of data area
 		 * for some commands, typically those with odd StructureSize,
-		 * so we must add one to the calculation (and 4 to account for
-		 * the size of the RFC1001 hdr.
+		 * so we must add one to the calculation.
 		 */
-		if (offset + 4 + 1 < len) {
+		if (offset + 1 < len) {
 			cifs_dbg(VFS, "data area offset %d overlaps SMB2 header %d\n",
-				 offset + 4 + 1, len);
+				 offset + 1, len);
 			data_length = 0;
 		} else {
-			len = 4 + offset + data_length;
+			len = offset + data_length;
 		}
 	}
 calc_size_exit:
@@ -465,8 +451,14 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb)
 	/* Windows doesn't allow paths beginning with \ */
 	if (from[0] == '\\')
 		start_of_path = from + 1;
+#ifdef CONFIG_CIFS_SMB311
+	/* SMB311 POSIX extensions paths do not include leading slash */
+	else if (cifs_sb_master_tcon(cifs_sb)->posix_extensions)
+		start_of_path = from + 1;
+#endif /* 311 */
 	else
 		start_of_path = from;
+
 	to = cifs_strndup_to_utf16(start_of_path, PATH_MAX, &len,
 				   cifs_sb->local_nls, map_type);
 	return to;
@@ -621,7 +613,7 @@ smb2_is_valid_lease_break(char *buffer)
 bool
 smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
 {
-	struct smb2_oplock_break_rsp *rsp = (struct smb2_oplock_break_rsp *)buffer;
+	struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer;
 	struct list_head *tmp, *tmp1, *tmp2;
 	struct cifs_ses *ses;
 	struct cifs_tcon *tcon;
@@ -630,7 +622,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
 
 	cifs_dbg(FYI, "Checking for oplock break\n");
 
-	if (rsp->hdr.sync_hdr.Command != SMB2_OPLOCK_BREAK)
+	if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK)
 		return false;
 
 	if (rsp->StructureSize !=
@@ -721,7 +713,7 @@ smb2_cancelled_close_fid(struct work_struct *work)
 int
 smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server)
 {
-	struct smb2_sync_hdr *sync_hdr = get_sync_hdr(buffer);
+	struct smb2_sync_hdr *sync_hdr = (struct smb2_sync_hdr *)buffer;
 	struct smb2_create_rsp *rsp = (struct smb2_create_rsp *)buffer;
 	struct cifs_tcon *tcon;
 	struct close_cancelled_open *cancelled;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 9c6d95f..950d0ab 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -123,7 +123,7 @@ smb2_get_credits_field(struct TCP_Server_Info *server, const int optype)
 static unsigned int
 smb2_get_credits(struct mid_q_entry *mid)
 {
-	struct smb2_sync_hdr *shdr = get_sync_hdr(mid->resp_buf);
+	struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)mid->resp_buf;
 
 	return le16_to_cpu(shdr->CreditRequest);
 }
@@ -190,7 +190,7 @@ static struct mid_q_entry *
 smb2_find_mid(struct TCP_Server_Info *server, char *buf)
 {
 	struct mid_q_entry *mid;
-	struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+	struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
 	__u64 wire_mid = le64_to_cpu(shdr->MessageId);
 
 	if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
@@ -212,15 +212,16 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf)
 }
 
 static void
-smb2_dump_detail(void *buf)
+smb2_dump_detail(void *buf, struct TCP_Server_Info *server)
 {
 #ifdef CONFIG_CIFS_DEBUG2
-	struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+	struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
 
 	cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n",
 		 shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId,
 		 shdr->ProcessId);
-	cifs_dbg(VFS, "smb buf %p len %u\n", buf, smb2_calc_size(buf));
+	cifs_dbg(VFS, "smb buf %p len %u\n", buf,
+		 server->ops->calc_smb_size(buf, server));
 #endif
 }
 
@@ -322,6 +323,40 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
 }
 #endif /* STATS2 */
 
+/*
+ * Open the directory at the root of a share
+ */
+int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
+{
+	struct cifs_open_parms oparams;
+	int rc;
+	__le16 srch_path = 0; /* Null - since an open of top of share */
+	u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+
+	mutex_lock(&tcon->prfid_mutex);
+	if (tcon->valid_root_fid) {
+		cifs_dbg(FYI, "found a cached root file handle\n");
+		memcpy(pfid, tcon->prfid, sizeof(struct cifs_fid));
+		mutex_unlock(&tcon->prfid_mutex);
+		return 0;
+	}
+
+	oparams.tcon = tcon;
+	oparams.create_options = 0;
+	oparams.desired_access = FILE_READ_ATTRIBUTES;
+	oparams.disposition = FILE_OPEN;
+	oparams.fid = pfid;
+	oparams.reconnect = false;
+
+	rc = SMB2_open(xid, &oparams, &srch_path, &oplock, NULL, NULL);
+	if (rc == 0) {
+		memcpy(tcon->prfid, pfid, sizeof(struct cifs_fid));
+		tcon->valid_root_fid = true;
+	}
+	mutex_unlock(&tcon->prfid_mutex);
+	return rc;
+}
+
 static void
 smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
 {
@@ -330,6 +365,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
 	u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
 	struct cifs_open_parms oparms;
 	struct cifs_fid fid;
+	bool no_cached_open = tcon->nohandlecache;
 
 	oparms.tcon = tcon;
 	oparms.desired_access = FILE_READ_ATTRIBUTES;
@@ -338,7 +374,11 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
 	oparms.fid = &fid;
 	oparms.reconnect = false;
 
-	rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+	if (no_cached_open)
+		rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+	else
+		rc = open_shroot(xid, tcon, &fid);
+
 	if (rc)
 		return;
 
@@ -352,7 +392,8 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
 			FS_DEVICE_INFORMATION);
 	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
 			FS_SECTOR_SIZE_INFORMATION); /* SMB3 specific */
-	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	if (no_cached_open)
+		SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
 	return;
 }
 
@@ -394,6 +435,9 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
 	struct cifs_open_parms oparms;
 	struct cifs_fid fid;
 
+	if ((*full_path == 0) && tcon->valid_root_fid)
+		return 0;
+
 	utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
 	if (!utf16_path)
 		return -ENOMEM;
@@ -704,9 +748,11 @@ smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon)
 		seq_puts(m, " TRIM-support,");
 
 	seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags);
+	seq_printf(m, "\n\ttid: 0x%x", tcon->tid);
 	if (tcon->perf_sector_size)
 		seq_printf(m, "\tOptimal sector size: 0x%x",
 			   tcon->perf_sector_size);
+	seq_printf(m, "\tMaximal Access: 0x%x", tcon->maximal_access);
 }
 
 static void
@@ -1257,7 +1303,7 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
 static bool
 smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length)
 {
-	struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+	struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
 
 	if (shdr->Status != STATUS_PENDING)
 		return false;
@@ -1275,12 +1321,13 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length)
 static bool
 smb2_is_session_expired(char *buf)
 {
-	struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+	struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
 
-	if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED)
+	if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED &&
+	    shdr->Status != STATUS_USER_SESSION_DELETED)
 		return false;
 
-	cifs_dbg(FYI, "Session expired\n");
+	cifs_dbg(FYI, "Session expired or deleted\n");
 	return true;
 }
 
@@ -1474,8 +1521,6 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 	unsigned int sub_offset;
 	unsigned int print_len;
 	unsigned int print_offset;
-	struct cifs_ses *ses = tcon->ses;
-	struct TCP_Server_Info *server = ses->server;
 
 	cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
 
@@ -1499,7 +1544,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 
 	err_buf = err_iov.iov_base;
 	if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) ||
-	    err_iov.iov_len + server->vals->header_preamble_size < SMB2_SYMLINK_STRUCT_SIZE) {
+	    err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE) {
 		kfree(utf16_path);
 		return -ENOENT;
 	}
@@ -1512,14 +1557,13 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 	print_len = le16_to_cpu(symlink->PrintNameLength);
 	print_offset = le16_to_cpu(symlink->PrintNameOffset);
 
-	if (err_iov.iov_len + server->vals->header_preamble_size <
-			SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) {
+	if (err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) {
 		kfree(utf16_path);
 		return -ENOENT;
 	}
 
-	if (err_iov.iov_len + server->vals->header_preamble_size <
-			SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) {
+	if (err_iov.iov_len <
+	    SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) {
 		kfree(utf16_path);
 		return -ENOENT;
 	}
@@ -1593,8 +1637,11 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,
 		oparms.create_options = 0;
 
 	utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
-	if (!utf16_path)
-		return ERR_PTR(-ENOMEM);
+	if (!utf16_path) {
+		rc = -ENOMEM;
+		free_xid(xid);
+		return ERR_PTR(rc);
+	}
 
 	oparms.tcon = tcon;
 	oparms.desired_access = READ_CONTROL;
@@ -1652,8 +1699,11 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 		access_flags = WRITE_DAC;
 
 	utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
-	if (!utf16_path)
-		return -ENOMEM;
+	if (!utf16_path) {
+		rc = -ENOMEM;
+		free_xid(xid);
+		return rc;
+	}
 
 	oparms.tcon = tcon;
 	oparms.desired_access = access_flags;
@@ -1713,15 +1763,21 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
 
 	/* if file not oplocked can't be sure whether asking to extend size */
 	if (!CIFS_CACHE_READ(cifsi))
-		if (keep_size == false)
-			return -EOPNOTSUPP;
+		if (keep_size == false) {
+			rc = -EOPNOTSUPP;
+			free_xid(xid);
+			return rc;
+		}
 
 	/*
 	 * Must check if file sparse since fallocate -z (zero range) assumes
 	 * non-sparse allocation
 	 */
-	if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE))
-		return -EOPNOTSUPP;
+	if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) {
+		rc = -EOPNOTSUPP;
+		free_xid(xid);
+		return rc;
+	}
 
 	/*
 	 * need to make sure we are not asked to extend the file since the SMB3
@@ -1730,8 +1786,11 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
 	 * which for a non sparse file would zero the newly extended range
 	 */
 	if (keep_size == false)
-		if (i_size_read(inode) < offset + len)
-			return -EOPNOTSUPP;
+		if (i_size_read(inode) < offset + len) {
+			rc = -EOPNOTSUPP;
+			free_xid(xid);
+			return rc;
+		}
 
 	cifs_dbg(FYI, "offset %lld len %lld", offset, len);
 
@@ -1764,8 +1823,11 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
 
 	/* Need to make file sparse, if not already, before freeing range. */
 	/* Consider adding equivalent for compressed since it could also work */
-	if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse))
-		return -EOPNOTSUPP;
+	if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) {
+		rc = -EOPNOTSUPP;
+		free_xid(xid);
+		return rc;
+	}
 
 	cifs_dbg(FYI, "offset %lld len %lld", offset, len);
 
@@ -1796,8 +1858,10 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
 
 	/* if file not oplocked can't be sure whether asking to extend size */
 	if (!CIFS_CACHE_READ(cifsi))
-		if (keep_size == false)
-			return -EOPNOTSUPP;
+		if (keep_size == false) {
+			free_xid(xid);
+			return rc;
+		}
 
 	/*
 	 * Files are non-sparse by default so falloc may be a no-op
@@ -1806,14 +1870,16 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
 	 */
 	if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0) {
 		if (keep_size == true)
-			return 0;
+			rc = 0;
 		/* check if extending file */
 		else if (i_size_read(inode) >= off + len)
 			/* not extending file and already not sparse */
-			return 0;
+			rc = 0;
 		/* BB: in future add else clause to extend file */
 		else
-			return -EOPNOTSUPP;
+			rc = -EOPNOTSUPP;
+		free_xid(xid);
+		return rc;
 	}
 
 	if ((keep_size == true) || (i_size_read(inode) >= off + len)) {
@@ -1825,8 +1891,11 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
 		 * ie potentially making a few extra pages at the beginning
 		 * or end of the file non-sparse via set_sparse is harmless.
 		 */
-		if ((off > 8192) || (off + len + 8192 < i_size_read(inode)))
-			return -EOPNOTSUPP;
+		if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) {
+			rc = -EOPNOTSUPP;
+			free_xid(xid);
+			return rc;
+		}
 
 		rc = smb2_set_sparse(xid, tcon, cfile, inode, false);
 	}
@@ -2035,7 +2104,7 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock)
 }
 
 static __u8
-smb2_parse_lease_buf(void *buf, unsigned int *epoch)
+smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key)
 {
 	struct create_lease *lc = (struct create_lease *)buf;
 
@@ -2046,13 +2115,16 @@ smb2_parse_lease_buf(void *buf, unsigned int *epoch)
 }
 
 static __u8
-smb3_parse_lease_buf(void *buf, unsigned int *epoch)
+smb3_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key)
 {
 	struct create_lease_v2 *lc = (struct create_lease_v2 *)buf;
 
 	*epoch = le16_to_cpu(lc->lcontext.Epoch);
 	if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
 		return SMB2_OPLOCK_LEVEL_NOCHANGE;
+	if (lease_key)
+		memcpy(lease_key, &lc->lcontext.LeaseKeyLow,
+		       SMB2_LEASE_KEY_SIZE);
 	return le32_to_cpu(lc->lcontext.LeaseState);
 }
 
@@ -2070,12 +2142,11 @@ smb2_dir_needs_close(struct cifsFileInfo *cfile)
 }
 
 static void
-fill_transform_hdr(struct TCP_Server_Info *server,
-		   struct smb2_transform_hdr *tr_hdr, struct smb_rqst *old_rq)
+fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
+		   struct smb_rqst *old_rq)
 {
 	struct smb2_sync_hdr *shdr =
 			(struct smb2_sync_hdr *)old_rq->rq_iov[1].iov_base;
-	unsigned int orig_len = get_rfc1002_length(old_rq->rq_iov[0].iov_base);
 
 	memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr));
 	tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM;
@@ -2083,8 +2154,6 @@ fill_transform_hdr(struct TCP_Server_Info *server,
 	tr_hdr->Flags = cpu_to_le16(0x01);
 	get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CMM_NONCE);
 	memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8);
-	inc_rfc1001_len(tr_hdr, sizeof(struct smb2_transform_hdr) - server->vals->header_preamble_size);
-	inc_rfc1001_len(tr_hdr, orig_len);
 }
 
 /* We can not use the normal sg_set_buf() as we will sometimes pass a
@@ -2096,11 +2165,16 @@ static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf,
 	sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
 }
 
+/* Assumes:
+ * rqst->rq_iov[0]  is rfc1002 length
+ * rqst->rq_iov[1]  is tranform header
+ * rqst->rq_iov[2+] data to be encrypted/decrypted
+ */
 static struct scatterlist *
 init_sg(struct smb_rqst *rqst, u8 *sign)
 {
-	unsigned int sg_len = rqst->rq_nvec + rqst->rq_npages + 1;
-	unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24;
+	unsigned int sg_len = rqst->rq_nvec + rqst->rq_npages;
+	unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20;
 	struct scatterlist *sg;
 	unsigned int i;
 	unsigned int j;
@@ -2110,10 +2184,10 @@ init_sg(struct smb_rqst *rqst, u8 *sign)
 		return NULL;
 
 	sg_init_table(sg, sg_len);
-	smb2_sg_set_buf(&sg[0], rqst->rq_iov[0].iov_base + 24, assoc_data_len);
-	for (i = 1; i < rqst->rq_nvec; i++)
-		smb2_sg_set_buf(&sg[i], rqst->rq_iov[i].iov_base,
-						rqst->rq_iov[i].iov_len);
+	smb2_sg_set_buf(&sg[0], rqst->rq_iov[1].iov_base + 20, assoc_data_len);
+	for (i = 1; i < rqst->rq_nvec - 1; i++)
+		smb2_sg_set_buf(&sg[i], rqst->rq_iov[i+1].iov_base,
+						rqst->rq_iov[i+1].iov_len);
 	for (j = 0; i < sg_len - 1; i++, j++) {
 		unsigned int len = (j < rqst->rq_npages - 1) ? rqst->rq_pagesz
 							: rqst->rq_tailsz;
@@ -2145,9 +2219,10 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key)
 }
 /*
  * Encrypt or decrypt @rqst message. @rqst has the following format:
- * iov[0] - transform header (associate data),
- * iov[1-N] and pages - data to encrypt.
- * On success return encrypted data in iov[1-N] and pages, leave iov[0]
+ * iov[0] - rfc1002 length
+ * iov[1] - transform header (associate data),
+ * iov[2-N] and pages - data to encrypt.
+ * On success return encrypted data in iov[2-N] and pages, leave iov[0-1]
  * untouched.
  */
 static int
@@ -2155,7 +2230,7 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc)
 {
 	struct smb2_transform_hdr *tr_hdr =
 			(struct smb2_transform_hdr *)rqst->rq_iov[0].iov_base;
-	unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20 - server->vals->header_preamble_size;
+	unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20;
 	int rc = 0;
 	struct scatterlist *sg;
 	u8 sign[SMB2_SIGNATURE_SIZE] = {};
@@ -2242,6 +2317,10 @@ free_req:
 	return rc;
 }
 
+/*
+ * This is called from smb_send_rqst. At this point we have the rfc1002
+ * header as the first element in the vector.
+ */
 static int
 smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq,
 		       struct smb_rqst *old_rq)
@@ -2250,6 +2329,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq,
 	struct page **pages;
 	struct smb2_transform_hdr *tr_hdr;
 	unsigned int npages = old_rq->rq_npages;
+	unsigned int orig_len = get_rfc1002_length(old_rq->rq_iov[0].iov_base);
 	int i;
 	int rc = -ENOMEM;
 
@@ -2268,24 +2348,34 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq,
 			goto err_free_pages;
 	}
 
-	iov = kmalloc_array(old_rq->rq_nvec, sizeof(struct kvec), GFP_KERNEL);
+	/* Make space for one extra iov to hold the transform header */
+	iov = kmalloc_array(old_rq->rq_nvec + 1, sizeof(struct kvec),
+			    GFP_KERNEL);
 	if (!iov)
 		goto err_free_pages;
 
 	/* copy all iovs from the old except the 1st one (rfc1002 length) */
-	memcpy(&iov[1], &old_rq->rq_iov[1],
+	memcpy(&iov[2], &old_rq->rq_iov[1],
 				sizeof(struct kvec) * (old_rq->rq_nvec - 1));
+	/* copy the rfc1002 iov */
+	iov[0].iov_base = old_rq->rq_iov[0].iov_base;
+	iov[0].iov_len  = old_rq->rq_iov[0].iov_len;
+
 	new_rq->rq_iov = iov;
-	new_rq->rq_nvec = old_rq->rq_nvec;
+	new_rq->rq_nvec = old_rq->rq_nvec + 1;
 
 	tr_hdr = kmalloc(sizeof(struct smb2_transform_hdr), GFP_KERNEL);
 	if (!tr_hdr)
 		goto err_free_iov;
 
-	/* fill the 1st iov with a transform header */
-	fill_transform_hdr(server, tr_hdr, old_rq);
-	new_rq->rq_iov[0].iov_base = tr_hdr;
-	new_rq->rq_iov[0].iov_len = sizeof(struct smb2_transform_hdr);
+	/* fill the 2nd iov with a transform header */
+	fill_transform_hdr(tr_hdr, orig_len, old_rq);
+	new_rq->rq_iov[1].iov_base = tr_hdr;
+	new_rq->rq_iov[1].iov_len = sizeof(struct smb2_transform_hdr);
+
+	/* Update rfc1002 header */
+	inc_rfc1001_len(new_rq->rq_iov[0].iov_base,
+			sizeof(struct smb2_transform_hdr));
 
 	/* copy pages form the old */
 	for (i = 0; i < npages; i++) {
@@ -2325,7 +2415,7 @@ smb3_free_transform_rq(struct smb_rqst *rqst)
 		put_page(rqst->rq_pages[i]);
 	kfree(rqst->rq_pages);
 	/* free transform header */
-	kfree(rqst->rq_iov[0].iov_base);
+	kfree(rqst->rq_iov[1].iov_base);
 	kfree(rqst->rq_iov);
 }
 
@@ -2342,18 +2432,19 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
 		 unsigned int buf_data_size, struct page **pages,
 		 unsigned int npages, unsigned int page_data_size)
 {
-	struct kvec iov[2];
+	struct kvec iov[3];
 	struct smb_rqst rqst = {NULL};
-	struct smb2_hdr *hdr;
 	int rc;
 
-	iov[0].iov_base = buf;
-	iov[0].iov_len = sizeof(struct smb2_transform_hdr);
-	iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr);
-	iov[1].iov_len = buf_data_size;
+	iov[0].iov_base = NULL;
+	iov[0].iov_len = 0;
+	iov[1].iov_base = buf;
+	iov[1].iov_len = sizeof(struct smb2_transform_hdr);
+	iov[2].iov_base = buf + sizeof(struct smb2_transform_hdr);
+	iov[2].iov_len = buf_data_size;
 
 	rqst.rq_iov = iov;
-	rqst.rq_nvec = 2;
+	rqst.rq_nvec = 3;
 	rqst.rq_pages = pages;
 	rqst.rq_npages = npages;
 	rqst.rq_pagesz = PAGE_SIZE;
@@ -2365,10 +2456,9 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
 	if (rc)
 		return rc;
 
-	memmove(buf + server->vals->header_preamble_size, iov[1].iov_base, buf_data_size);
-	hdr = (struct smb2_hdr *)buf;
-	hdr->smb2_buf_length = cpu_to_be32(buf_data_size + page_data_size);
-	server->total_read = buf_data_size + page_data_size + server->vals->header_preamble_size;
+	memmove(buf, iov[2].iov_base, buf_data_size);
+
+	server->total_read = buf_data_size + page_data_size;
 
 	return rc;
 }
@@ -2393,7 +2483,7 @@ read_data_into_pages(struct TCP_Server_Info *server, struct page **pages,
 			zero_user(page, len, PAGE_SIZE - len);
 			len = 0;
 		}
-		length = cifs_read_page_from_socket(server, page, n);
+		length = cifs_read_page_from_socket(server, page, 0, n);
 		if (length < 0)
 			return length;
 		server->total_read += length;
@@ -2441,7 +2531,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
 	unsigned int cur_page_idx;
 	unsigned int pad_len;
 	struct cifs_readdata *rdata = mid->callback_data;
-	struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+	struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
 	struct bio_vec *bvec = NULL;
 	struct iov_iter iter;
 	struct kvec iov;
@@ -2472,7 +2562,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
 		return 0;
 	}
 
-	data_offset = server->ops->read_data_offset(buf) + server->vals->header_preamble_size;
+	data_offset = server->ops->read_data_offset(buf);
 #ifdef CONFIG_CIFS_SMB_DIRECT
 	use_rdma_mr = rdata->mr;
 #endif
@@ -2568,12 +2658,11 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid)
 	unsigned int npages;
 	struct page **pages;
 	unsigned int len;
-	unsigned int buflen = server->pdu_size + server->vals->header_preamble_size;
+	unsigned int buflen = server->pdu_size;
 	int rc;
 	int i = 0;
 
-	len = min_t(unsigned int, buflen, server->vals->read_rsp_size -
-		server->vals->header_preamble_size +
+	len = min_t(unsigned int, buflen, server->vals->read_rsp_size +
 		sizeof(struct smb2_transform_hdr)) - HEADER_SIZE(server) + 1;
 
 	rc = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, len);
@@ -2581,8 +2670,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid)
 		return rc;
 	server->total_read += rc;
 
-	len = le32_to_cpu(tr_hdr->OriginalMessageSize) +
-		server->vals->header_preamble_size -
+	len = le32_to_cpu(tr_hdr->OriginalMessageSize) -
 		server->vals->read_rsp_size;
 	npages = DIV_ROUND_UP(len, PAGE_SIZE);
 
@@ -2609,8 +2697,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid)
 	if (rc)
 		goto free_pages;
 
-	rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size -
-			      server->vals->header_preamble_size,
+	rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size,
 			      pages, npages, len);
 	if (rc)
 		goto free_pages;
@@ -2647,7 +2734,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
 	struct mid_q_entry *mid_entry;
 
 	/* switch to large buffer if too big for a small one */
-	if (pdu_length + server->vals->header_preamble_size > MAX_CIFS_SMALL_BUFFER_SIZE) {
+	if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE) {
 		server->large_buf = true;
 		memcpy(server->bigbuf, buf, server->total_read);
 		buf = server->bigbuf;
@@ -2655,13 +2742,12 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
 
 	/* now read the rest */
 	length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1,
-				pdu_length - HEADER_SIZE(server) + 1 +
-				server->vals->header_preamble_size);
+				pdu_length - HEADER_SIZE(server) + 1);
 	if (length < 0)
 		return length;
 	server->total_read += length;
 
-	buf_size = pdu_length + server->vals->header_preamble_size - sizeof(struct smb2_transform_hdr);
+	buf_size = pdu_length - sizeof(struct smb2_transform_hdr);
 	length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0);
 	if (length)
 		return length;
@@ -2690,7 +2776,7 @@ smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid)
 	struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf;
 	unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
 
-	if (pdu_length + server->vals->header_preamble_size < sizeof(struct smb2_transform_hdr) +
+	if (pdu_length < sizeof(struct smb2_transform_hdr) +
 						sizeof(struct smb2_sync_hdr)) {
 		cifs_dbg(VFS, "Transform message is too small (%u)\n",
 			 pdu_length);
@@ -2699,14 +2785,14 @@ smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid)
 		return -ECONNABORTED;
 	}
 
-	if (pdu_length + server->vals->header_preamble_size < orig_len + sizeof(struct smb2_transform_hdr)) {
+	if (pdu_length < orig_len + sizeof(struct smb2_transform_hdr)) {
 		cifs_dbg(VFS, "Transform message is broken\n");
 		cifs_reconnect(server);
 		wake_up(&server->response_q);
 		return -ECONNABORTED;
 	}
 
-	if (pdu_length + server->vals->header_preamble_size > CIFSMaxBufSize + MAX_HEADER_SIZE(server))
+	if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server))
 		return receive_encrypted_read(server, mid);
 
 	return receive_encrypted_standard(server, mid);
@@ -2717,11 +2803,23 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 {
 	char *buf = server->large_buf ? server->bigbuf : server->smallbuf;
 
-	return handle_read_data(server, mid, buf, server->pdu_size +
-				server->vals->header_preamble_size,
+	return handle_read_data(server, mid, buf, server->pdu_size,
 				NULL, 0, 0);
 }
 
+static int
+smb2_next_header(char *buf)
+{
+	struct smb2_sync_hdr *hdr = (struct smb2_sync_hdr *)buf;
+	struct smb2_transform_hdr *t_hdr = (struct smb2_transform_hdr *)buf;
+
+	if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM)
+		return sizeof(struct smb2_transform_hdr) +
+		  le32_to_cpu(t_hdr->OriginalMessageSize);
+
+	return le32_to_cpu(hdr->NextCommand);
+}
+
 struct smb_version_operations smb20_operations = {
 	.compare_fids = smb2_compare_fids,
 	.setup_request = smb2_setup_request,
@@ -2813,6 +2911,7 @@ struct smb_version_operations smb20_operations = {
 	.get_acl_by_fid = get_smb2_acl_by_fid,
 	.set_acl = set_smb2_acl,
 #endif /* CIFS_ACL */
+	.next_header = smb2_next_header,
 };
 
 struct smb_version_operations smb21_operations = {
@@ -2907,6 +3006,7 @@ struct smb_version_operations smb21_operations = {
 	.get_acl_by_fid = get_smb2_acl_by_fid,
 	.set_acl = set_smb2_acl,
 #endif /* CIFS_ACL */
+	.next_header = smb2_next_header,
 };
 
 struct smb_version_operations smb30_operations = {
@@ -3011,6 +3111,7 @@ struct smb_version_operations smb30_operations = {
 	.get_acl_by_fid = get_smb2_acl_by_fid,
 	.set_acl = set_smb2_acl,
 #endif /* CIFS_ACL */
+	.next_header = smb2_next_header,
 };
 
 #ifdef CONFIG_CIFS_SMB311
@@ -3111,6 +3212,7 @@ struct smb_version_operations smb311_operations = {
 	.query_all_EAs = smb2_query_eas,
 	.set_EA = smb2_set_ea,
 #endif /* CIFS_XATTR */
+	.next_header = smb2_next_header,
 };
 #endif /* CIFS_SMB311 */
 
@@ -3122,8 +3224,8 @@ struct smb_version_values smb20_values = {
 	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
 	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
 	.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
-	.header_size = sizeof(struct smb2_hdr),
-	.header_preamble_size = 4,
+	.header_size = sizeof(struct smb2_sync_hdr),
+	.header_preamble_size = 0,
 	.max_header_size = MAX_SMB2_HDR_SIZE,
 	.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
 	.lock_cmd = SMB2_LOCK,
@@ -3143,8 +3245,8 @@ struct smb_version_values smb21_values = {
 	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
 	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
 	.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
-	.header_size = sizeof(struct smb2_hdr),
-	.header_preamble_size = 4,
+	.header_size = sizeof(struct smb2_sync_hdr),
+	.header_preamble_size = 0,
 	.max_header_size = MAX_SMB2_HDR_SIZE,
 	.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
 	.lock_cmd = SMB2_LOCK,
@@ -3164,8 +3266,8 @@ struct smb_version_values smb3any_values = {
 	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
 	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
 	.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
-	.header_size = sizeof(struct smb2_hdr),
-	.header_preamble_size = 4,
+	.header_size = sizeof(struct smb2_sync_hdr),
+	.header_preamble_size = 0,
 	.max_header_size = MAX_SMB2_HDR_SIZE,
 	.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
 	.lock_cmd = SMB2_LOCK,
@@ -3185,8 +3287,8 @@ struct smb_version_values smbdefault_values = {
 	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
 	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
 	.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
-	.header_size = sizeof(struct smb2_hdr),
-	.header_preamble_size = 4,
+	.header_size = sizeof(struct smb2_sync_hdr),
+	.header_preamble_size = 0,
 	.max_header_size = MAX_SMB2_HDR_SIZE,
 	.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
 	.lock_cmd = SMB2_LOCK,
@@ -3206,8 +3308,8 @@ struct smb_version_values smb30_values = {
 	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
 	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
 	.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
-	.header_size = sizeof(struct smb2_hdr),
-	.header_preamble_size = 4,
+	.header_size = sizeof(struct smb2_sync_hdr),
+	.header_preamble_size = 0,
 	.max_header_size = MAX_SMB2_HDR_SIZE,
 	.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
 	.lock_cmd = SMB2_LOCK,
@@ -3227,8 +3329,8 @@ struct smb_version_values smb302_values = {
 	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
 	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
 	.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
-	.header_size = sizeof(struct smb2_hdr),
-	.header_preamble_size = 4,
+	.header_size = sizeof(struct smb2_sync_hdr),
+	.header_preamble_size = 0,
 	.max_header_size = MAX_SMB2_HDR_SIZE,
 	.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
 	.lock_cmd = SMB2_LOCK,
@@ -3249,8 +3351,8 @@ struct smb_version_values smb311_values = {
 	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
 	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
 	.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
-	.header_size = sizeof(struct smb2_hdr),
-	.header_preamble_size = 4,
+	.header_size = sizeof(struct smb2_sync_hdr),
+	.header_preamble_size = 0,
 	.max_header_size = MAX_SMB2_HDR_SIZE,
 	.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
 	.lock_cmd = SMB2_LOCK,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 0f48741..281fbc1 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -49,6 +49,7 @@
 #include "cifspdu.h"
 #include "cifs_spnego.h"
 #include "smbdirect.h"
+#include "trace.h"
 
 /*
  *  The following table defines the expected "StructureSize" of SMB2 requests
@@ -79,7 +80,7 @@ static const int smb2_req_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
 	/* SMB2_OPLOCK_BREAK */ 24 /* BB this is 36 for LEASE_BREAK variant */
 };
 
-static int encryption_required(const struct cifs_tcon *tcon)
+static int smb3_encryption_required(const struct cifs_tcon *tcon)
 {
 	if (!tcon)
 		return 0;
@@ -145,7 +146,7 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
 		shdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */
 
 	if (tcon->ses && tcon->ses->server && tcon->ses->server->sign &&
-	    !encryption_required(tcon))
+	    !smb3_encryption_required(tcon))
 		shdr->Flags |= SMB2_FLAGS_SIGNED;
 out:
 	return;
@@ -367,6 +368,7 @@ smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
 
 #define SMB2_PREAUTH_INTEGRITY_CAPABILITIES	cpu_to_le16(1)
 #define SMB2_ENCRYPTION_CAPABILITIES		cpu_to_le16(2)
+#define SMB2_POSIX_EXTENSIONS_AVAILABLE		cpu_to_le16(0x100)
 
 static void
 build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt)
@@ -390,21 +392,35 @@ build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt)
 }
 
 static void
+build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt)
+{
+	pneg_ctxt->ContextType = SMB2_POSIX_EXTENSIONS_AVAILABLE;
+	pneg_ctxt->DataLength = cpu_to_le16(POSIX_CTXT_DATA_LEN);
+}
+
+static void
 assemble_neg_contexts(struct smb2_negotiate_req *req,
 		      unsigned int *total_len)
 {
 	char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT;
+	unsigned int ctxt_len;
 
+	*total_len += 2; /* Add 2 due to round to 8 byte boundary for 1st ctxt */
 	build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt);
-	/* Add 2 to size to round to 8 byte boundary */
+	ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_preauth_neg_context), 8) * 8;
+	*total_len += ctxt_len;
+	pneg_ctxt += ctxt_len;
 
-	pneg_ctxt += 2 + sizeof(struct smb2_preauth_neg_context);
 	build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt);
-	req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT);
-	req->NegotiateContextCount = cpu_to_le16(2);
+	ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_encryption_neg_context), 8) * 8;
+	*total_len += ctxt_len;
+	pneg_ctxt += ctxt_len;
+
+	build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);
+	*total_len += sizeof(struct smb2_posix_neg_context);
 
-	*total_len += 4 + sizeof(struct smb2_preauth_neg_context)
-		+ sizeof(struct smb2_encryption_neg_context);
+	req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT);
+	req->NegotiateContextCount = cpu_to_le16(3);
 }
 
 static void decode_preauth_context(struct smb2_preauth_neg_context *ctxt)
@@ -449,12 +465,12 @@ static int decode_encrypt_ctx(struct TCP_Server_Info *server,
 }
 
 static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
-				     struct TCP_Server_Info *server)
+				     struct TCP_Server_Info *server,
+				     unsigned int len_of_smb)
 {
 	struct smb2_neg_context *pctx;
 	unsigned int offset = le32_to_cpu(rsp->NegotiateContextOffset);
 	unsigned int ctxt_cnt = le16_to_cpu(rsp->NegotiateContextCount);
-	unsigned int len_of_smb = be32_to_cpu(rsp->hdr.smb2_buf_length);
 	unsigned int len_of_ctxts, i;
 	int rc = 0;
 
@@ -475,8 +491,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
 		if (len_of_ctxts < sizeof(struct smb2_neg_context))
 			break;
 
-		pctx = (struct smb2_neg_context *)(offset +
-			server->vals->header_preamble_size + (char *)rsp);
+		pctx = (struct smb2_neg_context *)(offset + (char *)rsp);
 		clen = le16_to_cpu(pctx->DataLength);
 		if (clen > len_of_ctxts)
 			break;
@@ -487,6 +502,8 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
 		else if (pctx->ContextType == SMB2_ENCRYPTION_CAPABILITIES)
 			rc = decode_encrypt_ctx(server,
 				(struct smb2_encryption_neg_context *)pctx);
+		else if (pctx->ContextType == SMB2_POSIX_EXTENSIONS_AVAILABLE)
+			server->posix_ext_supported = true;
 		else
 			cifs_dbg(VFS, "unknown negcontext of type %d ignored\n",
 				le16_to_cpu(pctx->ContextType));
@@ -501,6 +518,64 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
 	return rc;
 }
 
+static struct create_posix *
+create_posix_buf(umode_t mode)
+{
+	struct create_posix *buf;
+
+	buf = kzalloc(sizeof(struct create_posix),
+			GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->ccontext.DataOffset =
+		cpu_to_le16(offsetof(struct create_posix, Mode));
+	buf->ccontext.DataLength = cpu_to_le32(4);
+	buf->ccontext.NameOffset =
+		cpu_to_le16(offsetof(struct create_posix, Name));
+	buf->ccontext.NameLength = cpu_to_le16(16);
+
+	/* SMB2_CREATE_TAG_POSIX is "0x93AD25509CB411E7B42383DE968BCD7C" */
+	buf->Name[0] = 0x93;
+	buf->Name[1] = 0xAD;
+	buf->Name[2] = 0x25;
+	buf->Name[3] = 0x50;
+	buf->Name[4] = 0x9C;
+	buf->Name[5] = 0xB4;
+	buf->Name[6] = 0x11;
+	buf->Name[7] = 0xE7;
+	buf->Name[8] = 0xB4;
+	buf->Name[9] = 0x23;
+	buf->Name[10] = 0x83;
+	buf->Name[11] = 0xDE;
+	buf->Name[12] = 0x96;
+	buf->Name[13] = 0x8B;
+	buf->Name[14] = 0xCD;
+	buf->Name[15] = 0x7C;
+	buf->Mode = cpu_to_le32(mode);
+	cifs_dbg(FYI, "mode on posix create 0%o", mode);
+	return buf;
+}
+
+static int
+add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode)
+{
+	struct smb2_create_req *req = iov[0].iov_base;
+	unsigned int num = *num_iovec;
+
+	iov[num].iov_base = create_posix_buf(mode);
+	if (iov[num].iov_base == NULL)
+		return -ENOMEM;
+	iov[num].iov_len = sizeof(struct create_posix);
+	if (!req->CreateContextsOffset)
+		req->CreateContextsOffset = cpu_to_le32(
+				sizeof(struct smb2_create_req) +
+				iov[num - 1].iov_len);
+	le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_posix));
+	*num_iovec = num + 1;
+	return 0;
+}
+
 #else
 static void assemble_neg_contexts(struct smb2_negotiate_req *req,
 				  unsigned int *total_len)
@@ -691,7 +766,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
 	server->capabilities |= SMB2_NT_FIND | SMB2_LARGE_FILES;
 
 	security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
-					       &rsp->hdr);
+					       (struct smb2_sync_hdr *)rsp);
 	/*
 	 * See MS-SMB2 section 2.2.4: if no blob, client picks default which
 	 * for us will be
@@ -718,7 +793,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
 #ifdef CONFIG_CIFS_SMB311
 	if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) {
 		if (rsp->NegotiateContextCount)
-			rc = smb311_decode_neg_context(rsp, server);
+			rc = smb311_decode_neg_context(rsp, server,
+						       rsp_iov.iov_len);
 		else
 			cifs_dbg(VFS, "Missing expected negotiate contexts\n");
 	}
@@ -1054,7 +1130,7 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
 		goto out_put_spnego_key;
 
 	rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
-	ses->Suid = rsp->hdr.sync_hdr.SessionId;
+	ses->Suid = rsp->sync_hdr.SessionId;
 
 	ses->session_flags = le16_to_cpu(rsp->SessionFlags);
 
@@ -1130,13 +1206,13 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
 
 	/* If true, rc here is expected and not an error */
 	if (sess_data->buf0_type != CIFS_NO_BUFFER &&
-		rsp->hdr.sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED)
+		rsp->sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED)
 		rc = 0;
 
 	if (rc)
 		goto out;
 
-	if (offsetof(struct smb2_sess_setup_rsp, Buffer) - ses->server->vals->header_preamble_size !=
+	if (offsetof(struct smb2_sess_setup_rsp, Buffer) !=
 			le16_to_cpu(rsp->SecurityBufferOffset)) {
 		cifs_dbg(VFS, "Invalid security buffer offset %d\n",
 			le16_to_cpu(rsp->SecurityBufferOffset));
@@ -1151,7 +1227,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
 	cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n");
 
 
-	ses->Suid = rsp->hdr.sync_hdr.SessionId;
+	ses->Suid = rsp->sync_hdr.SessionId;
 	ses->session_flags = le16_to_cpu(rsp->SessionFlags);
 
 out:
@@ -1209,7 +1285,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
 
 	rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
 
-	ses->Suid = rsp->hdr.sync_hdr.SessionId;
+	ses->Suid = rsp->sync_hdr.SessionId;
 	ses->session_flags = le16_to_cpu(rsp->SessionFlags);
 
 	rc = SMB2_sess_establish_session(sess_data);
@@ -1276,6 +1352,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
 	sess_data->ses = ses;
 	sess_data->buf0_type = CIFS_NO_BUFFER;
 	sess_data->nls_cp = (struct nls_table *) nls_cp;
+	sess_data->previous_session = ses->Suid;
 
 #ifdef CONFIG_CIFS_SMB311
 	/*
@@ -1403,7 +1480,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
 		return rc;
 	}
 
-	if (encryption_required(tcon))
+	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	iov[0].iov_base = (char *)req;
@@ -1419,7 +1496,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
 
 	/* 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 */
 	if ((ses->server->dialect == SMB311_PROT_ID) &&
-	    !encryption_required(tcon))
+	    !smb3_encryption_required(tcon))
 		req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
 
 	rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov);
@@ -1457,7 +1534,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
 	tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
 	tcon->tidStatus = CifsGood;
 	tcon->need_reconnect = false;
-	tcon->tid = rsp->hdr.sync_hdr.TreeId;
+	tcon->tid = rsp->sync_hdr.TreeId;
 	strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
 
 	if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
@@ -1477,7 +1554,7 @@ tcon_exit:
 	return rc;
 
 tcon_error_exit:
-	if (rsp && rsp->hdr.sync_hdr.Status == STATUS_BAD_NETWORK_NAME) {
+	if (rsp && rsp->sync_hdr.Status == STATUS_BAD_NETWORK_NAME) {
 		cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
 	}
 	goto tcon_exit;
@@ -1508,7 +1585,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
 	if (rc)
 		return rc;
 
-	if (encryption_required(tcon))
+	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	flags |= CIFS_NO_RESP;
@@ -1575,7 +1652,7 @@ create_reconnect_durable_buf(struct cifs_fid *fid)
 
 static __u8
 parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
-		  unsigned int *epoch)
+		  unsigned int *epoch, char *lease_key)
 {
 	char *data_offset;
 	struct create_context *cc;
@@ -1583,14 +1660,15 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
 	unsigned int remaining;
 	char *name;
 
-	data_offset = (char *)rsp + server->vals->header_preamble_size + le32_to_cpu(rsp->CreateContextsOffset);
+	data_offset = (char *)rsp + le32_to_cpu(rsp->CreateContextsOffset);
 	remaining = le32_to_cpu(rsp->CreateContextsLength);
 	cc = (struct create_context *)data_offset;
 	while (remaining >= sizeof(struct create_context)) {
 		name = le16_to_cpu(cc->NameOffset) + (char *)cc;
 		if (le16_to_cpu(cc->NameLength) == 4 &&
 		    strncmp(name, "RqLs", 4) == 0)
-			return server->ops->parse_lease_buf(cc, epoch);
+			return server->ops->parse_lease_buf(cc, epoch,
+							    lease_key);
 
 		next = le32_to_cpu(cc->Next);
 		if (!next)
@@ -1818,7 +1896,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 	struct TCP_Server_Info *server;
 	struct cifs_tcon *tcon = oparms->tcon;
 	struct cifs_ses *ses = tcon->ses;
-	struct kvec iov[4];
+	struct kvec iov[5]; /* make sure at least one for each open context */
 	struct kvec rsp_iov = {NULL, 0};
 	int resp_buftype;
 	int uni_path_len;
@@ -1827,7 +1905,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 	int rc = 0;
 	unsigned int n_iov = 2;
 	__u32 file_attributes = 0;
-	char *dhc_buf = NULL, *lc_buf = NULL;
+	char *dhc_buf = NULL, *lc_buf = NULL, *pc_buf = NULL;
 	int flags = 0;
 	unsigned int total_len;
 
@@ -1843,7 +1921,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 	if (rc)
 		return rc;
 
-	if (encryption_required(tcon))
+	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	if (oparms->create_options & CREATE_OPTION_READONLY)
@@ -1944,6 +2022,27 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 		dhc_buf = iov[n_iov-1].iov_base;
 	}
 
+#ifdef CONFIG_CIFS_SMB311
+	if (tcon->posix_extensions) {
+		if (n_iov > 2) {
+			struct create_context *ccontext =
+			    (struct create_context *)iov[n_iov-1].iov_base;
+			ccontext->Next =
+				cpu_to_le32(iov[n_iov-1].iov_len);
+		}
+
+		rc = add_posix_context(iov, &n_iov, oparms->mode);
+		if (rc) {
+			cifs_small_buf_release(req);
+			kfree(copy_path);
+			kfree(lc_buf);
+			kfree(dhc_buf);
+			return rc;
+		}
+		pc_buf = iov[n_iov-1].iov_base;
+	}
+#endif /* SMB311 */
+
 	rc = smb2_send_recv(xid, ses, iov, n_iov, &resp_buftype, flags,
 			    &rsp_iov);
 	cifs_small_buf_release(req);
@@ -1956,8 +2055,13 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 			resp_buftype = CIFS_NO_BUFFER;
 			rsp = NULL;
 		}
+		trace_smb3_open_err(xid, tcon->tid, ses->Suid,
+				    oparms->create_options, oparms->desired_access, rc);
 		goto creat_exit;
-	}
+	} else
+		trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid,
+				     ses->Suid, oparms->create_options,
+				     oparms->desired_access);
 
 	oparms->fid->persistent_fid = rsp->PersistentFileId;
 	oparms->fid->volatile_fid = rsp->VolatileFileId;
@@ -1972,13 +2076,15 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 	}
 
 	if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE)
-		*oplock = parse_lease_state(server, rsp, &oparms->fid->epoch);
+		*oplock = parse_lease_state(server, rsp, &oparms->fid->epoch,
+					    oparms->fid->lease_key);
 	else
 		*oplock = rsp->OplockLevel;
 creat_exit:
 	kfree(copy_path);
 	kfree(lc_buf);
 	kfree(dhc_buf);
+	kfree(pc_buf);
 	free_rsp_buf(resp_buftype, rsp);
 	return rc;
 }
@@ -1994,7 +2100,6 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 {
 	struct smb2_ioctl_req *req;
 	struct smb2_ioctl_rsp *rsp;
-	struct smb2_sync_hdr *shdr;
 	struct cifs_ses *ses;
 	struct kvec iov[2];
 	struct kvec rsp_iov;
@@ -2025,7 +2130,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 	if (rc)
 		return rc;
 
-	if (encryption_required(tcon))
+	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	req->CtlCode = cpu_to_le32(opcode);
@@ -2088,6 +2193,10 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 	cifs_small_buf_release(req);
 	rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base;
 
+	if (rc != 0)
+		trace_smb3_fsctl_err(xid, persistent_fid, tcon->tid,
+				ses->Suid, 0, opcode, rc);
+
 	if ((rc != 0) && (rc != -EINVAL)) {
 		cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
 		goto ioctl_exit;
@@ -2115,7 +2224,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 		goto ioctl_exit;
 	}
 
-	if (get_rfc1002_length(rsp) < le32_to_cpu(rsp->OutputOffset) + *plen) {
+	if (rsp_iov.iov_len < le32_to_cpu(rsp->OutputOffset) + *plen) {
 		cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen,
 			le32_to_cpu(rsp->OutputOffset));
 		*plen = 0;
@@ -2129,8 +2238,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 		goto ioctl_exit;
 	}
 
-	shdr = get_sync_hdr(rsp);
-	memcpy(*out_data, (char *)shdr + le32_to_cpu(rsp->OutputOffset), *plen);
+	memcpy(*out_data, (char *)rsp + le32_to_cpu(rsp->OutputOffset), *plen);
 ioctl_exit:
 	free_rsp_buf(resp_buftype, rsp);
 	return rc;
@@ -2162,8 +2270,8 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
 }
 
 int
-SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
-	   u64 persistent_fid, u64 volatile_fid)
+SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
+		 u64 persistent_fid, u64 volatile_fid, int flags)
 {
 	struct smb2_close_req *req;
 	struct smb2_close_rsp *rsp;
@@ -2172,7 +2280,6 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
 	struct kvec rsp_iov;
 	int resp_buftype;
 	int rc = 0;
-	int flags = 0;
 	unsigned int total_len;
 
 	cifs_dbg(FYI, "Close\n");
@@ -2184,7 +2291,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		return rc;
 
-	if (encryption_required(tcon))
+	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	req->PersistentFileId = persistent_fid;
@@ -2199,6 +2306,8 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
 
 	if (rc != 0) {
 		cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
+		trace_smb3_close_err(xid, persistent_fid, tcon->tid, ses->Suid,
+				     rc);
 		goto close_exit;
 	}
 
@@ -2209,14 +2318,20 @@ close_exit:
 	return rc;
 }
 
+int
+SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
+	   u64 persistent_fid, u64 volatile_fid)
+{
+	return SMB2_close_flags(xid, tcon, persistent_fid, volatile_fid, 0);
+}
+
 static int
-validate_iov(struct TCP_Server_Info *server,
-	     unsigned int offset, unsigned int buffer_length,
+validate_iov(unsigned int offset, unsigned int buffer_length,
 	     struct kvec *iov, unsigned int min_buf_size)
 {
 	unsigned int smb_len = iov->iov_len;
-	char *end_of_smb = smb_len + server->vals->header_preamble_size + (char *)iov->iov_base;
-	char *begin_of_buf = server->vals->header_preamble_size + offset + (char *)iov->iov_base;
+	char *end_of_smb = smb_len + (char *)iov->iov_base;
+	char *begin_of_buf = offset + (char *)iov->iov_base;
 	char *end_of_buf = begin_of_buf + buffer_length;
 
 
@@ -2246,18 +2361,17 @@ validate_iov(struct TCP_Server_Info *server,
  * Caller must free buffer.
  */
 static int
-validate_and_copy_iov(struct TCP_Server_Info *server,
-		      unsigned int offset, unsigned int buffer_length,
+validate_and_copy_iov(unsigned int offset, unsigned int buffer_length,
 		      struct kvec *iov, unsigned int minbufsize,
 		      char *data)
 {
-	char *begin_of_buf = server->vals->header_preamble_size + offset + (char *)(iov->iov_base);
+	char *begin_of_buf = offset + (char *)iov->iov_base;
 	int rc;
 
 	if (!data)
 		return -EINVAL;
 
-	rc = validate_iov(server, offset, buffer_length, iov, minbufsize);
+	rc = validate_iov(offset, buffer_length, iov, minbufsize);
 	if (rc)
 		return rc;
 
@@ -2292,7 +2406,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		return rc;
 
-	if (encryption_required(tcon))
+	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	req->InfoType = info_type;
@@ -2318,6 +2432,8 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
 
 	if (rc) {
 		cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
+		trace_smb3_query_info_err(xid, persistent_fid, tcon->tid,
+				ses->Suid, info_class, (__u32)info_type, rc);
 		goto qinf_exit;
 	}
 
@@ -2335,8 +2451,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
 		}
 	}
 
-	rc = validate_and_copy_iov(ses->server,
-				   le16_to_cpu(rsp->OutputBufferOffset),
+	rc = validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset),
 				   le32_to_cpu(rsp->OutputBufferLength),
 				   &rsp_iov, min_len, *data);
 
@@ -2407,7 +2522,7 @@ smb2_echo_callback(struct mid_q_entry *mid)
 	unsigned int credits_received = 1;
 
 	if (mid->mid_state == MID_RESPONSE_RECEIVED)
-		credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest);
+		credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
 
 	DeleteMidQEntry(mid);
 	add_credits(server, credits_received, CIFS_ECHO_OP);
@@ -2536,7 +2651,7 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 	if (rc)
 		return rc;
 
-	if (encryption_required(tcon))
+	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	req->PersistentFileId = persistent_fid;
@@ -2548,8 +2663,11 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 	rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov);
 	cifs_small_buf_release(req);
 
-	if (rc != 0)
+	if (rc != 0) {
 		cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE);
+		trace_smb3_flush_err(xid, persistent_fid, tcon->tid, ses->Suid,
+				     rc);
+	}
 
 	free_rsp_buf(resp_buftype, rsp_iov.iov_base);
 	return rc;
@@ -2658,11 +2776,12 @@ smb2_readv_callback(struct mid_q_entry *mid)
 	struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
 	struct TCP_Server_Info *server = tcon->ses->server;
 	struct smb2_sync_hdr *shdr =
-				(struct smb2_sync_hdr *)rdata->iov[1].iov_base;
+				(struct smb2_sync_hdr *)rdata->iov[0].iov_base;
 	unsigned int credits_received = 1;
 	struct smb_rqst rqst = { .rq_iov = rdata->iov,
 				 .rq_nvec = 2,
 				 .rq_pages = rdata->pages,
+				 .rq_offset = rdata->page_offset,
 				 .rq_npages = rdata->nr_pages,
 				 .rq_pagesz = rdata->pagesz,
 				 .rq_tailsz = rdata->tailsz };
@@ -2760,7 +2879,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
 		return rc;
 	}
 
-	if (encryption_required(io_parms.tcon))
+	if (smb3_encryption_required(io_parms.tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	req_len = cpu_to_be32(total_len);
@@ -2791,7 +2910,13 @@ smb2_async_readv(struct cifs_readdata *rdata)
 	if (rc) {
 		kref_put(&rdata->refcount, cifs_readdata_release);
 		cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
-	}
+		trace_smb3_read_err(rc, 0 /* xid */, io_parms.persistent_fid,
+				   io_parms.tcon->tid, io_parms.tcon->ses->Suid,
+				   io_parms.offset, io_parms.length);
+	} else
+		trace_smb3_read_done(0 /* xid */, io_parms.persistent_fid,
+				   io_parms.tcon->tid, io_parms.tcon->ses->Suid,
+				   io_parms.offset, io_parms.length);
 
 	cifs_small_buf_release(buf);
 	return rc;
@@ -2804,7 +2929,6 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
 	int resp_buftype, rc = -EACCES;
 	struct smb2_read_plain_req *req = NULL;
 	struct smb2_read_rsp *rsp = NULL;
-	struct smb2_sync_hdr *shdr;
 	struct kvec iov[1];
 	struct kvec rsp_iov;
 	unsigned int total_len;
@@ -2816,7 +2940,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
 	if (rc)
 		return rc;
 
-	if (encryption_required(io_parms->tcon))
+	if (smb3_encryption_required(io_parms->tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	iov[0].iov_base = (char *)req;
@@ -2832,9 +2956,15 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
 			cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
 			cifs_dbg(VFS, "Send error in read = %d\n", rc);
 		}
+		trace_smb3_read_err(rc, xid, req->PersistentFileId,
+				    io_parms->tcon->tid, ses->Suid,
+				    io_parms->offset, io_parms->length);
 		free_rsp_buf(resp_buftype, rsp_iov.iov_base);
 		return rc == -ENODATA ? 0 : rc;
-	}
+	} else
+		trace_smb3_read_done(xid, req->PersistentFileId,
+				    io_parms->tcon->tid, ses->Suid,
+				    io_parms->offset, io_parms->length);
 
 	*nbytes = le32_to_cpu(rsp->DataLength);
 	if ((*nbytes > CIFS_MAX_MSGSIZE) ||
@@ -2845,10 +2975,8 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
 		*nbytes = 0;
 	}
 
-	shdr = get_sync_hdr(rsp);
-
 	if (*buf) {
-		memcpy(*buf, (char *)shdr + rsp->DataOffset, *nbytes);
+		memcpy(*buf, (char *)rsp + rsp->DataOffset, *nbytes);
 		free_rsp_buf(resp_buftype, rsp_iov.iov_base);
 	} else if (resp_buftype != CIFS_NO_BUFFER) {
 		*buf = rsp_iov.iov_base;
@@ -2875,7 +3003,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
 
 	switch (mid->mid_state) {
 	case MID_RESPONSE_RECEIVED:
-		credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest);
+		credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
 		wdata->result = smb2_check_receive(mid, tcon->ses->server, 0);
 		if (wdata->result != 0)
 			break;
@@ -2952,7 +3080,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
 		goto async_writev_out;
 	}
 
-	if (encryption_required(tcon))
+	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	shdr = (struct smb2_sync_hdr *)req;
@@ -3013,6 +3141,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
 	rqst.rq_iov = iov;
 	rqst.rq_nvec = 2;
 	rqst.rq_pages = wdata->pages;
+	rqst.rq_offset = wdata->page_offset;
 	rqst.rq_npages = wdata->nr_pages;
 	rqst.rq_pagesz = wdata->pagesz;
 	rqst.rq_tailsz = wdata->tailsz;
@@ -3050,9 +3179,15 @@ smb2_async_writev(struct cifs_writedata *wdata,
 			     wdata, flags);
 
 	if (rc) {
+		trace_smb3_write_err(0 /* no xid */, req->PersistentFileId,
+				     tcon->tid, tcon->ses->Suid, wdata->offset,
+				     wdata->bytes, rc);
 		kref_put(&wdata->refcount, release);
 		cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
-	}
+	} else
+		trace_smb3_write_done(0 /* no xid */, req->PersistentFileId,
+				     tcon->tid, tcon->ses->Suid, wdata->offset,
+				     wdata->bytes);
 
 async_writev_out:
 	cifs_small_buf_release(req);
@@ -3090,7 +3225,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
 	if (io_parms->tcon->ses->server == NULL)
 		return -ECONNABORTED;
 
-	if (encryption_required(io_parms->tcon))
+	if (smb3_encryption_required(io_parms->tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	req->sync_hdr.ProcessId = cpu_to_le32(io_parms->pid);
@@ -3116,10 +3251,19 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
 	rsp = (struct smb2_write_rsp *)rsp_iov.iov_base;
 
 	if (rc) {
+		trace_smb3_write_err(xid, req->PersistentFileId,
+				     io_parms->tcon->tid,
+				     io_parms->tcon->ses->Suid,
+				     io_parms->offset, io_parms->length, rc);
 		cifs_stats_fail_inc(io_parms->tcon, SMB2_WRITE_HE);
 		cifs_dbg(VFS, "Send error in write = %d\n", rc);
-	} else
+	} else {
 		*nbytes = le32_to_cpu(rsp->DataLength);
+		trace_smb3_write_done(xid, req->PersistentFileId,
+				     io_parms->tcon->tid,
+				     io_parms->tcon->ses->Suid,
+				     io_parms->offset, *nbytes);
+	}
 
 	free_rsp_buf(resp_buftype, rsp);
 	return rc;
@@ -3200,7 +3344,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		return rc;
 
-	if (encryption_required(tcon))
+	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	switch (srch_inf->info_level) {
@@ -3251,7 +3395,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
 
 	if (rc) {
 		if (rc == -ENODATA &&
-		    rsp->hdr.sync_hdr.Status == STATUS_NO_MORE_FILES) {
+		    rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
 			srch_inf->endOfSearch = true;
 			rc = 0;
 		}
@@ -3259,8 +3403,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
 		goto qdir_exit;
 	}
 
-	rc = validate_iov(server,
-			  le16_to_cpu(rsp->OutputBufferOffset),
+	rc = validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
 			  le32_to_cpu(rsp->OutputBufferLength), &rsp_iov,
 			  info_buf_size);
 	if (rc)
@@ -3275,10 +3418,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
 			cifs_buf_release(srch_inf->ntwrk_buf_start);
 	}
 	srch_inf->ntwrk_buf_start = (char *)rsp;
-	srch_inf->srch_entries_start = srch_inf->last_entry = 4 /* rfclen */ +
-		(char *)&rsp->hdr + le16_to_cpu(rsp->OutputBufferOffset);
-	/* 4 for rfc1002 length field */
-	end_of_smb = get_rfc1002_length(rsp) + 4 + (char *)&rsp->hdr;
+	srch_inf->srch_entries_start = srch_inf->last_entry =
+		(char *)rsp + le16_to_cpu(rsp->OutputBufferOffset);
+	end_of_smb = rsp_iov.iov_len + (char *)rsp;
 	srch_inf->entries_in_buffer =
 			num_entries(srch_inf->srch_entries_start, end_of_smb,
 				    &srch_inf->last_entry, info_buf_size);
@@ -3333,7 +3475,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
 		return rc;
 	}
 
-	if (encryption_required(tcon))
+	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	req->sync_hdr.ProcessId = cpu_to_le32(pid);
@@ -3366,8 +3508,11 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
 	cifs_small_buf_release(req);
 	rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base;
 
-	if (rc != 0)
+	if (rc != 0) {
 		cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE);
+		trace_smb3_set_info_err(xid, persistent_fid, tcon->tid,
+				ses->Suid, info_class, (__u32)info_type, rc);
+	}
 
 	free_rsp_buf(resp_buftype, rsp);
 	kfree(iov);
@@ -3514,7 +3659,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
 		  __u8 oplock_level)
 {
 	int rc;
-	struct smb2_oplock_break_req *req = NULL;
+	struct smb2_oplock_break *req = NULL;
 	struct cifs_ses *ses = tcon->ses;
 	int flags = CIFS_OBREAK_OP;
 	unsigned int total_len;
@@ -3528,7 +3673,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		return rc;
 
-	if (encryption_required(tcon))
+	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	req->VolatileFid = volatile_fid;
@@ -3593,7 +3738,7 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level,
 	req->InputBufferOffset =
 			cpu_to_le16(sizeof(struct smb2_query_info_req) - 1);
 	req->OutputBufferLength = cpu_to_le32(
-		outbuf_len + sizeof(struct smb2_query_info_rsp) - 1 - server->vals->header_preamble_size);
+		outbuf_len + sizeof(struct smb2_query_info_rsp) - 1);
 
 	iov->iov_base = (char *)req;
 	iov->iov_len = total_len;
@@ -3610,7 +3755,6 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc = 0;
 	int resp_buftype;
 	struct cifs_ses *ses = tcon->ses;
-	struct TCP_Server_Info *server = ses->server;
 	struct smb2_fs_full_size_info *info = NULL;
 	int flags = 0;
 
@@ -3620,7 +3764,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		return rc;
 
-	if (encryption_required(tcon))
+	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov);
@@ -3631,10 +3775,9 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
 	}
 	rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
 
-	info = (struct smb2_fs_full_size_info *)(server->vals->header_preamble_size +
-		le16_to_cpu(rsp->OutputBufferOffset) + (char *)&rsp->hdr);
-	rc = validate_iov(server,
-			  le16_to_cpu(rsp->OutputBufferOffset),
+	info = (struct smb2_fs_full_size_info *)(
+		le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp);
+	rc = validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
 			  le32_to_cpu(rsp->OutputBufferLength), &rsp_iov,
 			  sizeof(struct smb2_fs_full_size_info));
 	if (!rc)
@@ -3655,7 +3798,6 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc = 0;
 	int resp_buftype, max_len, min_len;
 	struct cifs_ses *ses = tcon->ses;
-	struct TCP_Server_Info *server = ses->server;
 	unsigned int rsp_len, offset;
 	int flags = 0;
 
@@ -3678,7 +3820,7 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		return rc;
 
-	if (encryption_required(tcon))
+	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov);
@@ -3691,20 +3833,20 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
 
 	rsp_len = le32_to_cpu(rsp->OutputBufferLength);
 	offset = le16_to_cpu(rsp->OutputBufferOffset);
-	rc = validate_iov(server, offset, rsp_len, &rsp_iov, min_len);
+	rc = validate_iov(offset, rsp_len, &rsp_iov, min_len);
 	if (rc)
 		goto qfsattr_exit;
 
 	if (level == FS_ATTRIBUTE_INFORMATION)
-		memcpy(&tcon->fsAttrInfo, server->vals->header_preamble_size + offset
-			+ (char *)&rsp->hdr, min_t(unsigned int,
+		memcpy(&tcon->fsAttrInfo, offset
+			+ (char *)rsp, min_t(unsigned int,
 			rsp_len, max_len));
 	else if (level == FS_DEVICE_INFORMATION)
-		memcpy(&tcon->fsDevInfo, server->vals->header_preamble_size + offset
-			+ (char *)&rsp->hdr, sizeof(FILE_SYSTEM_DEVICE_INFO));
+		memcpy(&tcon->fsDevInfo, offset
+			+ (char *)rsp, sizeof(FILE_SYSTEM_DEVICE_INFO));
 	else if (level == FS_SECTOR_SIZE_INFORMATION) {
 		struct smb3_fs_ss_info *ss_info = (struct smb3_fs_ss_info *)
-			(server->vals->header_preamble_size + offset + (char *)&rsp->hdr);
+			(offset + (char *)rsp);
 		tcon->ss_flags = le32_to_cpu(ss_info->Flags);
 		tcon->perf_sector_size =
 			le32_to_cpu(ss_info->PhysicalBytesPerSectorForPerf);
@@ -3735,7 +3877,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		return rc;
 
-	if (encryption_required(tcon))
+	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	req->sync_hdr.ProcessId = cpu_to_le32(pid);
@@ -3758,6 +3900,8 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc) {
 		cifs_dbg(FYI, "Send error in smb2_lockv = %d\n", rc);
 		cifs_stats_fail_inc(tcon, SMB2_LOCK_HE);
+		trace_smb3_lock_err(xid, persist_fid, tcon->tid,
+				    tcon->ses->Suid, rc);
 	}
 
 	return rc;
@@ -3799,7 +3943,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		return rc;
 
-	if (encryption_required(tcon))
+	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	req->sync_hdr.CreditRequest = cpu_to_le16(1);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index d28f358..a345560 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -122,25 +122,10 @@ struct smb2_sync_pdu {
 	__le16 StructureSize2; /* size of wct area (varies, request specific) */
 } __packed;
 
-struct smb2_hdr {
-	__be32 smb2_buf_length;	/* big endian on wire */
-				/* length is only two or three bytes - with */
-				/* one or two byte type preceding it that MBZ */
-	struct smb2_sync_hdr sync_hdr;
-} __packed;
-
-struct smb2_pdu {
-	struct smb2_hdr hdr;
-	__le16 StructureSize2; /* size of wct area (varies, request specific) */
-} __packed;
-
 #define SMB3_AES128CMM_NONCE 11
 #define SMB3_AES128GCM_NONCE 12
 
 struct smb2_transform_hdr {
-	__be32 smb2_buf_length;	/* big endian on wire */
-				/* length is only two or three bytes - with
-				 one or two byte type preceding it that MBZ */
 	__le32 ProtocolId;	/* 0xFD 'S' 'M' 'B' */
 	__u8   Signature[16];
 	__u8   Nonce[16];
@@ -171,7 +156,7 @@ struct smb2_transform_hdr {
 #define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9)
 
 struct smb2_err_rsp {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize;
 	__le16 Reserved; /* MBZ */
 	__le32 ByteCount;  /* even if zero, at least one byte follows */
@@ -300,8 +285,16 @@ struct smb2_encryption_neg_context {
 	__le16	Ciphers[1]; /* Ciphers[0] since only one used now */
 } __packed;
 
+#define POSIX_CTXT_DATA_LEN	8
+struct smb2_posix_neg_context {
+	__le16	ContextType; /* 0x100 */
+	__le16	DataLength;
+	__le32	Reserved;
+	__le64	Reserved1; /* In case needed for future (eg version or caps) */
+} __packed;
+
 struct smb2_negotiate_rsp {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize;	/* Must be 65 */
 	__le16 SecurityMode;
 	__le16 DialectRevision;
@@ -341,7 +334,7 @@ struct smb2_sess_setup_req {
 #define SMB2_SESSION_FLAG_IS_NULL	0x0002
 #define SMB2_SESSION_FLAG_ENCRYPT_DATA	0x0004
 struct smb2_sess_setup_rsp {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize; /* Must be 9 */
 	__le16 SessionFlags;
 	__le16 SecurityBufferOffset;
@@ -356,7 +349,7 @@ struct smb2_logoff_req {
 } __packed;
 
 struct smb2_logoff_rsp {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize;	/* Must be 4 */
 	__le16 Reserved;
 } __packed;
@@ -452,7 +445,7 @@ struct smb2_tree_connect_req_extension {
 } __packed;
 
 struct smb2_tree_connect_rsp {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize;	/* Must be 16 */
 	__u8   ShareType;  /* see below */
 	__u8   Reserved;
@@ -503,7 +496,7 @@ struct smb2_tree_disconnect_req {
 } __packed;
 
 struct smb2_tree_disconnect_rsp {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize;	/* Must be 4 */
 	__le16 Reserved;
 } __packed;
@@ -615,7 +608,9 @@ struct smb2_tree_disconnect_rsp {
 #define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2	"DH2Q"
 #define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2	"DH2C"
 #define SMB2_CREATE_APP_INSTANCE_ID	0x45BCA66AEFA7F74A9008FA462E144D74
-#define SVHDX_OPEN_DEVICE_CONTEXT	0x83CE6F1AD851E0986E34401CC9BCFCE9
+#define SVHDX_OPEN_DEVICE_CONTEX	0x9CCBCF9E04C1E643980E158DA1F6EC83
+#define SMB2_CREATE_TAG_POSIX		0x93AD25509CB411E7B42383DE968BCD7C
+
 
 struct smb2_create_req {
 	struct smb2_sync_hdr sync_hdr;
@@ -638,7 +633,7 @@ struct smb2_create_req {
 } __packed;
 
 struct smb2_create_rsp {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize;	/* Must be 89 */
 	__u8   OplockLevel;
 	__u8   Reserved;
@@ -727,6 +722,13 @@ struct create_durable {
 	} Data;
 } __packed;
 
+struct create_posix {
+	struct create_context ccontext;
+	__u8	Name[16];
+	__le32  Mode;
+	__u32	Reserved;
+} __packed;
+
 /* See MS-SMB2 2.2.13.2.11 */
 /* Flags */
 #define SMB2_DHANDLE_FLAG_PERSISTENT	0x00000002
@@ -894,7 +896,7 @@ struct smb2_ioctl_req {
 } __packed;
 
 struct smb2_ioctl_rsp {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize;	/* Must be 57 */
 	__u16 Reserved;
 	__le32 CtlCode;
@@ -921,7 +923,7 @@ struct smb2_close_req {
 } __packed;
 
 struct smb2_close_rsp {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize; /* 60 */
 	__le16 Flags;
 	__le32 Reserved;
@@ -944,7 +946,7 @@ struct smb2_flush_req {
 } __packed;
 
 struct smb2_flush_rsp {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize;
 	__le16 Reserved;
 } __packed;
@@ -976,7 +978,7 @@ struct smb2_read_plain_req {
 } __packed;
 
 struct smb2_read_rsp {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize; /* Must be 17 */
 	__u8   DataOffset;
 	__u8   Reserved;
@@ -1007,7 +1009,7 @@ struct smb2_write_req {
 } __packed;
 
 struct smb2_write_rsp {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize; /* Must be 17 */
 	__u8   DataOffset;
 	__u8   Reserved;
@@ -1041,7 +1043,7 @@ struct smb2_lock_req {
 } __packed;
 
 struct smb2_lock_rsp {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize; /* Must be 4 */
 	__le16 Reserved;
 } __packed;
@@ -1053,7 +1055,7 @@ struct smb2_echo_req {
 } __packed;
 
 struct smb2_echo_rsp {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize;	/* Must be 4 */
 	__u16  Reserved;
 } __packed;
@@ -1079,7 +1081,7 @@ struct smb2_query_directory_req {
 } __packed;
 
 struct smb2_query_directory_rsp {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize; /* Must be 9 */
 	__le16 OutputBufferOffset;
 	__le32 OutputBufferLength;
@@ -1128,7 +1130,7 @@ struct smb2_query_info_req {
 } __packed;
 
 struct smb2_query_info_rsp {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize; /* Must be 9 */
 	__le16 OutputBufferOffset;
 	__le32 OutputBufferLength;
@@ -1150,12 +1152,11 @@ struct smb2_set_info_req {
 } __packed;
 
 struct smb2_set_info_rsp {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize; /* Must be 2 */
 } __packed;
 
-/* oplock break without an rfc1002 header */
-struct smb2_oplock_break_req {
+struct smb2_oplock_break {
 	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize; /* Must be 24 */
 	__u8   OplockLevel;
@@ -1165,21 +1166,10 @@ struct smb2_oplock_break_req {
 	__u64  VolatileFid;
 } __packed;
 
-/* oplock break with an rfc1002 header */
-struct smb2_oplock_break_rsp {
-	struct smb2_hdr hdr;
-	__le16 StructureSize; /* Must be 24 */
-	__u8   OplockLevel;
-	__u8   Reserved;
-	__le32 Reserved2;
-	__u64  PersistentFid;
-	__u64  VolatileFid;
-} __packed;
-
 #define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01)
 
 struct smb2_lease_break {
-	struct smb2_hdr hdr;
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize; /* Must be 44 */
 	__le16 Reserved;
 	__le32 Flags;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 8ba24a9..908555b 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -36,8 +36,9 @@ struct smb_rqst;
 extern int map_smb2_to_linux_error(char *buf, bool log_err);
 extern int smb2_check_message(char *buf, unsigned int length,
 			      struct TCP_Server_Info *server);
-extern unsigned int smb2_calc_size(void *buf);
-extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr);
+extern unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *server);
+extern char *smb2_get_data_area_len(int *off, int *len,
+				    struct smb2_sync_hdr *shdr);
 extern __le16 *cifs_convert_path_to_utf16(const char *from,
 					  struct cifs_sb_info *cifs_sb);
 
@@ -65,6 +66,8 @@ extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server,
 extern int smb3_handle_read_data(struct TCP_Server_Info *server,
 				 struct mid_q_entry *mid);
 
+extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
+			struct cifs_fid *pfid);
 extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst,
 				   struct smb2_file_all_info *src);
 extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
@@ -129,6 +132,8 @@ extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
 		     char **out_data, u32 *plen /* returned data len */);
 extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
 		      u64 persistent_file_id, u64 volatile_file_id);
+extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
+			    u64 persistent_fid, u64 volatile_fid, int flags);
 extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
 		      u64 persistent_file_id, u64 volatile_file_id);
 extern int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 8806f3f..2c67112 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -480,7 +480,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 	unsigned int rc;
 	char server_response_sig[16];
 	struct smb2_sync_hdr *shdr =
-			(struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base;
+			(struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
 
 	if ((shdr->Command == SMB2_NEGOTIATE) ||
 	    (shdr->Command == SMB2_SESSION_SETUP) ||
@@ -605,14 +605,12 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
 		   bool log_error)
 {
 	unsigned int len = mid->resp_buf_size;
-	struct kvec iov[2];
+	struct kvec iov[1];
 	struct smb_rqst rqst = { .rq_iov = iov,
-				 .rq_nvec = 2 };
+				 .rq_nvec = 1 };
 
 	iov[0].iov_base = (char *)mid->resp_buf;
-	iov[0].iov_len = 4;
-	iov[1].iov_base = (char *)mid->resp_buf + 4;
-	iov[1].iov_len = len;
+	iov[0].iov_len = len;
 
 	dump_smb(mid->resp_buf, min_t(u32, 80, len));
 	/* convert the length into a more usable form */
diff --git a/fs/cifs/trace.c b/fs/cifs/trace.c
new file mode 100644
index 0000000..bd4a546f
--- /dev/null
+++ b/fs/cifs/trace.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *   Copyright (C) 2018, Microsoft Corporation.
+ *
+ *   Author(s): Steve French <stfrench@microsoft.com>
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ */
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
new file mode 100644
index 0000000..61e74d4
--- /dev/null
+++ b/fs/cifs/trace.h
@@ -0,0 +1,429 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *   Copyright (C) 2018, Microsoft Corporation.
+ *
+ *   Author(s): Steve French <stfrench@microsoft.com>
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cifs
+
+#if !defined(_CIFS_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _CIFS_TRACE_H
+
+#include <linux/tracepoint.h>
+
+/* For logging errors in read or write */
+DECLARE_EVENT_CLASS(smb3_rw_err_class,
+	TP_PROTO(unsigned int xid,
+		__u64	fid,
+		__u32	tid,
+		__u64	sesid,
+		__u64	offset,
+		__u32	len,
+		int	rc),
+	TP_ARGS(xid, fid, tid, sesid, offset, len, rc),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(__u64, fid)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__field(__u64, offset)
+		__field(__u32, len)
+		__field(int, rc)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->fid = fid;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__entry->offset = offset;
+		__entry->len = len;
+		__entry->rc = rc;
+	),
+	TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d",
+		__entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+		__entry->offset, __entry->len, __entry->rc)
+)
+
+#define DEFINE_SMB3_RW_ERR_EVENT(name)          \
+DEFINE_EVENT(smb3_rw_err_class, smb3_##name,    \
+	TP_PROTO(unsigned int xid,		\
+		__u64	fid,			\
+		__u32	tid,			\
+		__u64	sesid,			\
+		__u64	offset,			\
+		__u32	len,			\
+		int	rc),			\
+	TP_ARGS(xid, fid, tid, sesid, offset, len, rc))
+
+DEFINE_SMB3_RW_ERR_EVENT(write_err);
+DEFINE_SMB3_RW_ERR_EVENT(read_err);
+
+
+/* For logging successful read or write */
+DECLARE_EVENT_CLASS(smb3_rw_done_class,
+	TP_PROTO(unsigned int xid,
+		__u64	fid,
+		__u32	tid,
+		__u64	sesid,
+		__u64	offset,
+		__u32	len),
+	TP_ARGS(xid, fid, tid, sesid, offset, len),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(__u64, fid)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__field(__u64, offset)
+		__field(__u32, len)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->fid = fid;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__entry->offset = offset;
+		__entry->len = len;
+	),
+	TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x",
+		__entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+		__entry->offset, __entry->len)
+)
+
+#define DEFINE_SMB3_RW_DONE_EVENT(name)         \
+DEFINE_EVENT(smb3_rw_done_class, smb3_##name,   \
+	TP_PROTO(unsigned int xid,		\
+		__u64	fid,			\
+		__u32	tid,			\
+		__u64	sesid,			\
+		__u64	offset,			\
+		__u32	len),			\
+	TP_ARGS(xid, fid, tid, sesid, offset, len))
+
+DEFINE_SMB3_RW_DONE_EVENT(write_done);
+DEFINE_SMB3_RW_DONE_EVENT(read_done);
+
+/*
+ * For handle based calls other than read and write, and get/set info
+ */
+DECLARE_EVENT_CLASS(smb3_fd_err_class,
+	TP_PROTO(unsigned int xid,
+		__u64	fid,
+		__u32	tid,
+		__u64	sesid,
+		int	rc),
+	TP_ARGS(xid, fid, tid, sesid, rc),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(__u64, fid)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__field(int, rc)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->fid = fid;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__entry->rc = rc;
+	),
+	TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx rc=%d",
+		__entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+		__entry->rc)
+)
+
+#define DEFINE_SMB3_FD_ERR_EVENT(name)          \
+DEFINE_EVENT(smb3_fd_err_class, smb3_##name,    \
+	TP_PROTO(unsigned int xid,		\
+		__u64	fid,			\
+		__u32	tid,			\
+		__u64	sesid,			\
+		int	rc),			\
+	TP_ARGS(xid, fid, tid, sesid, rc))
+
+DEFINE_SMB3_FD_ERR_EVENT(flush_err);
+DEFINE_SMB3_FD_ERR_EVENT(lock_err);
+DEFINE_SMB3_FD_ERR_EVENT(close_err);
+
+/*
+ * For handle based query/set info calls
+ */
+DECLARE_EVENT_CLASS(smb3_inf_err_class,
+	TP_PROTO(unsigned int xid,
+		__u64	fid,
+		__u32	tid,
+		__u64	sesid,
+		__u8	infclass,
+		__u32	type,
+		int	rc),
+	TP_ARGS(xid, fid, tid, sesid, infclass, type, rc),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(__u64, fid)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__field(__u8, infclass)
+		__field(__u32, type)
+		__field(int, rc)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->fid = fid;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__entry->infclass = infclass;
+		__entry->type = type;
+		__entry->rc = rc;
+	),
+	TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx class=%u type=0x%x rc=%d",
+		__entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+		__entry->infclass, __entry->type, __entry->rc)
+)
+
+#define DEFINE_SMB3_INF_ERR_EVENT(name)          \
+DEFINE_EVENT(smb3_inf_err_class, smb3_##name,    \
+	TP_PROTO(unsigned int xid,		\
+		__u64	fid,			\
+		__u32	tid,			\
+		__u64	sesid,			\
+		__u8	infclass,		\
+		__u32	type,			\
+		int	rc),			\
+	TP_ARGS(xid, fid, tid, sesid, infclass, type, rc))
+
+DEFINE_SMB3_INF_ERR_EVENT(query_info_err);
+DEFINE_SMB3_INF_ERR_EVENT(set_info_err);
+DEFINE_SMB3_INF_ERR_EVENT(fsctl_err);
+
+/*
+ * For logging SMB3 Status code and Command for responses which return errors
+ */
+DECLARE_EVENT_CLASS(smb3_cmd_err_class,
+	TP_PROTO(__u32	tid,
+		__u64	sesid,
+		__u16	cmd,
+		__u64	mid,
+		__u32	status,
+		int	rc),
+	TP_ARGS(tid, sesid, cmd, mid, status, rc),
+	TP_STRUCT__entry(
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__field(__u16, cmd)
+		__field(__u64, mid)
+		__field(__u32, status)
+		__field(int, rc)
+	),
+	TP_fast_assign(
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__entry->cmd = cmd;
+		__entry->mid = mid;
+		__entry->status = status;
+		__entry->rc = rc;
+	),
+	TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu status=0x%x rc=%d",
+		__entry->sesid, __entry->tid, __entry->cmd, __entry->mid,
+		__entry->status, __entry->rc)
+)
+
+#define DEFINE_SMB3_CMD_ERR_EVENT(name)          \
+DEFINE_EVENT(smb3_cmd_err_class, smb3_##name,    \
+	TP_PROTO(__u32	tid,			\
+		__u64	sesid,			\
+		__u16	cmd,			\
+		__u64	mid,			\
+		__u32	status,			\
+		int	rc),			\
+	TP_ARGS(tid, sesid, cmd, mid, status, rc))
+
+DEFINE_SMB3_CMD_ERR_EVENT(cmd_err);
+
+DECLARE_EVENT_CLASS(smb3_cmd_done_class,
+	TP_PROTO(__u32	tid,
+		__u64	sesid,
+		__u16	cmd,
+		__u64	mid),
+	TP_ARGS(tid, sesid, cmd, mid),
+	TP_STRUCT__entry(
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__field(__u16, cmd)
+		__field(__u64, mid)
+	),
+	TP_fast_assign(
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__entry->cmd = cmd;
+		__entry->mid = mid;
+	),
+	TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu",
+		__entry->sesid, __entry->tid,
+		__entry->cmd, __entry->mid)
+)
+
+#define DEFINE_SMB3_CMD_DONE_EVENT(name)          \
+DEFINE_EVENT(smb3_cmd_done_class, smb3_##name,    \
+	TP_PROTO(__u32	tid,			\
+		__u64	sesid,			\
+		__u16	cmd,			\
+		__u64	mid),			\
+	TP_ARGS(tid, sesid, cmd, mid))
+
+DEFINE_SMB3_CMD_DONE_EVENT(cmd_done);
+
+DECLARE_EVENT_CLASS(smb3_exit_err_class,
+	TP_PROTO(unsigned int xid,
+		const char *func_name,
+		int	rc),
+	TP_ARGS(xid, func_name, rc),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(const char *, func_name)
+		__field(int, rc)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->func_name = func_name;
+		__entry->rc = rc;
+	),
+	TP_printk("\t%s: xid=%u rc=%d",
+		__entry->func_name, __entry->xid, __entry->rc)
+)
+
+#define DEFINE_SMB3_EXIT_ERR_EVENT(name)          \
+DEFINE_EVENT(smb3_exit_err_class, smb3_##name,    \
+	TP_PROTO(unsigned int xid,		\
+		const char *func_name,		\
+		int	rc),			\
+	TP_ARGS(xid, func_name, rc))
+
+DEFINE_SMB3_EXIT_ERR_EVENT(exit_err);
+
+DECLARE_EVENT_CLASS(smb3_enter_exit_class,
+	TP_PROTO(unsigned int xid,
+		const char *func_name),
+	TP_ARGS(xid, func_name),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(const char *, func_name)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->func_name = func_name;
+	),
+	TP_printk("\t%s: xid=%u",
+		__entry->func_name, __entry->xid)
+)
+
+#define DEFINE_SMB3_ENTER_EXIT_EVENT(name)        \
+DEFINE_EVENT(smb3_enter_exit_class, smb3_##name,  \
+	TP_PROTO(unsigned int xid,		\
+		const char *func_name),		\
+	TP_ARGS(xid, func_name))
+
+DEFINE_SMB3_ENTER_EXIT_EVENT(enter);
+DEFINE_SMB3_ENTER_EXIT_EVENT(exit_done);
+
+/*
+ * For smb2/smb3 open call
+ */
+DECLARE_EVENT_CLASS(smb3_open_err_class,
+	TP_PROTO(unsigned int xid,
+		__u32	tid,
+		__u64	sesid,
+		int	create_options,
+		int	desired_access,
+		int	rc),
+	TP_ARGS(xid, tid, sesid, create_options, desired_access, rc),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__field(int,   create_options)
+		__field(int, desired_access)
+		__field(int, rc)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__entry->create_options = create_options;
+		__entry->desired_access = desired_access;
+		__entry->rc = rc;
+	),
+	TP_printk("xid=%u sid=0x%llx tid=0x%x cr_opts=0x%x des_access=0x%x rc=%d",
+		__entry->xid, __entry->sesid, __entry->tid,
+		__entry->create_options, __entry->desired_access, __entry->rc)
+)
+
+#define DEFINE_SMB3_OPEN_ERR_EVENT(name)          \
+DEFINE_EVENT(smb3_open_err_class, smb3_##name,    \
+	TP_PROTO(unsigned int xid,		\
+		__u32	tid,			\
+		__u64	sesid,			\
+		int	create_options,		\
+		int	desired_access,		\
+		int	rc),			\
+	TP_ARGS(xid, tid, sesid, create_options, desired_access, rc))
+
+DEFINE_SMB3_OPEN_ERR_EVENT(open_err);
+
+
+DECLARE_EVENT_CLASS(smb3_open_done_class,
+	TP_PROTO(unsigned int xid,
+		__u64	fid,
+		__u32	tid,
+		__u64	sesid,
+		int	create_options,
+		int	desired_access),
+	TP_ARGS(xid, fid, tid, sesid, create_options, desired_access),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(__u64, fid)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__field(int, create_options)
+		__field(int, desired_access)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->fid = fid;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__entry->create_options = create_options;
+		__entry->desired_access = desired_access;
+	),
+	TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx cr_opts=0x%x des_access=0x%x",
+		__entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+		__entry->create_options, __entry->desired_access)
+)
+
+#define DEFINE_SMB3_OPEN_DONE_EVENT(name)        \
+DEFINE_EVENT(smb3_open_done_class, smb3_##name,  \
+	TP_PROTO(unsigned int xid,		\
+		__u64	fid,			\
+		__u32	tid,			\
+		__u64	sesid,			\
+		int	create_options,		\
+		int	desired_access),	\
+	TP_ARGS(xid, fid, tid, sesid, create_options, desired_access))
+
+DEFINE_SMB3_OPEN_DONE_EVENT(open_done);
+
+#endif /* _CIFS_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 927226a..e7254e3 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -800,8 +800,8 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses,
 #ifdef CONFIG_CIFS_SMB311
 	if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) {
 		struct kvec iov = {
-			.iov_base = buf + 4,
-			.iov_len = get_rfc1002_length(buf)
+			.iov_base = buf,
+			.iov_len = midQ->resp_buf_size
 		};
 		smb311_update_preauth_hash(ses, &iov, 1);
 	}
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 124b093..c4fb9ad 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -808,10 +808,7 @@ static struct dentry *cramfs_lookup(struct inode *dir, struct dentry *dentry, un
 	}
 out:
 	mutex_unlock(&read_mutex);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-	d_add(dentry, inode);
-	return NULL;
+	return d_splice_alias(inode, dentry);
 }
 
 static int cramfs_readpage(struct file *file, struct page *page)
diff --git a/fs/dax.c b/fs/dax.c
index aaec72de..aa86d9f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -677,7 +677,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
 		 * downgrading page table protection not changing it to point
 		 * to a new page.
 		 *
-		 * See Documentation/vm/mmu_notifier.txt
+		 * See Documentation/vm/mmu_notifier.rst
 		 */
 		if (pmdp) {
 #ifdef CONFIG_FS_DAX_PMD
diff --git a/fs/dcache.c b/fs/dcache.c
index 2acfc69..0e8e5de 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -580,6 +580,7 @@ static void __dentry_kill(struct dentry *dentry)
 	spin_unlock(&dentry->d_lock);
 	if (likely(can_free))
 		dentry_free(dentry);
+	cond_resched();
 }
 
 static struct dentry *__lock_parent(struct dentry *dentry)
@@ -827,30 +828,24 @@ static inline bool fast_dput(struct dentry *dentry)
  */
 void dput(struct dentry *dentry)
 {
-	if (unlikely(!dentry))
-		return;
+	while (dentry) {
+		might_sleep();
 
-repeat:
-	might_sleep();
+		rcu_read_lock();
+		if (likely(fast_dput(dentry))) {
+			rcu_read_unlock();
+			return;
+		}
 
-	rcu_read_lock();
-	if (likely(fast_dput(dentry))) {
+		/* Slow case: now with the dentry lock held */
 		rcu_read_unlock();
-		return;
-	}
-
-	/* Slow case: now with the dentry lock held */
-	rcu_read_unlock();
 
-	if (likely(retain_dentry(dentry))) {
-		spin_unlock(&dentry->d_lock);
-		return;
-	}
+		if (likely(retain_dentry(dentry))) {
+			spin_unlock(&dentry->d_lock);
+			return;
+		}
 
-	dentry = dentry_kill(dentry);
-	if (dentry) {
-		cond_resched();
-		goto repeat;
+		dentry = dentry_kill(dentry);
 	}
 }
 EXPORT_SYMBOL(dput);
@@ -907,6 +902,35 @@ repeat:
 }
 EXPORT_SYMBOL(dget_parent);
 
+static struct dentry * __d_find_any_alias(struct inode *inode)
+{
+	struct dentry *alias;
+
+	if (hlist_empty(&inode->i_dentry))
+		return NULL;
+	alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
+	__dget(alias);
+	return alias;
+}
+
+/**
+ * d_find_any_alias - find any alias for a given inode
+ * @inode: inode to find an alias for
+ *
+ * If any aliases exist for the given inode, take and return a
+ * reference for one of them.  If no aliases exist, return %NULL.
+ */
+struct dentry *d_find_any_alias(struct inode *inode)
+{
+	struct dentry *de;
+
+	spin_lock(&inode->i_lock);
+	de = __d_find_any_alias(inode);
+	spin_unlock(&inode->i_lock);
+	return de;
+}
+EXPORT_SYMBOL(d_find_any_alias);
+
 /**
  * d_find_alias - grab a hashed alias of inode
  * @inode: inode in question
@@ -923,34 +947,19 @@ EXPORT_SYMBOL(dget_parent);
  */
 static struct dentry *__d_find_alias(struct inode *inode)
 {
-	struct dentry *alias, *discon_alias;
+	struct dentry *alias;
+
+	if (S_ISDIR(inode->i_mode))
+		return __d_find_any_alias(inode);
 
-again:
-	discon_alias = NULL;
 	hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
 		spin_lock(&alias->d_lock);
- 		if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
-			if (IS_ROOT(alias) &&
-			    (alias->d_flags & DCACHE_DISCONNECTED)) {
-				discon_alias = alias;
-			} else {
-				__dget_dlock(alias);
-				spin_unlock(&alias->d_lock);
-				return alias;
-			}
-		}
-		spin_unlock(&alias->d_lock);
-	}
-	if (discon_alias) {
-		alias = discon_alias;
-		spin_lock(&alias->d_lock);
-		if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
+ 		if (!d_unhashed(alias)) {
 			__dget_dlock(alias);
 			spin_unlock(&alias->d_lock);
 			return alias;
 		}
 		spin_unlock(&alias->d_lock);
-		goto again;
 	}
 	return NULL;
 }
@@ -1052,8 +1061,6 @@ static void shrink_dentry_list(struct list_head *list)
 	while (!list_empty(list)) {
 		struct dentry *dentry, *parent;
 
-		cond_resched();
-
 		dentry = list_entry(list->prev, struct dentry, d_lru);
 		spin_lock(&dentry->d_lock);
 		rcu_read_lock();
@@ -1230,13 +1237,11 @@ enum d_walk_ret {
  * @parent:	start of walk
  * @data:	data passed to @enter() and @finish()
  * @enter:	callback when first entering the dentry
- * @finish:	callback when successfully finished the walk
  *
- * The @enter() and @finish() callbacks are called with d_lock held.
+ * The @enter() callbacks are called with d_lock held.
  */
 static void d_walk(struct dentry *parent, void *data,
-		   enum d_walk_ret (*enter)(void *, struct dentry *),
-		   void (*finish)(void *))
+		   enum d_walk_ret (*enter)(void *, struct dentry *))
 {
 	struct dentry *this_parent;
 	struct list_head *next;
@@ -1325,8 +1330,6 @@ ascend:
 	if (need_seqretry(&rename_lock, seq))
 		goto rename_retry;
 	rcu_read_unlock();
-	if (finish)
-		finish(data);
 
 out_unlock:
 	spin_unlock(&this_parent->d_lock);
@@ -1375,7 +1378,7 @@ int path_has_submounts(const struct path *parent)
 	struct check_mount data = { .mnt = parent->mnt, .mounted = 0 };
 
 	read_seqlock_excl(&mount_lock);
-	d_walk(parent->dentry, &data, path_check_mount, NULL);
+	d_walk(parent->dentry, &data, path_check_mount);
 	read_sequnlock_excl(&mount_lock);
 
 	return data.mounted;
@@ -1483,11 +1486,16 @@ void shrink_dcache_parent(struct dentry *parent)
 		data.start = parent;
 		data.found = 0;
 
-		d_walk(parent, &data, select_collect, NULL);
+		d_walk(parent, &data, select_collect);
+
+		if (!list_empty(&data.dispose)) {
+			shrink_dentry_list(&data.dispose);
+			continue;
+		}
+
+		cond_resched();
 		if (!data.found)
 			break;
-
-		shrink_dentry_list(&data.dispose);
 	}
 }
 EXPORT_SYMBOL(shrink_dcache_parent);
@@ -1518,7 +1526,7 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
 static void do_one_tree(struct dentry *dentry)
 {
 	shrink_dcache_parent(dentry);
-	d_walk(dentry, dentry, umount_check, NULL);
+	d_walk(dentry, dentry, umount_check);
 	d_drop(dentry);
 	dput(dentry);
 }
@@ -1542,78 +1550,48 @@ void shrink_dcache_for_umount(struct super_block *sb)
 	}
 }
 
-struct detach_data {
-	struct select_data select;
-	struct dentry *mountpoint;
-};
-static enum d_walk_ret detach_and_collect(void *_data, struct dentry *dentry)
+static enum d_walk_ret find_submount(void *_data, struct dentry *dentry)
 {
-	struct detach_data *data = _data;
-
+	struct dentry **victim = _data;
 	if (d_mountpoint(dentry)) {
 		__dget_dlock(dentry);
-		data->mountpoint = dentry;
+		*victim = dentry;
 		return D_WALK_QUIT;
 	}
-
-	return select_collect(&data->select, dentry);
-}
-
-static void check_and_drop(void *_data)
-{
-	struct detach_data *data = _data;
-
-	if (!data->mountpoint && list_empty(&data->select.dispose))
-		__d_drop(data->select.start);
+	return D_WALK_CONTINUE;
 }
 
 /**
  * d_invalidate - detach submounts, prune dcache, and drop
  * @dentry: dentry to invalidate (aka detach, prune and drop)
- *
- * no dcache lock.
- *
- * The final d_drop is done as an atomic operation relative to
- * rename_lock ensuring there are no races with d_set_mounted.  This
- * ensures there are no unhashed dentries on the path to a mountpoint.
  */
 void d_invalidate(struct dentry *dentry)
 {
-	/*
-	 * If it's already been dropped, return OK.
-	 */
+	bool had_submounts = false;
 	spin_lock(&dentry->d_lock);
 	if (d_unhashed(dentry)) {
 		spin_unlock(&dentry->d_lock);
 		return;
 	}
+	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
 
 	/* Negative dentries can be dropped without further checks */
-	if (!dentry->d_inode) {
-		d_drop(dentry);
+	if (!dentry->d_inode)
 		return;
-	}
 
+	shrink_dcache_parent(dentry);
 	for (;;) {
-		struct detach_data data;
-
-		data.mountpoint = NULL;
-		INIT_LIST_HEAD(&data.select.dispose);
-		data.select.start = dentry;
-		data.select.found = 0;
-
-		d_walk(dentry, &data, detach_and_collect, check_and_drop);
-
-		if (!list_empty(&data.select.dispose))
-			shrink_dentry_list(&data.select.dispose);
-		else if (!data.mountpoint)
+		struct dentry *victim = NULL;
+		d_walk(dentry, &victim, find_submount);
+		if (!victim) {
+			if (had_submounts)
+				shrink_dcache_parent(dentry);
 			return;
-
-		if (data.mountpoint) {
-			detach_mounts(data.mountpoint);
-			dput(data.mountpoint);
 		}
+		had_submounts = true;
+		detach_mounts(victim);
+		dput(victim);
 	}
 }
 EXPORT_SYMBOL(d_invalidate);
@@ -1963,35 +1941,6 @@ struct dentry *d_make_root(struct inode *root_inode)
 }
 EXPORT_SYMBOL(d_make_root);
 
-static struct dentry * __d_find_any_alias(struct inode *inode)
-{
-	struct dentry *alias;
-
-	if (hlist_empty(&inode->i_dentry))
-		return NULL;
-	alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
-	__dget(alias);
-	return alias;
-}
-
-/**
- * d_find_any_alias - find any alias for a given inode
- * @inode: inode to find an alias for
- *
- * If any aliases exist for the given inode, take and return a
- * reference for one of them.  If no aliases exist, return %NULL.
- */
-struct dentry *d_find_any_alias(struct inode *inode)
-{
-	struct dentry *de;
-
-	spin_lock(&inode->i_lock);
-	de = __d_find_any_alias(inode);
-	spin_unlock(&inode->i_lock);
-	return de;
-}
-EXPORT_SYMBOL(d_find_any_alias);
-
 static struct dentry *__d_instantiate_anon(struct dentry *dentry,
 					   struct inode *inode,
 					   bool disconnected)
@@ -3134,7 +3083,7 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
 
 void d_genocide(struct dentry *parent)
 {
-	d_walk(parent, parent, d_genocide_kill, NULL);
+	d_walk(parent, parent, d_genocide_kill);
 }
 
 EXPORT_SYMBOL(d_genocide);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 874607b..093fb54 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -432,8 +432,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
 	struct bio *bio;
 
 	/*
-	 * bio_alloc() is guaranteed to return a bio when called with
-	 * __GFP_RECLAIM and we request a valid number of vectors.
+	 * bio_alloc() is guaranteed to return a bio when allowed to sleep and
+	 * we request a valid number of vectors.
 	 */
 	bio = bio_alloc(GFP_KERNEL, nr_vecs);
 
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 5243989..a5e4a22 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1037,6 +1037,7 @@ static void sctp_connect_to_sock(struct connection *con)
 	int result;
 	int addr_len;
 	struct socket *sock;
+	struct timeval tv = { .tv_sec = 5, .tv_usec = 0 };
 
 	if (con->nodeid == 0) {
 		log_print("attempt to connect sock 0 foiled");
@@ -1080,11 +1081,22 @@ static void sctp_connect_to_sock(struct connection *con)
 	log_print("connecting to %d", con->nodeid);
 
 	/* Turn off Nagle's algorithm */
-	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+	kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
 			  sizeof(one));
 
+	/*
+	 * Make sock->ops->connect() function return in specified time,
+	 * since O_NONBLOCK argument in connect() function does not work here,
+	 * then, we should restore the default value of this attribute.
+	 */
+	kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+			  sizeof(tv));
 	result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len,
-				   O_NONBLOCK);
+				   0);
+	memset(&tv, 0, sizeof(tv));
+	kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+			  sizeof(tv));
+
 	if (result == -EINPROGRESS)
 		result = 0;
 	if (result == 0)
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 08d3bd6..61c9514 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -101,14 +101,20 @@ static int eventfd_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static __poll_t eventfd_poll(struct file *file, poll_table *wait)
+static struct wait_queue_head *
+eventfd_get_poll_head(struct file *file, __poll_t events)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+
+	return &ctx->wqh;
+}
+
+static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask)
 {
 	struct eventfd_ctx *ctx = file->private_data;
 	__poll_t events = 0;
 	u64 count;
 
-	poll_wait(file, &ctx->wqh, wait);
-
 	/*
 	 * All writes to ctx->count occur within ctx->wqh.lock.  This read
 	 * can be done outside ctx->wqh.lock because we know that poll_wait
@@ -305,7 +311,8 @@ static const struct file_operations eventfd_fops = {
 	.show_fdinfo	= eventfd_show_fdinfo,
 #endif
 	.release	= eventfd_release,
-	.poll		= eventfd_poll,
+	.get_poll_head	= eventfd_get_poll_head,
+	.poll_mask	= eventfd_poll_mask,
 	.read		= eventfd_read,
 	.write		= eventfd_write,
 	.llseek		= noop_llseek,
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 602ca42..67db22f 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -884,8 +884,7 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
 
 	pt->_key = epi->event.events;
 	if (!is_file_epoll(epi->ffd.file))
-		return epi->ffd.file->f_op->poll(epi->ffd.file, pt) &
-		       epi->event.events;
+		return vfs_poll(epi->ffd.file, pt) & epi->event.events;
 
 	ep = epi->ffd.file->private_data;
 	poll_wait(epi->ffd.file, &ep->poll_wait, pt);
@@ -2025,7 +2024,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 
 	/* The target file descriptor must support poll */
 	error = -EPERM;
-	if (!tf.file->f_op->poll)
+	if (!file_can_poll(tf.file))
 		goto error_tgt_fput;
 
 	/* Check if EPOLLWAKEUP is allowed */
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 3c6a9c1..ddbf872 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -790,7 +790,7 @@ int ore_create(struct ore_io_state *ios)
 	for (i = 0; i < ios->oc->numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
+		or = osd_start_request(_ios_od(ios, i));
 		if (unlikely(!or)) {
 			ORE_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -815,7 +815,7 @@ int ore_remove(struct ore_io_state *ios)
 	for (i = 0; i < ios->oc->numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
+		or = osd_start_request(_ios_od(ios, i));
 		if (unlikely(!or)) {
 			ORE_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -847,7 +847,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 		struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
 		struct osd_request *or;
 
-		or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL);
+		or = osd_start_request(_ios_od(ios, dev));
 		if (unlikely(!or)) {
 			ORE_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -966,7 +966,7 @@ int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
 		return 0; /* Just an empty slot */
 
 	first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
-	or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL);
+	or = osd_start_request(_ios_od(ios, first_dev));
 	if (unlikely(!or)) {
 		ORE_ERR("%s: osd_start_request failed\n", __func__);
 		return -ENOMEM;
@@ -1060,7 +1060,7 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
 		struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
 		struct osd_request *or;
 
-		or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL);
+		or = osd_start_request(_ios_od(ios, cur_comp));
 		if (unlikely(!or)) {
 			ORE_ERR("%s: osd_start_request failed\n", __func__);
 			return -ENOMEM;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 179cd5c..719a315 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -229,7 +229,7 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
 static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
 		    u64 offset, void *p, unsigned length)
 {
-	struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+	struct osd_request *or = osd_start_request(od);
 /*	struct osd_sense_info osi = {.key = 0};*/
 	int ret;
 
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a42e712..229ea4d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2390,7 +2390,7 @@ extern int ext4_init_inode_table(struct super_block *sb,
 extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
 
 /* mballoc.c */
-extern const struct file_operations ext4_seq_mb_groups_fops;
+extern const struct seq_operations ext4_mb_seq_groups_ops;
 extern long ext4_mb_stats;
 extern long ext4_mb_max_to_scan;
 extern int ext4_mb_init(struct super_block *);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 769a627..6884e81 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2254,7 +2254,7 @@ out:
 
 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 {
-	struct super_block *sb = seq->private;
+	struct super_block *sb = PDE_DATA(file_inode(seq->file));
 	ext4_group_t group;
 
 	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
@@ -2265,7 +2265,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 
 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct super_block *sb = seq->private;
+	struct super_block *sb = PDE_DATA(file_inode(seq->file));
 	ext4_group_t group;
 
 	++*pos;
@@ -2277,7 +2277,7 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
 
 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 {
-	struct super_block *sb = seq->private;
+	struct super_block *sb = PDE_DATA(file_inode(seq->file));
 	ext4_group_t group = (ext4_group_t) ((unsigned long) v);
 	int i;
 	int err, buddy_loaded = 0;
@@ -2330,34 +2330,13 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
 {
 }
 
-static const struct seq_operations ext4_mb_seq_groups_ops = {
+const struct seq_operations ext4_mb_seq_groups_ops = {
 	.start  = ext4_mb_seq_groups_start,
 	.next   = ext4_mb_seq_groups_next,
 	.stop   = ext4_mb_seq_groups_stop,
 	.show   = ext4_mb_seq_groups_show,
 };
 
-static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
-{
-	struct super_block *sb = PDE_DATA(inode);
-	int rc;
-
-	rc = seq_open(file, &ext4_mb_seq_groups_ops);
-	if (rc == 0) {
-		struct seq_file *m = file->private_data;
-		m->private = sb;
-	}
-	return rc;
-
-}
-
-const struct file_operations ext4_seq_mb_groups_fops = {
-	.open		= ext4_mb_seq_groups_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
 {
 	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 9ebd26c..f34da0b 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -346,39 +346,9 @@ static struct kobject *ext4_root;
 
 static struct kobject *ext4_feat;
 
-#define PROC_FILE_SHOW_DEFN(name) \
-static int name##_open(struct inode *inode, struct file *file) \
-{ \
-	return single_open(file, ext4_seq_##name##_show, PDE_DATA(inode)); \
-} \
-\
-static const struct file_operations ext4_seq_##name##_fops = { \
-	.open		= name##_open, \
-	.read		= seq_read, \
-	.llseek		= seq_lseek, \
-	.release	= single_release, \
-}
-
-#define PROC_FILE_LIST(name) \
-	{ __stringify(name), &ext4_seq_##name##_fops }
-
-PROC_FILE_SHOW_DEFN(es_shrinker_info);
-PROC_FILE_SHOW_DEFN(options);
-
-static const struct ext4_proc_files {
-	const char *name;
-	const struct file_operations *fops;
-} proc_files[] = {
-	PROC_FILE_LIST(options),
-	PROC_FILE_LIST(es_shrinker_info),
-	PROC_FILE_LIST(mb_groups),
-	{ NULL, NULL },
-};
-
 int ext4_register_sysfs(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	const struct ext4_proc_files *p;
 	int err;
 
 	init_completion(&sbi->s_kobj_unregister);
@@ -392,11 +362,14 @@ int ext4_register_sysfs(struct super_block *sb)
 
 	if (ext4_proc_root)
 		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-
 	if (sbi->s_proc) {
-		for (p = proc_files; p->name; p++)
-			proc_create_data(p->name, S_IRUGO, sbi->s_proc,
-					 p->fops, sb);
+		proc_create_single_data("options", S_IRUGO, sbi->s_proc,
+				ext4_seq_options_show, sb);
+		proc_create_single_data("es_shrinker_info", S_IRUGO,
+				sbi->s_proc, ext4_seq_es_shrinker_info_show,
+				sb);
+		proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
+				&ext4_mb_seq_groups_ops, sb);
 	}
 	return 0;
 }
@@ -404,13 +377,9 @@ int ext4_register_sysfs(struct super_block *sb)
 void ext4_unregister_sysfs(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	const struct ext4_proc_files *p;
 
-	if (sbi->s_proc) {
-		for (p = proc_files; p->name; p++)
-			remove_proc_entry(p->name, sbi->s_proc);
-		remove_proc_entry(sb->s_id, ext4_proc_root);
-	}
+	if (sbi->s_proc)
+		remove_proc_subtree(sb->s_id, ext4_proc_root);
 	kobject_del(&sbi->s_kobj);
 }
 
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index f33a56d..4b47ca6 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -572,23 +572,6 @@ static int iostat_info_seq_show(struct seq_file *seq, void *offset)
 	return 0;
 }
 
-#define F2FS_PROC_FILE_DEF(_name)					\
-static int _name##_open_fs(struct inode *inode, struct file *file)	\
-{									\
-	return single_open(file, _name##_seq_show, PDE_DATA(inode));	\
-}									\
-									\
-static const struct file_operations f2fs_seq_##_name##_fops = {		\
-	.open = _name##_open_fs,					\
-	.read = seq_read,						\
-	.llseek = seq_lseek,						\
-	.release = single_release,					\
-};
-
-F2FS_PROC_FILE_DEF(segment_info);
-F2FS_PROC_FILE_DEF(segment_bits);
-F2FS_PROC_FILE_DEF(iostat_info);
-
 int __init f2fs_init_sysfs(void)
 {
 	int ret;
@@ -632,12 +615,12 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
 		sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
 
 	if (sbi->s_proc) {
-		proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
-				 &f2fs_seq_segment_info_fops, sb);
-		proc_create_data("segment_bits", S_IRUGO, sbi->s_proc,
-				 &f2fs_seq_segment_bits_fops, sb);
-		proc_create_data("iostat_info", S_IRUGO, sbi->s_proc,
-				&f2fs_seq_iostat_info_fops, sb);
+		proc_create_single_data("segment_info", S_IRUGO, sbi->s_proc,
+				segment_info_seq_show, sb);
+		proc_create_single_data("segment_bits", S_IRUGO, sbi->s_proc,
+				segment_bits_seq_show, sb);
+		proc_create_single_data("iostat_info", S_IRUGO, sbi->s_proc,
+				iostat_info_seq_show, sb);
 	}
 	return 0;
 }
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 582ca73..484ce67 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -314,10 +314,6 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
 	int err;
 
 	mutex_lock(&MSDOS_SB(sb)->s_lock);
-	/*
-	 * Check whether the directory is not in use, then check
-	 * whether it is empty.
-	 */
 	err = fat_dir_empty(inode);
 	if (err)
 		goto out;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 2649759..4f4362d 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -697,15 +697,6 @@ static int vfat_find(struct inode *dir, const struct qstr *qname,
 	return fat_search_long(dir, qname->name, len, sinfo);
 }
 
-/*
- * (nfsd's) anonymous disconnected dentry?
- * NOTE: !IS_ROOT() is not anonymous (I.e. d_splice_alias() did the job).
- */
-static int vfat_d_anon_disconn(struct dentry *dentry)
-{
-	return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED);
-}
-
 static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 				  unsigned int flags)
 {
@@ -738,8 +729,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 	 * Checking "alias->d_parent == dentry->d_parent" to make sure
 	 * FS is not corrupted (especially double linked dir).
 	 */
-	if (alias && alias->d_parent == dentry->d_parent &&
-	    !vfat_d_anon_disconn(alias)) {
+	if (alias && alias->d_parent == dentry->d_parent) {
 		/*
 		 * This inode has non anonymous-DCACHE_DISCONNECTED
 		 * dentry. This means, the user did ->lookup() by an
@@ -747,7 +737,6 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 		 *
 		 * Switch to new one for reason of locality if possible.
 		 */
-		BUG_ON(d_unhashed(alias));
 		if (!S_ISDIR(inode->i_mode))
 			d_move(alias, dentry);
 		iput(inode);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index d737ff0..c421694 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -871,9 +871,9 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 		if (fa->fa_file != filp)
 			continue;
 
-		spin_lock_irq(&fa->fa_lock);
+		write_lock_irq(&fa->fa_lock);
 		fa->fa_file = NULL;
-		spin_unlock_irq(&fa->fa_lock);
+		write_unlock_irq(&fa->fa_lock);
 
 		*fp = fa->fa_next;
 		call_rcu(&fa->fa_rcu, fasync_free_rcu);
@@ -918,13 +918,13 @@ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy
 		if (fa->fa_file != filp)
 			continue;
 
-		spin_lock_irq(&fa->fa_lock);
+		write_lock_irq(&fa->fa_lock);
 		fa->fa_fd = fd;
-		spin_unlock_irq(&fa->fa_lock);
+		write_unlock_irq(&fa->fa_lock);
 		goto out;
 	}
 
-	spin_lock_init(&new->fa_lock);
+	rwlock_init(&new->fa_lock);
 	new->magic = FASYNC_MAGIC;
 	new->fa_file = filp;
 	new->fa_fd = fd;
@@ -987,14 +987,13 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
 {
 	while (fa) {
 		struct fown_struct *fown;
-		unsigned long flags;
 
 		if (fa->magic != FASYNC_MAGIC) {
 			printk(KERN_ERR "kill_fasync: bad magic number in "
 			       "fasync_struct!\n");
 			return;
 		}
-		spin_lock_irqsave(&fa->fa_lock, flags);
+		read_lock(&fa->fa_lock);
 		if (fa->fa_file) {
 			fown = &fa->fa_file->f_owner;
 			/* Don't send SIGURG to processes which have not set a
@@ -1003,7 +1002,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
 			if (!(sig == SIGURG && fown->signum == 0))
 				send_sigio(fown, fa->fa_fd, band);
 		}
-		spin_unlock_irqrestore(&fa->fa_lock, flags);
+		read_unlock(&fa->fa_lock);
 		fa = rcu_dereference(fa->fa_next);
 	}
 }
diff --git a/fs/filesystems.c b/fs/filesystems.c
index f2728a4..b03f57b 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -238,21 +238,9 @@ static int filesystems_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int filesystems_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, filesystems_proc_show, NULL);
-}
-
-static const struct file_operations filesystems_proc_fops = {
-	.open		= filesystems_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_filesystems_init(void)
 {
-	proc_create("filesystems", 0, NULL, &filesystems_proc_fops);
+	proc_create_single("filesystems", 0, NULL, filesystems_proc_show);
 	return 0;
 }
 module_init(proc_filesystems_init);
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index ce4785f..a514256 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -193,13 +193,9 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags)
 		return ERR_PTR(-ENAMETOOLONG);
 				 
 	ino = vxfs_inode_by_name(dip, dp);
-	if (ino) {
+	if (ino)
 		ip = vxfs_iget(dip->i_sb, ino);
-		if (IS_ERR(ip))
-			return ERR_CAST(ip);
-	}
-	d_add(dp, ip);
-	return NULL;
+	return d_splice_alias(ip, dp);
 }
 
 /**
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
index 15a3d04..9a13e9e 100644
--- a/fs/fscache/histogram.c
+++ b/fs/fscache/histogram.c
@@ -83,24 +83,9 @@ static void fscache_histogram_stop(struct seq_file *m, void *v)
 {
 }
 
-static const struct seq_operations fscache_histogram_ops = {
+const struct seq_operations fscache_histogram_ops = {
 	.start		= fscache_histogram_start,
 	.stop		= fscache_histogram_stop,
 	.next		= fscache_histogram_next,
 	.show		= fscache_histogram_show,
 };
-
-/*
- * open "/proc/fs/fscache/histogram" to provide latency data
- */
-static int fscache_histogram_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &fscache_histogram_ops);
-}
-
-const struct file_operations fscache_histogram_fops = {
-	.open		= fscache_histogram_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 500650f..f83328a 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -31,6 +31,7 @@
 #include <linux/fscache-cache.h>
 #include <trace/events/fscache.h>
 #include <linux/sched.h>
+#include <linux/seq_file.h>
 
 #define FSCACHE_MIN_THREADS	4
 #define FSCACHE_MAX_THREADS	32
@@ -84,7 +85,7 @@ static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif)
 	atomic_inc(&histogram[jif]);
 }
 
-extern const struct file_operations fscache_histogram_fops;
+extern const struct seq_operations fscache_histogram_ops;
 
 #else
 #define fscache_hist(hist, start_jif) do {} while (0)
@@ -294,7 +295,7 @@ static inline void fscache_stat_d(atomic_t *stat)
 
 #define __fscache_stat(stat) (stat)
 
-extern const struct file_operations fscache_stats_fops;
+int fscache_stats_show(struct seq_file *m, void *v);
 #else
 
 #define __fscache_stat(stat) (NULL)
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
index 1d9e495..49a8c90 100644
--- a/fs/fscache/proc.c
+++ b/fs/fscache/proc.c
@@ -26,14 +26,14 @@ int __init fscache_proc_init(void)
 		goto error_dir;
 
 #ifdef CONFIG_FSCACHE_STATS
-	if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL,
-			 &fscache_stats_fops))
+	if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL,
+			fscache_stats_show))
 		goto error_stats;
 #endif
 
 #ifdef CONFIG_FSCACHE_HISTOGRAM
-	if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL,
-			 &fscache_histogram_fops))
+	if (!proc_create_seq("fs/fscache/histogram", S_IFREG | 0444, NULL,
+			 &fscache_histogram_ops))
 		goto error_histogram;
 #endif
 
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index fcc8c2f..00564a1 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -138,7 +138,7 @@ atomic_t fscache_n_cache_culled_objects;
 /*
  * display the general statistics
  */
-static int fscache_stats_show(struct seq_file *m, void *v)
+int fscache_stats_show(struct seq_file *m, void *v)
 {
 	seq_puts(m, "FS-Cache statistics\n");
 
@@ -284,18 +284,3 @@ static int fscache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&fscache_n_cache_culled_objects));
 	return 0;
 }
-
-/*
- * open "/proc/fs/fscache/stats" allowing provision of a statistical summary
- */
-static int fscache_stats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, fscache_stats_show, NULL);
-}
-
-const struct file_operations fscache_stats_fops = {
-	.open		= fscache_stats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release        = single_release,
-};
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index f587165..35f5ee2 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -54,8 +54,7 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
 			continue;
 		if (start >= to)
 			break;
-		if (gfs2_is_jdata(ip))
-			set_buffer_uptodate(bh);
+		set_buffer_uptodate(bh);
 		gfs2_trans_add_data(ip->i_gl, bh);
 	}
 }
@@ -747,18 +746,21 @@ out:
 	put_page(page);
 
 	gfs2_trans_end(sdp);
-	if (pos + len > ip->i_inode.i_size)
-		gfs2_trim_blocks(&ip->i_inode);
-	goto out_trans_fail;
+	if (alloc_required) {
+		gfs2_inplace_release(ip);
+		if (pos + len > ip->i_inode.i_size)
+			gfs2_trim_blocks(&ip->i_inode);
+	}
+	goto out_qunlock;
 
 out_endtrans:
 	gfs2_trans_end(sdp);
 out_trans_fail:
-	if (alloc_required) {
+	if (alloc_required)
 		gfs2_inplace_release(ip);
 out_qunlock:
+	if (alloc_required)
 		gfs2_quota_unlock(ip);
-	}
 out_unlock:
 	if (&ip->i_inode == sdp->sd_rindex) {
 		gfs2_glock_dq(&m_ip->i_gh);
@@ -814,7 +816,6 @@ out:
  * @inode: The inode
  * @dibh: The buffer_head containing the on-disk inode
  * @pos: The file position
- * @len: The length of the write
  * @copied: How much was actually copied by the VFS
  * @page: The page
  *
@@ -824,17 +825,15 @@ out:
  * Returns: errno
  */
 static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
-				  loff_t pos, unsigned len, unsigned copied,
+				  loff_t pos, unsigned copied,
 				  struct page *page)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 	u64 to = pos + copied;
 	void *kaddr;
 	unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
 
-	BUG_ON(pos + len > gfs2_max_stuffed_size(ip));
+	BUG_ON(pos + copied > gfs2_max_stuffed_size(ip));
 
 	kaddr = kmap_atomic(page);
 	memcpy(buf + pos, kaddr + pos, copied);
@@ -850,20 +849,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 			i_size_write(inode, to);
 		mark_inode_dirty(inode);
 	}
-
-	if (inode == sdp->sd_rindex) {
-		adjust_fs_space(inode);
-		sdp->sd_rindex_uptodate = 0;
-	}
-
-	brelse(dibh);
-	gfs2_trans_end(sdp);
-	if (inode == sdp->sd_rindex) {
-		gfs2_glock_dq(&m_ip->i_gh);
-		gfs2_holder_uninit(&m_ip->i_gh);
-	}
-	gfs2_glock_dq(&ip->i_gh);
-	gfs2_holder_uninit(&ip->i_gh);
 	return copied;
 }
 
@@ -877,9 +862,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
  * @page: The page that has been written
  * @fsdata: The fsdata (unused in GFS2)
  *
- * The main write_end function for GFS2. We have a separate one for
- * stuffed files as they are slightly different, otherwise we just
- * put our locking around the VFS provided functions.
+ * The main write_end function for GFS2. We just put our locking around the VFS
+ * provided functions.
  *
  * Returns: errno
  */
@@ -900,32 +884,39 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 	BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
 
 	ret = gfs2_meta_inode_buffer(ip, &dibh);
-	if (unlikely(ret)) {
-		unlock_page(page);
-		put_page(page);
-		goto failed;
-	}
+	if (unlikely(ret))
+		goto out;
 
-	if (gfs2_is_stuffed(ip))
-		return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
+	if (gfs2_is_stuffed(ip)) {
+		ret = gfs2_stuffed_write_end(inode, dibh, pos, copied, page);
+		page = NULL;
+		goto out2;
+	}
 
-	if (!gfs2_is_writeback(ip))
+	if (gfs2_is_jdata(ip))
 		gfs2_page_add_databufs(ip, page, pos & ~PAGE_MASK, len);
+	else
+		gfs2_ordered_add_inode(ip);
 
 	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+	page = NULL;
 	if (tr->tr_num_buf_new)
 		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 	else
 		gfs2_trans_add_meta(ip->i_gl, dibh);
 
-
+out2:
 	if (inode == sdp->sd_rindex) {
 		adjust_fs_space(inode);
 		sdp->sd_rindex_uptodate = 0;
 	}
 
 	brelse(dibh);
-failed:
+out:
+	if (page) {
+		unlock_page(page);
+		put_page(page);
+	}
 	gfs2_trans_end(sdp);
 	gfs2_inplace_release(ip);
 	if (ip->i_qadata && ip->i_qadata->qa_qd_num)
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 278ed08..a7b586e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -89,10 +89,12 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
 		map_bh(bh, inode->i_sb, block);
 
 	set_buffer_uptodate(bh);
-	if (!gfs2_is_jdata(ip))
-		mark_buffer_dirty(bh);
-	if (!gfs2_is_writeback(ip))
+	if (gfs2_is_jdata(ip))
 		gfs2_trans_add_data(ip->i_gl, bh);
+	else {
+		mark_buffer_dirty(bh);
+		gfs2_ordered_add_inode(ip);
+	}
 
 	if (release) {
 		unlock_page(page);
@@ -176,8 +178,8 @@ out:
 /**
  * find_metapath - Find path through the metadata tree
  * @sdp: The superblock
- * @mp: The metapath to return the result in
  * @block: The disk block to look up
+ * @mp: The metapath to return the result in
  * @height: The pre-calculated height of the metadata tree
  *
  *   This routine returns a struct metapath structure that defines a path
@@ -188,8 +190,7 @@ out:
  *   filesystem with a blocksize of 4096.
  *
  *   find_metapath() would return a struct metapath structure set to:
- *   mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
- *   and mp_list[2] = 165.
+ *   mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
  *
  *   That means that in order to get to the block containing the byte at
  *   offset 101342453, we would load the indirect block pointed to by pointer
@@ -279,6 +280,21 @@ static inline __be64 *metapointer(unsigned int height, const struct metapath *mp
 	return p + mp->mp_list[height];
 }
 
+static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
+{
+	const struct buffer_head *bh = mp->mp_bh[height];
+	return (const __be64 *)(bh->b_data + bh->b_size);
+}
+
+static void clone_metapath(struct metapath *clone, struct metapath *mp)
+{
+	unsigned int hgt;
+
+	*clone = *mp;
+	for (hgt = 0; hgt < mp->mp_aheight; hgt++)
+		get_bh(clone->mp_bh[hgt]);
+}
+
 static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
 {
 	const __be64 *t;
@@ -420,20 +436,140 @@ static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __b
 	return (ptr - first);
 }
 
-static inline void bmap_lock(struct gfs2_inode *ip, int create)
+typedef const __be64 *(*gfs2_metadata_walker)(
+		struct metapath *mp,
+		const __be64 *start, const __be64 *end,
+		u64 factor, void *data);
+
+#define WALK_STOP ((__be64 *)0)
+#define WALK_NEXT ((__be64 *)1)
+
+static int gfs2_walk_metadata(struct inode *inode, sector_t lblock,
+		u64 len, struct metapath *mp, gfs2_metadata_walker walker,
+		void *data)
 {
-	if (create)
-		down_write(&ip->i_rw_mutex);
-	else
-		down_read(&ip->i_rw_mutex);
+	struct metapath clone;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	const __be64 *start, *end, *ptr;
+	u64 factor = 1;
+	unsigned int hgt;
+	int ret = 0;
+
+	for (hgt = ip->i_height - 1; hgt >= mp->mp_aheight; hgt--)
+		factor *= sdp->sd_inptrs;
+
+	for (;;) {
+		u64 step;
+
+		/* Walk indirect block. */
+		start = metapointer(hgt, mp);
+		end = metaend(hgt, mp);
+
+		step = (end - start) * factor;
+		if (step > len)
+			end = start + DIV_ROUND_UP_ULL(len, factor);
+
+		ptr = walker(mp, start, end, factor, data);
+		if (ptr == WALK_STOP)
+			break;
+		if (step >= len)
+			break;
+		len -= step;
+		if (ptr != WALK_NEXT) {
+			BUG_ON(!*ptr);
+			mp->mp_list[hgt] += ptr - start;
+			goto fill_up_metapath;
+		}
+
+lower_metapath:
+		/* Decrease height of metapath. */
+		if (mp != &clone) {
+			clone_metapath(&clone, mp);
+			mp = &clone;
+		}
+		brelse(mp->mp_bh[hgt]);
+		mp->mp_bh[hgt] = NULL;
+		if (!hgt)
+			break;
+		hgt--;
+		factor *= sdp->sd_inptrs;
+
+		/* Advance in metadata tree. */
+		(mp->mp_list[hgt])++;
+		start = metapointer(hgt, mp);
+		end = metaend(hgt, mp);
+		if (start >= end) {
+			mp->mp_list[hgt] = 0;
+			if (!hgt)
+				break;
+			goto lower_metapath;
+		}
+
+fill_up_metapath:
+		/* Increase height of metapath. */
+		if (mp != &clone) {
+			clone_metapath(&clone, mp);
+			mp = &clone;
+		}
+		ret = fillup_metapath(ip, mp, ip->i_height - 1);
+		if (ret < 0)
+			break;
+		hgt += ret;
+		for (; ret; ret--)
+			do_div(factor, sdp->sd_inptrs);
+		mp->mp_aheight = hgt + 1;
+	}
+	if (mp == &clone)
+		release_metapath(mp);
+	return ret;
 }
 
-static inline void bmap_unlock(struct gfs2_inode *ip, int create)
+struct gfs2_hole_walker_args {
+	u64 blocks;
+};
+
+static const __be64 *gfs2_hole_walker(struct metapath *mp,
+		const __be64 *start, const __be64 *end,
+		u64 factor, void *data)
 {
-	if (create)
-		up_write(&ip->i_rw_mutex);
-	else
-		up_read(&ip->i_rw_mutex);
+	struct gfs2_hole_walker_args *args = data;
+	const __be64 *ptr;
+
+	for (ptr = start; ptr < end; ptr++) {
+		if (*ptr) {
+			args->blocks += (ptr - start) * factor;
+			if (mp->mp_aheight == mp->mp_fheight)
+				return WALK_STOP;
+			return ptr;  /* increase height */
+		}
+	}
+	args->blocks += (end - start) * factor;
+	return WALK_NEXT;
+}
+
+/**
+ * gfs2_hole_size - figure out the size of a hole
+ * @inode: The inode
+ * @lblock: The logical starting block number
+ * @len: How far to look (in blocks)
+ * @mp: The metapath at lblock
+ * @iomap: The iomap to store the hole size in
+ *
+ * This function modifies @mp.
+ *
+ * Returns: errno on error
+ */
+static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
+			  struct metapath *mp, struct iomap *iomap)
+{
+	struct gfs2_hole_walker_args args = { };
+	int ret = 0;
+
+	ret = gfs2_walk_metadata(inode, lblock, len, mp, gfs2_hole_walker, &args);
+	if (!ret)
+		iomap->length = args.blocks << inode->i_blkbits;
+	return ret;
 }
 
 static inline __be64 *gfs2_indirect_init(struct metapath *mp,
@@ -462,15 +598,11 @@ enum alloc_state {
 };
 
 /**
- * gfs2_bmap_alloc - Build a metadata tree of the requested height
+ * gfs2_iomap_alloc - Build a metadata tree of the requested height
  * @inode: The GFS2 inode
- * @lblock: The logical starting block of the extent
- * @bh_map: This is used to return the mapping details
- * @zero_new: True if newly allocated blocks should be zeroed
+ * @iomap: The iomap structure
+ * @flags: iomap flags
  * @mp: The metapath, with proper height information calculated
- * @maxlen: The max number of data blocks to alloc
- * @dblock: Pointer to return the resulting new block
- * @dblks: Pointer to return the number of blocks allocated
  *
  * In this routine we may have to alloc:
  *   i) Indirect blocks to grow the metadata tree height
@@ -483,6 +615,13 @@ enum alloc_state {
  * blocks are available, there will only be one request per bmap call)
  * and uses the state machine to initialise the blocks in order.
  *
+ * Right now, this function will allocate at most one indirect block
+ * worth of data -- with a default block size of 4K, that's slightly
+ * less than 2M.  If this limitation is ever removed to allow huge
+ * allocations, we would probably still want to limit the iomap size we
+ * return to avoid stalling other tasks during huge writes; the next
+ * iomap iteration would then find the blocks already allocated.
+ *
  * Returns: errno on error
  */
 
@@ -497,6 +636,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 	unsigned dblks = 0;
 	unsigned ptrs_per_blk;
 	const unsigned end_of_metadata = mp->mp_fheight - 1;
+	int ret;
 	enum alloc_state state;
 	__be64 *ptr;
 	__be64 zero_bn = 0;
@@ -507,6 +647,8 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 
 	gfs2_trans_add_meta(ip->i_gl, dibh);
 
+	down_write(&ip->i_rw_mutex);
+
 	if (mp->mp_fheight == mp->mp_aheight) {
 		struct buffer_head *bh;
 		int eob;
@@ -542,11 +684,10 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 	blks = dblks + iblks;
 	i = mp->mp_aheight;
 	do {
-		int error;
 		n = blks - alloced;
-		error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
-		if (error)
-			return error;
+		ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
+		if (ret)
+			goto out;
 		alloced += n;
 		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 			gfs2_trans_add_unrevoke(sdp, bn, n);
@@ -602,7 +743,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 			dblks = n;
 			ptr = metapointer(end_of_metadata, mp);
 			iomap->addr = bn << inode->i_blkbits;
-			iomap->flags |= IOMAP_F_NEW;
+			iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
 			while (n-- > 0)
 				*ptr++ = cpu_to_be64(bn++);
 			break;
@@ -612,64 +753,10 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 	iomap->length = (u64)dblks << inode->i_blkbits;
 	ip->i_height = mp->mp_fheight;
 	gfs2_add_inode_blocks(&ip->i_inode, alloced);
-	gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
-	return 0;
-}
-
-/**
- * hole_size - figure out the size of a hole
- * @inode: The inode
- * @lblock: The logical starting block number
- * @mp: The metapath
- *
- * Returns: The hole size in bytes
- *
- */
-static u64 hole_size(struct inode *inode, sector_t lblock, struct metapath *mp)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct metapath mp_eof;
-	u64 factor = 1;
-	int hgt;
-	u64 holesz = 0;
-	const __be64 *first, *end, *ptr;
-	const struct buffer_head *bh;
-	u64 lblock_stop = (i_size_read(inode) - 1) >> inode->i_blkbits;
-	int zeroptrs;
-	bool done = false;
-
-	/* Get another metapath, to the very last byte */
-	find_metapath(sdp, lblock_stop, &mp_eof, ip->i_height);
-	for (hgt = ip->i_height - 1; hgt >= 0 && !done; hgt--) {
-		bh = mp->mp_bh[hgt];
-		if (bh) {
-			zeroptrs = 0;
-			first = metapointer(hgt, mp);
-			end = (const __be64 *)(bh->b_data + bh->b_size);
-
-			for (ptr = first; ptr < end; ptr++) {
-				if (*ptr) {
-					done = true;
-					break;
-				} else {
-					zeroptrs++;
-				}
-			}
-		} else {
-			zeroptrs = sdp->sd_inptrs;
-		}
-		if (factor * zeroptrs >= lblock_stop - lblock + 1) {
-			holesz = lblock_stop - lblock + 1;
-			break;
-		}
-		holesz += factor * zeroptrs;
-
-		factor *= sdp->sd_inptrs;
-		if (hgt && (mp->mp_list[hgt - 1] < mp_eof.mp_list[hgt - 1]))
-			(mp->mp_list[hgt - 1])++;
-	}
-	return holesz << inode->i_blkbits;
+	gfs2_dinode_out(ip, dibh->b_data);
+out:
+	up_write(&ip->i_rw_mutex);
+	return ret;
 }
 
 static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
@@ -685,121 +772,130 @@ static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
 }
 
 /**
- * gfs2_iomap_begin - Map blocks from an inode to disk blocks
+ * gfs2_iomap_get - Map blocks from an inode to disk blocks
  * @inode: The inode
  * @pos: Starting position in bytes
  * @length: Length to map, in bytes
  * @flags: iomap flags
  * @iomap: The iomap structure
+ * @mp: The metapath
  *
  * Returns: errno
  */
-int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
-		     unsigned flags, struct iomap *iomap)
+static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
+			  unsigned flags, struct iomap *iomap,
+			  struct metapath *mp)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct metapath mp = { .mp_aheight = 1, };
-	unsigned int factor = sdp->sd_sb.sb_bsize;
-	const u64 *arr = sdp->sd_heightsize;
 	__be64 *ptr;
 	sector_t lblock;
-	sector_t lend;
-	int ret = 0;
+	sector_t lblock_stop;
+	int ret;
 	int eob;
-	unsigned int len;
+	u64 len;
 	struct buffer_head *bh;
 	u8 height;
 
-	trace_gfs2_iomap_start(ip, pos, length, flags);
-	if (!length) {
-		ret = -EINVAL;
-		goto out;
-	}
+	if (!length)
+		return -EINVAL;
 
 	if (gfs2_is_stuffed(ip)) {
 		if (flags & IOMAP_REPORT) {
+			if (pos >= i_size_read(inode))
+				return -ENOENT;
 			gfs2_stuffed_iomap(inode, iomap);
-			if (pos >= iomap->length)
-				ret = -ENOENT;
-			goto out;
+			return 0;
 		}
 		BUG_ON(!(flags & IOMAP_WRITE));
 	}
-
 	lblock = pos >> inode->i_blkbits;
-	lend = (pos + length + sdp->sd_sb.sb_bsize - 1) >> inode->i_blkbits;
-
 	iomap->offset = lblock << inode->i_blkbits;
-	iomap->addr = IOMAP_NULL_ADDR;
-	iomap->type = IOMAP_HOLE;
-	iomap->length = (u64)(lend - lblock) << inode->i_blkbits;
-	iomap->flags = IOMAP_F_MERGED;
-	bmap_lock(ip, flags & IOMAP_WRITE);
+	lblock_stop = (pos + length - 1) >> inode->i_blkbits;
+	len = lblock_stop - lblock + 1;
 
-	/*
-	 * Directory data blocks have a struct gfs2_meta_header header, so the
-	 * remaining size is smaller than the filesystem block size.  Logical
-	 * block numbers for directories are in units of this remaining size!
-	 */
-	if (gfs2_is_dir(ip)) {
-		factor = sdp->sd_jbsize;
-		arr = sdp->sd_jheightsize;
-	}
+	down_read(&ip->i_rw_mutex);
 
-	ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
+	ret = gfs2_meta_inode_buffer(ip, &mp->mp_bh[0]);
 	if (ret)
-		goto out_release;
+		goto unlock;
 
 	height = ip->i_height;
-	while ((lblock + 1) * factor > arr[height])
+	while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
 		height++;
-	find_metapath(sdp, lblock, &mp, height);
+	find_metapath(sdp, lblock, mp, height);
 	if (height > ip->i_height || gfs2_is_stuffed(ip))
 		goto do_alloc;
 
-	ret = lookup_metapath(ip, &mp);
+	ret = lookup_metapath(ip, mp);
 	if (ret)
-		goto out_release;
+		goto unlock;
 
-	if (mp.mp_aheight != ip->i_height)
+	if (mp->mp_aheight != ip->i_height)
 		goto do_alloc;
 
-	ptr = metapointer(ip->i_height - 1, &mp);
+	ptr = metapointer(ip->i_height - 1, mp);
 	if (*ptr == 0)
 		goto do_alloc;
 
-	iomap->type = IOMAP_MAPPED;
-	iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
+	bh = mp->mp_bh[ip->i_height - 1];
+	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, len, &eob);
 
-	bh = mp.mp_bh[ip->i_height - 1];
-	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, &eob);
+	iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
+	iomap->length = len << inode->i_blkbits;
+	iomap->type = IOMAP_MAPPED;
+	iomap->flags = IOMAP_F_MERGED;
 	if (eob)
 		iomap->flags |= IOMAP_F_BOUNDARY;
-	iomap->length = (u64)len << inode->i_blkbits;
 
-out_release:
-	release_metapath(&mp);
-	bmap_unlock(ip, flags & IOMAP_WRITE);
 out:
-	trace_gfs2_iomap_end(ip, iomap, ret);
+	iomap->bdev = inode->i_sb->s_bdev;
+unlock:
+	up_read(&ip->i_rw_mutex);
 	return ret;
 
 do_alloc:
-	if (flags & IOMAP_WRITE) {
-		ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
-	} else if (flags & IOMAP_REPORT) {
+	iomap->addr = IOMAP_NULL_ADDR;
+	iomap->length = len << inode->i_blkbits;
+	iomap->type = IOMAP_HOLE;
+	iomap->flags = 0;
+	if (flags & IOMAP_REPORT) {
 		loff_t size = i_size_read(inode);
 		if (pos >= size)
 			ret = -ENOENT;
-		else if (height <= ip->i_height)
-			iomap->length = hole_size(inode, lblock, &mp);
+		else if (height == ip->i_height)
+			ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 		else
 			iomap->length = size - pos;
 	}
-	goto out_release;
+	goto out;
+}
+
+static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
+			    unsigned flags, struct iomap *iomap)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct metapath mp = { .mp_aheight = 1, };
+	int ret;
+
+	trace_gfs2_iomap_start(ip, pos, length, flags);
+	if (flags & IOMAP_WRITE) {
+		ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
+		if (!ret && iomap->type == IOMAP_HOLE)
+			ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
+		release_metapath(&mp);
+	} else {
+		ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
+		release_metapath(&mp);
+	}
+	trace_gfs2_iomap_end(ip, iomap, ret);
+	return ret;
 }
 
+const struct iomap_ops gfs2_iomap_ops = {
+	.iomap_begin = gfs2_iomap_begin,
+};
+
 /**
  * gfs2_block_map - Map one or more blocks of an inode to a disk block
  * @inode: The inode
@@ -825,25 +921,34 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 		   struct buffer_head *bh_map, int create)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
-	struct iomap iomap;
-	int ret, flags = 0;
+	loff_t pos = (loff_t)lblock << inode->i_blkbits;
+	loff_t length = bh_map->b_size;
+	struct metapath mp = { .mp_aheight = 1, };
+	struct iomap iomap = { };
+	int ret;
 
 	clear_buffer_mapped(bh_map);
 	clear_buffer_new(bh_map);
 	clear_buffer_boundary(bh_map);
 	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
 
-	if (create)
-		flags |= IOMAP_WRITE;
-	ret = gfs2_iomap_begin(inode, (loff_t)lblock << inode->i_blkbits,
-			       bh_map->b_size, flags, &iomap);
-	if (ret) {
-		if (!create && ret == -ENOENT) {
-			/* Return unmapped buffer beyond the end of file.  */
+	if (create) {
+		ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, &iomap, &mp);
+		if (!ret && iomap.type == IOMAP_HOLE)
+			ret = gfs2_iomap_alloc(inode, &iomap, IOMAP_WRITE, &mp);
+		release_metapath(&mp);
+	} else {
+		ret = gfs2_iomap_get(inode, pos, length, 0, &iomap, &mp);
+		release_metapath(&mp);
+
+		/* Return unmapped buffer beyond the end of file. */
+		if (ret == -ENOENT) {
 			ret = 0;
+			goto out;
 		}
-		goto out;
 	}
+	if (ret)
+		goto out;
 
 	if (iomap.length > bh_map->b_size) {
 		iomap.length = bh_map->b_size;
@@ -945,8 +1050,10 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from,
 		err = 0;
 	}
 
-	if (!gfs2_is_writeback(ip))
+	if (gfs2_is_jdata(ip))
 		gfs2_trans_add_data(ip->i_gl, bh);
+	else
+		gfs2_ordered_add_inode(ip);
 
 	zero_user(page, offset, length);
 	mark_buffer_dirty(bh);
@@ -1056,6 +1163,19 @@ out:
 	return error;
 }
 
+int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length,
+			 struct iomap *iomap)
+{
+	struct metapath mp = { .mp_aheight = 1, };
+	int ret;
+
+	ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
+	if (!ret && iomap->type == IOMAP_HOLE)
+		ret = gfs2_iomap_alloc(inode, iomap, IOMAP_WRITE, &mp);
+	release_metapath(&mp);
+	return ret;
+}
+
 /**
  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
  * @ip: inode
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index c3402fe..6b18fb3 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -46,11 +46,13 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
 	}
 }
 
+extern const struct iomap_ops gfs2_iomap_ops;
+
 extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
 extern int gfs2_block_map(struct inode *inode, sector_t lblock,
 			  struct buffer_head *bh, int create);
-extern int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
-			    unsigned flags, struct iomap *iomap);
+extern int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length,
+				struct iomap *iomap);
 extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
 			   u64 *dblock, unsigned *extlen);
 extern int gfs2_setattr_size(struct inode *inode, u64 size);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4b71f021..7137db7 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -733,7 +733,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	loff_t end = offset + len;
 	struct buffer_head *dibh;
-	struct iomap iomap;
+	struct iomap iomap = { };
 	int error;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -749,8 +749,8 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
 	}
 
 	while (offset < end) {
-		error = gfs2_iomap_begin(inode, offset, end - offset,
-					 IOMAP_WRITE, &iomap);
+		error = gfs2_iomap_get_alloc(inode, offset, end - offset,
+					     &iomap);
 		if (error)
 			goto out;
 		offset = iomap.offset + iomap.length;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 1b6b1e3..d2ad817 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -116,6 +116,7 @@ static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm)
 
 static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm)
 {
+	BUG_ON(rbm->offset >= rbm->rgd->rd_data);
 	return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) +
 		rbm->offset;
 }
@@ -696,8 +697,6 @@ struct gfs2_sbd {
 	u32 sd_max_dirres;	/* Max blocks needed to add a directory entry */
 	u32 sd_max_height;	/* Max height of a file's metadata tree */
 	u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
-	u32 sd_max_jheight; /* Max height of journaled file's meta tree */
-	u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
 	u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */
 
 	struct gfs2_args sd_args;	/* Mount arguments */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8700eb8..feda55f 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -2006,10 +2006,6 @@ static int gfs2_getattr(const struct path *path, struct kstat *stat,
 	return 0;
 }
 
-const struct iomap_ops gfs2_iomap_ops = {
-	.iomap_begin = gfs2_iomap_begin,
-};
-
 static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		       u64 start, u64 len)
 {
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 1862e31..2024143 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -14,6 +14,7 @@
 #include <linux/spinlock.h>
 #include <linux/writeback.h>
 #include "incore.h"
+#include "inode.h"
 
 /**
  * gfs2_log_lock - acquire the right to mess with the log manager
@@ -50,8 +51,12 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
 
 static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_sbd *sdp;
 
+	if (!gfs2_is_ordered(ip))
+		return;
+
+	sdp = GFS2_SB(&ip->i_inode);
 	if (!test_bit(GIF_ORDERED, &ip->i_flags)) {
 		spin_lock(&sdp->sd_ordered_lock);
 		if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags))
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3ba3f16..c2469833b 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -335,25 +335,6 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
 	sdp->sd_heightsize[x] = ~0;
 	gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
 
-	sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
-				 sizeof(struct gfs2_dinode);
-	sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
-	for (x = 2;; x++) {
-		u64 space, d;
-		u32 m;
-
-		space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
-		d = space;
-		m = do_div(d, sdp->sd_inptrs);
-
-		if (d != sdp->sd_jheightsize[x - 1] || m)
-			break;
-		sdp->sd_jheightsize[x] = space;
-	}
-	sdp->sd_max_jheight = x;
-	sdp->sd_jheightsize[x] = ~0;
-	gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
-
 	sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize -
 				      sizeof(struct gfs2_leaf)) /
 				     GFS2_MIN_DIRENT_SIZE;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 7a98abd..e8585df 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -735,7 +735,10 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
 			if (!buffer_uptodate(bh))
 				goto unlock_out;
 		}
-		gfs2_trans_add_data(ip->i_gl, bh);
+		if (gfs2_is_jdata(ip))
+			gfs2_trans_add_data(ip->i_gl, bh);
+		else
+			gfs2_ordered_add_inode(ip);
 
 		/* If we need to write to the next block as well */
 		if (to_write > (bsize - boff)) {
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8b68391..6bc5cfe 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -372,8 +372,8 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
 		start = bi->bi_bh->b_data;
 		if (bi->bi_clone)
 			start = bi->bi_clone;
-		end = start + bi->bi_bh->b_size;
 		start += bi->bi_offset;
+		end = start + bi->bi_len;
 		BUG_ON(rbm.offset & 3);
 		start += (rbm.offset / GFS2_NBBY);
 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index c75caca..064c9a0 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -143,32 +143,21 @@ static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
  * @gl: The inode glock associated with the buffer
  * @bh: The buffer to add
  *
- * This is used in two distinct cases:
- * i) In ordered write mode
- *    We put the data buffer on a list so that we can ensure that it's
- *    synced to disk at the right time
- * ii) In journaled data mode
- *    We need to journal the data block in the same way as metadata in
- *    the functions above. The difference is that here we have a tag
- *    which is two __be64's being the block number (as per meta data)
- *    and a flag which says whether the data block needs escaping or
- *    not. This means we need a new log entry for each 251 or so data
- *    blocks, which isn't an enormous overhead but twice as much as
- *    for normal metadata blocks.
+ * This is used in journaled data mode.
+ * We need to journal the data block in the same way as metadata in
+ * the functions above. The difference is that here we have a tag
+ * which is two __be64's being the block number (as per meta data)
+ * and a flag which says whether the data block needs escaping or
+ * not. This means we need a new log entry for each 251 or so data
+ * blocks, which isn't an enormous overhead but twice as much as
+ * for normal metadata blocks.
  */
 void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
 {
 	struct gfs2_trans *tr = current->journal_info;
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
-	struct address_space *mapping = bh->b_page->mapping;
-	struct gfs2_inode *ip = GFS2_I(mapping->host);
 	struct gfs2_bufdata *bd;
 
-	if (!gfs2_is_jdata(ip)) {
-		gfs2_ordered_add_inode(ip);
-		return;
-	}
-
 	lock_buffer(bh);
 	if (buffer_pinned(bh)) {
 		set_bit(TR_TOUCHED, &tr->tr_flags);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 75b2542..3bf2ae0 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -31,21 +31,15 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
 	hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
 	res = hfs_brec_read(&fd, &rec, sizeof(rec));
 	if (res) {
-		hfs_find_exit(&fd);
-		if (res == -ENOENT) {
-			/* No such entry */
-			inode = NULL;
-			goto done;
-		}
-		return ERR_PTR(res);
+		if (res != -ENOENT)
+			inode = ERR_PTR(res);
+	} else {
+		inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec);
+		if (!inode)
+			inode = ERR_PTR(-EACCES);
 	}
-	inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec);
 	hfs_find_exit(&fd);
-	if (!inode)
-		return ERR_PTR(-EACCES);
-done:
-	d_add(dentry, inode);
-	return NULL;
+	return d_splice_alias(inode, dentry);
 }
 
 /*
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 2538b49..b3309b8 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -543,9 +543,9 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
 	igrab(dir);
 	hlist_add_fake(&inode->i_hash);
 	mark_inode_dirty(inode);
+	dont_mount(dentry);
 out:
-	d_add(dentry, inode);
-	return NULL;
+	return d_splice_alias(inode, dentry);
 }
 
 void hfs_evict_inode(struct inode *inode)
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 15e06fb..b525437 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -122,8 +122,7 @@ again:
 	if (S_ISREG(inode->i_mode))
 		HFSPLUS_I(inode)->linkid = linkid;
 out:
-	d_add(dentry, inode);
-	return NULL;
+	return d_splice_alias(inode, dentry);
 fail:
 	hfs_find_exit(&fd);
 	return ERR_PTR(err);
diff --git a/fs/internal.h b/fs/internal.h
index e08972d..980d005 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -125,6 +125,7 @@ int do_fchmodat(int dfd, const char __user *filename, umode_t mode);
 int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
 		int flag);
 
+extern int open_check_o_direct(struct file *f);
 extern int vfs_open(const struct path *, struct file *, const struct cred *);
 extern struct file *filp_clone_open(struct file *);
 
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 4823431..b445b13 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -549,7 +549,7 @@ static int ioctl_fsfreeze(struct file *filp)
 {
 	struct super_block *sb = file_inode(filp)->i_sb;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
 	/* If filesystem doesn't support freeze feature, return. */
@@ -566,7 +566,7 @@ static int ioctl_fsthaw(struct file *filp)
 {
 	struct super_block *sb = file_inode(filp)->i_sb;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
 	/* Thaw */
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index a709076..35a5b2a 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -29,7 +29,6 @@
 
 #ifdef PROC_FS_JFS /* see jfs_debug.h */
 
-static struct proc_dir_entry *base;
 #ifdef CONFIG_JFS_DEBUG
 static int jfs_loglevel_proc_show(struct seq_file *m, void *v)
 {
@@ -66,43 +65,29 @@ static const struct file_operations jfs_loglevel_proc_fops = {
 };
 #endif
 
-static struct {
-	const char	*name;
-	const struct file_operations *proc_fops;
-} Entries[] = {
-#ifdef CONFIG_JFS_STATISTICS
-	{ "lmstats",	&jfs_lmstats_proc_fops, },
-	{ "txstats",	&jfs_txstats_proc_fops, },
-	{ "xtstat",	&jfs_xtstat_proc_fops, },
-	{ "mpstat",	&jfs_mpstat_proc_fops, },
-#endif
-#ifdef CONFIG_JFS_DEBUG
-	{ "TxAnchor",	&jfs_txanchor_proc_fops, },
-	{ "loglevel",	&jfs_loglevel_proc_fops }
-#endif
-};
-#define NPROCENT	ARRAY_SIZE(Entries)
-
 void jfs_proc_init(void)
 {
-	int i;
+	struct proc_dir_entry *base;
 
-	if (!(base = proc_mkdir("fs/jfs", NULL)))
+	base = proc_mkdir("fs/jfs", NULL);
+	if (!base)
 		return;
 
-	for (i = 0; i < NPROCENT; i++)
-		proc_create(Entries[i].name, 0, base, Entries[i].proc_fops);
+#ifdef CONFIG_JFS_STATISTICS
+	proc_create_single("lmstats", 0, base, jfs_lmstats_proc_show);
+	proc_create_single("txstats", 0, base, jfs_txstats_proc_show);
+	proc_create_single("xtstat", 0, base, jfs_xtstat_proc_show);
+	proc_create_single("mpstat", 0, base, jfs_mpstat_proc_show);
+#endif
+#ifdef CONFIG_JFS_DEBUG
+	proc_create_single("TxAnchor", 0, base, jfs_txanchor_proc_show);
+	proc_create("loglevel", 0, base, &jfs_loglevel_proc_fops);
+#endif
 }
 
 void jfs_proc_clean(void)
 {
-	int i;
-
-	if (base) {
-		for (i = 0; i < NPROCENT; i++)
-			remove_proc_entry(Entries[i].name, base);
-		remove_proc_entry("fs/jfs", NULL);
-	}
+	remove_proc_subtree("fs/jfs", NULL);
 }
 
 #endif /* PROC_FS_JFS */
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index eafd130..0d9e35d 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h
@@ -62,7 +62,7 @@ extern void jfs_proc_clean(void);
 
 extern int jfsloglevel;
 
-extern const struct file_operations jfs_txanchor_proc_fops;
+int jfs_txanchor_proc_show(struct seq_file *m, void *v);
 
 /* information message: e.g., configuration, major event */
 #define jfs_info(fmt, arg...) do {			\
@@ -105,10 +105,10 @@ extern const struct file_operations jfs_txanchor_proc_fops;
  *	----------
  */
 #ifdef	CONFIG_JFS_STATISTICS
-extern const struct file_operations jfs_lmstats_proc_fops;
-extern const struct file_operations jfs_txstats_proc_fops;
-extern const struct file_operations jfs_mpstat_proc_fops;
-extern const struct file_operations jfs_xtstat_proc_fops;
+int jfs_lmstats_proc_show(struct seq_file *m, void *v);
+int jfs_txstats_proc_show(struct seq_file *m, void *v);
+int jfs_mpstat_proc_show(struct seq_file *m, void *v);
+int jfs_xtstat_proc_show(struct seq_file *m, void *v);
 
 #define	INCREMENT(x)		((x)++)
 #define	DECREMENT(x)		((x)--)
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 0e5d412..6b68df3 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2493,7 +2493,7 @@ exit:
 }
 
 #ifdef CONFIG_JFS_STATISTICS
-static int jfs_lmstats_proc_show(struct seq_file *m, void *v)
+int jfs_lmstats_proc_show(struct seq_file *m, void *v)
 {
 	seq_printf(m,
 		       "JFS Logmgr stats\n"
@@ -2510,16 +2510,4 @@ static int jfs_lmstats_proc_show(struct seq_file *m, void *v)
 		       lmStat.partial_page);
 	return 0;
 }
-
-static int jfs_lmstats_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, jfs_lmstats_proc_show, NULL);
-}
-
-const struct file_operations jfs_lmstats_proc_fops = {
-	.open		= jfs_lmstats_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif /* CONFIG_JFS_STATISTICS */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 1a3b0cc..fa2c682 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -815,7 +815,7 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len)
 }
 
 #ifdef CONFIG_JFS_STATISTICS
-static int jfs_mpstat_proc_show(struct seq_file *m, void *v)
+int jfs_mpstat_proc_show(struct seq_file *m, void *v)
 {
 	seq_printf(m,
 		       "JFS Metapage statistics\n"
@@ -828,16 +828,4 @@ static int jfs_mpstat_proc_show(struct seq_file *m, void *v)
 		       mpStat.lockwait);
 	return 0;
 }
-
-static int jfs_mpstat_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, jfs_mpstat_proc_show, NULL);
-}
-
-const struct file_operations jfs_mpstat_proc_fops = {
-	.open		= jfs_mpstat_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 4d973524..a5663cb 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2998,7 +2998,7 @@ int jfs_sync(void *arg)
 }
 
 #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
-static int jfs_txanchor_proc_show(struct seq_file *m, void *v)
+int jfs_txanchor_proc_show(struct seq_file *m, void *v)
 {
 	char *freewait;
 	char *freelockwait;
@@ -3032,22 +3032,10 @@ static int jfs_txanchor_proc_show(struct seq_file *m, void *v)
 		       list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
 	return 0;
 }
-
-static int jfs_txanchor_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, jfs_txanchor_proc_show, NULL);
-}
-
-const struct file_operations jfs_txanchor_proc_fops = {
-	.open		= jfs_txanchor_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
-static int jfs_txstats_proc_show(struct seq_file *m, void *v)
+int jfs_txstats_proc_show(struct seq_file *m, void *v)
 {
 	seq_printf(m,
 		       "JFS TxStats\n"
@@ -3072,16 +3060,4 @@ static int jfs_txstats_proc_show(struct seq_file *m, void *v)
 		       TxStat.txLockAlloc_freelock);
 	return 0;
 }
-
-static int jfs_txstats_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, jfs_txstats_proc_show, NULL);
-}
-
-const struct file_operations jfs_txstats_proc_fops = {
-	.open		= jfs_txstats_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 5cde6d2..2c200b5 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -3874,7 +3874,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
 }
 
 #ifdef CONFIG_JFS_STATISTICS
-static int jfs_xtstat_proc_show(struct seq_file *m, void *v)
+int jfs_xtstat_proc_show(struct seq_file *m, void *v)
 {
 	seq_printf(m,
 		       "JFS Xtree statistics\n"
@@ -3887,16 +3887,4 @@ static int jfs_xtstat_proc_show(struct seq_file *m, void *v)
 		       xtStat.split);
 	return 0;
 }
-
-static int jfs_xtstat_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, jfs_xtstat_proc_show, NULL);
-}
-
-const struct file_operations jfs_xtstat_proc_fops = {
-	.open		= jfs_xtstat_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
diff --git a/fs/locks.c b/fs/locks.c
index 62bbe8b..05e211be 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2788,22 +2788,10 @@ static const struct seq_operations locks_seq_operations = {
 	.show	= locks_show,
 };
 
-static int locks_open(struct inode *inode, struct file *filp)
-{
-	return seq_open_private(filp, &locks_seq_operations,
-					sizeof(struct locks_iterator));
-}
-
-static const struct file_operations proc_locks_operations = {
-	.open		= locks_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
-
 static int __init proc_locks_init(void)
 {
-	proc_create("locks", 0, NULL, &proc_locks_operations);
+	proc_create_seq_private("locks", 0, NULL, &locks_seq_operations,
+			sizeof(struct locks_iterator), NULL);
 	return 0;
 }
 fs_initcall(proc_locks_init);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index ccf0f00..1a6084d 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -28,13 +28,9 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, un
 		return ERR_PTR(-ENAMETOOLONG);
 
 	ino = minix_inode_by_name(dentry);
-	if (ino) {
+	if (ino)
 		inode = minix_iget(dir->i_sb, ino);
-		if (IS_ERR(inode))
-			return ERR_CAST(inode);
-	}
-	d_add(dentry, inode);
-	return NULL;
+	return d_splice_alias(inode, dentry);
 }
 
 static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
diff --git a/fs/namei.c b/fs/namei.c
index 186bd24..6df1f61 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -984,13 +984,15 @@ static bool safe_hardlink_source(struct inode *inode)
  */
 static int may_linkat(struct path *link)
 {
-	struct inode *inode;
+	struct inode *inode = link->dentry->d_inode;
+
+	/* Inode writeback is not safe when the uid or gid are invalid. */
+	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+		return -EOVERFLOW;
 
 	if (!sysctl_protected_hardlinks)
 		return 0;
 
-	inode = link->dentry->d_inode;
-
 	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
 	 * otherwise, it must be a safe source.
 	 */
@@ -1438,10 +1440,8 @@ static int path_parent_directory(struct path *path)
 static int follow_dotdot(struct nameidata *nd)
 {
 	while(1) {
-		if (nd->path.dentry == nd->root.dentry &&
-		    nd->path.mnt == nd->root.mnt) {
+		if (path_equal(&nd->path, &nd->root))
 			break;
-		}
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			int ret = path_parent_directory(&nd->path);
 			if (ret)
@@ -2749,6 +2749,11 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
 	BUG_ON(!inode);
 
 	BUG_ON(victim->d_parent->d_inode != dir);
+
+	/* Inode writeback is not safe when the uid or gid are invalid. */
+	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+		return -EOVERFLOW;
+
 	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
 
 	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
@@ -3367,7 +3372,9 @@ finish_open_created:
 		goto out;
 	*opened |= FILE_OPENED;
 opened:
-	error = ima_file_check(file, op->acc_mode, *opened);
+	error = open_check_o_direct(file);
+	if (!error)
+		error = ima_file_check(file, op->acc_mode, *opened);
 	if (!error && will_truncate)
 		error = handle_truncate(file);
 out:
@@ -3447,6 +3454,9 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 	error = finish_open(file, child, NULL, opened);
 	if (error)
 		goto out2;
+	error = open_check_o_direct(file);
+	if (error)
+		fput(file);
 out2:
 	mnt_drop_write(path.mnt);
 out:
@@ -3672,7 +3682,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 	if (error)
 		return error;
 
-	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
+	if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
+	    !ns_capable(dentry->d_sb->s_user_ns, CAP_MKNOD))
 		return -EPERM;
 
 	if (!dir->i_op->mknod)
@@ -3847,11 +3858,11 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (error)
 		goto out;
 
-	shrink_dcache_parent(dentry);
 	error = dir->i_op->rmdir(dir, dentry);
 	if (error)
 		goto out;
 
+	shrink_dcache_parent(dentry);
 	dentry->d_inode->i_flags |= S_DEAD;
 	dont_mount(dentry);
 	detach_mounts(dentry);
@@ -4434,8 +4445,6 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		    old_dir->i_nlink >= max_links)
 			goto out;
 	}
-	if (is_dir && !(flags & RENAME_EXCHANGE) && target)
-		shrink_dcache_parent(new_dentry);
 	if (!is_dir) {
 		error = try_break_deleg(source, delegated_inode);
 		if (error)
@@ -4452,8 +4461,10 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto out;
 
 	if (!(flags & RENAME_EXCHANGE) && target) {
-		if (is_dir)
+		if (is_dir) {
+			shrink_dcache_parent(new_dentry);
 			target->i_flags |= S_DEAD;
+		}
 		dont_mount(new_dentry);
 		detach_mounts(new_dentry);
 	}
diff --git a/fs/namespace.c b/fs/namespace.c
index 5f75969..8ddd148 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1590,7 +1590,7 @@ static int do_umount(struct mount *mnt, int flags)
 		 * Special case for "unmounting" root ...
 		 * we just try to remount it readonly.
 		 */
-		if (!capable(CAP_SYS_ADMIN))
+		if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
 			return -EPERM;
 		down_write(&sb->s_umount);
 		if (!sb_rdonly(sb))
@@ -2333,7 +2333,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
 	down_write(&sb->s_umount);
 	if (ms_flags & MS_BIND)
 		err = change_mount_flags(path->mnt, ms_flags);
-	else if (!capable(CAP_SYS_ADMIN))
+	else if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
 		err = -EPERM;
 	else
 		err = do_remount_sb(sb, sb_flags, data, 0);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index b9129e2..bbc91d7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1067,7 +1067,6 @@ void nfs_clients_init(struct net *net)
 }
 
 #ifdef CONFIG_PROC_FS
-static int nfs_server_list_open(struct inode *inode, struct file *file);
 static void *nfs_server_list_start(struct seq_file *p, loff_t *pos);
 static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
 static void nfs_server_list_stop(struct seq_file *p, void *v);
@@ -1080,14 +1079,6 @@ static const struct seq_operations nfs_server_list_ops = {
 	.show	= nfs_server_list_show,
 };
 
-static const struct file_operations nfs_server_list_fops = {
-	.open		= nfs_server_list_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
-static int nfs_volume_list_open(struct inode *inode, struct file *file);
 static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos);
 static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos);
 static void nfs_volume_list_stop(struct seq_file *p, void *v);
@@ -1100,23 +1091,6 @@ static const struct seq_operations nfs_volume_list_ops = {
 	.show	= nfs_volume_list_show,
 };
 
-static const struct file_operations nfs_volume_list_fops = {
-	.open		= nfs_volume_list_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
-/*
- * open "/proc/fs/nfsfs/servers" which provides a summary of servers with which
- * we're dealing
- */
-static int nfs_server_list_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &nfs_server_list_ops,
-			   sizeof(struct seq_net_private));
-}
-
 /*
  * set up the iterator to start reading from the server list and return the first item
  */
@@ -1185,15 +1159,6 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
 }
 
 /*
- * open "/proc/fs/nfsfs/volumes" which provides a summary of extant volumes
- */
-static int nfs_volume_list_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &nfs_volume_list_ops,
-			   sizeof(struct seq_net_private));
-}
-
-/*
  * set up the iterator to start reading from the volume list and return the first item
  */
 static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
@@ -1278,14 +1243,14 @@ int nfs_fs_proc_net_init(struct net *net)
 		goto error_0;
 
 	/* a file of servers with which we're dealing */
-	p = proc_create("servers", S_IFREG|S_IRUGO,
-			nn->proc_nfsfs, &nfs_server_list_fops);
+	p = proc_create_net("servers", S_IFREG|S_IRUGO, nn->proc_nfsfs,
+			&nfs_server_list_ops, sizeof(struct seq_net_private));
 	if (!p)
 		goto error_1;
 
 	/* a file of volumes that we have mounted */
-	p = proc_create("volumes", S_IFREG|S_IRUGO,
-			nn->proc_nfsfs, &nfs_volume_list_fops);
+	p = proc_create_net("volumes", S_IFREG|S_IRUGO, nn->proc_nfsfs,
+			&nfs_volume_list_ops, sizeof(struct seq_net_private));
 	if (!p)
 		goto error_1;
 	return 0;
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 70b8bf7..a43dfed 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -227,7 +227,7 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
 	if (!buf)
 		return -ENOMEM;
 
-	rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
+	rq = blk_get_request(q, REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(rq)) {
 		error = -ENOMEM;
 		goto out_free_buf;
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index b714652..4bee3a7 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -305,11 +305,10 @@ static struct dentry *omfs_lookup(struct inode *dir, struct dentry *dentry,
 		ino_t ino = be64_to_cpu(oi->i_head.h_self);
 		brelse(bh);
 		inode = omfs_iget(dir->i_sb, ino);
-		if (IS_ERR(inode))
-			return ERR_CAST(inode);
+	} else if (bh != ERR_PTR(-ENOENT)) {
+		inode = ERR_CAST(bh);
 	}
-	d_add(dentry, inode);
-	return NULL;
+	return d_splice_alias(inode, dentry);
 }
 
 /* sanity check block's self pointer */
diff --git a/fs/open.c b/fs/open.c
index c5ee7cd..d0e955b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -724,6 +724,16 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 	return ksys_fchown(fd, user, group);
 }
 
+int open_check_o_direct(struct file *f)
+{
+	/* NB: we're sure to have correct a_ops only after f_op->open */
+	if (f->f_flags & O_DIRECT) {
+		if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
+			return -EINVAL;
+	}
+	return 0;
+}
+
 static int do_dentry_open(struct file *f,
 			  struct inode *inode,
 			  int (*open)(struct inode *, struct file *),
@@ -745,7 +755,7 @@ static int do_dentry_open(struct file *f,
 	if (unlikely(f->f_flags & O_PATH)) {
 		f->f_mode = FMODE_PATH;
 		f->f_op = &empty_fops;
-		goto done;
+		return 0;
 	}
 
 	if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
@@ -798,12 +808,7 @@ static int do_dentry_open(struct file *f,
 	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
 
 	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
-done:
-	/* NB: we're sure to have correct a_ops only after f_op->open */
-	error = -EINVAL;
-	if ((f->f_flags & O_DIRECT) &&
-	    (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO))
-	    	goto out_fput;
+
 	return 0;
 
 cleanup_all:
@@ -818,9 +823,6 @@ cleanup_file:
 	f->f_path.dentry = NULL;
 	f->f_inode = NULL;
 	return error;
-out_fput:
-    	fput(f);
-	return error;
 }
 
 /**
@@ -918,14 +920,20 @@ struct file *dentry_open(const struct path *path, int flags,
 	BUG_ON(!path->mnt);
 
 	f = get_empty_filp();
-	if (IS_ERR(f))
-		return f;
-
-	f->f_flags = flags;
-	error = vfs_open(path, f, cred);
-	if (error) {
-		put_filp(f);
-		return ERR_PTR(error);
+	if (!IS_ERR(f)) {
+		f->f_flags = flags;
+		error = vfs_open(path, f, cred);
+		if (!error) {
+			/* from now on we need fput() to dispose of f */
+			error = open_check_o_direct(f);
+			if (error) {
+				fput(f);
+				f = ERR_PTR(error);
+			}
+		} else { 
+			put_filp(f);
+			f = ERR_PTR(error);
+		}
 	}
 	return f;
 }
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 2200662..607092f 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -256,8 +256,7 @@ found:
 		break;
 	}
 
-	d_add(dentry, inode);
-	return NULL;
+	return d_splice_alias(inode, dentry);
 }
 
 static int openpromfs_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 1b5707c..365cd73 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -110,7 +110,6 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
 	struct orangefs_inode_s *parent = ORANGEFS_I(dir);
 	struct orangefs_kernel_op_s *new_op;
 	struct inode *inode;
-	struct dentry *res;
 	int ret = -EINVAL;
 
 	/*
@@ -158,65 +157,18 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
 		     new_op->downcall.resp.lookup.refn.fs_id,
 		     ret);
 
-	if (ret < 0) {
-		if (ret == -ENOENT) {
-			/*
-			 * if no inode was found, add a negative dentry to
-			 * dcache anyway; if we don't, we don't hold expected
-			 * lookup semantics and we most noticeably break
-			 * during directory renames.
-			 *
-			 * however, if the operation failed or exited, do not
-			 * add the dentry (e.g. in the case that a touch is
-			 * issued on a file that already exists that was
-			 * interrupted during this lookup -- no need to add
-			 * another negative dentry for an existing file)
-			 */
-
-			gossip_debug(GOSSIP_NAME_DEBUG,
-				     "orangefs_lookup: Adding *negative* dentry "
-				     "%p for %pd\n",
-				     dentry,
-				     dentry);
-
-			d_add(dentry, NULL);
-			res = NULL;
-			goto out;
-		}
-
+	if (ret >= 0) {
+		orangefs_set_timeout(dentry);
+		inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
+	} else if (ret == -ENOENT) {
+		inode = NULL;
+	} else {
 		/* must be a non-recoverable error */
-		res = ERR_PTR(ret);
-		goto out;
-	}
-
-	orangefs_set_timeout(dentry);
-
-	inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
-	if (IS_ERR(inode)) {
-		gossip_debug(GOSSIP_NAME_DEBUG,
-			"error %ld from iget\n", PTR_ERR(inode));
-		res = ERR_CAST(inode);
-		goto out;
+		inode = ERR_PTR(ret);
 	}
 
-	gossip_debug(GOSSIP_NAME_DEBUG,
-		     "%s:%s:%d "
-		     "Found good inode [%lu] with count [%d]\n",
-		     __FILE__,
-		     __func__,
-		     __LINE__,
-		     inode->i_ino,
-		     (int)atomic_read(&inode->i_count));
-
-	/* update dentry/inode pair into dcache */
-	res = d_splice_alias(inode, dentry);
-
-	gossip_debug(GOSSIP_NAME_DEBUG,
-		     "Lookup success (inode ct = %d)\n",
-		     (int)atomic_read(&inode->i_count));
-out:
 	op_release(new_op);
-	return res;
+	return d_splice_alias(inode, dentry);
 }
 
 /* return 0 on success; non-zero otherwise */
diff --git a/fs/pipe.c b/fs/pipe.c
index 39d6f43..bb0840e 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -509,19 +509,22 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	}
 }
 
-/* No kernel lock held - fine */
-static __poll_t
-pipe_poll(struct file *filp, poll_table *wait)
+static struct wait_queue_head *
+pipe_get_poll_head(struct file *filp, __poll_t events)
 {
-	__poll_t mask;
 	struct pipe_inode_info *pipe = filp->private_data;
-	int nrbufs;
 
-	poll_wait(filp, &pipe->wait, wait);
+	return &pipe->wait;
+}
+
+/* No kernel lock held - fine */
+static __poll_t pipe_poll_mask(struct file *filp, __poll_t events)
+{
+	struct pipe_inode_info *pipe = filp->private_data;
+	int nrbufs = pipe->nrbufs;
+	__poll_t mask = 0;
 
 	/* Reading only -- no need for acquiring the semaphore.  */
-	nrbufs = pipe->nrbufs;
-	mask = 0;
 	if (filp->f_mode & FMODE_READ) {
 		mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0;
 		if (!pipe->writers && filp->f_version != pipe->w_counter)
@@ -1020,7 +1023,8 @@ const struct file_operations pipefifo_fops = {
 	.llseek		= no_llseek,
 	.read_iter	= pipe_read,
 	.write_iter	= pipe_write,
-	.poll		= pipe_poll,
+	.get_poll_head	= pipe_get_poll_head,
+	.poll_mask	= pipe_poll_mask,
 	.unlocked_ioctl	= pipe_ioctl,
 	.release	= pipe_release,
 	.fasync		= pipe_fasync,
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 72391b3..e6d7f41 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -702,25 +702,22 @@ out:
 
 static int children_seq_show(struct seq_file *seq, void *v)
 {
-	struct inode *inode = seq->private;
-	pid_t pid;
-
-	pid = pid_nr_ns(v, inode->i_sb->s_fs_info);
-	seq_printf(seq, "%d ", pid);
+	struct inode *inode = file_inode(seq->file);
 
+	seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode)));
 	return 0;
 }
 
 static void *children_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	return get_children_pid(seq->private, NULL, *pos);
+	return get_children_pid(file_inode(seq->file), NULL, *pos);
 }
 
 static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct pid *pid;
 
-	pid = get_children_pid(seq->private, v, *pos + 1);
+	pid = get_children_pid(file_inode(seq->file), v, *pos + 1);
 	put_pid(v);
 
 	++*pos;
@@ -741,17 +738,7 @@ static const struct seq_operations children_seq_ops = {
 
 static int children_seq_open(struct inode *inode, struct file *file)
 {
-	struct seq_file *m;
-	int ret;
-
-	ret = seq_open(file, &children_seq_ops);
-	if (ret)
-		return ret;
-
-	m = file->private_data;
-	m->private = inode;
-
-	return ret;
+	return seq_open(file, &children_seq_ops);
 }
 
 const struct file_operations proc_tid_children_operations = {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1a76d75..33ed174 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -698,7 +698,7 @@ static bool has_pid_permissions(struct pid_namespace *pid,
 
 static int proc_pid_permission(struct inode *inode, int mask)
 {
-	struct pid_namespace *pid = inode->i_sb->s_fs_info;
+	struct pid_namespace *pid = proc_pid_ns(inode);
 	struct task_struct *task;
 	bool has_perms;
 
@@ -733,13 +733,11 @@ static const struct inode_operations proc_def_inode_operations = {
 static int proc_single_show(struct seq_file *m, void *v)
 {
 	struct inode *inode = m->private;
-	struct pid_namespace *ns;
-	struct pid *pid;
+	struct pid_namespace *ns = proc_pid_ns(inode);
+	struct pid *pid = proc_pid(inode);
 	struct task_struct *task;
 	int ret;
 
-	ns = inode->i_sb->s_fs_info;
-	pid = proc_pid(inode);
 	task = get_pid_task(pid, PIDTYPE_PID);
 	if (!task)
 		return -ESRCH;
@@ -1410,7 +1408,7 @@ static const struct file_operations proc_fail_nth_operations = {
 static int sched_show(struct seq_file *m, void *v)
 {
 	struct inode *inode = m->private;
-	struct pid_namespace *ns = inode->i_sb->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(inode);
 	struct task_struct *p;
 
 	p = get_proc_task(inode);
@@ -1782,8 +1780,8 @@ int pid_getattr(const struct path *path, struct kstat *stat,
 		u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
+	struct pid_namespace *pid = proc_pid_ns(inode);
 	struct task_struct *task;
-	struct pid_namespace *pid = path->dentry->d_sb->s_fs_info;
 
 	generic_fillattr(inode, stat);
 
@@ -1809,15 +1807,22 @@ int pid_getattr(const struct path *path, struct kstat *stat,
 /* dentry stuff */
 
 /*
- *	Exceptional case: normally we are not allowed to unhash a busy
- * directory. In this case, however, we can do it - no aliasing problems
- * due to the way we treat inodes.
- *
+ * Set <pid>/... inode ownership (can change due to setuid(), etc.)
+ */
+void pid_update_inode(struct task_struct *task, struct inode *inode)
+{
+	task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
+
+	inode->i_mode &= ~(S_ISUID | S_ISGID);
+	security_task_to_inode(task, inode);
+}
+
+/*
  * Rewrite the inode's ownerships here because the owning task may have
  * performed a setuid(), etc.
  *
  */
-int pid_revalidate(struct dentry *dentry, unsigned int flags)
+static int pid_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode;
 	struct task_struct *task;
@@ -1829,10 +1834,7 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
 	task = get_proc_task(inode);
 
 	if (task) {
-		task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
-
-		inode->i_mode &= ~(S_ISUID | S_ISGID);
-		security_task_to_inode(task, inode);
+		pid_update_inode(task, inode);
 		put_task_struct(task);
 		return 1;
 	}
@@ -1880,8 +1882,8 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
 	struct dentry *child, *dir = file->f_path.dentry;
 	struct qstr qname = QSTR_INIT(name, len);
 	struct inode *inode;
-	unsigned type;
-	ino_t ino;
+	unsigned type = DT_UNKNOWN;
+	ino_t ino = 1;
 
 	child = d_hash_and_lookup(dir, &qname);
 	if (!child) {
@@ -1890,22 +1892,23 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
 		if (IS_ERR(child))
 			goto end_instantiate;
 		if (d_in_lookup(child)) {
-			int err = instantiate(d_inode(dir), child, task, ptr);
+			struct dentry *res;
+			res = instantiate(child, task, ptr);
 			d_lookup_done(child);
-			if (err < 0) {
-				dput(child);
+			if (IS_ERR(res))
 				goto end_instantiate;
+			if (unlikely(res)) {
+				dput(child);
+				child = res;
 			}
 		}
 	}
 	inode = d_inode(child);
 	ino = inode->i_ino;
 	type = inode->i_mode >> 12;
+end_instantiate:
 	dput(child);
 	return dir_emit(ctx, name, len, ino, type);
-
-end_instantiate:
-	return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
 }
 
 /*
@@ -2067,19 +2070,19 @@ static const struct inode_operations proc_map_files_link_inode_operations = {
 	.setattr	= proc_setattr,
 };
 
-static int
-proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+static struct dentry *
+proc_map_files_instantiate(struct dentry *dentry,
 			   struct task_struct *task, const void *ptr)
 {
 	fmode_t mode = (fmode_t)(unsigned long)ptr;
 	struct proc_inode *ei;
 	struct inode *inode;
 
-	inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK |
+	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK |
 				    ((mode & FMODE_READ ) ? S_IRUSR : 0) |
 				    ((mode & FMODE_WRITE) ? S_IWUSR : 0));
 	if (!inode)
-		return -ENOENT;
+		return ERR_PTR(-ENOENT);
 
 	ei = PROC_I(inode);
 	ei->op.proc_get_link = map_files_get_link;
@@ -2088,9 +2091,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
 	inode->i_size = 64;
 
 	d_set_d_op(dentry, &tid_map_files_dentry_operations);
-	d_add(dentry, inode);
-
-	return 0;
+	return d_splice_alias(inode, dentry);
 }
 
 static struct dentry *proc_map_files_lookup(struct inode *dir,
@@ -2099,19 +2100,19 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
 	unsigned long vm_start, vm_end;
 	struct vm_area_struct *vma;
 	struct task_struct *task;
-	int result;
+	struct dentry *result;
 	struct mm_struct *mm;
 
-	result = -ENOENT;
+	result = ERR_PTR(-ENOENT);
 	task = get_proc_task(dir);
 	if (!task)
 		goto out;
 
-	result = -EACCES;
+	result = ERR_PTR(-EACCES);
 	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
 		goto out_put_task;
 
-	result = -ENOENT;
+	result = ERR_PTR(-ENOENT);
 	if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
 		goto out_put_task;
 
@@ -2125,7 +2126,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
 		goto out_no_vma;
 
 	if (vma->vm_file)
-		result = proc_map_files_instantiate(dir, dentry, task,
+		result = proc_map_files_instantiate(dentry, task,
 				(void *)(unsigned long)vma->vm_file->f_mode);
 
 out_no_vma:
@@ -2134,7 +2135,7 @@ out_no_vma:
 out_put_task:
 	put_task_struct(task);
 out:
-	return ERR_PTR(result);
+	return result;
 }
 
 static const struct inode_operations proc_map_files_inode_operations = {
@@ -2337,7 +2338,7 @@ static int proc_timers_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 
 	tp->pid = proc_pid(inode);
-	tp->ns = inode->i_sb->s_fs_info;
+	tp->ns = proc_pid_ns(inode);
 	return 0;
 }
 
@@ -2435,16 +2436,16 @@ static const struct file_operations proc_pid_set_timerslack_ns_operations = {
 	.release	= single_release,
 };
 
-static int proc_pident_instantiate(struct inode *dir,
-	struct dentry *dentry, struct task_struct *task, const void *ptr)
+static struct dentry *proc_pident_instantiate(struct dentry *dentry,
+	struct task_struct *task, const void *ptr)
 {
 	const struct pid_entry *p = ptr;
 	struct inode *inode;
 	struct proc_inode *ei;
 
-	inode = proc_pid_make_inode(dir->i_sb, task, p->mode);
+	inode = proc_pid_make_inode(dentry->d_sb, task, p->mode);
 	if (!inode)
-		goto out;
+		return ERR_PTR(-ENOENT);
 
 	ei = PROC_I(inode);
 	if (S_ISDIR(inode->i_mode))
@@ -2454,13 +2455,9 @@ static int proc_pident_instantiate(struct inode *dir,
 	if (p->fop)
 		inode->i_fop = p->fop;
 	ei->op = p->op;
+	pid_update_inode(task, inode);
 	d_set_d_op(dentry, &pid_dentry_operations);
-	d_add(dentry, inode);
-	/* Close the race of the process dying before we return the dentry */
-	if (pid_revalidate(dentry, 0))
-		return 0;
-out:
-	return -ENOENT;
+	return d_splice_alias(inode, dentry);
 }
 
 static struct dentry *proc_pident_lookup(struct inode *dir, 
@@ -2468,11 +2465,9 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 					 const struct pid_entry *ents,
 					 unsigned int nents)
 {
-	int error;
 	struct task_struct *task = get_proc_task(dir);
 	const struct pid_entry *p, *last;
-
-	error = -ENOENT;
+	struct dentry *res = ERR_PTR(-ENOENT);
 
 	if (!task)
 		goto out_no_task;
@@ -2491,11 +2486,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 	if (p >= last)
 		goto out;
 
-	error = proc_pident_instantiate(dir, dentry, task, p);
+	res = proc_pident_instantiate(dentry, task, p);
 out:
 	put_task_struct(task);
 out_no_task:
-	return ERR_PTR(error);
+	return res;
 }
 
 static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
@@ -3138,38 +3133,32 @@ void proc_flush_task(struct task_struct *task)
 	}
 }
 
-static int proc_pid_instantiate(struct inode *dir,
-				   struct dentry * dentry,
+static struct dentry *proc_pid_instantiate(struct dentry * dentry,
 				   struct task_struct *task, const void *ptr)
 {
 	struct inode *inode;
 
-	inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
 	if (!inode)
-		goto out;
+		return ERR_PTR(-ENOENT);
 
 	inode->i_op = &proc_tgid_base_inode_operations;
 	inode->i_fop = &proc_tgid_base_operations;
 	inode->i_flags|=S_IMMUTABLE;
 
 	set_nlink(inode, nlink_tgid);
+	pid_update_inode(task, inode);
 
 	d_set_d_op(dentry, &pid_dentry_operations);
-
-	d_add(dentry, inode);
-	/* Close the race of the process dying before we return the dentry */
-	if (pid_revalidate(dentry, 0))
-		return 0;
-out:
-	return -ENOENT;
+	return d_splice_alias(inode, dentry);
 }
 
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
 {
-	int result = -ENOENT;
 	struct task_struct *task;
 	unsigned tgid;
 	struct pid_namespace *ns;
+	struct dentry *result = ERR_PTR(-ENOENT);
 
 	tgid = name_to_int(&dentry->d_name);
 	if (tgid == ~0U)
@@ -3184,10 +3173,10 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign
 	if (!task)
 		goto out;
 
-	result = proc_pid_instantiate(dir, dentry, task, NULL);
+	result = proc_pid_instantiate(dentry, task, NULL);
 	put_task_struct(task);
 out:
-	return ERR_PTR(result);
+	return result;
 }
 
 /*
@@ -3239,7 +3228,7 @@ retry:
 int proc_pid_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct tgid_iter iter;
-	struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(file_inode(file));
 	loff_t pos = ctx->pos;
 
 	if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
@@ -3435,37 +3424,32 @@ static const struct inode_operations proc_tid_base_inode_operations = {
 	.setattr	= proc_setattr,
 };
 
-static int proc_task_instantiate(struct inode *dir,
-	struct dentry *dentry, struct task_struct *task, const void *ptr)
+static struct dentry *proc_task_instantiate(struct dentry *dentry,
+	struct task_struct *task, const void *ptr)
 {
 	struct inode *inode;
-	inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
-
+	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
 	if (!inode)
-		goto out;
+		return ERR_PTR(-ENOENT);
+
 	inode->i_op = &proc_tid_base_inode_operations;
 	inode->i_fop = &proc_tid_base_operations;
-	inode->i_flags|=S_IMMUTABLE;
+	inode->i_flags |= S_IMMUTABLE;
 
 	set_nlink(inode, nlink_tid);
+	pid_update_inode(task, inode);
 
 	d_set_d_op(dentry, &pid_dentry_operations);
-
-	d_add(dentry, inode);
-	/* Close the race of the process dying before we return the dentry */
-	if (pid_revalidate(dentry, 0))
-		return 0;
-out:
-	return -ENOENT;
+	return d_splice_alias(inode, dentry);
 }
 
 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
 {
-	int result = -ENOENT;
 	struct task_struct *task;
 	struct task_struct *leader = get_proc_task(dir);
 	unsigned tid;
 	struct pid_namespace *ns;
+	struct dentry *result = ERR_PTR(-ENOENT);
 
 	if (!leader)
 		goto out_no_task;
@@ -3485,13 +3469,13 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
 	if (!same_thread_group(leader, task))
 		goto out_drop_task;
 
-	result = proc_task_instantiate(dir, dentry, task, NULL);
+	result = proc_task_instantiate(dentry, task, NULL);
 out_drop_task:
 	put_task_struct(task);
 out:
 	put_task_struct(leader);
 out_no_task:
-	return ERR_PTR(result);
+	return result;
 }
 
 /*
@@ -3588,7 +3572,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 	/* f_version caches the tgid value that the last readdir call couldn't
 	 * return. lseek aka telldir automagically resets f_version to 0.
 	 */
-	ns = inode->i_sb->s_fs_info;
+	ns = proc_pid_ns(inode);
 	tid = (int)file->f_version;
 	file->f_version = 0;
 	for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index 8233e7a..fa762c5 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -11,21 +11,9 @@ static int cmdline_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int cmdline_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, cmdline_proc_show, NULL);
-}
-
-static const struct file_operations cmdline_proc_fops = {
-	.open		= cmdline_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_cmdline_init(void)
 {
-	proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
+	proc_create_single("cmdline", 0, NULL, cmdline_proc_show);
 	return 0;
 }
 fs_initcall(proc_cmdline_init);
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index a8ac48a..954caf0 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -91,21 +91,9 @@ static const struct seq_operations consoles_op = {
 	.show	= show_console_dev
 };
 
-static int consoles_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &consoles_op);
-}
-
-static const struct file_operations proc_consoles_operations = {
-	.open		= consoles_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int __init proc_consoles_init(void)
 {
-	proc_create("consoles", 0, NULL, &proc_consoles_operations);
+	proc_create_seq("consoles", 0, NULL, &consoles_op);
 	return 0;
 }
 fs_initcall(proc_consoles_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 2c7f22b..37d3869 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -51,21 +51,9 @@ static const struct seq_operations devinfo_ops = {
 	.show  = devinfo_show
 };
 
-static int devinfo_open(struct inode *inode, struct file *filp)
-{
-	return seq_open(filp, &devinfo_ops);
-}
-
-static const struct file_operations proc_devinfo_operations = {
-	.open		= devinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int __init proc_devices_init(void)
 {
-	proc_create("devices", 0, NULL, &proc_devinfo_operations);
+	proc_create_seq("devices", 0, NULL, &devinfo_ops);
 	return 0;
 }
 fs_initcall(proc_devices_init);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6b80cd1..05b9893 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -81,9 +81,41 @@ static const struct file_operations proc_fdinfo_file_operations = {
 	.release	= single_release,
 };
 
+static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
+{
+	struct files_struct *files = get_files_struct(task);
+	struct file *file;
+
+	if (!files)
+		return false;
+
+	rcu_read_lock();
+	file = fcheck_files(files, fd);
+	if (file)
+		*mode = file->f_mode;
+	rcu_read_unlock();
+	put_files_struct(files);
+	return !!file;
+}
+
+static void tid_fd_update_inode(struct task_struct *task, struct inode *inode,
+				fmode_t f_mode)
+{
+	task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
+
+	if (S_ISLNK(inode->i_mode)) {
+		unsigned i_mode = S_IFLNK;
+		if (f_mode & FMODE_READ)
+			i_mode |= S_IRUSR | S_IXUSR;
+		if (f_mode & FMODE_WRITE)
+			i_mode |= S_IWUSR | S_IXUSR;
+		inode->i_mode = i_mode;
+	}
+	security_task_to_inode(task, inode);
+}
+
 static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
 {
-	struct files_struct *files;
 	struct task_struct *task;
 	struct inode *inode;
 	unsigned int fd;
@@ -96,35 +128,11 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
 	fd = proc_fd(inode);
 
 	if (task) {
-		files = get_files_struct(task);
-		if (files) {
-			struct file *file;
-
-			rcu_read_lock();
-			file = fcheck_files(files, fd);
-			if (file) {
-				unsigned f_mode = file->f_mode;
-
-				rcu_read_unlock();
-				put_files_struct(files);
-
-				task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
-
-				if (S_ISLNK(inode->i_mode)) {
-					unsigned i_mode = S_IFLNK;
-					if (f_mode & FMODE_READ)
-						i_mode |= S_IRUSR | S_IXUSR;
-					if (f_mode & FMODE_WRITE)
-						i_mode |= S_IWUSR | S_IXUSR;
-					inode->i_mode = i_mode;
-				}
-
-				security_task_to_inode(task, inode);
-				put_task_struct(task);
-				return 1;
-			}
-			rcu_read_unlock();
-			put_files_struct(files);
+		fmode_t f_mode;
+		if (tid_fd_mode(task, fd, &f_mode)) {
+			tid_fd_update_inode(task, inode, f_mode);
+			put_task_struct(task);
+			return 1;
 		}
 		put_task_struct(task);
 	}
@@ -166,34 +174,33 @@ static int proc_fd_link(struct dentry *dentry, struct path *path)
 	return ret;
 }
 
-static int
-proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
-		    struct task_struct *task, const void *ptr)
+struct fd_data {
+	fmode_t mode;
+	unsigned fd;
+};
+
+static struct dentry *proc_fd_instantiate(struct dentry *dentry,
+	struct task_struct *task, const void *ptr)
 {
-	unsigned fd = (unsigned long)ptr;
+	const struct fd_data *data = ptr;
 	struct proc_inode *ei;
 	struct inode *inode;
 
-	inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK);
+	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK);
 	if (!inode)
-		goto out;
+		return ERR_PTR(-ENOENT);
 
 	ei = PROC_I(inode);
-	ei->fd = fd;
+	ei->fd = data->fd;
 
 	inode->i_op = &proc_pid_link_inode_operations;
 	inode->i_size = 64;
 
 	ei->op.proc_get_link = proc_fd_link;
+	tid_fd_update_inode(task, inode, data->mode);
 
 	d_set_d_op(dentry, &tid_fd_dentry_operations);
-	d_add(dentry, inode);
-
-	/* Close the race of the process dying before we return the dentry */
-	if (tid_fd_revalidate(dentry, 0))
-		return 0;
- out:
-	return -ENOENT;
+	return d_splice_alias(inode, dentry);
 }
 
 static struct dentry *proc_lookupfd_common(struct inode *dir,
@@ -201,19 +208,21 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
 					   instantiate_t instantiate)
 {
 	struct task_struct *task = get_proc_task(dir);
-	int result = -ENOENT;
-	unsigned fd = name_to_int(&dentry->d_name);
+	struct fd_data data = {.fd = name_to_int(&dentry->d_name)};
+	struct dentry *result = ERR_PTR(-ENOENT);
 
 	if (!task)
 		goto out_no_task;
-	if (fd == ~0U)
+	if (data.fd == ~0U)
+		goto out;
+	if (!tid_fd_mode(task, data.fd, &data.mode))
 		goto out;
 
-	result = instantiate(dir, dentry, task, (void *)(unsigned long)fd);
+	result = instantiate(dentry, task, &data);
 out:
 	put_task_struct(task);
 out_no_task:
-	return ERR_PTR(result);
+	return result;
 }
 
 static int proc_readfd_common(struct file *file, struct dir_context *ctx,
@@ -236,17 +245,22 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
 	for (fd = ctx->pos - 2;
 	     fd < files_fdtable(files)->max_fds;
 	     fd++, ctx->pos++) {
+		struct file *f;
+		struct fd_data data;
 		char name[10 + 1];
 		int len;
 
-		if (!fcheck_files(files, fd))
+		f = fcheck_files(files, fd);
+		if (!f)
 			continue;
+		data.mode = f->f_mode;
 		rcu_read_unlock();
+		data.fd = fd;
 
 		len = snprintf(name, sizeof(name), "%u", fd);
 		if (!proc_fill_cache(file, ctx,
 				     name, len, instantiate, p,
-				     (void *)(unsigned long)fd))
+				     &data))
 			goto out_fd_loop;
 		cond_resched();
 		rcu_read_lock();
@@ -304,31 +318,25 @@ const struct inode_operations proc_fd_inode_operations = {
 	.setattr	= proc_setattr,
 };
 
-static int
-proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
-			struct task_struct *task, const void *ptr)
+static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry,
+	struct task_struct *task, const void *ptr)
 {
-	unsigned fd = (unsigned long)ptr;
+	const struct fd_data *data = ptr;
 	struct proc_inode *ei;
 	struct inode *inode;
 
-	inode = proc_pid_make_inode(dir->i_sb, task, S_IFREG | S_IRUSR);
+	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUSR);
 	if (!inode)
-		goto out;
+		return ERR_PTR(-ENOENT);
 
 	ei = PROC_I(inode);
-	ei->fd = fd;
+	ei->fd = data->fd;
 
 	inode->i_fop = &proc_fdinfo_file_operations;
+	tid_fd_update_inode(task, inode, 0);
 
 	d_set_d_op(dentry, &tid_fd_dentry_operations);
-	d_add(dentry, inode);
-
-	/* Close the race of the process dying before we return the dentry */
-	if (tid_fd_revalidate(dentry, 0))
-		return 0;
- out:
-	return -ENOENT;
+	return d_splice_alias(inode, dentry);
 }
 
 static struct dentry *
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 2078e70..7b4d971 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -25,6 +25,7 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/uaccess.h>
+#include <linux/seq_file.h>
 
 #include "internal.h"
 
@@ -256,8 +257,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
 		if (!inode)
 			return ERR_PTR(-ENOMEM);
 		d_set_d_op(dentry, &proc_misc_dentry_ops);
-		d_add(dentry, inode);
-		return NULL;
+		return d_splice_alias(inode, dentry);
 	}
 	read_unlock(&proc_subdir_lock);
 	return ERR_PTR(-ENOENT);
@@ -346,13 +346,12 @@ static const struct inode_operations proc_dir_inode_operations = {
 	.setattr	= proc_notify_change,
 };
 
-static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
+/* returns the registered entry, or frees dp and returns NULL on failure */
+struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
+		struct proc_dir_entry *dp)
 {
-	int ret;
-
-	ret = proc_alloc_inum(&dp->low_ino);
-	if (ret)
-		return ret;
+	if (proc_alloc_inum(&dp->low_ino))
+		goto out_free_entry;
 
 	write_lock(&proc_subdir_lock);
 	dp->parent = dir;
@@ -360,12 +359,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
 		WARN(1, "proc_dir_entry '%s/%s' already registered\n",
 		     dir->name, dp->name);
 		write_unlock(&proc_subdir_lock);
-		proc_free_inum(dp->low_ino);
-		return -EEXIST;
+		goto out_free_inum;
 	}
 	write_unlock(&proc_subdir_lock);
 
-	return 0;
+	return dp;
+out_free_inum:
+	proc_free_inum(dp->low_ino);
+out_free_entry:
+	pde_free(dp);
+	return NULL;
 }
 
 static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
@@ -443,10 +446,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
 		if (ent->data) {
 			strcpy((char*)ent->data,dest);
 			ent->proc_iops = &proc_link_inode_operations;
-			if (proc_register(parent, ent) < 0) {
-				pde_free(ent);
-				ent = NULL;
-			}
+			ent = proc_register(parent, ent);
 		} else {
 			pde_free(ent);
 			ent = NULL;
@@ -470,11 +470,9 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
 		ent->proc_fops = &proc_dir_operations;
 		ent->proc_iops = &proc_dir_inode_operations;
 		parent->nlink++;
-		if (proc_register(parent, ent) < 0) {
-			pde_free(ent);
+		ent = proc_register(parent, ent);
+		if (!ent)
 			parent->nlink--;
-			ent = NULL;
-		}
 	}
 	return ent;
 }
@@ -505,47 +503,47 @@ struct proc_dir_entry *proc_create_mount_point(const char *name)
 		ent->proc_fops = NULL;
 		ent->proc_iops = NULL;
 		parent->nlink++;
-		if (proc_register(parent, ent) < 0) {
-			pde_free(ent);
+		ent = proc_register(parent, ent);
+		if (!ent)
 			parent->nlink--;
-			ent = NULL;
-		}
 	}
 	return ent;
 }
 EXPORT_SYMBOL(proc_create_mount_point);
 
-struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
-					struct proc_dir_entry *parent,
-					const struct file_operations *proc_fops,
-					void *data)
+struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
+		struct proc_dir_entry **parent, void *data)
 {
-	struct proc_dir_entry *pde;
+	struct proc_dir_entry *p;
+
 	if ((mode & S_IFMT) == 0)
 		mode |= S_IFREG;
-
-	if (!S_ISREG(mode)) {
-		WARN_ON(1);	/* use proc_mkdir() */
+	if ((mode & S_IALLUGO) == 0)
+		mode |= S_IRUGO;
+	if (WARN_ON_ONCE(!S_ISREG(mode)))
 		return NULL;
+
+	p = __proc_create(parent, name, mode, 1);
+	if (p) {
+		p->proc_iops = &proc_file_inode_operations;
+		p->data = data;
 	}
+	return p;
+}
+
+struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
+		struct proc_dir_entry *parent,
+		const struct file_operations *proc_fops, void *data)
+{
+	struct proc_dir_entry *p;
 
 	BUG_ON(proc_fops == NULL);
 
-	if ((mode & S_IALLUGO) == 0)
-		mode |= S_IRUGO;
-	pde = __proc_create(&parent, name, mode, 1);
-	if (!pde)
-		goto out;
-	pde->proc_fops = proc_fops;
-	pde->data = data;
-	pde->proc_iops = &proc_file_inode_operations;
-	if (proc_register(parent, pde) < 0)
-		goto out_free;
-	return pde;
-out_free:
-	pde_free(pde);
-out:
-	return NULL;
+	p = proc_create_reg(name, mode, &parent, data);
+	if (!p)
+		return NULL;
+	p->proc_fops = proc_fops;
+	return proc_register(parent, p);
 }
 EXPORT_SYMBOL(proc_create_data);
  
@@ -557,6 +555,67 @@ struct proc_dir_entry *proc_create(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL(proc_create);
 
+static int proc_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *de = PDE(inode);
+
+	if (de->state_size)
+		return seq_open_private(file, de->seq_ops, de->state_size);
+	return seq_open(file, de->seq_ops);
+}
+
+static const struct file_operations proc_seq_fops = {
+	.open		= proc_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
+		struct proc_dir_entry *parent, const struct seq_operations *ops,
+		unsigned int state_size, void *data)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create_reg(name, mode, &parent, data);
+	if (!p)
+		return NULL;
+	p->proc_fops = &proc_seq_fops;
+	p->seq_ops = ops;
+	p->state_size = state_size;
+	return proc_register(parent, p);
+}
+EXPORT_SYMBOL(proc_create_seq_private);
+
+static int proc_single_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *de = PDE(inode);
+
+	return single_open(file, de->single_show, de->data);
+}
+
+static const struct file_operations proc_single_fops = {
+	.open		= proc_single_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
+		struct proc_dir_entry *parent,
+		int (*show)(struct seq_file *, void *), void *data)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create_reg(name, mode, &parent, data);
+	if (!p)
+		return NULL;
+	p->proc_fops = &proc_single_fops;
+	p->single_show = show;
+	return proc_register(parent, p);
+}
+EXPORT_SYMBOL(proc_create_single_data);
+
 void proc_set_size(struct proc_dir_entry *de, loff_t size)
 {
 	de->size = size;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 0f1692e..43c70c9 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -44,7 +44,12 @@ struct proc_dir_entry {
 	struct completion *pde_unload_completion;
 	const struct inode_operations *proc_iops;
 	const struct file_operations *proc_fops;
+	union {
+		const struct seq_operations *seq_ops;
+		int (*single_show)(struct seq_file *, void *);
+	};
 	void *data;
+	unsigned int state_size;
 	unsigned int low_ino;
 	nlink_t nlink;
 	kuid_t uid;
@@ -57,9 +62,9 @@ struct proc_dir_entry {
 	umode_t mode;
 	u8 namelen;
 #ifdef CONFIG_64BIT
-#define SIZEOF_PDE_INLINE_NAME	(192-139)
+#define SIZEOF_PDE_INLINE_NAME	(192-155)
 #else
-#define SIZEOF_PDE_INLINE_NAME	(128-87)
+#define SIZEOF_PDE_INLINE_NAME	(128-95)
 #endif
 	char inline_name[SIZEOF_PDE_INLINE_NAME];
 } __randomize_layout;
@@ -147,14 +152,14 @@ extern const struct dentry_operations pid_dentry_operations;
 extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int proc_setattr(struct dentry *, struct iattr *);
 extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
-extern int pid_revalidate(struct dentry *, unsigned int);
+extern void pid_update_inode(struct task_struct *, struct inode *);
 extern int pid_delete_dentry(const struct dentry *);
 extern int proc_pid_readdir(struct file *, struct dir_context *);
 extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
 extern loff_t mem_lseek(struct file *, loff_t, int);
 
 /* Lookups */
-typedef int instantiate_t(struct inode *, struct dentry *,
+typedef struct dentry *instantiate_t(struct dentry *,
 				     struct task_struct *, const void *);
 extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
 			   instantiate_t, struct task_struct *, const void *);
@@ -162,6 +167,10 @@ extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, i
 /*
  * generic.c
  */
+struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
+		struct proc_dir_entry **parent, void *data);
+struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
+		struct proc_dir_entry *dp);
 extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
 struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *);
 extern int proc_readdir(struct file *, struct dir_context *);
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
index 6a6bee9..cb0edc7 100644
--- a/fs/proc/interrupts.c
+++ b/fs/proc/interrupts.c
@@ -34,21 +34,9 @@ static const struct seq_operations int_seq_ops = {
 	.show  = show_interrupts
 };
 
-static int interrupts_open(struct inode *inode, struct file *filp)
-{
-	return seq_open(filp, &int_seq_ops);
-}
-
-static const struct file_operations proc_interrupts_operations = {
-	.open		= interrupts_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int __init proc_interrupts_init(void)
 {
-	proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
+	proc_create_seq("interrupts", 0, NULL, &int_seq_ops);
 	return 0;
 }
 fs_initcall(proc_interrupts_init);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index b572cc8..d066947 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -28,21 +28,9 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int loadavg_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, loadavg_proc_show, NULL);
-}
-
-static const struct file_operations loadavg_proc_fops = {
-	.open		= loadavg_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_loadavg_init(void)
 {
-	proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
+	proc_create_single("loadavg", 0, NULL, loadavg_proc_show);
 	return 0;
 }
 fs_initcall(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 65a72ab..2fb0484 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -149,21 +149,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int meminfo_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, meminfo_proc_show, NULL);
-}
-
-static const struct file_operations meminfo_proc_fops = {
-	.open		= meminfo_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_meminfo_init(void)
 {
-	proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
+	proc_create_single("meminfo", 0, NULL, meminfo_proc_show);
 	return 0;
 }
 fs_initcall(proc_meminfo_init);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 59b17e5..dd2b35f 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -87,28 +87,24 @@ static const struct inode_operations proc_ns_link_inode_operations = {
 	.setattr	= proc_setattr,
 };
 
-static int proc_ns_instantiate(struct inode *dir,
-	struct dentry *dentry, struct task_struct *task, const void *ptr)
+static struct dentry *proc_ns_instantiate(struct dentry *dentry,
+	struct task_struct *task, const void *ptr)
 {
 	const struct proc_ns_operations *ns_ops = ptr;
 	struct inode *inode;
 	struct proc_inode *ei;
 
-	inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK | S_IRWXUGO);
+	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO);
 	if (!inode)
-		goto out;
+		return ERR_PTR(-ENOENT);
 
 	ei = PROC_I(inode);
 	inode->i_op = &proc_ns_link_inode_operations;
 	ei->ns_ops = ns_ops;
+	pid_update_inode(task, inode);
 
 	d_set_d_op(dentry, &pid_dentry_operations);
-	d_add(dentry, inode);
-	/* Close the race of the process dying before we return the dentry */
-	if (pid_revalidate(dentry, 0))
-		return 0;
-out:
-	return -ENOENT;
+	return d_splice_alias(inode, dentry);
 }
 
 static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
@@ -147,12 +143,10 @@ const struct file_operations proc_ns_dir_operations = {
 static struct dentry *proc_ns_dir_lookup(struct inode *dir,
 				struct dentry *dentry, unsigned int flags)
 {
-	int error;
 	struct task_struct *task = get_proc_task(dir);
 	const struct proc_ns_operations **entry, **last;
 	unsigned int len = dentry->d_name.len;
-
-	error = -ENOENT;
+	struct dentry *res = ERR_PTR(-ENOENT);
 
 	if (!task)
 		goto out_no_task;
@@ -167,11 +161,11 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
 	if (entry == last)
 		goto out;
 
-	error = proc_ns_instantiate(dir, dentry, task, *entry);
+	res = proc_ns_instantiate(dentry, task, *entry);
 out:
 	put_task_struct(task);
 out_no_task:
-	return ERR_PTR(error);
+	return res;
 }
 
 const struct inode_operations proc_ns_dir_inode_operations = {
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 7563437..3b63be6 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -113,21 +113,9 @@ static const struct seq_operations proc_nommu_region_list_seqop = {
 	.show	= nommu_region_list_show
 };
 
-static int proc_nommu_region_list_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &proc_nommu_region_list_seqop);
-}
-
-static const struct file_operations proc_nommu_region_list_operations = {
-	.open    = proc_nommu_region_list_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release,
-};
-
 static int __init proc_nommu_init(void)
 {
-	proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations);
+	proc_create_seq("maps", S_IRUGO, NULL, &proc_nommu_region_list_seqop);
 	return 0;
 }
 
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 1763f37..7d94fa0 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -38,20 +38,20 @@ static struct net *get_proc_net(const struct inode *inode)
 	return maybe_get_net(PDE_NET(PDE(inode)));
 }
 
-int seq_open_net(struct inode *ino, struct file *f,
-		 const struct seq_operations *ops, int size)
+static int seq_open_net(struct inode *inode, struct file *file)
 {
-	struct net *net;
+	unsigned int state_size = PDE(inode)->state_size;
 	struct seq_net_private *p;
+	struct net *net;
 
-	BUG_ON(size < sizeof(*p));
+	WARN_ON_ONCE(state_size < sizeof(*p));
 
-	net = get_proc_net(ino);
-	if (net == NULL)
+	net = get_proc_net(inode);
+	if (!net)
 		return -ENXIO;
 
-	p = __seq_open_private(f, ops, size);
-	if (p == NULL) {
+	p = __seq_open_private(file, PDE(inode)->seq_ops, state_size);
+	if (!p) {
 		put_net(net);
 		return -ENOMEM;
 	}
@@ -60,51 +60,83 @@ int seq_open_net(struct inode *ino, struct file *f,
 #endif
 	return 0;
 }
-EXPORT_SYMBOL_GPL(seq_open_net);
 
-int single_open_net(struct inode *inode, struct file *file,
-		int (*show)(struct seq_file *, void *))
+static int seq_release_net(struct inode *ino, struct file *f)
 {
-	int err;
-	struct net *net;
-
-	err = -ENXIO;
-	net = get_proc_net(inode);
-	if (net == NULL)
-		goto err_net;
-
-	err = single_open(file, show, net);
-	if (err < 0)
-		goto err_open;
+	struct seq_file *seq = f->private_data;
 
+	put_net(seq_file_net(seq));
+	seq_release_private(ino, f);
 	return 0;
+}
 
-err_open:
-	put_net(net);
-err_net:
-	return err;
+static const struct file_operations proc_net_seq_fops = {
+	.open		= seq_open_net,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
+};
+
+struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
+		struct proc_dir_entry *parent, const struct seq_operations *ops,
+		unsigned int state_size, void *data)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create_reg(name, mode, &parent, data);
+	if (!p)
+		return NULL;
+	p->proc_fops = &proc_net_seq_fops;
+	p->seq_ops = ops;
+	p->state_size = state_size;
+	return proc_register(parent, p);
 }
-EXPORT_SYMBOL_GPL(single_open_net);
+EXPORT_SYMBOL_GPL(proc_create_net_data);
 
-int seq_release_net(struct inode *ino, struct file *f)
+static int single_open_net(struct inode *inode, struct file *file)
 {
-	struct seq_file *seq;
+	struct proc_dir_entry *de = PDE(inode);
+	struct net *net;
+	int err;
 
-	seq = f->private_data;
+	net = get_proc_net(inode);
+	if (!net)
+		return -ENXIO;
 
-	put_net(seq_file_net(seq));
-	seq_release_private(ino, f);
-	return 0;
+	err = single_open(file, de->single_show, net);
+	if (err)
+		put_net(net);
+	return err;
 }
-EXPORT_SYMBOL_GPL(seq_release_net);
 
-int single_release_net(struct inode *ino, struct file *f)
+static int single_release_net(struct inode *ino, struct file *f)
 {
 	struct seq_file *seq = f->private_data;
 	put_net(seq->private);
 	return single_release(ino, f);
 }
-EXPORT_SYMBOL_GPL(single_release_net);
+
+static const struct file_operations proc_net_single_fops = {
+	.open		= single_open_net,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release_net,
+};
+
+struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
+		struct proc_dir_entry *parent,
+		int (*show)(struct seq_file *, void *), void *data)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create_reg(name, mode, &parent, data);
+	if (!p)
+		return NULL;
+	p->proc_fops = &proc_net_single_fops;
+	p->single_show = show;
+	return proc_register(parent, p);
+}
+EXPORT_SYMBOL_GPL(proc_create_net_single);
 
 static struct net *get_proc_task_net(struct inode *dir)
 {
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 8989936..4d765e5 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -554,9 +554,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
 	if (!inode)
 		goto out;
 
-	err = NULL;
 	d_set_d_op(dentry, &proc_sys_dentry_operations);
-	d_add(dentry, inode);
+	err = d_splice_alias(inode, dentry);
 
 out:
 	if (h)
@@ -684,6 +683,7 @@ static bool proc_sys_fill_cache(struct file *file,
 		if (IS_ERR(child))
 			return false;
 		if (d_in_lookup(child)) {
+			struct dentry *res;
 			inode = proc_sys_make_inode(dir->d_sb, head, table);
 			if (!inode) {
 				d_lookup_done(child);
@@ -691,7 +691,16 @@ static bool proc_sys_fill_cache(struct file *file,
 				return false;
 			}
 			d_set_d_op(child, &proc_sys_dentry_operations);
-			d_add(child, inode);
+			res = d_splice_alias(inode, child);
+			d_lookup_done(child);
+			if (unlikely(res)) {
+				if (IS_ERR(res)) {
+					dput(child);
+					return false;
+				}
+				dput(child);
+				child = res;
+			}
 		}
 	}
 	inode = d_inode(child);
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index d0cf1c5..c69ff19 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -126,18 +126,6 @@ static const struct seq_operations tty_drivers_op = {
 	.show	= show_tty_driver
 };
 
-static int tty_drivers_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &tty_drivers_op);
-}
-
-static const struct file_operations proc_tty_drivers_operations = {
-	.open		= tty_drivers_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 /*
  * This function is called by tty_register_driver() to handle
  * registering the driver's /proc handler into /proc/tty/driver/<foo>
@@ -147,11 +135,11 @@ void proc_tty_register_driver(struct tty_driver *driver)
 	struct proc_dir_entry *ent;
 		
 	if (!driver->driver_name || driver->proc_entry ||
-	    !driver->ops->proc_fops)
+	    !driver->ops->proc_show)
 		return;
 
-	ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
-			       driver->ops->proc_fops, driver);
+	ent = proc_create_single_data(driver->driver_name, 0, proc_tty_driver,
+			       driver->ops->proc_show, driver);
 	driver->proc_entry = ent;
 }
 
@@ -186,6 +174,6 @@ void __init proc_tty_init(void)
 	 * entry.
 	 */
 	proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL);
-	proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops);
-	proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations);
+	proc_create_seq("tty/ldiscs", 0, NULL, &tty_ldiscs_seq_ops);
+	proc_create_seq("tty/drivers", 0, NULL, &tty_drivers_op);
 }
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 4d7d061..127265e 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -12,7 +12,7 @@ static const char *proc_self_get_link(struct dentry *dentry,
 				      struct inode *inode,
 				      struct delayed_call *done)
 {
-	struct pid_namespace *ns = inode->i_sb->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(inode);
 	pid_t tgid = task_tgid_nr_ns(current, ns);
 	char *name;
 
@@ -36,7 +36,7 @@ static unsigned self_inum __ro_after_init;
 int proc_setup_self(struct super_block *s)
 {
 	struct inode *root_inode = d_inode(s->s_root);
-	struct pid_namespace *ns = s->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(root_inode);
 	struct dentry *self;
 	
 	inode_lock(root_inode);
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 24072cc..12901dc 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -25,21 +25,9 @@ static int show_softirqs(struct seq_file *p, void *v)
 	return 0;
 }
 
-static int softirqs_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, show_softirqs, NULL);
-}
-
-static const struct file_operations proc_softirqs_operations = {
-	.open		= softirqs_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_softirqs_init(void)
 {
-	proc_create("softirqs", 0, NULL, &proc_softirqs_operations);
+	proc_create_single("softirqs", 0, NULL, show_softirqs);
 	return 0;
 }
 fs_initcall(proc_softirqs_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c486ad4..a20c6e4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -937,7 +937,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 	/*
 	 * The soft-dirty tracker uses #PF-s to catch writes
 	 * to pages, so write-protect the pte as well. See the
-	 * Documentation/vm/soft-dirty.txt for full description
+	 * Documentation/admin-guide/mm/soft-dirty.rst for full description
 	 * of how soft-dirty works.
 	 */
 	pte_t ptent = *pte;
@@ -1421,7 +1421,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
  * Bits 0-54  page frame number (PFN) if present
  * Bits 0-4   swap type if swapped
  * Bits 5-54  swap offset if swapped
- * Bit  55    pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
+ * Bit  55    pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
  * Bit  56    page exclusively mapped
  * Bits 57-60 zero
  * Bit  61    page is file-page or shared-anon
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 9d2efac..b905010 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -12,7 +12,7 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
 					     struct inode *inode,
 					     struct delayed_call *done)
 {
-	struct pid_namespace *ns = inode->i_sb->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(inode);
 	pid_t tgid = task_tgid_nr_ns(current, ns);
 	pid_t pid = task_pid_nr_ns(current, ns);
 	char *name;
@@ -36,7 +36,7 @@ static unsigned thread_self_inum __ro_after_init;
 int proc_setup_thread_self(struct super_block *s)
 {
 	struct inode *root_inode = d_inode(s->s_root);
-	struct pid_namespace *ns = s->s_fs_info;
+	struct pid_namespace *ns = proc_pid_ns(root_inode);
 	struct dentry *thread_self;
 
 	inode_lock(root_inode);
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 95a708d..3bd12f9 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -30,21 +30,9 @@ static int uptime_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int uptime_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, uptime_proc_show, NULL);
-}
-
-static const struct file_operations uptime_proc_fops = {
-	.open		= uptime_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_uptime_init(void)
 {
-	proc_create("uptime", 0, NULL, &uptime_proc_fops);
+	proc_create_single("uptime", 0, NULL, uptime_proc_show);
 	return 0;
 }
 fs_initcall(proc_uptime_init);
diff --git a/fs/proc/version.c b/fs/proc/version.c
index 94901e8..b449f18 100644
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -15,21 +15,9 @@ static int version_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int version_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, version_proc_show, NULL);
-}
-
-static const struct file_operations version_proc_fops = {
-	.open		= version_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_version_init(void)
 {
-	proc_create("version", 0, NULL, &version_proc_fops);
+	proc_create_single("version", 0, NULL, version_proc_show);
 	return 0;
 }
 fs_initcall(proc_version_init);
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index eca2787..8d72221 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -114,13 +114,9 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, unsigned i
 	brelse(bh);
 
 	foundinode = qnx4_iget(dir->i_sb, ino);
-	if (IS_ERR(foundinode)) {
+	if (IS_ERR(foundinode))
 		QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
 			   PTR_ERR(foundinode)));
-		return ERR_CAST(foundinode);
-	}
 out:
-	d_add(dentry, foundinode);
-
-	return NULL;
+	return d_splice_alias(foundinode, dentry);
 }
diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c
index 72c2770..e2e98e6 100644
--- a/fs/qnx6/namei.c
+++ b/fs/qnx6/namei.c
@@ -29,15 +29,11 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry,
 	if (ino) {
 		foundinode = qnx6_iget(dir->i_sb, ino);
 		qnx6_put_page(page);
-		if (IS_ERR(foundinode)) {
+		if (IS_ERR(foundinode))
 			pr_debug("lookup->iget ->  error %ld\n",
 				 PTR_ERR(foundinode));
-			return ERR_CAST(foundinode);
-		}
 	} else {
 		pr_debug("%s(): not found %s\n", __func__, name);
-		return NULL;
 	}
-	d_add(dentry, foundinode);
-	return NULL;
+	return d_splice_alias(foundinode, dentry);
 }
diff --git a/fs/read_write.c b/fs/read_write.c
index c4eabbf..e83bd97 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -2023,7 +2023,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
 		ret = mnt_want_write_file(dst_file);
 		if (ret) {
 			info->status = ret;
-			goto next_loop;
+			goto next_fdput;
 		}
 
 		dst_off = info->dest_offset;
@@ -2058,9 +2058,9 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
 
 next_file:
 		mnt_drop_write_file(dst_file);
-next_loop:
+next_fdput:
 		fdput(dst_fd);
-
+next_loop:
 		if (fatal_signal_pending(current))
 			goto out;
 	}
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index fe99915..e39b391 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -389,27 +389,13 @@ static int show_journal(struct seq_file *m, void *unused)
 	return 0;
 }
 
-static int r_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, PDE_DATA(inode), 
-				proc_get_parent_data(inode));
-}
-
-static const struct file_operations r_file_operations = {
-	.open = r_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 static struct proc_dir_entry *proc_info_root = NULL;
 static const char proc_info_root_name[] = "fs/reiserfs";
 
 static void add_file(struct super_block *sb, char *name,
 		     int (*func) (struct seq_file *, void *))
 {
-	proc_create_data(name, 0, REISERFS_SB(sb)->procdir,
-			 &r_file_operations, func);
+	proc_create_single_data(name, 0, REISERFS_SB(sb)->procdir, func, sb);
 }
 
 int reiserfs_proc_info_init(struct super_block *sb)
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 8f06fd1..6ccb519 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -213,7 +213,7 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
 				   unsigned int flags)
 {
 	unsigned long offset, maxoff;
-	struct inode *inode;
+	struct inode *inode = NULL;
 	struct romfs_inode ri;
 	const char *name;		/* got from dentry */
 	int len, ret;
@@ -233,7 +233,7 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	for (;;) {
 		if (!offset || offset >= maxoff)
-			goto out0;
+			break;
 
 		ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri));
 		if (ret < 0)
@@ -244,37 +244,19 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
 				       len);
 		if (ret < 0)
 			goto error;
-		if (ret == 1)
+		if (ret == 1) {
+			/* Hard link handling */
+			if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
+				offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+			inode = romfs_iget(dir->i_sb, offset);
 			break;
+		}
 
 		/* next entry */
 		offset = be32_to_cpu(ri.next) & ROMFH_MASK;
 	}
 
-	/* Hard link handling */
-	if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
-		offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-
-	inode = romfs_iget(dir->i_sb, offset);
-	if (IS_ERR(inode)) {
-		ret = PTR_ERR(inode);
-		goto error;
-	}
-	goto outi;
-
-	/*
-	 * it's a bit funky, _lookup needs to return an error code
-	 * (negative) or a NULL, both as a dentry.  ENOENT should not
-	 * be returned, instead we need to create a negative dentry by
-	 * d_add(dentry, NULL); and return 0 as no error.
-	 * (Although as I see, it only matters on writable file
-	 * systems).
-	 */
-out0:
-	inode = NULL;
-outi:
-	d_add(dentry, inode);
-	ret = 0;
+	return d_splice_alias(inode, dentry);
 error:
 	return ERR_PTR(ret);
 }
diff --git a/fs/select.c b/fs/select.c
index ba879c5..bc3cc0f 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -34,6 +34,29 @@
 
 #include <linux/uaccess.h>
 
+__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
+{
+	if (file->f_op->poll) {
+		return file->f_op->poll(file, pt);
+	} else if (file_has_poll_mask(file)) {
+		unsigned int events = poll_requested_events(pt);
+		struct wait_queue_head *head;
+
+		if (pt && pt->_qproc) {
+			head = file->f_op->get_poll_head(file, events);
+			if (!head)
+				return DEFAULT_POLLMASK;
+			if (IS_ERR(head))
+				return EPOLLERR;
+			pt->_qproc(file, head, pt);
+		}
+
+		return file->f_op->poll_mask(file, events);
+	} else {
+		return DEFAULT_POLLMASK;
+	}
+}
+EXPORT_SYMBOL_GPL(vfs_poll);
 
 /*
  * Estimate expected accuracy in ns from a timeval.
@@ -233,7 +256,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 	add_wait_queue(wait_address, &entry->wait);
 }
 
-int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+static int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 			  ktime_t *expires, unsigned long slack)
 {
 	int rc = -EINTR;
@@ -258,7 +281,6 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 
 	return rc;
 }
-EXPORT_SYMBOL(poll_schedule_timeout);
 
 /**
  * poll_select_set_timeout - helper function to setup the timeout value
@@ -503,14 +525,10 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
 					continue;
 				f = fdget(i);
 				if (f.file) {
-					const struct file_operations *f_op;
-					f_op = f.file->f_op;
-					mask = DEFAULT_POLLMASK;
-					if (f_op->poll) {
-						wait_key_set(wait, in, out,
-							     bit, busy_flag);
-						mask = (*f_op->poll)(f.file, wait);
-					}
+					wait_key_set(wait, in, out, bit,
+						     busy_flag);
+					mask = vfs_poll(f.file, wait);
+
 					fdput(f);
 					if ((mask & POLLIN_SET) && (in & bit)) {
 						res_in |= bit;
@@ -813,34 +831,29 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
 				     bool *can_busy_poll,
 				     __poll_t busy_flag)
 {
-	__poll_t mask;
-	int fd;
-
-	mask = 0;
-	fd = pollfd->fd;
-	if (fd >= 0) {
-		struct fd f = fdget(fd);
-		mask = EPOLLNVAL;
-		if (f.file) {
-			/* userland u16 ->events contains POLL... bitmap */
-			__poll_t filter = demangle_poll(pollfd->events) |
-						EPOLLERR | EPOLLHUP;
-			mask = DEFAULT_POLLMASK;
-			if (f.file->f_op->poll) {
-				pwait->_key = filter;
-				pwait->_key |= busy_flag;
-				mask = f.file->f_op->poll(f.file, pwait);
-				if (mask & busy_flag)
-					*can_busy_poll = true;
-			}
-			/* Mask out unneeded events. */
-			mask &= filter;
-			fdput(f);
-		}
-	}
+	int fd = pollfd->fd;
+	__poll_t mask = 0, filter;
+	struct fd f;
+
+	if (fd < 0)
+		goto out;
+	mask = EPOLLNVAL;
+	f = fdget(fd);
+	if (!f.file)
+		goto out;
+
+	/* userland u16 ->events contains POLL... bitmap */
+	filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
+	pwait->_key = filter | busy_flag;
+	mask = vfs_poll(f.file, pwait);
+	if (mask & busy_flag)
+		*can_busy_poll = true;
+	mask &= filter;		/* Mask out unneeded events. */
+	fdput(f);
+
+out:
 	/* ... and so does ->revents */
 	pollfd->revents = mangle_poll(mask);
-
 	return mask;
 }
 
diff --git a/fs/signalfd.c b/fs/signalfd.c
index d2187a8..cbb42f7 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -81,83 +81,86 @@ static __poll_t signalfd_poll(struct file *file, poll_table *wait)
 static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 			     siginfo_t const *kinfo)
 {
-	long err;
+	struct signalfd_siginfo new;
 
 	BUILD_BUG_ON(sizeof(struct signalfd_siginfo) != 128);
 
 	/*
 	 * Unused members should be zero ...
 	 */
-	err = __clear_user(uinfo, sizeof(*uinfo));
+	memset(&new, 0, sizeof(new));
 
 	/*
 	 * If you change siginfo_t structure, please be sure
 	 * this code is fixed accordingly.
 	 */
-	err |= __put_user(kinfo->si_signo, &uinfo->ssi_signo);
-	err |= __put_user(kinfo->si_errno, &uinfo->ssi_errno);
-	err |= __put_user(kinfo->si_code, &uinfo->ssi_code);
+	new.ssi_signo = kinfo->si_signo;
+	new.ssi_errno = kinfo->si_errno;
+	new.ssi_code  = kinfo->si_code;
 	switch (siginfo_layout(kinfo->si_signo, kinfo->si_code)) {
 	case SIL_KILL:
-		err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
-		err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
+		new.ssi_pid = kinfo->si_pid;
+		new.ssi_uid = kinfo->si_uid;
 		break;
 	case SIL_TIMER:
-		 err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid);
-		 err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun);
-		 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
-		 err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
+		new.ssi_tid = kinfo->si_tid;
+		new.ssi_overrun = kinfo->si_overrun;
+		new.ssi_ptr = (long) kinfo->si_ptr;
+		new.ssi_int = kinfo->si_int;
 		break;
 	case SIL_POLL:
-		err |= __put_user(kinfo->si_band, &uinfo->ssi_band);
-		err |= __put_user(kinfo->si_fd, &uinfo->ssi_fd);
+		new.ssi_band = kinfo->si_band;
+		new.ssi_fd   = kinfo->si_fd;
 		break;
-	case SIL_FAULT:
-		err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr);
-#ifdef __ARCH_SI_TRAPNO
-		err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
-#endif
-#ifdef BUS_MCEERR_AO
+	case SIL_FAULT_BNDERR:
+	case SIL_FAULT_PKUERR:
 		/*
-		 * Other callers might not initialize the si_lsb field,
-		 * so check explicitly for the right codes here.
+		 * Fall through to the SIL_FAULT case.  Both SIL_FAULT_BNDERR
+		 * and SIL_FAULT_PKUERR are only generated by faults that
+		 * deliver them synchronously to userspace.  In case someone
+		 * injects one of these signals and signalfd catches it treat
+		 * it as SIL_FAULT.
 		 */
-		if (kinfo->si_signo == SIGBUS &&
-		     kinfo->si_code == BUS_MCEERR_AO)
-			err |= __put_user((short) kinfo->si_addr_lsb,
-					  &uinfo->ssi_addr_lsb);
+	case SIL_FAULT:
+		new.ssi_addr = (long) kinfo->si_addr;
+#ifdef __ARCH_SI_TRAPNO
+		new.ssi_trapno = kinfo->si_trapno;
 #endif
-#ifdef BUS_MCEERR_AR
-		/*
-		 * Other callers might not initialize the si_lsb field,
-		 * so check explicitly for the right codes here.
-		 */
-		if (kinfo->si_signo == SIGBUS &&
-		    kinfo->si_code == BUS_MCEERR_AR)
-			err |= __put_user((short) kinfo->si_addr_lsb,
-					  &uinfo->ssi_addr_lsb);
+		break;
+	case SIL_FAULT_MCEERR:
+		new.ssi_addr = (long) kinfo->si_addr;
+#ifdef __ARCH_SI_TRAPNO
+		new.ssi_trapno = kinfo->si_trapno;
 #endif
+		new.ssi_addr_lsb = (short) kinfo->si_addr_lsb;
 		break;
 	case SIL_CHLD:
-		err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
-		err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
-		err |= __put_user(kinfo->si_status, &uinfo->ssi_status);
-		err |= __put_user(kinfo->si_utime, &uinfo->ssi_utime);
-		err |= __put_user(kinfo->si_stime, &uinfo->ssi_stime);
+		new.ssi_pid    = kinfo->si_pid;
+		new.ssi_uid    = kinfo->si_uid;
+		new.ssi_status = kinfo->si_status;
+		new.ssi_utime  = kinfo->si_utime;
+		new.ssi_stime  = kinfo->si_stime;
 		break;
 	case SIL_RT:
-	default:
 		/*
 		 * This case catches also the signals queued by sigqueue().
 		 */
-		err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
-		err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
-		err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
-		err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
+		new.ssi_pid = kinfo->si_pid;
+		new.ssi_uid = kinfo->si_uid;
+		new.ssi_ptr = (long) kinfo->si_ptr;
+		new.ssi_int = kinfo->si_int;
+		break;
+	case SIL_SYS:
+		new.ssi_call_addr = (long) kinfo->si_call_addr;
+		new.ssi_syscall   = kinfo->si_syscall;
+		new.ssi_arch      = kinfo->si_arch;
 		break;
 	}
 
-	return err ? -EFAULT: sizeof(*uinfo);
+	if (copy_to_user(uinfo, &new, sizeof(struct signalfd_siginfo)))
+		return -EFAULT;
+
+	return sizeof(*uinfo);
 }
 
 static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info,
diff --git a/fs/super.c b/fs/super.c
index 4b5b562..50728d9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -947,7 +947,7 @@ void emergency_remount(void)
 static void do_thaw_all_callback(struct super_block *sb)
 {
 	down_write(&sb->s_umount);
-	if (sb->s_root && sb->s_flags & MS_BORN) {
+	if (sb->s_root && sb->s_flags & SB_BORN) {
 		emergency_thaw_bdev(sb);
 		thaw_super_locked(sb);
 	} else {
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 250b075..4d5d204 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -51,14 +51,9 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, un
 	if (dentry->d_name.len > SYSV_NAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
 	ino = sysv_inode_by_name(dentry);
-
-	if (ino) {
+	if (ino)
 		inode = sysv_iget(dir->i_sb, ino);
-		if (IS_ERR(inode))
-			return ERR_CAST(inode);
-	}
-	d_add(dentry, inode);
-	return NULL;
+	return d_splice_alias(inode, dentry);
 }
 
 static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode, dev_t rdev)
diff --git a/fs/timerfd.c b/fs/timerfd.c
index cdad49d..d84a2be 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -226,21 +226,20 @@ static int timerfd_release(struct inode *inode, struct file *file)
 	kfree_rcu(ctx, rcu);
 	return 0;
 }
-
-static __poll_t timerfd_poll(struct file *file, poll_table *wait)
+	
+static struct wait_queue_head *timerfd_get_poll_head(struct file *file,
+		__poll_t eventmask)
 {
 	struct timerfd_ctx *ctx = file->private_data;
-	__poll_t events = 0;
-	unsigned long flags;
 
-	poll_wait(file, &ctx->wqh, wait);
+	return &ctx->wqh;
+}
 
-	spin_lock_irqsave(&ctx->wqh.lock, flags);
-	if (ctx->ticks)
-		events |= EPOLLIN;
-	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+static __poll_t timerfd_poll_mask(struct file *file, __poll_t eventmask)
+{
+	struct timerfd_ctx *ctx = file->private_data;
 
-	return events;
+	return ctx->ticks ? EPOLLIN : 0;
 }
 
 static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
@@ -364,7 +363,8 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg
 
 static const struct file_operations timerfd_fops = {
 	.release	= timerfd_release,
-	.poll		= timerfd_poll,
+	.get_poll_head	= timerfd_get_poll_head,
+	.poll_mask	= timerfd_poll_mask,
 	.read		= timerfd_read,
 	.llseek		= noop_llseek,
 	.show_fdinfo	= timerfd_show,
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 9d7fb88..4e267cc 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -214,7 +214,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
 	int err;
 	union ubifs_key key;
 	struct inode *inode = NULL;
-	struct ubifs_dent_node *dent;
+	struct ubifs_dent_node *dent = NULL;
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
 	struct fscrypt_name nm;
 
@@ -229,14 +229,14 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
 		return ERR_PTR(err);
 
 	if (fname_len(&nm) > UBIFS_MAX_NLEN) {
-		err = -ENAMETOOLONG;
-		goto out_fname;
+		inode = ERR_PTR(-ENAMETOOLONG);
+		goto done;
 	}
 
 	dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
 	if (!dent) {
-		err = -ENOMEM;
-		goto out_fname;
+		inode = ERR_PTR(-ENOMEM);
+		goto done;
 	}
 
 	if (nm.hash) {
@@ -250,16 +250,16 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
 	}
 
 	if (err) {
-		if (err == -ENOENT) {
+		if (err == -ENOENT)
 			dbg_gen("not found");
-			goto done;
-		}
-		goto out_dent;
+		else
+			inode = ERR_PTR(err);
+		goto done;
 	}
 
 	if (dbg_check_name(c, dent, &nm)) {
-		err = -EINVAL;
-		goto out_dent;
+		inode = ERR_PTR(-EINVAL);
+		goto done;
 	}
 
 	inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
@@ -272,7 +272,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
 		ubifs_err(c, "dead directory entry '%pd', error %d",
 			  dentry, err);
 		ubifs_ro_mode(c, err);
-		goto out_dent;
+		goto done;
 	}
 
 	if (ubifs_crypt_is_encrypted(dir) &&
@@ -280,27 +280,14 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
 	    !fscrypt_has_permitted_context(dir, inode)) {
 		ubifs_warn(c, "Inconsistent encryption contexts: %lu/%lu",
 			   dir->i_ino, inode->i_ino);
-		err = -EPERM;
-		goto out_inode;
+		iput(inode);
+		inode = ERR_PTR(-EPERM);
 	}
 
 done:
 	kfree(dent);
 	fscrypt_free_filename(&nm);
-	/*
-	 * Note, d_splice_alias() would be required instead if we supported
-	 * NFS.
-	 */
-	d_add(dentry, inode);
-	return NULL;
-
-out_inode:
-	iput(inode);
-out_dent:
-	kfree(dent);
-out_fname:
-	fscrypt_free_filename(&nm);
-	return ERR_PTR(err);
+	return d_splice_alias(inode, dentry);
 }
 
 static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
diff --git a/fs/xattr.c b/fs/xattr.c
index 61cd28b..f9cb1db 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -229,7 +229,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(vfs_setxattr);
 
-ssize_t
+static ssize_t
 xattr_getsecurity(struct inode *inode, const char *name, void *value,
 			size_t size)
 {
@@ -254,7 +254,6 @@ out:
 out_noalloc:
 	return len;
 }
-EXPORT_SYMBOL_GPL(xattr_getsecurity);
 
 /*
  * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr
@@ -354,7 +353,6 @@ vfs_listxattr(struct dentry *dentry, char *list, size_t size)
 	if (error)
 		return error;
 	if (inode->i_op->listxattr && (inode->i_opflags & IOP_XATTR)) {
-		error = -EOPNOTSUPP;
 		error = inode->i_op->listxattr(dentry, list, size);
 	} else {
 		error = security_inode_listsecurity(inode, list, size);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 0ab824f..1024635 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -594,7 +594,7 @@ xfs_alloc_ioend(
 	struct xfs_ioend	*ioend;
 	struct bio		*bio;
 
-	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
+	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
 	xfs_init_bio_from_bh(bio, bh);
 
 	ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 69346d4..694c85b 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -18,7 +18,7 @@
 #ifndef __XFS_AOPS_H__
 #define __XFS_AOPS_H__
 
-extern struct bio_set *xfs_ioend_bioset;
+extern struct bio_set xfs_ioend_bioset;
 
 /*
  * Types of I/O for bmap clustering and I/O completion tracking.
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a3ed3c8..df42e4c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -260,6 +260,7 @@ xfs_vn_lookup(
 	struct dentry	*dentry,
 	unsigned int flags)
 {
+	struct inode *inode;
 	struct xfs_inode *cip;
 	struct xfs_name	name;
 	int		error;
@@ -269,14 +270,13 @@ xfs_vn_lookup(
 
 	xfs_dentry_to_name(&name, dentry);
 	error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
-	if (unlikely(error)) {
-		if (unlikely(error != -ENOENT))
-			return ERR_PTR(error);
-		d_add(dentry, NULL);
-		return NULL;
-	}
-
-	return d_splice_alias(VFS_I(cip), dentry);
+	if (likely(!error))
+		inode = VFS_I(cip);
+	else if (likely(error == -ENOENT))
+		inode = NULL;
+	else
+		inode = ERR_PTR(error);
+	return d_splice_alias(inode, dentry);
 }
 
 STATIC struct dentry *
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 056e12b..1cc7990 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -113,6 +113,7 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats)
 	}
 }
 
+#ifdef CONFIG_PROC_FS
 /* legacy quota interfaces */
 #ifdef CONFIG_XFS_QUOTA
 static int xqm_proc_show(struct seq_file *m, void *v)
@@ -124,18 +125,6 @@ static int xqm_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int xqm_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, xqm_proc_show, NULL);
-}
-
-static const struct file_operations xqm_proc_fops = {
-	.open		= xqm_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /* legacy quota stats interface no 2 */
 static int xqmstat_proc_show(struct seq_file *m, void *v)
 {
@@ -147,22 +136,8 @@ static int xqmstat_proc_show(struct seq_file *m, void *v)
 	seq_putc(m, '\n');
 	return 0;
 }
-
-static int xqmstat_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, xqmstat_proc_show, NULL);
-}
-
-static const struct file_operations xqmstat_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= xqmstat_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif /* CONFIG_XFS_QUOTA */
 
-#ifdef CONFIG_PROC_FS
 int
 xfs_init_procfs(void)
 {
@@ -174,11 +149,9 @@ xfs_init_procfs(void)
 		goto out;
 
 #ifdef CONFIG_XFS_QUOTA
-	if (!proc_create("fs/xfs/xqmstat", 0, NULL,
-			 &xqmstat_proc_fops))
+	if (!proc_create_single("fs/xfs/xqmstat", 0, NULL, xqmstat_proc_show))
 		goto out;
-	if (!proc_create("fs/xfs/xqm", 0, NULL,
-			 &xqm_proc_fops))
+	if (!proc_create_single("fs/xfs/xqm", 0, NULL, xqm_proc_show))
 		goto out;
 #endif
 	return 0;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d714240..f643d76 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -63,7 +63,7 @@
 #include <linux/parser.h>
 
 static const struct super_operations xfs_super_operations;
-struct bio_set *xfs_ioend_bioset;
+struct bio_set xfs_ioend_bioset;
 
 static struct kset *xfs_kset;		/* top-level xfs sysfs dir */
 #ifdef DEBUG
@@ -1845,10 +1845,9 @@ MODULE_ALIAS_FS("xfs");
 STATIC int __init
 xfs_init_zones(void)
 {
-	xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE,
+	if (bioset_init(&xfs_ioend_bioset, 4 * MAX_BUF_PER_PAGE,
 			offsetof(struct xfs_ioend, io_inline_bio),
-			BIOSET_NEED_BVECS);
-	if (!xfs_ioend_bioset)
+			BIOSET_NEED_BVECS))
 		goto out;
 
 	xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
@@ -1997,7 +1996,7 @@ xfs_init_zones(void)
  out_destroy_log_ticket_zone:
 	kmem_zone_destroy(xfs_log_ticket_zone);
  out_free_ioend_bioset:
-	bioset_free(xfs_ioend_bioset);
+	bioset_exit(&xfs_ioend_bioset);
  out:
 	return -ENOMEM;
 }
@@ -2029,7 +2028,7 @@ xfs_destroy_zones(void)
 	kmem_zone_destroy(xfs_btree_cur_zone);
 	kmem_zone_destroy(xfs_bmap_free_item_zone);
 	kmem_zone_destroy(xfs_log_ticket_zone);
-	bioset_free(xfs_ioend_bioset);
+	bioset_exit(&xfs_ioend_bioset);
 }
 
 STATIC int __init
diff --git a/include/asm-generic/atomic-long.h b/include/asm-generic/atomic-long.h
index 34a028a..87d14476 100644
--- a/include/asm-generic/atomic-long.h
+++ b/include/asm-generic/atomic-long.h
@@ -25,6 +25,7 @@ typedef atomic64_t atomic_long_t;
 
 #define ATOMIC_LONG_INIT(i)	ATOMIC64_INIT(i)
 #define ATOMIC_LONG_PFX(x)	atomic64 ## x
+#define ATOMIC_LONG_TYPE	s64
 
 #else
 
@@ -32,6 +33,7 @@ typedef atomic_t atomic_long_t;
 
 #define ATOMIC_LONG_INIT(i)	ATOMIC_INIT(i)
 #define ATOMIC_LONG_PFX(x)	atomic ## x
+#define ATOMIC_LONG_TYPE	int
 
 #endif
 
@@ -90,6 +92,21 @@ ATOMIC_LONG_ADD_SUB_OP(sub, _release)
 #define atomic_long_cmpxchg(l, old, new) \
 	(ATOMIC_LONG_PFX(_cmpxchg)((ATOMIC_LONG_PFX(_t) *)(l), (old), (new)))
 
+
+#define atomic_long_try_cmpxchg_relaxed(l, old, new) \
+	(ATOMIC_LONG_PFX(_try_cmpxchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(l), \
+					   (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg_acquire(l, old, new) \
+	(ATOMIC_LONG_PFX(_try_cmpxchg_acquire)((ATOMIC_LONG_PFX(_t) *)(l), \
+					   (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg_release(l, old, new) \
+	(ATOMIC_LONG_PFX(_try_cmpxchg_release)((ATOMIC_LONG_PFX(_t) *)(l), \
+					   (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg(l, old, new) \
+	(ATOMIC_LONG_PFX(_try_cmpxchg)((ATOMIC_LONG_PFX(_t) *)(l), \
+				       (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+
+
 #define atomic_long_xchg_relaxed(v, new) \
 	(ATOMIC_LONG_PFX(_xchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(v), (new)))
 #define atomic_long_xchg_acquire(v, new) \
@@ -244,6 +261,8 @@ static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u)
 #define atomic_long_inc_not_zero(l) \
 	ATOMIC_LONG_PFX(_inc_not_zero)((ATOMIC_LONG_PFX(_t) *)(l))
 
+#define atomic_long_cond_read_relaxed(v, c) \
+	ATOMIC_LONG_PFX(_cond_read_relaxed)((ATOMIC_LONG_PFX(_t) *)(v), (c))
 #define atomic_long_cond_read_acquire(v, c) \
 	ATOMIC_LONG_PFX(_cond_read_acquire)((ATOMIC_LONG_PFX(_t) *)(v), (c))
 
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index 29458bb..2cafdbb 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -221,18 +221,17 @@ do {									\
 #endif
 
 /**
- * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering
+ * smp_cond_load_relaxed() - (Spin) wait for cond with no ordering guarantees
  * @ptr: pointer to the variable to wait on
  * @cond: boolean expression to wait for
  *
- * Equivalent to using smp_load_acquire() on the condition variable but employs
- * the control dependency of the wait to reduce the barrier on many platforms.
+ * Equivalent to using READ_ONCE() on the condition variable.
  *
  * Due to C lacking lambda expressions we load the value of *ptr into a
  * pre-named variable @VAL to be used in @cond.
  */
-#ifndef smp_cond_load_acquire
-#define smp_cond_load_acquire(ptr, cond_expr) ({		\
+#ifndef smp_cond_load_relaxed
+#define smp_cond_load_relaxed(ptr, cond_expr) ({		\
 	typeof(ptr) __PTR = (ptr);				\
 	typeof(*ptr) VAL;					\
 	for (;;) {						\
@@ -241,10 +240,26 @@ do {									\
 			break;					\
 		cpu_relax();					\
 	}							\
-	smp_acquire__after_ctrl_dep();				\
 	VAL;							\
 })
 #endif
 
+/**
+ * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering
+ * @ptr: pointer to the variable to wait on
+ * @cond: boolean expression to wait for
+ *
+ * Equivalent to using smp_load_acquire() on the condition variable but employs
+ * the control dependency of the wait to reduce the barrier on many platforms.
+ */
+#ifndef smp_cond_load_acquire
+#define smp_cond_load_acquire(ptr, cond_expr) ({		\
+	typeof(*ptr) _val;					\
+	_val = smp_cond_load_relaxed(ptr, cond_expr);		\
+	smp_acquire__after_ctrl_dep();				\
+	_val;							\
+})
+#endif
+
 #endif /* !__ASSEMBLY__ */
 #endif /* __ASM_GENERIC_BARRIER_H */
diff --git a/include/asm-generic/dma-mapping.h b/include/asm-generic/dma-mapping.h
index 880a292..ad28682 100644
--- a/include/asm-generic/dma-mapping.h
+++ b/include/asm-generic/dma-mapping.h
@@ -4,7 +4,16 @@
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
+	/*
+	 * Use the non-coherent ops if available.  If an architecture wants a
+	 * more fine-grained selection of operations it will have to implement
+	 * get_arch_dma_ops itself or use the per-device dma_ops.
+	 */
+#ifdef CONFIG_DMA_NONCOHERENT_OPS
+	return &dma_noncoherent_ops;
+#else
 	return &dma_direct_ops;
+#endif
 }
 
 #endif /* _ASM_GENERIC_DMA_MAPPING_H */
diff --git a/include/asm-generic/pci.h b/include/asm-generic/pci.h
index 830d765..6bb3cd3 100644
--- a/include/asm-generic/pci.h
+++ b/include/asm-generic/pci.h
@@ -14,12 +14,4 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 }
 #endif /* HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ */
 
-/*
- * By default, assume that no iommu is in use and that the PCI
- * space is mapped to address physical 0.
- */
-#ifndef PCI_DMA_BUS_IS_PHYS
-#define PCI_DMA_BUS_IS_PHYS	(1)
-#endif
-
 #endif /* _ASM_GENERIC_PCI_H */
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index b37b4ad..9cc4575 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -26,7 +26,6 @@
  * @lock: Pointer to queued spinlock structure
  * Return: 1 if it is locked, 0 otherwise
  */
-#ifndef queued_spin_is_locked
 static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
 {
 	/*
@@ -35,7 +34,6 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
 	 */
 	return atomic_read(&lock->val);
 }
-#endif
 
 /**
  * queued_spin_value_unlocked - is the spinlock structure unlocked?
@@ -100,7 +98,7 @@ static __always_inline void queued_spin_unlock(struct qspinlock *lock)
 	/*
 	 * unlock() needs release semantics:
 	 */
-	(void)atomic_sub_return_release(_Q_LOCKED_VAL, &lock->val);
+	smp_store_release(&lock->locked, 0);
 }
 #endif
 
diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h
index 034acd0..0763f06 100644
--- a/include/asm-generic/qspinlock_types.h
+++ b/include/asm-generic/qspinlock_types.h
@@ -29,13 +29,41 @@
 #endif
 
 typedef struct qspinlock {
-	atomic_t	val;
+	union {
+		atomic_t val;
+
+		/*
+		 * By using the whole 2nd least significant byte for the
+		 * pending bit, we can allow better optimization of the lock
+		 * acquisition for the pending bit holder.
+		 */
+#ifdef __LITTLE_ENDIAN
+		struct {
+			u8	locked;
+			u8	pending;
+		};
+		struct {
+			u16	locked_pending;
+			u16	tail;
+		};
+#else
+		struct {
+			u16	tail;
+			u16	locked_pending;
+		};
+		struct {
+			u8	reserved[2];
+			u8	pending;
+			u8	locked;
+		};
+#endif
+	};
 } arch_spinlock_t;
 
 /*
  * Initializier
  */
-#define	__ARCH_SPIN_LOCK_UNLOCKED	{ ATOMIC_INIT(0) }
+#define	__ARCH_SPIN_LOCK_UNLOCKED	{ .val = ATOMIC_INIT(0) }
 
 /*
  * Bitfields in the atomic value:
diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h
index 482461d..cc414db 100644
--- a/include/crypto/if_alg.h
+++ b/include/crypto/if_alg.h
@@ -245,8 +245,7 @@ ssize_t af_alg_sendpage(struct socket *sock, struct page *page,
 			int offset, size_t size, int flags);
 void af_alg_free_resources(struct af_alg_async_req *areq);
 void af_alg_async_cb(struct crypto_async_request *_req, int err);
-__poll_t af_alg_poll(struct file *file, struct socket *sock,
-			 poll_table *wait);
+__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events);
 struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk,
 					   unsigned int areqlen);
 int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags,
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 9d8aabe..b83e68d 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -8,8 +8,6 @@ struct kioctx;
 struct kiocb;
 struct mm_struct;
 
-#define KIOCB_KEY		0
-
 typedef int (kiocb_cancel_fn)(struct kiocb *);
 
 /* prototypes */
diff --git a/include/linux/atalk.h b/include/linux/atalk.h
index 4037392..23f8055 100644
--- a/include/linux/atalk.h
+++ b/include/linux/atalk.h
@@ -145,7 +145,12 @@ extern rwlock_t atalk_interfaces_lock;
 
 extern struct atalk_route atrtr_default;
 
-extern const struct file_operations atalk_seq_arp_fops;
+struct aarp_iter_state {
+	int bucket;
+	struct aarp_entry **table;
+};
+
+extern const struct seq_operations aarp_seq_ops;
 
 extern int sysctl_aarp_expiry_time;
 extern int sysctl_aarp_tick_time;
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 8b276fd..01ce399 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -654,6 +654,7 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 }
 #endif
 
+#define atomic_cond_read_relaxed(v, c)	smp_cond_load_relaxed(&(v)->counter, (c))
 #define atomic_cond_read_acquire(v, c)	smp_cond_load_acquire(&(v)->counter, (c))
 
 #ifdef CONFIG_GENERIC_ATOMIC64
@@ -1075,6 +1076,7 @@ static inline long long atomic64_fetch_andnot_release(long long i, atomic64_t *v
 }
 #endif
 
+#define atomic64_cond_read_relaxed(v, c)	smp_cond_load_relaxed(&(v)->counter, (c))
 #define atomic64_cond_read_acquire(v, c)	smp_cond_load_acquire(&(v)->counter, (c))
 
 #include <asm-generic/atomic-long.h>
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ce547a2..810a8be 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -67,8 +67,12 @@
 
 #define bio_multiple_segments(bio)				\
 	((bio)->bi_iter.bi_size != bio_iovec(bio).bv_len)
-#define bio_sectors(bio)	((bio)->bi_iter.bi_size >> 9)
-#define bio_end_sector(bio)	((bio)->bi_iter.bi_sector + bio_sectors((bio)))
+
+#define bvec_iter_sectors(iter)	((iter).bi_size >> 9)
+#define bvec_iter_end_sector(iter) ((iter).bi_sector + bvec_iter_sectors((iter)))
+
+#define bio_sectors(bio)	bvec_iter_sectors((bio)->bi_iter)
+#define bio_end_sector(bio)	bvec_iter_end_sector((bio)->bi_iter)
 
 /*
  * Return the data direction, READ or WRITE.
@@ -406,13 +410,13 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors,
 	return bio_split(bio, sectors, gfp, bs);
 }
 
-extern struct bio_set *bioset_create(unsigned int, unsigned int, int flags);
 enum {
 	BIOSET_NEED_BVECS = BIT(0),
 	BIOSET_NEED_RESCUER = BIT(1),
 };
-extern void bioset_free(struct bio_set *);
-extern mempool_t *biovec_create_pool(int pool_entries);
+extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags);
+extern void bioset_exit(struct bio_set *);
+extern int biovec_init_pool(mempool_t *pool, int pool_entries);
 
 extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *);
 extern void bio_put(struct bio *);
@@ -421,11 +425,11 @@ extern void __bio_clone_fast(struct bio *, struct bio *);
 extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
 
-extern struct bio_set *fs_bio_set;
+extern struct bio_set fs_bio_set;
 
 static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
 {
-	return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
+	return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set);
 }
 
 static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
@@ -499,7 +503,10 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 }
 #endif
 
+extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
+			       struct bio *src, struct bvec_iter *src_iter);
 extern void bio_copy_data(struct bio *dst, struct bio *src);
+extern void bio_list_copy_data(struct bio *dst, struct bio *src);
 extern void bio_free_pages(struct bio *bio);
 
 extern struct bio *bio_copy_user_iov(struct request_queue *,
@@ -507,7 +514,13 @@ extern struct bio *bio_copy_user_iov(struct request_queue *,
 				     struct iov_iter *,
 				     gfp_t);
 extern int bio_uncopy_user(struct bio *);
-void zero_fill_bio(struct bio *bio);
+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter);
+
+static inline void zero_fill_bio(struct bio *bio)
+{
+	zero_fill_bio_iter(bio, bio->bi_iter);
+}
+
 extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
 extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
@@ -722,11 +735,11 @@ struct bio_set {
 	struct kmem_cache *bio_slab;
 	unsigned int front_pad;
 
-	mempool_t *bio_pool;
-	mempool_t *bvec_pool;
+	mempool_t bio_pool;
+	mempool_t bvec_pool;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
-	mempool_t *bio_integrity_pool;
-	mempool_t *bvec_integrity_pool;
+	mempool_t bio_integrity_pool;
+	mempool_t bvec_integrity_pool;
 #endif
 
 	/*
@@ -745,6 +758,11 @@ struct biovec_slab {
 	struct kmem_cache *slab;
 };
 
+static inline bool bioset_initialized(struct bio_set *bs)
+{
+	return bs->bio_slab != NULL;
+}
+
 /*
  * a small number of entries is fine, not going to be performance critical.
  * basically we just need to survive
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ebc34a5..fb35517 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -259,7 +259,8 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
 void blk_mq_complete_request(struct request *rq);
-
+bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
+			   struct bio *bio);
 bool blk_mq_queue_stopped(struct request_queue *q);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 17b18b9..3c4f390 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -8,6 +8,7 @@
 
 #include <linux/types.h>
 #include <linux/bvec.h>
+#include <linux/ktime.h>
 
 struct bio_set;
 struct bio;
@@ -90,10 +91,52 @@ static inline bool blk_path_error(blk_status_t error)
 	return true;
 }
 
-struct blk_issue_stat {
-	u64 stat;
+/*
+ * From most significant bit:
+ * 1 bit: reserved for other usage, see below
+ * 12 bits: original size of bio
+ * 51 bits: issue time of bio
+ */
+#define BIO_ISSUE_RES_BITS      1
+#define BIO_ISSUE_SIZE_BITS     12
+#define BIO_ISSUE_RES_SHIFT     (64 - BIO_ISSUE_RES_BITS)
+#define BIO_ISSUE_SIZE_SHIFT    (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS)
+#define BIO_ISSUE_TIME_MASK     ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1)
+#define BIO_ISSUE_SIZE_MASK     \
+	(((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT)
+#define BIO_ISSUE_RES_MASK      (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1))
+
+/* Reserved bit for blk-throtl */
+#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63)
+
+struct bio_issue {
+	u64 value;
 };
 
+static inline u64 __bio_issue_time(u64 time)
+{
+	return time & BIO_ISSUE_TIME_MASK;
+}
+
+static inline u64 bio_issue_time(struct bio_issue *issue)
+{
+	return __bio_issue_time(issue->value);
+}
+
+static inline sector_t bio_issue_size(struct bio_issue *issue)
+{
+	return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT);
+}
+
+static inline void bio_issue_init(struct bio_issue *issue,
+				       sector_t size)
+{
+	size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1;
+	issue->value = ((issue->value & BIO_ISSUE_RES_MASK) |
+			(ktime_get_ns() & BIO_ISSUE_TIME_MASK) |
+			((u64)size << BIO_ISSUE_SIZE_SHIFT));
+}
+
 /*
  * main unit of I/O for the block layer and lower layers (ie drivers and
  * stacking drivers)
@@ -138,7 +181,7 @@ struct bio {
 	struct cgroup_subsys_state *bi_css;
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	void			*bi_cg_private;
-	struct blk_issue_stat	bi_issue_stat;
+	struct bio_issue	bi_issue;
 #endif
 #endif
 	union {
@@ -186,6 +229,8 @@ struct bio {
 				 * throttling rules. Don't do it again. */
 #define BIO_TRACE_COMPLETION 10	/* bio_endio() should trace the final completion
 				 * of this bio. */
+#define BIO_QUEUE_ENTERED 11	/* can use blk_queue_enter_live() */
+
 /* See BVEC_POOL_OFFSET below before adding new flags */
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5c4eee0..bca3a92 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -125,16 +125,23 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_SPECIAL_PAYLOAD	((__force req_flags_t)(1 << 18))
 /* The per-zone write lock is held for this request */
 #define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
-/* timeout is expired */
-#define RQF_MQ_TIMEOUT_EXPIRED	((__force req_flags_t)(1 << 20))
 /* already slept for hybrid poll */
-#define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 21))
+#define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 20))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
 	(RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
 
 /*
+ * Request state for blk-mq.
+ */
+enum mq_rq_state {
+	MQ_RQ_IDLE		= 0,
+	MQ_RQ_IN_FLIGHT		= 1,
+	MQ_RQ_COMPLETE		= 2,
+};
+
+/*
  * Try to put the fields that are referenced together in the same cacheline.
  *
  * If you modify this structure, make sure to update blk_rq_init() and
@@ -205,9 +212,20 @@ struct request {
 
 	struct gendisk *rq_disk;
 	struct hd_struct *part;
-	unsigned long start_time;
-	struct blk_issue_stat issue_stat;
-	/* Number of scatter-gather DMA addr+len pairs after
+	/* Time that I/O was submitted to the kernel. */
+	u64 start_time_ns;
+	/* Time that I/O was submitted to the device. */
+	u64 io_start_time_ns;
+
+#ifdef CONFIG_BLK_WBT
+	unsigned short wbt_flags;
+#endif
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+	unsigned short throtl_size;
+#endif
+
+	/*
+	 * Number of scatter-gather DMA addr+len pairs after
 	 * physical address coalescing is performed.
 	 */
 	unsigned short nr_phys_segments;
@@ -219,32 +237,14 @@ struct request {
 	unsigned short write_hint;
 	unsigned short ioprio;
 
-	unsigned int timeout;
-
 	void *special;		/* opaque pointer available for LLD use */
 
 	unsigned int extra_len;	/* length of alignment and padding */
 
-	/*
-	 * On blk-mq, the lower bits of ->gstate (generation number and
-	 * state) carry the MQ_RQ_* state value and the upper bits the
-	 * generation number which is monotonically incremented and used to
-	 * distinguish the reuse instances.
-	 *
-	 * ->gstate_seq allows updates to ->gstate and other fields
-	 * (currently ->deadline) during request start to be read
-	 * atomically from the timeout path, so that it can operate on a
-	 * coherent set of information.
-	 */
-	seqcount_t gstate_seq;
-	u64 gstate;
+	enum mq_rq_state state;
+	refcount_t ref;
 
-	/*
-	 * ->aborted_gstate is used by the timeout to claim a specific
-	 * recycle instance of this request.  See blk_mq_timeout_work().
-	 */
-	struct u64_stats_sync aborted_gstate_sync;
-	u64 aborted_gstate;
+	unsigned int timeout;
 
 	/* access through blk_rq_set_deadline, blk_rq_deadline */
 	unsigned long __deadline;
@@ -267,8 +267,6 @@ struct request {
 
 #ifdef CONFIG_BLK_CGROUP
 	struct request_list *rl;		/* rl this rq is alloced from */
-	unsigned long long start_time_ns;
-	unsigned long long io_start_time_ns;    /* when passed to hardware */
 #endif
 };
 
@@ -328,9 +326,8 @@ typedef int (init_rq_fn)(struct request_queue *, struct request *, gfp_t);
 typedef void (exit_rq_fn)(struct request_queue *, struct request *);
 
 enum blk_eh_timer_return {
-	BLK_EH_NOT_HANDLED,
-	BLK_EH_HANDLED,
-	BLK_EH_RESET_TIMER,
+	BLK_EH_DONE,		/* drivers has completed the command */
+	BLK_EH_RESET_TIMER,	/* reset timer and try again */
 };
 
 typedef enum blk_eh_timer_return (rq_timed_out_fn)(struct request *);
@@ -655,7 +652,7 @@ struct request_queue {
 
 	struct blk_mq_tag_set	*tag_set;
 	struct list_head	tag_set_list;
-	struct bio_set		*bio_split;
+	struct bio_set		bio_split;
 
 #ifdef CONFIG_BLK_DEBUG_FS
 	struct dentry		*debugfs_dir;
@@ -967,11 +964,8 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
-extern struct request *blk_get_request_flags(struct request_queue *,
-					     unsigned int op,
-					     blk_mq_req_flags_t flags);
 extern struct request *blk_get_request(struct request_queue *, unsigned int op,
-				       gfp_t gfp_mask);
+				       blk_mq_req_flags_t flags);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern int blk_lld_busy(struct request_queue *q);
 extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
@@ -1788,48 +1782,6 @@ int kblockd_schedule_work(struct work_struct *work);
 int kblockd_schedule_work_on(int cpu, struct work_struct *work);
 int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 
-#ifdef CONFIG_BLK_CGROUP
-/*
- * This should not be using sched_clock(). A real patch is in progress
- * to fix this up, until that is in place we need to disable preemption
- * around sched_clock() in this function and set_io_start_time_ns().
- */
-static inline void set_start_time_ns(struct request *req)
-{
-	preempt_disable();
-	req->start_time_ns = sched_clock();
-	preempt_enable();
-}
-
-static inline void set_io_start_time_ns(struct request *req)
-{
-	preempt_disable();
-	req->io_start_time_ns = sched_clock();
-	preempt_enable();
-}
-
-static inline uint64_t rq_start_time_ns(struct request *req)
-{
-        return req->start_time_ns;
-}
-
-static inline uint64_t rq_io_start_time_ns(struct request *req)
-{
-        return req->io_start_time_ns;
-}
-#else
-static inline void set_start_time_ns(struct request *req) {}
-static inline void set_io_start_time_ns(struct request *req) {}
-static inline uint64_t rq_start_time_ns(struct request *req)
-{
-	return 0;
-}
-static inline uint64_t rq_io_start_time_ns(struct request *req)
-{
-	return 0;
-}
-#endif
-
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index 28a7ccc..6aeaf64 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -72,8 +72,7 @@ struct bsg_job {
 void bsg_job_done(struct bsg_job *job, int result,
 		  unsigned int reply_payload_rcv_len);
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
-		bsg_job_fn *job_fn, int dd_job_size,
-		void (*release)(struct device *));
+		bsg_job_fn *job_fn, int dd_job_size);
 void bsg_job_put(struct bsg_job *job);
 int __must_check bsg_job_get(struct bsg_job *job);
 
diff --git a/include/linux/bsg.h b/include/linux/bsg.h
index 0c7dd9c..dac37b6 100644
--- a/include/linux/bsg.h
+++ b/include/linux/bsg.h
@@ -17,17 +17,13 @@ struct bsg_ops {
 
 struct bsg_class_device {
 	struct device *class_dev;
-	struct device *parent;
 	int minor;
 	struct request_queue *queue;
-	struct kref ref;
 	const struct bsg_ops *ops;
-	void (*release)(struct device *);
 };
 
 int bsg_register_queue(struct request_queue *q, struct device *parent,
-		const char *name, const struct bsg_ops *ops,
-		void (*release)(struct device *));
+		const char *name, const struct bsg_ops *ops);
 int bsg_scsi_register_queue(struct request_queue *q, struct device *parent);
 void bsg_unregister_queue(struct request_queue *q);
 #else
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 081281a..ad19205 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -330,6 +330,7 @@ extern int put_compat_rusage(const struct rusage *,
 			     struct compat_rusage __user *);
 
 struct compat_siginfo;
+struct __compat_aio_sigset;
 
 struct compat_dirent {
 	u32		d_ino;
@@ -553,6 +554,12 @@ asmlinkage long compat_sys_io_getevents(compat_aio_context_t ctx_id,
 					compat_long_t nr,
 					struct io_event __user *events,
 					struct compat_timespec __user *timeout);
+asmlinkage long compat_sys_io_pgetevents(compat_aio_context_t ctx_id,
+					compat_long_t min_nr,
+					compat_long_t nr,
+					struct io_event __user *events,
+					struct compat_timespec __user *timeout,
+					const struct __compat_aio_sigset __user *usig);
 
 /* fs/cookies.c */
 asmlinkage long compat_sys_lookup_dcookie(u32, u32, char __user *, compat_size_t);
diff --git a/include/linux/cper.h b/include/linux/cper.h
index d14ef4e..9c703a0 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -381,7 +381,7 @@ struct cper_sec_proc_generic {
 /* IA32/X64 Processor Error Section */
 struct cper_sec_proc_ia {
 	__u64	validation_bits;
-	__u8	lapic_id;
+	__u64	lapic_id;
 	__u8	cpuid[48];
 };
 
@@ -551,5 +551,7 @@ const char *cper_mem_err_unpack(struct trace_seq *,
 				struct cper_mem_err_compact *);
 void cper_print_proc_arm(const char *pfx,
 			 const struct cper_sec_proc_arm *proc);
+void cper_print_proc_ia(const char *pfx,
+			const struct cper_sec_proc_ia *proc);
 
 #endif
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 5e335b6..e6c0448 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -29,7 +29,7 @@
 
 #ifdef CONFIG_TASK_DELAY_ACCT
 struct task_delay_info {
-	spinlock_t	lock;
+	raw_spinlock_t	lock;
 	unsigned int	flags;	/* Private per-task flags */
 
 	/* For each stat XXX, add following, aligned appropriately
diff --git a/include/linux/device.h b/include/linux/device.h
index 4779569..00b6c3b 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -88,6 +88,8 @@ extern void bus_remove_file(struct bus_type *, struct bus_attribute *);
  * @resume:	Called to bring a device on this bus out of sleep mode.
  * @num_vf:	Called to find out how many virtual functions a device on this
  *		bus supports.
+ * @dma_configure:	Called to setup DMA configuration on a device on
+			this bus.
  * @pm:		Power management operations of this bus, callback the specific
  *		device driver's pm-ops.
  * @iommu_ops:  IOMMU specific operations for this bus, used to attach IOMMU
@@ -96,8 +98,6 @@ extern void bus_remove_file(struct bus_type *, struct bus_attribute *);
  * @p:		The private data of the driver core, only the driver core can
  *		touch this.
  * @lock_key:	Lock class key for use by the lock validator
- * @force_dma:	Assume devices on this bus should be set up by dma_configure()
- * 		even if DMA capability is not explicitly described by firmware.
  *
  * A bus is a channel between the processor and one or more devices. For the
  * purposes of the device model, all devices are connected via a bus, even if
@@ -130,14 +130,14 @@ struct bus_type {
 
 	int (*num_vf)(struct device *dev);
 
+	int (*dma_configure)(struct device *dev);
+
 	const struct dev_pm_ops *pm;
 
 	const struct iommu_ops *iommu_ops;
 
 	struct subsys_private *p;
 	struct lock_class_key lock_key;
-
-	bool force_dma;
 };
 
 extern int __must_check bus_register(struct bus_type *bus);
@@ -904,6 +904,8 @@ struct dev_links_info {
  * @offline:	Set after successful invocation of bus type's .offline().
  * @of_node_reused: Set if the device-tree node is shared with an ancestor
  *              device.
+ * @dma_32bit_limit: bridge limited to 32bit DMA even if the device itself
+ *		indicates support for a higher limit in the dma_mask field.
  *
  * At the lowest level, every device in a Linux system is represented by an
  * instance of struct device. The device structure contains the information
@@ -992,6 +994,7 @@ struct device {
 	bool			offline_disabled:1;
 	bool			offline:1;
 	bool			of_node_reused:1;
+	bool			dma_32bit_limit:1;
 };
 
 static inline struct device *kobj_to_dev(struct kobject *kobj)
diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index c7d844f..a785f25 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -30,8 +30,6 @@ struct bus_type;
 
 extern void dma_debug_add_bus(struct bus_type *bus);
 
-extern void dma_debug_init(u32 num_entries);
-
 extern int dma_debug_resize_entries(u32 num_entries);
 
 extern void debug_dma_map_page(struct device *dev, struct page *page,
@@ -100,10 +98,6 @@ static inline void dma_debug_add_bus(struct bus_type *bus)
 {
 }
 
-static inline void dma_debug_init(u32 num_entries)
-{
-}
-
 static inline int dma_debug_resize_entries(u32 num_entries)
 {
 	return 0;
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 53ad6a4..8d9f33f 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -59,6 +59,11 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs);
 void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t dma_addr, unsigned long attrs);
+dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs);
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+		enum dma_data_direction dir, unsigned long attrs);
 int dma_direct_supported(struct device *dev, u64 mask);
-
+int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr);
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index f8ab1c0..f9cc309 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -133,10 +133,10 @@ struct dma_map_ops {
 #ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
 	u64 (*get_required_mask)(struct device *dev);
 #endif
-	int is_phys;
 };
 
 extern const struct dma_map_ops dma_direct_ops;
+extern const struct dma_map_ops dma_noncoherent_ops;
 extern const struct dma_map_ops dma_virt_ops;
 
 #define DMA_BIT_MASK(n)	(((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
@@ -502,7 +502,7 @@ dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr,
 #define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, 0)
 
 #ifndef arch_dma_alloc_attrs
-#define arch_dma_alloc_attrs(dev, flag)	(true)
+#define arch_dma_alloc_attrs(dev)	(true)
 #endif
 
 static inline void *dma_alloc_attrs(struct device *dev, size_t size,
@@ -521,7 +521,7 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size,
 	/* let the implementation decide on the zone to allocate from: */
 	flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
-	if (!arch_dma_alloc_attrs(&dev, &flag))
+	if (!arch_dma_alloc_attrs(&dev))
 		return NULL;
 	if (!ops->alloc)
 		return NULL;
@@ -572,14 +572,6 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 	return 0;
 }
 
-/*
- * This is a hack for the legacy x86 forbid_dac and iommu_sac_force. Please
- * don't use this in new code.
- */
-#ifndef arch_dma_supported
-#define arch_dma_supported(dev, mask)	(1)
-#endif
-
 static inline void dma_check_mask(struct device *dev, u64 mask)
 {
 	if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
@@ -592,9 +584,6 @@ static inline int dma_supported(struct device *dev, u64 mask)
 
 	if (!ops)
 		return 0;
-	if (!arch_dma_supported(dev, mask))
-		return 0;
-
 	if (!ops->dma_supported)
 		return 1;
 	return ops->dma_supported(dev, mask);
@@ -839,7 +828,7 @@ static inline int dma_mmap_wc(struct device *dev,
 #define dma_mmap_writecombine dma_mmap_wc
 #endif
 
-#if defined(CONFIG_NEED_DMA_MAP_STATE) || defined(CONFIG_DMA_API_DEBUG)
+#ifdef CONFIG_NEED_DMA_MAP_STATE
 #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME)        dma_addr_t ADDR_NAME
 #define DEFINE_DMA_UNMAP_LEN(LEN_NAME)          __u32 LEN_NAME
 #define dma_unmap_addr(PTR, ADDR_NAME)           ((PTR)->ADDR_NAME)
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
new file mode 100644
index 0000000..10b2654
--- /dev/null
+++ b/include/linux/dma-noncoherent.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_DMA_NONCOHERENT_H
+#define _LINUX_DMA_NONCOHERENT_H 1
+
+#include <linux/dma-mapping.h>
+
+void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		gfp_t gfp, unsigned long attrs);
+void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
+		dma_addr_t dma_addr, unsigned long attrs);
+
+#ifdef CONFIG_DMA_NONCOHERENT_MMAP
+int arch_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs);
+#else
+#define arch_dma_mmap NULL
+#endif /* CONFIG_DMA_NONCOHERENT_MMAP */
+
+#ifdef CONFIG_DMA_NONCOHERENT_CACHE_SYNC
+void arch_dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+		enum dma_data_direction direction);
+#else
+#define arch_dma_cache_sync NULL
+#endif /* CONFIG_DMA_NONCOHERENT_CACHE_SYNC */
+
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE
+void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir);
+#else
+static inline void arch_sync_dma_for_device(struct device *dev,
+		phys_addr_t paddr, size_t size, enum dma_data_direction dir)
+{
+}
+#endif /* ARCH_HAS_SYNC_DMA_FOR_DEVICE */
+
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir);
+#else
+static inline void arch_sync_dma_for_cpu(struct device *dev,
+		phys_addr_t paddr, size_t size, enum dma_data_direction dir)
+{
+}
+#endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
+
+#endif /* _LINUX_DMA_NONCOHERENT_H */
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 3016d8c..56add82 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -397,7 +397,7 @@ typedef struct {
 	u32 set_bar_attributes;
 	u64 romsize;
 	u32 romimage;
-} efi_pci_io_protocol_32;
+} efi_pci_io_protocol_32_t;
 
 typedef struct {
 	u64 poll_mem;
@@ -417,7 +417,7 @@ typedef struct {
 	u64 set_bar_attributes;
 	u64 romsize;
 	u64 romimage;
-} efi_pci_io_protocol_64;
+} efi_pci_io_protocol_64_t;
 
 typedef struct {
 	void *poll_mem;
@@ -437,7 +437,7 @@ typedef struct {
 	void *set_bar_attributes;
 	uint64_t romsize;
 	void *romimage;
-} efi_pci_io_protocol;
+} efi_pci_io_protocol_t;
 
 #define EFI_PCI_IO_ATTRIBUTE_ISA_MOTHERBOARD_IO 0x0001
 #define EFI_PCI_IO_ATTRIBUTE_ISA_IO 0x0002
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 6d9e230..a02deea 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -218,8 +218,6 @@ extern void elv_unregister(struct elevator_type *);
 extern ssize_t elv_iosched_show(struct request_queue *, char *);
 extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t);
 
-extern int elevator_init(struct request_queue *, char *);
-extern void elevator_exit(struct request_queue *, struct elevator_queue *);
 extern bool elv_bio_merge_ok(struct request *, struct bio *);
 extern struct elevator_queue *elevator_alloc(struct request_queue *,
 					struct elevator_type *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 760d8da..d4c37d3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -94,7 +94,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 
 /*
  * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
- * to O_WRONLY and O_RDWR via the strange trick in __dentry_open()
+ * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open()
  */
 
 /* file is open for reading */
@@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
 }
 
 struct fasync_struct {
-	spinlock_t		fa_lock;
+	rwlock_t		fa_lock;
 	int			magic;
 	int			fa_fd;
 	struct fasync_struct	*fa_next; /* singly linked list */
@@ -1711,6 +1711,8 @@ struct file_operations {
 	int (*iterate) (struct file *, struct dir_context *);
 	int (*iterate_shared) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
+	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
+	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
@@ -2570,7 +2572,7 @@ extern bool is_bad_inode(struct inode *);
 
 #ifdef CONFIG_BLOCK
 extern void check_disk_size_change(struct gendisk *disk,
-				   struct block_device *bdev);
+		struct block_device *bdev, bool verbose);
 extern int revalidate_disk(struct gendisk *);
 extern int check_disk_change(struct block_device *);
 extern int __invalidate_device(struct block_device *, bool);
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 3998892..2f1327c 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -16,7 +16,7 @@
 /*
  * Heterogeneous Memory Management (HMM)
  *
- * See Documentation/vm/hmm.txt for reasons and overview of what HMM is and it
+ * See Documentation/vm/hmm.rst for reasons and overview of what HMM is and it
  * is for. Here we focus on the HMM API description, with some explanation of
  * the underlying implementation.
  *
diff --git a/include/linux/ide.h b/include/linux/ide.h
index ca9d34f..c74b032 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -961,7 +961,7 @@ __IDE_PROC_DEVSET(_name, _min, _max, NULL, NULL)
 typedef struct {
 	const char	*name;
 	umode_t		mode;
-	const struct file_operations *proc_fops;
+	int (*show)(struct seq_file *, void *);
 } ide_proc_entry_t;
 
 void proc_ide_create(void);
@@ -973,8 +973,8 @@ void ide_proc_unregister_port(ide_hwif_t *);
 void ide_proc_register_driver(ide_drive_t *, struct ide_driver *);
 void ide_proc_unregister_driver(ide_drive_t *, struct ide_driver *);
 
-extern const struct file_operations ide_capacity_proc_fops;
-extern const struct file_operations ide_geometry_proc_fops;
+int ide_capacity_proc_show(struct seq_file *m, void *v);
+int ide_geometry_proc_show(struct seq_file *m, void *v);
 #else
 static inline void proc_ide_create(void) { ; }
 static inline void proc_ide_destroy(void) { ; }
@@ -1508,8 +1508,6 @@ static inline void ide_set_hwifdata (ide_hwif_t * hwif, void *data)
 	hwif->hwif_data = data;
 }
 
-extern void ide_toggle_bounce(ide_drive_t *drive, int on);
-
 u64 ide_get_lba_addr(struct ide_cmd *, int);
 u8 ide_dump_status(ide_drive_t *, const char *, u8);
 
diff --git a/include/linux/iommu-helper.h b/include/linux/iommu-helper.h
index cb9a924..70d01ed 100644
--- a/include/linux/iommu-helper.h
+++ b/include/linux/iommu-helper.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_IOMMU_HELPER_H
 #define _LINUX_IOMMU_HELPER_H
 
+#include <linux/bug.h>
 #include <linux/kernel.h>
 
 static inline unsigned long iommu_device_max_index(unsigned long size,
@@ -14,9 +15,15 @@ static inline unsigned long iommu_device_max_index(unsigned long size,
 		return size;
 }
 
-extern int iommu_is_span_boundary(unsigned int index, unsigned int nr,
-				  unsigned long shift,
-				  unsigned long boundary_size);
+static inline int iommu_is_span_boundary(unsigned int index, unsigned int nr,
+		unsigned long shift, unsigned long boundary_size)
+{
+	BUG_ON(!is_power_of_2(boundary_size));
+
+	shift = (shift + index) & (boundary_size - 1);
+	return shift + nr > boundary_size;
+}
+
 extern unsigned long iommu_area_alloc(unsigned long *map, unsigned long size,
 				      unsigned long start, unsigned int nr,
 				      unsigned long shift,
diff --git a/include/linux/isdn/capilli.h b/include/linux/isdn/capilli.h
index 11b57c4..d75e1ad 100644
--- a/include/linux/isdn/capilli.h
+++ b/include/linux/isdn/capilli.h
@@ -50,7 +50,7 @@ struct capi_ctr {
 	u16  (*send_message)(struct capi_ctr *, struct sk_buff *skb);
 	
 	char *(*procinfo)(struct capi_ctr *);
-	const struct file_operations *proc_fops;
+	int (*proc_show)(struct seq_file *, void *);
 
 	/* filled in before calling ready callback */
 	u8 manu[CAPI_MANUFACTURER_LEN];		/* CAPI_GET_MANUFACTURER */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 1795fec..1c11313 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1133,7 +1133,6 @@ extern int ata_sas_port_start(struct ata_port *ap);
 extern void ata_sas_port_stop(struct ata_port *ap);
 extern int ata_sas_slave_configure(struct scsi_device *, struct ata_port *);
 extern int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap);
-extern enum blk_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
 extern int sata_scr_valid(struct ata_link *link);
 extern int sata_scr_read(struct ata_link *link, int reg, u32 *val);
 extern int sata_scr_write(struct ata_link *link, int reg, u32 val);
@@ -1359,7 +1358,6 @@ extern struct device_attribute *ata_common_sdev_attrs[];
 	.proc_name		= drv_name,			\
 	.slave_configure	= ata_scsi_slave_config,	\
 	.slave_destroy		= ata_scsi_slave_destroy,	\
-	.eh_timed_out		= ata_scsi_timed_out,		\
 	.bios_param		= ata_std_bios_param,		\
 	.unlock_native_capacity	= ata_scsi_unlock_native_capacity, \
 	.sdev_attrs		= ata_common_sdev_attrs
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 6e0859b..e9e0d1c 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -489,7 +489,7 @@ typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
 typedef sector_t (nvm_tgt_capacity_fn)(void *);
 typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *,
 				int flags);
-typedef void (nvm_tgt_exit_fn)(void *);
+typedef void (nvm_tgt_exit_fn)(void *, bool);
 typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *);
 typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *);
 
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index b51f5c4..0c964ac 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -25,6 +25,18 @@ typedef struct mempool_s {
 	wait_queue_head_t wait;
 } mempool_t;
 
+static inline bool mempool_initialized(mempool_t *pool)
+{
+	return pool->elements != NULL;
+}
+
+void mempool_exit(mempool_t *pool);
+int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+		      mempool_free_t *free_fn, void *pool_data,
+		      gfp_t gfp_mask, int node_id);
+int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+		 mempool_free_t *free_fn, void *pool_data);
+
 extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
 			mempool_free_t *free_fn, void *pool_data);
 extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
@@ -43,6 +55,14 @@ extern void mempool_free(void *element, mempool_t *pool);
  */
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
 void mempool_free_slab(void *element, void *pool_data);
+
+static inline int
+mempool_init_slab_pool(mempool_t *pool, int min_nr, struct kmem_cache *kc)
+{
+	return mempool_init(pool, min_nr, mempool_alloc_slab,
+			    mempool_free_slab, (void *) kc);
+}
+
 static inline mempool_t *
 mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
 {
@@ -56,6 +76,13 @@ mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
  */
 void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data);
 void mempool_kfree(void *element, void *pool_data);
+
+static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+	return mempool_init(pool, min_nr, mempool_kmalloc,
+			    mempool_kfree, (void *) size);
+}
+
 static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
 {
 	return mempool_create(min_nr, mempool_kmalloc, mempool_kfree,
@@ -68,6 +95,13 @@ static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
  */
 void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data);
 void mempool_free_pages(void *element, void *pool_data);
+
+static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order)
+{
+	return mempool_init(pool, min_nr, mempool_alloc_pages,
+			    mempool_free_pages, (void *)(long)order);
+}
+
 static inline mempool_t *mempool_create_page_pool(int min_nr, int order)
 {
 	return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages,
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 7b4899c..74ea5e2 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -45,7 +45,7 @@ struct vmem_altmap {
  * must be treated as an opaque object, rather than a "normal" struct page.
  *
  * A more complete discussion of unaddressable memory may be found in
- * include/linux/hmm.h and Documentation/vm/hmm.txt.
+ * include/linux/hmm.h and Documentation/vm/hmm.rst.
  *
  * MEMORY_DEVICE_PUBLIC:
  * Device memory that is cache coherent from device and CPU point of view. This
@@ -67,7 +67,7 @@ enum memory_type {
  *   page_free()
  *
  * Additional notes about MEMORY_DEVICE_PRIVATE may be found in
- * include/linux/hmm.h and Documentation/vm/hmm.txt. There is also a brief
+ * include/linux/hmm.h and Documentation/vm/hmm.rst. There is also a brief
  * explanation in include/linux/memory_hotplug.h.
  *
  * The page_fault() callback must migrate page back, from device memory to
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index 2d4e23c..f09e9cf 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -197,6 +197,8 @@ struct cros_ec_dev {
 	u32 features[2];
 };
 
+#define to_cros_ec_dev(dev)  container_of(dev, struct cros_ec_dev, class_dev)
+
 /**
  * cros_ec_suspend - Handle a suspend operation for the ChromeOS EC device
  *
diff --git a/include/linux/mfd/mc13xxx.h b/include/linux/mfd/mc13xxx.h
index 638222e..54a3cd8 100644
--- a/include/linux/mfd/mc13xxx.h
+++ b/include/linux/mfd/mc13xxx.h
@@ -243,6 +243,8 @@ struct mc13xxx_platform_data {
 #define MC13XXX_ADC0_LICELLCON		(1 << 0)
 #define MC13XXX_ADC0_CHRGICON		(1 << 1)
 #define MC13XXX_ADC0_BATICON		(1 << 2)
+#define MC13XXX_ADC0_ADIN7SEL_DIE	(1 << 4)
+#define MC13XXX_ADC0_ADIN7SEL_UID	(2 << 4)
 #define MC13XXX_ADC0_ADREFEN		(1 << 10)
 #define MC13XXX_ADC0_TSMOD0		(1 << 12)
 #define MC13XXX_ADC0_TSMOD1		(1 << 13)
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 2d07a1e..392e6af 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -174,7 +174,7 @@ struct mmu_notifier_ops {
 	 * invalidate_range_start()/end() notifiers, as
 	 * invalidate_range() alread catches the points in time when an
 	 * external TLB range needs to be flushed. For more in depth
-	 * discussion on this see Documentation/vm/mmu_notifier.txt
+	 * discussion on this see Documentation/vm/mmu_notifier.rst
 	 *
 	 * Note that this function might be called with just a sub-range
 	 * of what was passed to invalidate_range_start()/end(), if
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 14bc0d5..3093dd1 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -146,9 +146,6 @@ extern void __mutex_init(struct mutex *lock, const char *name,
  */
 static inline bool mutex_is_locked(struct mutex *lock)
 {
-	/*
-	 * XXX think about spin_is_locked
-	 */
 	return __mutex_owner(lock) != NULL;
 }
 
diff --git a/include/linux/net.h b/include/linux/net.h
index 2248a05..3fd9d8c 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -147,6 +147,7 @@ struct proto_ops {
 	int		(*getname)   (struct socket *sock,
 				      struct sockaddr *addr,
 				      int peer);
+	__poll_t	(*poll_mask) (struct socket *sock, __poll_t events);
 	__poll_t	(*poll)	     (struct file *file, struct socket *sock,
 				      struct poll_table_struct *wait);
 	int		(*ioctl)     (struct socket *sock, unsigned int cmd,
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 4112e2b..2950ce9 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -436,10 +436,19 @@ enum {
 enum {
 	NVME_AER_ERROR			= 0,
 	NVME_AER_SMART			= 1,
+	NVME_AER_NOTICE			= 2,
 	NVME_AER_CSS			= 6,
 	NVME_AER_VS			= 7,
-	NVME_AER_NOTICE_NS_CHANGED	= 0x0002,
-	NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102,
+};
+
+enum {
+	NVME_AER_NOTICE_NS_CHANGED	= 0x00,
+	NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
+};
+
+enum {
+	NVME_AEN_CFG_NS_ATTR		= 1 << 8,
+	NVME_AEN_CFG_FW_ACT		= 1 << 9,
 };
 
 struct nvme_lba_range_type {
@@ -747,6 +756,7 @@ enum {
 	NVME_LOG_ERROR		= 0x01,
 	NVME_LOG_SMART		= 0x02,
 	NVME_LOG_FW_SLOT	= 0x03,
+	NVME_LOG_CHANGED_NS	= 0x04,
 	NVME_LOG_CMD_EFFECTS	= 0x05,
 	NVME_LOG_DISC		= 0x70,
 	NVME_LOG_RESERVATION	= 0x80,
@@ -755,6 +765,8 @@ enum {
 	NVME_FWACT_ACTV		= (2 << 3),
 };
 
+#define NVME_MAX_CHANGED_NAMESPACES	1024
+
 struct nvme_identify {
 	__u8			opcode;
 	__u8			flags;
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 8da5a1b..165fd30 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -55,7 +55,9 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
 	return of_node_get(cpu_dev->of_node);
 }
 
-int of_dma_configure(struct device *dev, struct device_node *np);
+int of_dma_configure(struct device *dev,
+		     struct device_node *np,
+		     bool force_dma);
 void of_dma_deconfigure(struct device *dev);
 #else /* CONFIG_OF */
 
@@ -105,7 +107,9 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
 	return NULL;
 }
 
-static inline int of_dma_configure(struct device *dev, struct device_node *np)
+static inline int of_dma_configure(struct device *dev,
+				   struct device_node *np,
+				   bool force_dma)
 {
 	return 0;
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 73178a2..55371cb 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -670,7 +670,7 @@ int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn,
 int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn,
 		  int reg, int len, u32 val);
 
-#ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
 typedef u64 pci_bus_addr_t;
 #else
 typedef u32 pci_bus_addr_t;
diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h
index 93d142a..1746015 100644
--- a/include/linux/pktcdvd.h
+++ b/include/linux/pktcdvd.h
@@ -186,7 +186,7 @@ struct pktcdvd_device
 	sector_t		current_sector;	/* Keep track of where the elevator is */
 	atomic_t		scan_queue;	/* Set to non-zero when pkt_handle_queue */
 						/* needs to be run. */
-	mempool_t		*rb_pool;	/* mempool for pkt_rb_node allocations */
+	mempool_t		rb_pool;	/* mempool for pkt_rb_node allocations */
 
 	struct packet_iosched   iosched;
 	struct gendisk		*disk;
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 49f634d..3097c94 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -356,6 +356,8 @@ extern int platform_pm_restore(struct device *dev);
 #define platform_pm_restore		NULL
 #endif
 
+extern int platform_dma_configure(struct device *dev);
+
 #ifdef CONFIG_PM_SLEEP
 #define USE_PLATFORM_PM_SLEEP_OPS \
 	.suspend = platform_pm_suspend, \
diff --git a/include/linux/poll.h b/include/linux/poll.h
index f45ebd0..fdf86b4 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -74,6 +74,18 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
 	pt->_key   = ~(__poll_t)0; /* all events enabled */
 }
 
+static inline bool file_has_poll_mask(struct file *file)
+{
+	return file->f_op->get_poll_head && file->f_op->poll_mask;
+}
+
+static inline bool file_can_poll(struct file *file)
+{
+	return file->f_op->poll || file_has_poll_mask(file);
+}
+
+__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt);
+
 struct poll_table_entry {
 	struct file *filp;
 	__poll_t key;
@@ -96,8 +108,6 @@ struct poll_wqueues {
 
 extern void poll_initwait(struct poll_wqueues *pwq);
 extern void poll_freewait(struct poll_wqueues *pwq);
-extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
-				 ktime_t *expires, unsigned long slack);
 extern u64 select_estimate_accuracy(struct timespec64 *tv);
 
 #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 928ef9e..e518352 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -9,6 +9,8 @@
 #include <linux/fs.h>
 
 struct proc_dir_entry;
+struct seq_file;
+struct seq_operations;
 
 #ifdef CONFIG_PROC_FS
 
@@ -23,6 +25,19 @@ extern struct proc_dir_entry *proc_mkdir_data(const char *, umode_t,
 extern struct proc_dir_entry *proc_mkdir_mode(const char *, umode_t,
 					      struct proc_dir_entry *);
 struct proc_dir_entry *proc_create_mount_point(const char *name);
+
+struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
+		struct proc_dir_entry *parent, const struct seq_operations *ops,
+		unsigned int state_size, void *data);
+#define proc_create_seq_data(name, mode, parent, ops, data) \
+	proc_create_seq_private(name, mode, parent, ops, 0, data)
+#define proc_create_seq(name, mode, parent, ops) \
+	proc_create_seq_private(name, mode, parent, ops, 0, NULL)
+struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
+		struct proc_dir_entry *parent,
+		int (*show)(struct seq_file *, void *), void *data);
+#define proc_create_single(name, mode, parent, show) \
+	proc_create_single_data(name, mode, parent, show, NULL)
  
 extern struct proc_dir_entry *proc_create_data(const char *, umode_t,
 					       struct proc_dir_entry *,
@@ -38,6 +53,15 @@ extern void proc_remove(struct proc_dir_entry *);
 extern void remove_proc_entry(const char *, struct proc_dir_entry *);
 extern int remove_proc_subtree(const char *, struct proc_dir_entry *);
 
+struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
+		struct proc_dir_entry *parent, const struct seq_operations *ops,
+		unsigned int state_size, void *data);
+#define proc_create_net(name, mode, parent, state_size, ops) \
+	proc_create_net_data(name, mode, parent, state_size, ops, NULL)
+struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
+		struct proc_dir_entry *parent,
+		int (*show)(struct seq_file *, void *), void *data);
+
 #else /* CONFIG_PROC_FS */
 
 static inline void proc_root_init(void)
@@ -57,6 +81,11 @@ static inline struct proc_dir_entry *proc_mkdir_data(const char *name,
 	umode_t mode, struct proc_dir_entry *parent, void *data) { return NULL; }
 static inline struct proc_dir_entry *proc_mkdir_mode(const char *name,
 	umode_t mode, struct proc_dir_entry *parent) { return NULL; }
+#define proc_create_seq_private(name, mode, parent, ops, size, data) ({NULL;})
+#define proc_create_seq_data(name, mode, parent, ops, data) ({NULL;})
+#define proc_create_seq(name, mode, parent, ops) ({NULL;})
+#define proc_create_single(name, mode, parent, show) ({NULL;})
+#define proc_create_single_data(name, mode, parent, show, data) ({NULL;})
 #define proc_create(name, mode, parent, proc_fops) ({NULL;})
 #define proc_create_data(name, mode, parent, proc_fops, data) ({NULL;})
 
@@ -69,6 +98,10 @@ static inline void proc_remove(struct proc_dir_entry *de) {}
 #define remove_proc_entry(name, parent) do {} while (0)
 static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) { return 0; }
 
+#define proc_create_net_data(name, mode, parent, ops, state_size, data) ({NULL;})
+#define proc_create_net(name, mode, parent, state_size, ops) ({NULL;})
+#define proc_create_net_single(name, mode, parent, show, data) ({NULL;})
+
 #endif /* CONFIG_PROC_FS */
 
 struct net;
@@ -83,4 +116,10 @@ struct ns_common;
 int open_related_ns(struct ns_common *ns,
 		   struct ns_common *(*get_ns)(struct ns_common *ns));
 
+/* get the associated pid namespace for a file in procfs */
+static inline struct pid_namespace *proc_pid_ns(struct inode *inode)
+{
+	return inode->i_sb->s_fs_info;
+}
+
 #endif /* _LINUX_PROC_FS_H */
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 919b2a0..037bf0e 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -345,7 +345,6 @@ extern void user_single_step_siginfo(struct task_struct *tsk,
 static inline void user_single_step_siginfo(struct task_struct *tsk,
 				struct pt_regs *regs, siginfo_t *info)
 {
-	memset(info, 0, sizeof(*info));
 	info->si_signo = SIGTRAP;
 }
 #endif
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 36360d0..e679b17 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -108,7 +108,6 @@ void rcu_sched_qs(void);
 void rcu_bh_qs(void);
 void rcu_check_callbacks(int user);
 void rcu_report_dead(unsigned int cpu);
-void rcu_cpu_starting(unsigned int cpu);
 void rcutree_migrate_callbacks(int cpu);
 
 #ifdef CONFIG_RCU_STALL_COMMON
@@ -188,13 +187,13 @@ static inline void exit_tasks_rcu_finish(void) { }
 #endif /* #else #ifdef CONFIG_TASKS_RCU */
 
 /**
- * cond_resched_rcu_qs - Report potential quiescent states to RCU
+ * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
  *
  * This macro resembles cond_resched(), except that it is defined to
  * report potential quiescent states to RCU-tasks even if the cond_resched()
  * machinery were to be shut off, as some advocate for PREEMPT kernels.
  */
-#define cond_resched_rcu_qs() \
+#define cond_resched_tasks_rcu_qs() \
 do { \
 	if (!cond_resched()) \
 		rcu_note_voluntary_context_switch_lite(current); \
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index ce9beec..7b3c82e 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -132,5 +132,6 @@ static inline void rcu_all_qs(void) { barrier(); }
 #define rcutree_offline_cpu      NULL
 #define rcutree_dead_cpu         NULL
 #define rcutree_dying_cpu        NULL
+static inline void rcu_cpu_starting(unsigned int cpu) { }
 
 #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index fd996cd..9146558 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -74,6 +74,7 @@ static inline void synchronize_rcu_bh_expedited(void)
 void rcu_barrier(void);
 void rcu_barrier_bh(void);
 void rcu_barrier_sched(void);
+bool rcu_eqs_special_set(int cpu);
 unsigned long get_state_synchronize_rcu(void);
 void cond_synchronize_rcu(unsigned long oldstate);
 unsigned long get_state_synchronize_sched(void);
@@ -100,5 +101,6 @@ int rcutree_online_cpu(unsigned int cpu);
 int rcutree_offline_cpu(unsigned int cpu);
 int rcutree_dead_cpu(unsigned int cpu);
 int rcutree_dying_cpu(unsigned int cpu);
+void rcu_cpu_starting(unsigned int cpu);
 
 #endif /* __LINUX_RCUTREE_H */
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 5f7ad05..4f38068 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -15,6 +15,7 @@
 
 #include <linux/list.h>
 #include <linux/rbtree.h>
+#include <linux/ktime.h>
 #include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/bug.h>
@@ -587,7 +588,10 @@ struct regmap *__devm_regmap_init_sdw(struct sdw_slave *sdw,
 				 const struct regmap_config *config,
 				 struct lock_class_key *lock_key,
 				 const char *lock_name);
-
+struct regmap *__devm_regmap_init_slimbus(struct slim_device *slimbus,
+				 const struct regmap_config *config,
+				 struct lock_class_key *lock_key,
+				 const char *lock_name);
 /*
  * Wrapper for regmap_init macros to include a unique lockdep key and name
  * for each call. No-op if CONFIG_LOCKDEP is not set.
@@ -906,6 +910,19 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 	__regmap_lockdep_wrapper(__devm_regmap_init_sdw, #config,	\
 				sdw, config)
 
+/**
+ * devm_regmap_init_slimbus() - Initialise managed register map
+ *
+ * @slimbus: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap. The regmap will be automatically freed by the
+ * device management code.
+ */
+#define devm_regmap_init_slimbus(slimbus, config)			\
+	__regmap_lockdep_wrapper(__devm_regmap_init_slimbus, #config,	\
+				slimbus, config)
 int regmap_mmio_attach_clk(struct regmap *map, struct clk *clk);
 void regmap_mmio_detach_clk(struct regmap *map);
 void regmap_exit(struct regmap *map);
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 841585f..e653953 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -127,6 +127,12 @@ struct sbitmap_queue {
 	 * @round_robin: Allocate bits in strict round-robin order.
 	 */
 	bool round_robin;
+
+	/**
+	 * @min_shallow_depth: The minimum shallow depth which may be passed to
+	 * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
+	 */
+	unsigned int min_shallow_depth;
 };
 
 /**
@@ -390,6 +396,9 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq);
  * @shallow_depth: The maximum number of bits to allocate from a single word.
  * See sbitmap_get_shallow().
  *
+ * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
+ * initializing @sbq.
+ *
  * Return: Non-negative allocated bit number if successful, -1 otherwise.
  */
 int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
@@ -424,6 +433,9 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
  * @shallow_depth: The maximum number of bits to allocate from a single word.
  * See sbitmap_get_shallow().
  *
+ * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
+ * initializing @sbq.
+ *
  * Return: Non-negative allocated bit number if successful, -1 otherwise.
  */
 static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
@@ -439,6 +451,23 @@ static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
 }
 
 /**
+ * sbitmap_queue_min_shallow_depth() - Inform a &struct sbitmap_queue of the
+ * minimum shallow depth that will be used.
+ * @sbq: Bitmap queue in question.
+ * @min_shallow_depth: The minimum shallow depth that will be passed to
+ * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
+ *
+ * sbitmap_queue_clear() batches wakeups as an optimization. The batch size
+ * depends on the depth of the bitmap. Since the shallow allocation functions
+ * effectively operate with a different depth, the shallow depth must be taken
+ * into account when calculating the batch size. This function must be called
+ * with the minimum shallow depth that will be used. Failure to do so can result
+ * in missed wakeups.
+ */
+void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
+				     unsigned int min_shallow_depth);
+
+/**
  * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
  * &struct sbitmap_queue.
  * @sbq: Bitmap to free from.
@@ -484,6 +513,13 @@ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq,
 void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);
 
 /**
+ * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue
+ * on a &struct sbitmap_queue.
+ * @sbq: Bitmap queue to wake up.
+ */
+void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
+
+/**
  * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct
  * seq_file.
  * @sbq: Bitmap queue to show.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ca3f3ea..14e4f9c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1512,6 +1512,7 @@ static inline int task_nice(const struct task_struct *p)
 extern int can_nice(const struct task_struct *p, const int nice);
 extern int task_curr(const struct task_struct *p);
 extern int idle_cpu(int cpu);
+extern int available_idle_cpu(int cpu);
 extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
 extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
 extern int sched_setattr(struct task_struct *, const struct sched_attr *);
@@ -1661,7 +1662,6 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
  * explicit rescheduling in places that are safe. The return
  * value indicates whether a reschedule was done in fact.
  * cond_resched_lock() will drop the spinlock before scheduling,
- * cond_resched_softirq() will enable bhs before scheduling.
  */
 #ifndef CONFIG_PREEMPT
 extern int _cond_resched(void);
@@ -1681,13 +1681,6 @@ extern int __cond_resched_lock(spinlock_t *lock);
 	__cond_resched_lock(lock);				\
 })
 
-extern int __cond_resched_softirq(void);
-
-#define cond_resched_softirq() ({					\
-	___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);	\
-	__cond_resched_softirq();					\
-})
-
 static inline void cond_resched_rcu(void)
 {
 #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2c570cd9..76a8cb4 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -28,7 +28,7 @@ extern struct mm_struct *mm_alloc(void);
  *
  * Use mmdrop() to release the reference acquired by mmgrab().
  *
- * See also <Documentation/vm/active_mm.txt> for an in-depth explanation
+ * See also <Documentation/vm/active_mm.rst> for an in-depth explanation
  * of &mm_struct.mm_count vs &mm_struct.mm_users.
  */
 static inline void mmgrab(struct mm_struct *mm)
@@ -62,7 +62,7 @@ static inline void mmdrop(struct mm_struct *mm)
  *
  * Use mmput() to release the reference acquired by mmget().
  *
- * See also <Documentation/vm/active_mm.txt> for an in-depth explanation
+ * See also <Documentation/vm/active_mm.rst> for an in-depth explanation
  * of &mm_struct.mm_count vs &mm_struct.mm_users.
  */
 static inline void mmget(struct mm_struct *mm)
@@ -170,6 +170,17 @@ static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
 static inline void fs_reclaim_release(gfp_t gfp_mask) { }
 #endif
 
+/**
+ * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope.
+ *
+ * This functions marks the beginning of the GFP_NOIO allocation scope.
+ * All further allocations will implicitly drop __GFP_IO flag and so
+ * they are safe for the IO critical section from the allocation recursion
+ * point of view. Use memalloc_noio_restore to end the scope with flags
+ * returned by this function.
+ *
+ * This function is safe to be used from any context.
+ */
 static inline unsigned int memalloc_noio_save(void)
 {
 	unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
@@ -177,11 +188,30 @@ static inline unsigned int memalloc_noio_save(void)
 	return flags;
 }
 
+/**
+ * memalloc_noio_restore - Ends the implicit GFP_NOIO scope.
+ * @flags: Flags to restore.
+ *
+ * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function.
+ * Always make sure that that the given flags is the return value from the
+ * pairing memalloc_noio_save call.
+ */
 static inline void memalloc_noio_restore(unsigned int flags)
 {
 	current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
 }
 
+/**
+ * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope.
+ *
+ * This functions marks the beginning of the GFP_NOFS allocation scope.
+ * All further allocations will implicitly drop __GFP_FS flag and so
+ * they are safe for the FS critical section from the allocation recursion
+ * point of view. Use memalloc_nofs_restore to end the scope with flags
+ * returned by this function.
+ *
+ * This function is safe to be used from any context.
+ */
 static inline unsigned int memalloc_nofs_save(void)
 {
 	unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
@@ -189,6 +219,14 @@ static inline unsigned int memalloc_nofs_save(void)
 	return flags;
 }
 
+/**
+ * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope.
+ * @flags: Flags to restore.
+ *
+ * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function.
+ * Always make sure that that the given flags is the return value from the
+ * pairing memalloc_nofs_save call.
+ */
 static inline void memalloc_nofs_restore(unsigned int flags)
 {
 	current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
diff --git a/include/linux/seq_file_net.h b/include/linux/seq_file_net.h
index 43ccd84..0fdbe1d 100644
--- a/include/linux/seq_file_net.h
+++ b/include/linux/seq_file_net.h
@@ -13,12 +13,6 @@ struct seq_net_private {
 #endif
 };
 
-int seq_open_net(struct inode *, struct file *,
-		 const struct seq_operations *, int);
-int single_open_net(struct inode *, struct file *file,
-		int (*show)(struct seq_file *, void *));
-int seq_release_net(struct inode *, struct file *);
-int single_release_net(struct inode *, struct file *);
 static inline struct net *seq_file_net(struct seq_file *seq)
 {
 #ifdef CONFIG_NET_NS
@@ -28,4 +22,17 @@ static inline struct net *seq_file_net(struct seq_file *seq)
 #endif
 }
 
+/*
+ * This one is needed for proc_create_net_single since net is stored directly
+ * in private not as a struct i.e. seq_file_net can't be used.
+ */
+static inline struct net *seq_file_single_net(struct seq_file *seq)
+{
+#ifdef CONFIG_NET_NS
+	return (struct net *)seq->private;
+#else
+	return &init_net;
+#endif
+}
+
 #endif
diff --git a/include/linux/signal.h b/include/linux/signal.h
index a9bc7e1..3c52001 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -28,6 +28,9 @@ enum siginfo_layout {
 	SIL_TIMER,
 	SIL_POLL,
 	SIL_FAULT,
+	SIL_FAULT_MCEERR,
+	SIL_FAULT_BNDERR,
+	SIL_FAULT_PKUERR,
 	SIL_CHLD,
 	SIL_RT,
 	SIL_SYS,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9065477..8919837 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3250,8 +3250,7 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
 				    int *peeked, int *off, int *err);
 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
 				  int *err);
-__poll_t datagram_poll(struct file *file, struct socket *sock,
-			   struct poll_table_struct *wait);
+__poll_t datagram_poll_mask(struct socket *sock, __poll_t events);
 int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
 			   struct iov_iter *to, int size);
 static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h
new file mode 100644
index 0000000..bb4bd15
--- /dev/null
+++ b/include/linux/spi/spi-mem.h
@@ -0,0 +1,249 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (C) 2018 Exceet Electronics GmbH
+ * Copyright (C) 2018 Bootlin
+ *
+ * Author: Boris Brezillon <boris.brezillon@bootlin.com>
+ */
+
+#ifndef __LINUX_SPI_MEM_H
+#define __LINUX_SPI_MEM_H
+
+#include <linux/spi/spi.h>
+
+#define SPI_MEM_OP_CMD(__opcode, __buswidth)			\
+	{							\
+		.buswidth = __buswidth,				\
+		.opcode = __opcode,				\
+	}
+
+#define SPI_MEM_OP_ADDR(__nbytes, __val, __buswidth)		\
+	{							\
+		.nbytes = __nbytes,				\
+		.val = __val,					\
+		.buswidth = __buswidth,				\
+	}
+
+#define SPI_MEM_OP_NO_ADDR	{ }
+
+#define SPI_MEM_OP_DUMMY(__nbytes, __buswidth)			\
+	{							\
+		.nbytes = __nbytes,				\
+		.buswidth = __buswidth,				\
+	}
+
+#define SPI_MEM_OP_NO_DUMMY	{ }
+
+#define SPI_MEM_OP_DATA_IN(__nbytes, __buf, __buswidth)		\
+	{							\
+		.dir = SPI_MEM_DATA_IN,				\
+		.nbytes = __nbytes,				\
+		.buf.in = __buf,				\
+		.buswidth = __buswidth,				\
+	}
+
+#define SPI_MEM_OP_DATA_OUT(__nbytes, __buf, __buswidth)	\
+	{							\
+		.dir = SPI_MEM_DATA_OUT,			\
+		.nbytes = __nbytes,				\
+		.buf.out = __buf,				\
+		.buswidth = __buswidth,				\
+	}
+
+#define SPI_MEM_OP_NO_DATA	{ }
+
+/**
+ * enum spi_mem_data_dir - describes the direction of a SPI memory data
+ *			   transfer from the controller perspective
+ * @SPI_MEM_DATA_IN: data coming from the SPI memory
+ * @SPI_MEM_DATA_OUT: data sent the SPI memory
+ */
+enum spi_mem_data_dir {
+	SPI_MEM_DATA_IN,
+	SPI_MEM_DATA_OUT,
+};
+
+/**
+ * struct spi_mem_op - describes a SPI memory operation
+ * @cmd.buswidth: number of IO lines used to transmit the command
+ * @cmd.opcode: operation opcode
+ * @addr.nbytes: number of address bytes to send. Can be zero if the operation
+ *		 does not need to send an address
+ * @addr.buswidth: number of IO lines used to transmit the address cycles
+ * @addr.val: address value. This value is always sent MSB first on the bus.
+ *	      Note that only @addr.nbytes are taken into account in this
+ *	      address value, so users should make sure the value fits in the
+ *	      assigned number of bytes.
+ * @dummy.nbytes: number of dummy bytes to send after an opcode or address. Can
+ *		  be zero if the operation does not require dummy bytes
+ * @dummy.buswidth: number of IO lanes used to transmit the dummy bytes
+ * @data.buswidth: number of IO lanes used to send/receive the data
+ * @data.dir: direction of the transfer
+ * @data.buf.in: input buffer
+ * @data.buf.out: output buffer
+ */
+struct spi_mem_op {
+	struct {
+		u8 buswidth;
+		u8 opcode;
+	} cmd;
+
+	struct {
+		u8 nbytes;
+		u8 buswidth;
+		u64 val;
+	} addr;
+
+	struct {
+		u8 nbytes;
+		u8 buswidth;
+	} dummy;
+
+	struct {
+		u8 buswidth;
+		enum spi_mem_data_dir dir;
+		unsigned int nbytes;
+		/* buf.{in,out} must be DMA-able. */
+		union {
+			void *in;
+			const void *out;
+		} buf;
+	} data;
+};
+
+#define SPI_MEM_OP(__cmd, __addr, __dummy, __data)		\
+	{							\
+		.cmd = __cmd,					\
+		.addr = __addr,					\
+		.dummy = __dummy,				\
+		.data = __data,					\
+	}
+
+/**
+ * struct spi_mem - describes a SPI memory device
+ * @spi: the underlying SPI device
+ * @drvpriv: spi_mem_drviver private data
+ *
+ * Extra information that describe the SPI memory device and may be needed by
+ * the controller to properly handle this device should be placed here.
+ *
+ * One example would be the device size since some controller expose their SPI
+ * mem devices through a io-mapped region.
+ */
+struct spi_mem {
+	struct spi_device *spi;
+	void *drvpriv;
+};
+
+/**
+ * struct spi_mem_set_drvdata() - attach driver private data to a SPI mem
+ *				  device
+ * @mem: memory device
+ * @data: data to attach to the memory device
+ */
+static inline void spi_mem_set_drvdata(struct spi_mem *mem, void *data)
+{
+	mem->drvpriv = data;
+}
+
+/**
+ * struct spi_mem_get_drvdata() - get driver private data attached to a SPI mem
+ *				  device
+ * @mem: memory device
+ *
+ * Return: the data attached to the mem device.
+ */
+static inline void *spi_mem_get_drvdata(struct spi_mem *mem)
+{
+	return mem->drvpriv;
+}
+
+/**
+ * struct spi_controller_mem_ops - SPI memory operations
+ * @adjust_op_size: shrink the data xfer of an operation to match controller's
+ *		    limitations (can be alignment of max RX/TX size
+ *		    limitations)
+ * @supports_op: check if an operation is supported by the controller
+ * @exec_op: execute a SPI memory operation
+ *
+ * This interface should be implemented by SPI controllers providing an
+ * high-level interface to execute SPI memory operation, which is usually the
+ * case for QSPI controllers.
+ */
+struct spi_controller_mem_ops {
+	int (*adjust_op_size)(struct spi_mem *mem, struct spi_mem_op *op);
+	bool (*supports_op)(struct spi_mem *mem,
+			    const struct spi_mem_op *op);
+	int (*exec_op)(struct spi_mem *mem,
+		       const struct spi_mem_op *op);
+};
+
+/**
+ * struct spi_mem_driver - SPI memory driver
+ * @spidrv: inherit from a SPI driver
+ * @probe: probe a SPI memory. Usually where detection/initialization takes
+ *	   place
+ * @remove: remove a SPI memory
+ * @shutdown: take appropriate action when the system is shutdown
+ *
+ * This is just a thin wrapper around a spi_driver. The core takes care of
+ * allocating the spi_mem object and forwarding the probe/remove/shutdown
+ * request to the spi_mem_driver. The reason we use this wrapper is because
+ * we might have to stuff more information into the spi_mem struct to let
+ * SPI controllers know more about the SPI memory they interact with, and
+ * having this intermediate layer allows us to do that without adding more
+ * useless fields to the spi_device object.
+ */
+struct spi_mem_driver {
+	struct spi_driver spidrv;
+	int (*probe)(struct spi_mem *mem);
+	int (*remove)(struct spi_mem *mem);
+	void (*shutdown)(struct spi_mem *mem);
+};
+
+#if IS_ENABLED(CONFIG_SPI_MEM)
+int spi_controller_dma_map_mem_op_data(struct spi_controller *ctlr,
+				       const struct spi_mem_op *op,
+				       struct sg_table *sg);
+
+void spi_controller_dma_unmap_mem_op_data(struct spi_controller *ctlr,
+					  const struct spi_mem_op *op,
+					  struct sg_table *sg);
+#else
+static inline int
+spi_controller_dma_map_mem_op_data(struct spi_controller *ctlr,
+				   const struct spi_mem_op *op,
+				   struct sg_table *sg)
+{
+	return -ENOTSUPP;
+}
+
+static inline void
+spi_controller_dma_unmap_mem_op_data(struct spi_controller *ctlr,
+				     const struct spi_mem_op *op,
+				     struct sg_table *sg)
+{
+}
+#endif /* CONFIG_SPI_MEM */
+
+int spi_mem_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op);
+
+bool spi_mem_supports_op(struct spi_mem *mem,
+			 const struct spi_mem_op *op);
+
+int spi_mem_exec_op(struct spi_mem *mem,
+		    const struct spi_mem_op *op);
+
+int spi_mem_driver_register_with_owner(struct spi_mem_driver *drv,
+				       struct module *owner);
+
+void spi_mem_driver_unregister(struct spi_mem_driver *drv);
+
+#define spi_mem_driver_register(__drv)                                  \
+	spi_mem_driver_register_with_owner(__drv, THIS_MODULE)
+
+#define module_spi_mem_driver(__drv)                                    \
+	module_driver(__drv, spi_mem_driver_register,                   \
+		      spi_mem_driver_unregister)
+
+#endif /* __LINUX_SPI_MEM_H */
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index bc6bb32..a64235e 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -26,7 +26,7 @@ struct dma_chan;
 struct property_entry;
 struct spi_controller;
 struct spi_transfer;
-struct spi_flash_read_message;
+struct spi_controller_mem_ops;
 
 /*
  * INTERFACES between SPI master-side drivers and SPI slave protocol handlers,
@@ -376,13 +376,11 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
  *                    transfer_one callback.
  * @handle_err: the subsystem calls the driver to handle an error that occurs
  *		in the generic implementation of transfer_one_message().
+ * @mem_ops: optimized/dedicated operations for interactions with SPI memory.
+ *	     This field is optional and should only be implemented if the
+ *	     controller has native support for memory like operations.
  * @unprepare_message: undo any work done by prepare_message().
  * @slave_abort: abort the ongoing transfer request on an SPI slave controller
- * @spi_flash_read: to support spi-controller hardwares that provide
- *                  accelerated interface to read from flash devices.
- * @spi_flash_can_dma: analogous to can_dma() interface, but for
- *		       controllers implementing spi_flash_read.
- * @flash_read_supported: spi device supports flash read
  * @cs_gpios: Array of GPIOs to use as chip select lines; one per CS
  *	number. Any individual value may be -ENOENT for CS lines that
  *	are not GPIOs (driven by the SPI controller itself).
@@ -548,11 +546,6 @@ struct spi_controller {
 	int (*unprepare_message)(struct spi_controller *ctlr,
 				 struct spi_message *message);
 	int (*slave_abort)(struct spi_controller *ctlr);
-	int (*spi_flash_read)(struct  spi_device *spi,
-			      struct spi_flash_read_message *msg);
-	bool (*spi_flash_can_dma)(struct spi_device *spi,
-				  struct spi_flash_read_message *msg);
-	bool (*flash_read_supported)(struct spi_device *spi);
 
 	/*
 	 * These hooks are for drivers that use a generic implementation
@@ -564,6 +557,9 @@ struct spi_controller {
 	void (*handle_err)(struct spi_controller *ctlr,
 			   struct spi_message *message);
 
+	/* Optimized handlers for SPI memory-like operations. */
+	const struct spi_controller_mem_ops *mem_ops;
+
 	/* gpio chip select */
 	int			*cs_gpios;
 
@@ -1183,48 +1179,6 @@ static inline ssize_t spi_w8r16be(struct spi_device *spi, u8 cmd)
 	return be16_to_cpu(result);
 }
 
-/**
- * struct spi_flash_read_message - flash specific information for
- * spi-masters that provide accelerated flash read interfaces
- * @buf: buffer to read data
- * @from: offset within the flash from where data is to be read
- * @len: length of data to be read
- * @retlen: actual length of data read
- * @read_opcode: read_opcode to be used to communicate with flash
- * @addr_width: number of address bytes
- * @dummy_bytes: number of dummy bytes
- * @opcode_nbits: number of lines to send opcode
- * @addr_nbits: number of lines to send address
- * @data_nbits: number of lines for data
- * @rx_sg: Scatterlist for receive data read from flash
- * @cur_msg_mapped: message has been mapped for DMA
- */
-struct spi_flash_read_message {
-	void *buf;
-	loff_t from;
-	size_t len;
-	size_t retlen;
-	u8 read_opcode;
-	u8 addr_width;
-	u8 dummy_bytes;
-	u8 opcode_nbits;
-	u8 addr_nbits;
-	u8 data_nbits;
-	struct sg_table rx_sg;
-	bool cur_msg_mapped;
-};
-
-/* SPI core interface for flash read support */
-static inline bool spi_flash_read_supported(struct spi_device *spi)
-{
-	return spi->controller->spi_flash_read &&
-	       (!spi->controller->flash_read_supported ||
-	       spi->controller->flash_read_supported(spi));
-}
-
-int spi_flash_read(struct spi_device *spi,
-		   struct spi_flash_read_message *msg);
-
 /*---------------------------------------------------------------------------*/
 
 /*
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 4894d32..1e8a464 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -380,6 +380,24 @@ static __always_inline int spin_trylock_irq(spinlock_t *lock)
 	raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
 })
 
+/**
+ * spin_is_locked() - Check whether a spinlock is locked.
+ * @lock: Pointer to the spinlock.
+ *
+ * This function is NOT required to provide any memory ordering
+ * guarantees; it could be used for debugging purposes or, when
+ * additional synchronization is needed, accompanied with other
+ * constructs (memory barriers) enforcing the synchronization.
+ *
+ * Returns: 1 if @lock is locked, 0 otherwise.
+ *
+ * Note that the function only tells you that the spinlock is
+ * seen to be locked, not that it is locked on your CPU.
+ *
+ * Further, on CONFIG_SMP=n builds with CONFIG_DEBUG_SPINLOCK=n,
+ * the return value is always 0 (see include/linux/spinlock_up.h).
+ * Therefore you should not rely heavily on the return value.
+ */
 static __always_inline int spin_is_locked(spinlock_t *lock)
 {
 	return raw_spin_is_locked(&lock->rlock);
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 33c1c69..91494d7 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -69,11 +69,45 @@ struct srcu_struct { };
 
 void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 		void (*func)(struct rcu_head *head));
-void cleanup_srcu_struct(struct srcu_struct *sp);
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced);
 int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
 void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
 void synchronize_srcu(struct srcu_struct *sp);
 
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+static inline void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+	_cleanup_srcu_struct(sp, false);
+}
+
+/**
+ * cleanup_srcu_struct_quiesced - deconstruct a quiesced sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.  Also,
+ * all grace-period processing must have completed.
+ *
+ * "Completed" means that the last synchronize_srcu() and
+ * synchronize_srcu_expedited() calls must have returned before the call
+ * to cleanup_srcu_struct_quiesced().  It also means that the callback
+ * from the last call_srcu() must have been invoked before the call to
+ * cleanup_srcu_struct_quiesced(), but you can use srcu_barrier() to help
+ * with this last.  Violating these rules will get you a WARN_ON() splat
+ * (with high probability, anyway), and will also cause the srcu_struct
+ * to be leaked.
+ */
+static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp)
+{
+	_cleanup_srcu_struct(sp, true);
+}
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 /**
diff --git a/include/linux/string.h b/include/linux/string.h
index dd39a69..4a5a0eb 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -147,8 +147,8 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);
 #endif
 #ifndef __HAVE_ARCH_MEMCPY_MCSAFE
-static inline __must_check int memcpy_mcsafe(void *dst, const void *src,
-		size_t cnt)
+static inline __must_check unsigned long memcpy_mcsafe(void *dst,
+		const void *src, size_t cnt)
 {
 	memcpy(dst, src, cnt);
 	return 0;
diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index a5704da..e90b9bd 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -122,8 +122,6 @@ extern struct dentry *rpc_create_cache_dir(struct dentry *,
 					   struct cache_detail *);
 extern void rpc_remove_cache_dir(struct dentry *);
 
-extern int rpc_rmdir(struct dentry *dentry);
-
 struct rpc_pipe *rpc_mkpipe_data(const struct rpc_pipe_ops *ops, int flags);
 void rpc_destroy_pipe_data(struct rpc_pipe *pipe);
 extern struct dentry *rpc_mkpipe_dentry(struct dentry *, const char *, void *,
diff --git a/include/linux/swait.h b/include/linux/swait.h
index c98aaf6..bf8cb0d 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -5,10 +5,23 @@
 #include <linux/list.h>
 #include <linux/stddef.h>
 #include <linux/spinlock.h>
+#include <linux/wait.h>
 #include <asm/current.h>
 
 /*
- * Simple wait queues
+ * BROKEN wait-queues.
+ *
+ * These "simple" wait-queues are broken garbage, and should never be
+ * used. The comments below claim that they are "similar" to regular
+ * wait-queues, but the semantics are actually completely different, and
+ * every single user we have ever had has been buggy (or pointless).
+ *
+ * A "swake_up()" only wakes up _one_ waiter, which is not at all what
+ * "wake_up()" does, and has led to problems. In other cases, it has
+ * been fine, because there's only ever one waiter (kvm), but in that
+ * case gthe whole "simple" wait-queue is just pointless to begin with,
+ * since there is no "queue". Use "wake_up_process()" with a direct
+ * pointer instead.
  *
  * While these are very similar to regular wait queues (wait.h) the most
  * important difference is that the simple waitqueue allows for deterministic
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2417d28..c063443 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -53,7 +53,7 @@ static inline int current_is_kswapd(void)
 
 /*
  * Unaddressable device memory support. See include/linux/hmm.h and
- * Documentation/vm/hmm.txt. Short description is we need struct pages for
+ * Documentation/vm/hmm.rst. Short description is we need struct pages for
  * device memory that is unaddressable (inaccessible) by CPU, so that we can
  * migrate part of a process memory to device memory.
  *
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 70fcda1..811172f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -290,6 +290,12 @@ asmlinkage long sys_io_getevents(aio_context_t ctx_id,
 				long nr,
 				struct io_event __user *events,
 				struct timespec __user *timeout);
+asmlinkage long sys_io_pgetevents(aio_context_t ctx_id,
+				long min_nr,
+				long nr,
+				struct io_event __user *events,
+				struct timespec __user *timeout,
+				const struct __aio_sigset *sig);
 
 /* fs/xattr.c */
 asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 26c1521..4a88419 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -124,6 +124,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
 {
 	if (step) {
 		siginfo_t info;
+		clear_siginfo(&info);
 		user_single_step_siginfo(current, regs, &info);
 		force_sig_info(SIGTRAP, &info, current);
 		return;
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 1dd587ba..9bd7d37 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -10,6 +10,7 @@
 #include <linux/tty_ldisc.h>
 #include <linux/mutex.h>
 #include <linux/tty_flags.h>
+#include <linux/seq_file.h>
 #include <uapi/linux/tty.h>
 #include <linux/rwsem.h>
 #include <linux/llist.h>
@@ -535,7 +536,7 @@ extern void tty_ldisc_deref(struct tty_ldisc *);
 extern struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *);
 extern void tty_ldisc_hangup(struct tty_struct *tty, bool reset);
 extern int tty_ldisc_reinit(struct tty_struct *tty, int disc);
-extern const struct file_operations tty_ldiscs_proc_fops;
+extern const struct seq_operations tty_ldiscs_seq_ops;
 
 extern void tty_wakeup(struct tty_struct *tty);
 extern void tty_ldisc_flush(struct tty_struct *tty);
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 31c2b5b..71dbc89 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -293,7 +293,7 @@ struct tty_operations {
 	int (*poll_get_char)(struct tty_driver *driver, int line);
 	void (*poll_put_char)(struct tty_driver *driver, int line, char ch);
 #endif
-	const struct file_operations *proc_fops;
+	int (*proc_show)(struct seq_file *, void *);
 } __randomize_layout;
 
 struct tty_driver {
diff --git a/include/linux/uio.h b/include/linux/uio.h
index e67e12a..f5766e8 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -154,6 +154,12 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
 #define _copy_from_iter_flushcache _copy_from_iter_nocache
 #endif
 
+#ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE
+size_t _copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i);
+#else
+#define _copy_to_iter_mcsafe _copy_to_iter
+#endif
+
 static __always_inline __must_check
 size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 {
@@ -163,6 +169,15 @@ size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 		return _copy_from_iter_flushcache(addr, bytes, i);
 }
 
+static __always_inline __must_check
+size_t copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i)
+{
+	if (unlikely(!check_copy_size(addr, bytes, false)))
+		return 0;
+	else
+		return _copy_to_iter_mcsafe(addr, bytes, i);
+}
+
 size_t iov_iter_zero(size_t bytes, struct iov_iter *);
 unsigned long iov_iter_alignment(const struct iov_iter *i);
 unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index d70f77a4..6dad031 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -46,7 +46,6 @@ struct xattr {
 	size_t value_len;
 };
 
-ssize_t xattr_getsecurity(struct inode *, const char *, void *, size_t);
 ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t);
 ssize_t vfs_getxattr(struct dentry *, const char *, void *, size_t);
 ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size);
diff --git a/include/net/ax25.h b/include/net/ax25.h
index c91bc87..3f9aea8 100644
--- a/include/net/ax25.h
+++ b/include/net/ax25.h
@@ -15,6 +15,7 @@
 #include <linux/refcount.h>
 #include <net/neighbour.h>
 #include <net/sock.h>
+#include <linux/seq_file.h>
 
 #define	AX25_T1CLAMPLO  		1
 #define	AX25_T1CLAMPHI 			(30 * HZ)
@@ -399,7 +400,7 @@ int ax25_check_iframes_acked(ax25_cb *, unsigned short);
 /* ax25_route.c */
 void ax25_rt_device_down(struct net_device *);
 int ax25_rt_ioctl(unsigned int, void __user *);
-extern const struct file_operations ax25_route_fops;
+extern const struct seq_operations ax25_rt_seqops;
 ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev);
 int ax25_rt_autobind(ax25_cb *, ax25_address *);
 struct sk_buff *ax25_rt_build_path(struct sk_buff *, ax25_address *,
@@ -455,7 +456,7 @@ unsigned long ax25_display_timer(struct timer_list *);
 extern int  ax25_uid_policy;
 ax25_uid_assoc *ax25_findbyuid(kuid_t);
 int __must_check ax25_uid_ioctl(int, struct sockaddr_ax25 *);
-extern const struct file_operations ax25_uid_fops;
+extern const struct seq_operations ax25_uid_seqops;
 void ax25_uid_free(void);
 
 /* sysctl_net_ax25.c */
diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index ec9d6bc..53ce817 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -271,7 +271,7 @@ int  bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 		     int flags);
 int  bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg,
 			    size_t len, int flags);
-__poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait);
+__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events);
 int  bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 int  bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo);
 int  bt_sock_wait_ready(struct sock *sk, unsigned long flags);
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 71c72a9..c518743 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -121,6 +121,21 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock)
 #endif
 }
 
+static inline void sock_poll_busy_loop(struct socket *sock, __poll_t events)
+{
+	if (sk_can_busy_loop(sock->sk) &&
+	    events && (events & POLL_BUSY_LOOP)) {
+		/* once, only if requested by syscall */
+		sk_busy_loop(sock->sk, 1);
+	}
+}
+
+/* if this socket can poll_ll, tell the system call */
+static inline __poll_t sock_poll_busy_flag(struct socket *sock)
+{
+	return sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0;
+}
+
 /* used in the NIC receive handler to mark the skb */
 static inline void skb_mark_napi_id(struct sk_buff *skb,
 				    struct napi_struct *napi)
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 5e86fd9..0e79c34 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -394,7 +394,15 @@ void fib6_gc_cleanup(void);
 
 int fib6_init(void);
 
-int ipv6_route_open(struct inode *inode, struct file *file);
+struct ipv6_route_iter {
+	struct seq_net_private p;
+	struct fib6_walker w;
+	loff_t skip;
+	struct fib6_table *tbl;
+	int sernum;
+};
+
+extern const struct seq_operations ipv6_route_seq_ops;
 
 int call_fib6_notifier(struct notifier_block *nb, struct net *net,
 		       enum fib_event_type event_type,
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index eb0bec0..aea7a12 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -41,18 +41,6 @@ static inline struct netns_ipvs *net_ipvs(struct net* net)
 	return net->ipvs;
 }
 
-/* This one needed for single_open_net since net is stored directly in
- * private not as a struct i.e. seq_file_net can't be used.
- */
-static inline struct net *seq_file_single_net(struct seq_file *seq)
-{
-#ifdef CONFIG_NET_NS
-	return (struct net *)seq->private;
-#else
-	return &init_net;
-#endif
-}
-
 /* Connections' size value needed by ip_vs_ctl.c */
 extern int ip_vs_conn_tab_size;
 
diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h
index f4c21b5..b0eaeb0 100644
--- a/include/net/iucv/af_iucv.h
+++ b/include/net/iucv/af_iucv.h
@@ -153,8 +153,6 @@ struct iucv_sock_list {
 	atomic_t	  autobind_name;
 };
 
-__poll_t iucv_sock_poll(struct file *file, struct socket *sock,
-			    poll_table *wait);
 void iucv_sock_link(struct iucv_sock_list *l, struct sock *s);
 void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *s);
 void iucv_accept_enqueue(struct sock *parent, struct sock *sk);
diff --git a/include/net/netrom.h b/include/net/netrom.h
index 0dad2dd..5a0714f 100644
--- a/include/net/netrom.h
+++ b/include/net/netrom.h
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <net/sock.h>
 #include <linux/refcount.h>
+#include <linux/seq_file.h>
 
 #define	NR_NETWORK_LEN			15
 #define	NR_TRANSPORT_LEN		5
@@ -216,8 +217,8 @@ struct net_device *nr_dev_get(ax25_address *);
 int nr_rt_ioctl(unsigned int, void __user *);
 void nr_link_failed(ax25_cb *, int);
 int nr_route_frame(struct sk_buff *, ax25_cb *);
-extern const struct file_operations nr_nodes_fops;
-extern const struct file_operations nr_neigh_fops;
+extern const struct seq_operations nr_node_seqops;
+extern const struct seq_operations nr_neigh_seqops;
 void nr_rt_free(void);
 
 /* nr_subr.c */
diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h
index 8639de5..cbee32b 100644
--- a/include/net/phonet/pn_dev.h
+++ b/include/net/phonet/pn_dev.h
@@ -56,7 +56,7 @@ struct net_device *phonet_route_output(struct net *net, u8 daddr);
 
 #define PN_NO_ADDR	0xff
 
-extern const struct file_operations pn_sock_seq_fops;
-extern const struct file_operations pn_res_seq_fops;
+extern const struct seq_operations pn_sock_seq_ops;
+extern const struct seq_operations pn_res_seq_ops;
 
 #endif
diff --git a/include/net/ping.h b/include/net/ping.h
index 4cd90d6..fd080e0 100644
--- a/include/net/ping.h
+++ b/include/net/ping.h
@@ -83,20 +83,9 @@ int  ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 bool ping_rcv(struct sk_buff *skb);
 
 #ifdef CONFIG_PROC_FS
-struct ping_seq_afinfo {
-	char				*name;
-	sa_family_t			family;
-	const struct file_operations	*seq_fops;
-	const struct seq_operations	seq_ops;
-};
-
-extern const struct file_operations ping_seq_fops;
-
 void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family);
 void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos);
 void ping_seq_stop(struct seq_file *seq, void *v);
-int ping_proc_register(struct net *net, struct ping_seq_afinfo *afinfo);
-void ping_proc_unregister(struct net *net, struct ping_seq_afinfo *afinfo);
 
 int __init ping_proc_init(void);
 void ping_proc_exit(void);
diff --git a/include/net/raw.h b/include/net/raw.h
index 99d26d0..9c9fa98 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -48,7 +48,6 @@ void raw_proc_exit(void);
 struct raw_iter_state {
 	struct seq_net_private p;
 	int bucket;
-	struct raw_hashinfo *h;
 };
 
 static inline struct raw_iter_state *raw_seq_private(struct seq_file *seq)
@@ -58,9 +57,6 @@ static inline struct raw_iter_state *raw_seq_private(struct seq_file *seq)
 void *raw_seq_start(struct seq_file *seq, loff_t *pos);
 void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos);
 void raw_seq_stop(struct seq_file *seq, void *v);
-int raw_seq_open(struct inode *ino, struct file *file,
-		 struct raw_hashinfo *h, const struct seq_operations *ops);
-
 #endif
 
 int raw_hash_sk(struct sock *sk);
diff --git a/include/net/rose.h b/include/net/rose.h
index 04b7268..cf517d3 100644
--- a/include/net/rose.h
+++ b/include/net/rose.h
@@ -200,9 +200,9 @@ void rose_enquiry_response(struct sock *);
 
 /* rose_route.c */
 extern struct rose_neigh *rose_loopback_neigh;
-extern const struct file_operations rose_neigh_fops;
-extern const struct file_operations rose_nodes_fops;
-extern const struct file_operations rose_routes_fops;
+extern const struct seq_operations rose_neigh_seqops;
+extern const struct seq_operations rose_node_seqops;
+extern struct seq_operations rose_route_seqops;
 
 void rose_add_loopback_neigh(void);
 int __must_check rose_add_loopback_node(rose_address *);
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 35498e6..e6d349b 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -109,8 +109,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb);
 int sctp_inet_listen(struct socket *sock, int backlog);
 void sctp_write_space(struct sock *sk);
 void sctp_data_ready(struct sock *sk);
-__poll_t sctp_poll(struct file *file, struct socket *sock,
-		poll_table *wait);
+__poll_t sctp_poll_mask(struct socket *sock, __poll_t events);
 void sctp_sock_rfree(struct sk_buff *skb);
 void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 		    struct sctp_association *asoc);
diff --git a/include/net/sock.h b/include/net/sock.h
index 74d725f..4d2e8ad 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1591,8 +1591,6 @@ int sock_no_connect(struct socket *, struct sockaddr *, int, int);
 int sock_no_socketpair(struct socket *, struct socket *);
 int sock_no_accept(struct socket *, struct socket *, int, bool);
 int sock_no_getname(struct socket *, struct sockaddr *, int);
-__poll_t sock_no_poll(struct file *, struct socket *,
-			  struct poll_table_struct *);
 int sock_no_ioctl(struct socket *, unsigned int, unsigned long);
 int sock_no_listen(struct socket *, int);
 int sock_no_shutdown(struct socket *, int);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9c9b376..f88f8a2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -388,8 +388,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
 void tcp_close(struct sock *sk, long timeout);
 void tcp_init_sock(struct sock *sk);
 void tcp_init_transfer(struct sock *sk, int bpf_op);
-__poll_t tcp_poll(struct file *file, struct socket *sock,
-		      struct poll_table_struct *wait);
+__poll_t tcp_poll_mask(struct socket *sock, __poll_t events);
 int tcp_getsockopt(struct sock *sk, int level, int optname,
 		   char __user *optval, int __user *optlen);
 int tcp_setsockopt(struct sock *sk, int level, int optname,
@@ -1747,27 +1746,22 @@ enum tcp_seq_states {
 	TCP_SEQ_STATE_ESTABLISHED,
 };
 
-int tcp_seq_open(struct inode *inode, struct file *file);
+void *tcp_seq_start(struct seq_file *seq, loff_t *pos);
+void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos);
+void tcp_seq_stop(struct seq_file *seq, void *v);
 
 struct tcp_seq_afinfo {
-	char				*name;
 	sa_family_t			family;
-	const struct file_operations	*seq_fops;
-	struct seq_operations		seq_ops;
 };
 
 struct tcp_iter_state {
 	struct seq_net_private	p;
-	sa_family_t		family;
 	enum tcp_seq_states	state;
 	struct sock		*syn_wait_sk;
 	int			bucket, offset, sbucket, num;
 	loff_t			last_pos;
 };
 
-int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo);
-void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo);
-
 extern struct request_sock_ops tcp_request_sock_ops;
 extern struct request_sock_ops tcp6_request_sock_ops;
 
diff --git a/include/net/udp.h b/include/net/udp.h
index 0676b27..d8ca3b2 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -276,7 +276,7 @@ int udp_init_sock(struct sock *sk);
 int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 int __udp_disconnect(struct sock *sk, int flags);
 int udp_disconnect(struct sock *sk, int flags);
-__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait);
+__poll_t udp_poll_mask(struct socket *sock, __poll_t events);
 struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 				       netdev_features_t features,
 				       bool is_ipv6);
@@ -408,31 +408,27 @@ do {									\
 #define __UDPX_INC_STATS(sk, field) __UDP_INC_STATS(sock_net(sk), field, 0)
 #endif
 
-/* /proc */
-int udp_seq_open(struct inode *inode, struct file *file);
-
+#ifdef CONFIG_PROC_FS
 struct udp_seq_afinfo {
-	char				*name;
 	sa_family_t			family;
 	struct udp_table		*udp_table;
-	const struct file_operations	*seq_fops;
-	struct seq_operations		seq_ops;
 };
 
 struct udp_iter_state {
 	struct seq_net_private  p;
-	sa_family_t		family;
 	int			bucket;
-	struct udp_table	*udp_table;
 };
 
-#ifdef CONFIG_PROC_FS
-int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo);
-void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo);
+void *udp_seq_start(struct seq_file *seq, loff_t *pos);
+void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos);
+void udp_seq_stop(struct seq_file *seq, void *v);
+
+extern const struct seq_operations udp_seq_ops;
+extern const struct seq_operations udp6_seq_ops;
 
 int udp4_proc_init(void);
 void udp4_proc_exit(void);
-#endif
+#endif /* CONFIG_PROC_FS */
 
 int udpv4_offload_init(void);
 
diff --git a/include/scsi/osd_initiator.h b/include/scsi/osd_initiator.h
index a29d308..86a569d 100644
--- a/include/scsi/osd_initiator.h
+++ b/include/scsi/osd_initiator.h
@@ -148,7 +148,6 @@ struct osd_request {
 		u8 *pad_buff;
 	} out, in;
 
-	gfp_t alloc_flags;
 	unsigned timeout;
 	unsigned retries;
 	unsigned sense_len;
@@ -202,14 +201,11 @@ static inline bool osd_req_is_ver1(struct osd_request *or)
  *
  * @osd_dev:    OSD device that holds the scsi-device and default values
  *              that the request is associated with.
- * @gfp:        The allocation flags to use for request allocation, and all
- *              subsequent allocations. This will be stored at
- *              osd_request->alloc_flags, can be changed by user later
  *
  * Allocate osd_request and initialize all members to the
  * default/initial state.
  */
-struct osd_request *osd_start_request(struct osd_dev *od, gfp_t gfp);
+struct osd_request *osd_start_request(struct osd_dev *od);
 
 enum osd_req_options {
 	OSD_REQ_FUA = 0x08,	/* Force Unit Access */
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 12f454c..53b485f 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -307,7 +307,7 @@ struct scsi_host_template {
 	 * EH_HANDLED:		I fixed the error, please complete the command
 	 * EH_RESET_TIMER:	I need more time, reset the timer and
 	 *			begin counting again
-	 * EH_NOT_HANDLED	Begin normal error recovery
+	 * EH_DONE:		Begin normal error recovery
 	 *
 	 * Status: OPTIONAL
 	 */
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 965c650..39b94ec 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -121,9 +121,9 @@ TRACE_EVENT(btrfs_transaction_commit,
 		__entry->root_objectid	= root->root_key.objectid;
 	),
 
-	TP_printk_btrfs("root = %llu(%s), gen = %llu",
+	TP_printk_btrfs("root=%llu(%s) gen=%llu",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long long)__entry->generation)
+		  __entry->generation)
 );
 
 DECLARE_EVENT_CLASS(btrfs__inode,
@@ -133,7 +133,7 @@ DECLARE_EVENT_CLASS(btrfs__inode,
 	TP_ARGS(inode),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	ino_t,  ino			)
+		__field(	u64,  ino			)
 		__field(	blkcnt_t,  blocks		)
 		__field(	u64,  disk_i_size		)
 		__field(	u64,  generation		)
@@ -143,7 +143,7 @@ DECLARE_EVENT_CLASS(btrfs__inode,
 	),
 
 	TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
-		__entry->ino	= inode->i_ino;
+		__entry->ino	= btrfs_ino(BTRFS_I(inode));
 		__entry->blocks	= inode->i_blocks;
 		__entry->disk_i_size  = BTRFS_I(inode)->disk_i_size;
 		__entry->generation = BTRFS_I(inode)->generation;
@@ -153,15 +153,15 @@ DECLARE_EVENT_CLASS(btrfs__inode,
 				BTRFS_I(inode)->root->root_key.objectid;
 	),
 
-	TP_printk_btrfs("root=%llu(%s) gen=%llu ino=%lu blocks=%llu "
+	TP_printk_btrfs("root=%llu(%s) gen=%llu ino=%llu blocks=%llu "
 		  "disk_i_size=%llu last_trans=%llu logged_trans=%llu",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long long)__entry->generation,
-		  (unsigned long)__entry->ino,
+		  __entry->generation,
+		  __entry->ino,
 		  (unsigned long long)__entry->blocks,
-		  (unsigned long long)__entry->disk_i_size,
-		  (unsigned long long)__entry->last_trans,
-		  (unsigned long long)__entry->logged_trans)
+		  __entry->disk_i_size,
+		  __entry->last_trans,
+		  __entry->logged_trans)
 );
 
 DEFINE_EVENT(btrfs__inode, btrfs_inode_new,
@@ -244,23 +244,25 @@ TRACE_EVENT_CONDITION(btrfs_get_extent,
 		  "block_len=%llu flags=%s refs=%u "
 		  "compress_type=%u",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long long)__entry->ino,
-		  (unsigned long long)__entry->start,
-		  (unsigned long long)__entry->len,
-		  (unsigned long long)__entry->orig_start,
+		  __entry->ino,
+		  __entry->start,
+		  __entry->len,
+		  __entry->orig_start,
 		  show_map_type(__entry->block_start),
-		  (unsigned long long)__entry->block_len,
+		  __entry->block_len,
 		  show_map_flags(__entry->flags),
 		  __entry->refs, __entry->compress_type)
 );
 
 TRACE_EVENT(btrfs_handle_em_exist,
 
-	TP_PROTO(const struct extent_map *existing, const struct extent_map *map, u64 start, u64 len),
+	TP_PROTO(struct btrfs_fs_info *fs_info,
+		const struct extent_map *existing, const struct extent_map *map,
+		u64 start, u64 len),
 
-	TP_ARGS(existing, map, start, len),
+	TP_ARGS(fs_info, existing, map, start, len),
 
-	TP_STRUCT__entry(
+	TP_STRUCT__entry_btrfs(
 		__field(	u64,  e_start		)
 		__field(	u64,  e_len		)
 		__field(	u64,  map_start		)
@@ -269,7 +271,7 @@ TRACE_EVENT(btrfs_handle_em_exist,
 		__field(	u64,  len		)
 	),
 
-	TP_fast_assign(
+	TP_fast_assign_btrfs(fs_info,
 		__entry->e_start	= existing->start;
 		__entry->e_len		= existing->len;
 		__entry->map_start	= map->start;
@@ -278,15 +280,15 @@ TRACE_EVENT(btrfs_handle_em_exist,
 		__entry->len		= len;
 	),
 
-	TP_printk("start=%llu len=%llu "
+	TP_printk_btrfs("start=%llu len=%llu "
 		  "existing(start=%llu len=%llu) "
 		  "em(start=%llu len=%llu)",
-		  (unsigned long long)__entry->start,
-		  (unsigned long long)__entry->len,
-		  (unsigned long long)__entry->e_start,
-		  (unsigned long long)__entry->e_len,
-		  (unsigned long long)__entry->map_start,
-		  (unsigned long long)__entry->map_len)
+		  __entry->start,
+		  __entry->len,
+		  __entry->e_start,
+		  __entry->e_len,
+		  __entry->map_start,
+		  __entry->map_len)
 );
 
 /* file extent item */
@@ -443,7 +445,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent,
 	TP_ARGS(inode, ordered),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	ino_t,  ino		)
+		__field(	u64,  ino		)
 		__field(	u64,  file_offset	)
 		__field(	u64,  start		)
 		__field(	u64,  len		)
@@ -457,7 +459,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent,
 	),
 
 	TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
-		__entry->ino 		= inode->i_ino;
+		__entry->ino 		= btrfs_ino(BTRFS_I(inode));
 		__entry->file_offset	= ordered->file_offset;
 		__entry->start		= ordered->start;
 		__entry->len		= ordered->len;
@@ -477,13 +479,13 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent,
 		  "bytes_left=%llu flags=%s compress_type=%d "
 		  "refs=%d",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long long)__entry->ino,
-		  (unsigned long long)__entry->file_offset,
-		  (unsigned long long)__entry->start,
-		  (unsigned long long)__entry->len,
-		  (unsigned long long)__entry->disk_len,
-		  (unsigned long long)__entry->truncated_len,
-		  (unsigned long long)__entry->bytes_left,
+		  __entry->ino,
+		  __entry->file_offset,
+		  __entry->start,
+		  __entry->len,
+		  __entry->disk_len,
+		  __entry->truncated_len,
+		  __entry->bytes_left,
 		  show_ordered_flags(__entry->flags),
 		  __entry->compress_type, __entry->refs)
 );
@@ -528,7 +530,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 	TP_ARGS(page, inode, wbc),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	ino_t,  ino			)
+		__field(	u64,	ino			)
 		__field(	pgoff_t,  index			)
 		__field(	long,   nr_to_write		)
 		__field(	long,   pages_skipped		)
@@ -542,7 +544,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 	),
 
 	TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
-		__entry->ino		= inode->i_ino;
+		__entry->ino		= btrfs_ino(BTRFS_I(inode));
 		__entry->index		= page->index;
 		__entry->nr_to_write	= wbc->nr_to_write;
 		__entry->pages_skipped	= wbc->pages_skipped;
@@ -556,12 +558,12 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 				 BTRFS_I(inode)->root->root_key.objectid;
 	),
 
-	TP_printk_btrfs("root=%llu(%s) ino=%lu page_index=%lu "
+	TP_printk_btrfs("root=%llu(%s) ino=%llu page_index=%lu "
 		  "nr_to_write=%ld pages_skipped=%ld range_start=%llu "
 		  "range_end=%llu for_kupdate=%d "
 		  "for_reclaim=%d range_cyclic=%d writeback_index=%lu",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long)__entry->ino, __entry->index,
+		  __entry->ino, __entry->index,
 		  __entry->nr_to_write, __entry->pages_skipped,
 		  __entry->range_start, __entry->range_end,
 		  __entry->for_kupdate,
@@ -584,7 +586,7 @@ TRACE_EVENT(btrfs_writepage_end_io_hook,
 	TP_ARGS(page, start, end, uptodate),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	ino_t,	 ino		)
+		__field(	u64,	 ino		)
 		__field(	pgoff_t, index		)
 		__field(	u64,	 start		)
 		__field(	u64,	 end		)
@@ -593,7 +595,7 @@ TRACE_EVENT(btrfs_writepage_end_io_hook,
 	),
 
 	TP_fast_assign_btrfs(btrfs_sb(page->mapping->host->i_sb),
-		__entry->ino	= page->mapping->host->i_ino;
+		__entry->ino	= btrfs_ino(BTRFS_I(page->mapping->host));
 		__entry->index	= page->index;
 		__entry->start	= start;
 		__entry->end	= end;
@@ -602,12 +604,12 @@ TRACE_EVENT(btrfs_writepage_end_io_hook,
 			 BTRFS_I(page->mapping->host)->root->root_key.objectid;
 	),
 
-	TP_printk_btrfs("root=%llu(%s) ino=%lu page_index=%lu start=%llu "
+	TP_printk_btrfs("root=%llu(%s) ino=%llu page_index=%lu start=%llu "
 		  "end=%llu uptodate=%d",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long)__entry->ino, (unsigned long)__entry->index,
-		  (unsigned long long)__entry->start,
-		  (unsigned long long)__entry->end, __entry->uptodate)
+		  __entry->ino, (unsigned long)__entry->index,
+		  __entry->start,
+		  __entry->end, __entry->uptodate)
 );
 
 TRACE_EVENT(btrfs_sync_file,
@@ -617,8 +619,8 @@ TRACE_EVENT(btrfs_sync_file,
 	TP_ARGS(file, datasync),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	ino_t,  ino		)
-		__field(	ino_t,  parent		)
+		__field(	u64,	ino		)
+		__field(	u64,	parent		)
 		__field(	int,    datasync	)
 		__field(	u64,    root_objectid	)
 	),
@@ -628,16 +630,17 @@ TRACE_EVENT(btrfs_sync_file,
 		const struct inode *inode = d_inode(dentry);
 
 		TP_fast_assign_fsid(btrfs_sb(file->f_path.dentry->d_sb));
-		__entry->ino		= inode->i_ino;
-		__entry->parent		= d_inode(dentry->d_parent)->i_ino;
+		__entry->ino		= btrfs_ino(BTRFS_I(inode));
+		__entry->parent		= btrfs_ino(BTRFS_I(d_inode(dentry->d_parent)));
 		__entry->datasync	= datasync;
 		__entry->root_objectid	=
 				 BTRFS_I(inode)->root->root_key.objectid;
 	),
 
-	TP_printk_btrfs("root=%llu(%s) ino=%ld parent=%ld datasync=%d",
+	TP_printk_btrfs("root=%llu(%s) ino=%llu parent=%llu datasync=%d",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long)__entry->ino, (unsigned long)__entry->parent,
+		  __entry->ino,
+		  __entry->parent,
 		  __entry->datasync)
 );
 
@@ -655,7 +658,7 @@ TRACE_EVENT(btrfs_sync_fs,
 		__entry->wait	= wait;
 	),
 
-	TP_printk_btrfs("wait = %d", __entry->wait)
+	TP_printk_btrfs("wait=%d", __entry->wait)
 );
 
 TRACE_EVENT(btrfs_add_block_group,
@@ -665,8 +668,7 @@ TRACE_EVENT(btrfs_add_block_group,
 
 	TP_ARGS(fs_info, block_group, create),
 
-	TP_STRUCT__entry(
-		__array(	u8,	fsid,	BTRFS_FSID_SIZE	)
+	TP_STRUCT__entry_btrfs(
 		__field(	u64,	offset			)
 		__field(	u64,	size			)
 		__field(	u64,	flags			)
@@ -675,8 +677,7 @@ TRACE_EVENT(btrfs_add_block_group,
 		__field(	int,	create			)
 	),
 
-	TP_fast_assign(
-		memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE);
+	TP_fast_assign_btrfs(fs_info,
 		__entry->offset		= block_group->key.objectid;
 		__entry->size		= block_group->key.offset;
 		__entry->flags		= block_group->flags;
@@ -686,16 +687,16 @@ TRACE_EVENT(btrfs_add_block_group,
 		__entry->create		= create;
 	),
 
-	TP_printk("%pU: block_group offset=%llu size=%llu "
+	TP_printk_btrfs("block_group offset=%llu size=%llu "
 		  "flags=%llu(%s) bytes_used=%llu bytes_super=%llu "
-		  "create=%d", __entry->fsid,
-		  (unsigned long long)__entry->offset,
-		  (unsigned long long)__entry->size,
-		  (unsigned long long)__entry->flags,
+		  "create=%d",
+		  __entry->offset,
+		  __entry->size,
+		  __entry->flags,
 		  __print_flags((unsigned long)__entry->flags, "|",
 				BTRFS_GROUP_FLAGS),
-		  (unsigned long long)__entry->bytes_used,
-		  (unsigned long long)__entry->bytes_super, __entry->create)
+		  __entry->bytes_used,
+		  __entry->bytes_super, __entry->create)
 );
 
 #define show_ref_action(action)						\
@@ -740,13 +741,13 @@ DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref,
 	TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s "
 		  "parent=%llu(%s) ref_root=%llu(%s) level=%d "
 		  "type=%s seq=%llu",
-		  (unsigned long long)__entry->bytenr,
-		  (unsigned long long)__entry->num_bytes,
+		  __entry->bytenr,
+		  __entry->num_bytes,
 		  show_ref_action(__entry->action),
 		  show_root_type(__entry->parent),
 		  show_root_type(__entry->ref_root),
 		  __entry->level, show_ref_type(__entry->type),
-		  (unsigned long long)__entry->seq)
+		  __entry->seq)
 );
 
 DEFINE_EVENT(btrfs_delayed_tree_ref,  add_delayed_tree_ref,
@@ -805,15 +806,15 @@ DECLARE_EVENT_CLASS(btrfs_delayed_data_ref,
 	TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s "
 		  "parent=%llu(%s) ref_root=%llu(%s) owner=%llu "
 		  "offset=%llu type=%s seq=%llu",
-		  (unsigned long long)__entry->bytenr,
-		  (unsigned long long)__entry->num_bytes,
+		  __entry->bytenr,
+		  __entry->num_bytes,
 		  show_ref_action(__entry->action),
 		  show_root_type(__entry->parent),
 		  show_root_type(__entry->ref_root),
-		  (unsigned long long)__entry->owner,
-		  (unsigned long long)__entry->offset,
+		  __entry->owner,
+		  __entry->offset,
 		  show_ref_type(__entry->type),
-		  (unsigned long long)__entry->seq)
+		  __entry->seq)
 );
 
 DEFINE_EVENT(btrfs_delayed_data_ref,  add_delayed_data_ref,
@@ -859,8 +860,8 @@ DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,
 	),
 
 	TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s is_data=%d",
-		  (unsigned long long)__entry->bytenr,
-		  (unsigned long long)__entry->num_bytes,
+		  __entry->bytenr,
+		  __entry->num_bytes,
 		  show_ref_action(__entry->action),
 		  __entry->is_data)
 );
@@ -923,8 +924,8 @@ DECLARE_EVENT_CLASS(btrfs__chunk,
 	TP_printk_btrfs("root=%llu(%s) offset=%llu size=%llu "
 		  "num_stripes=%d sub_stripes=%d type=%s",
 		  show_root_type(__entry->root_objectid),
-		  (unsigned long long)__entry->offset,
-		  (unsigned long long)__entry->size,
+		  __entry->offset,
+		  __entry->size,
 		  __entry->num_stripes, __entry->sub_stripes,
 		  show_chunk_type(__entry->type))
 );
@@ -974,9 +975,9 @@ TRACE_EVENT(btrfs_cow_block,
 		  "(orig_level=%d) cow_buf=%llu (cow_level=%d)",
 		  show_root_type(__entry->root_objectid),
 		  __entry->refs,
-		  (unsigned long long)__entry->buf_start,
+		  __entry->buf_start,
 		  __entry->buf_level,
-		  (unsigned long long)__entry->cow_start,
+		  __entry->cow_start,
 		  __entry->cow_level)
 );
 
@@ -1001,7 +1002,7 @@ TRACE_EVENT(btrfs_space_reservation,
 		__entry->reserve	= reserve;
 	),
 
-	TP_printk_btrfs("%s: %Lu %s %Lu", __get_str(type), __entry->val,
+	TP_printk_btrfs("%s: %llu %s %llu", __get_str(type), __entry->val,
 			__entry->reserve ? "reserve" : "release",
 			__entry->bytes)
 );
@@ -1019,29 +1020,27 @@ TRACE_EVENT(btrfs_trigger_flush,
 
 	TP_ARGS(fs_info, flags, bytes, flush, reason),
 
-	TP_STRUCT__entry(
-		__array(	u8,	fsid,	BTRFS_FSID_SIZE	)
+	TP_STRUCT__entry_btrfs(
 		__field(	u64,	flags			)
 		__field(	u64,	bytes			)
 		__field(	int,	flush			)
 		__string(	reason,	reason			)
 	),
 
-	TP_fast_assign(
-		memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE);
+	TP_fast_assign_btrfs(fs_info,
 		__entry->flags	= flags;
 		__entry->bytes	= bytes;
 		__entry->flush	= flush;
 		__assign_str(reason, reason)
 	),
 
-	TP_printk("%pU: %s: flush=%d(%s) flags=%llu(%s) bytes=%llu",
-		  __entry->fsid, __get_str(reason), __entry->flush,
+	TP_printk_btrfs("%s: flush=%d(%s) flags=%llu(%s) bytes=%llu",
+		  __get_str(reason), __entry->flush,
 		  show_flush_action(__entry->flush),
-		  (unsigned long long)__entry->flags,
+		  __entry->flags,
 		  __print_flags((unsigned long)__entry->flags, "|",
 				BTRFS_GROUP_FLAGS),
-		  (unsigned long long)__entry->bytes)
+		  __entry->bytes)
 );
 
 #define show_flush_state(state)							\
@@ -1060,29 +1059,27 @@ TRACE_EVENT(btrfs_flush_space,
 
 	TP_ARGS(fs_info, flags, num_bytes, state, ret),
 
-	TP_STRUCT__entry(
-		__array(	u8,	fsid,	BTRFS_FSID_SIZE	)
+	TP_STRUCT__entry_btrfs(
 		__field(	u64,	flags			)
 		__field(	u64,	num_bytes		)
 		__field(	int,	state			)
 		__field(	int,	ret			)
 	),
 
-	TP_fast_assign(
-		memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE);
+	TP_fast_assign_btrfs(fs_info,
 		__entry->flags		=	flags;
 		__entry->num_bytes	=	num_bytes;
 		__entry->state		=	state;
 		__entry->ret		=	ret;
 	),
 
-	TP_printk("%pU: state=%d(%s) flags=%llu(%s) num_bytes=%llu ret=%d",
-		  __entry->fsid, __entry->state,
+	TP_printk_btrfs("state=%d(%s) flags=%llu(%s) num_bytes=%llu ret=%d",
+		  __entry->state,
 		  show_flush_state(__entry->state),
-		  (unsigned long long)__entry->flags,
+		  __entry->flags,
 		  __print_flags((unsigned long)__entry->flags, "|",
 				BTRFS_GROUP_FLAGS),
-		  (unsigned long long)__entry->num_bytes, __entry->ret)
+		  __entry->num_bytes, __entry->ret)
 );
 
 DECLARE_EVENT_CLASS(btrfs__reserved_extent,
@@ -1103,8 +1100,8 @@ DECLARE_EVENT_CLASS(btrfs__reserved_extent,
 
 	TP_printk_btrfs("root=%llu(%s) start=%llu len=%llu",
 		  show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
-		  (unsigned long long)__entry->start,
-		  (unsigned long long)__entry->len)
+		  __entry->start,
+		  __entry->len)
 );
 
 DEFINE_EVENT(btrfs__reserved_extent,  btrfs_reserved_extent_alloc,
@@ -1140,7 +1137,7 @@ TRACE_EVENT(find_free_extent,
 		__entry->data		= data;
 	),
 
-	TP_printk_btrfs("root=%Lu(%s) len=%Lu empty_size=%Lu flags=%Lu(%s)",
+	TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)",
 		  show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
 		  __entry->num_bytes, __entry->empty_size, __entry->data,
 		  __print_flags((unsigned long)__entry->data, "|",
@@ -1149,11 +1146,10 @@ TRACE_EVENT(find_free_extent,
 
 DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 
-	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_block_group_cache *block_group, u64 start,
+	TP_PROTO(const struct btrfs_block_group_cache *block_group, u64 start,
 		 u64 len),
 
-	TP_ARGS(fs_info, block_group, start, len),
+	TP_ARGS(block_group, start, len),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	u64,	bg_objectid		)
@@ -1162,15 +1158,15 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 		__field(	u64,	len			)
 	),
 
-	TP_fast_assign_btrfs(fs_info,
+	TP_fast_assign_btrfs(block_group->fs_info,
 		__entry->bg_objectid	= block_group->key.objectid;
 		__entry->flags		= block_group->flags;
 		__entry->start		= start;
 		__entry->len		= len;
 	),
 
-	TP_printk_btrfs("root=%Lu(%s) block_group=%Lu flags=%Lu(%s) "
-		  "start=%Lu len=%Lu",
+	TP_printk_btrfs("root=%llu(%s) block_group=%llu flags=%llu(%s) "
+		  "start=%llu len=%llu",
 		  show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
 		  __entry->bg_objectid,
 		  __entry->flags, __print_flags((unsigned long)__entry->flags,
@@ -1180,20 +1176,18 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 
 DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,
 
-	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_block_group_cache *block_group, u64 start,
+	TP_PROTO(const struct btrfs_block_group_cache *block_group, u64 start,
 		 u64 len),
 
-	TP_ARGS(fs_info, block_group, start, len)
+	TP_ARGS(block_group, start, len)
 );
 
 DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
 
-	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_block_group_cache *block_group, u64 start,
+	TP_PROTO(const struct btrfs_block_group_cache *block_group, u64 start,
 		 u64 len),
 
-	TP_ARGS(fs_info, block_group, start, len)
+	TP_ARGS(block_group, start, len)
 );
 
 TRACE_EVENT(btrfs_find_cluster,
@@ -1221,8 +1215,8 @@ TRACE_EVENT(btrfs_find_cluster,
 		__entry->min_bytes	= min_bytes;
 	),
 
-	TP_printk_btrfs("block_group=%Lu flags=%Lu(%s) start=%Lu len=%Lu "
-		  "empty_size=%Lu min_bytes=%Lu", __entry->bg_objectid,
+	TP_printk_btrfs("block_group=%llu flags=%llu(%s) start=%llu len=%llu "
+		  "empty_size=%llu min_bytes=%llu", __entry->bg_objectid,
 		  __entry->flags,
 		  __print_flags((unsigned long)__entry->flags, "|",
 				BTRFS_GROUP_FLAGS), __entry->start,
@@ -1243,7 +1237,7 @@ TRACE_EVENT(btrfs_failed_cluster_setup,
 		__entry->bg_objectid	= block_group->key.objectid;
 	),
 
-	TP_printk_btrfs("block_group=%Lu", __entry->bg_objectid)
+	TP_printk_btrfs("block_group=%llu", __entry->bg_objectid)
 );
 
 TRACE_EVENT(btrfs_setup_cluster,
@@ -1272,8 +1266,8 @@ TRACE_EVENT(btrfs_setup_cluster,
 		__entry->bitmap		= bitmap;
 	),
 
-	TP_printk_btrfs("block_group=%Lu flags=%Lu(%s) window_start=%Lu "
-		  "size=%Lu max_size=%Lu bitmap=%d",
+	TP_printk_btrfs("block_group=%llu flags=%llu(%s) window_start=%llu "
+		  "size=%llu max_size=%llu bitmap=%d",
 		  __entry->bg_objectid,
 		  __entry->flags,
 		  __print_flags((unsigned long)__entry->flags, "|",
@@ -1476,7 +1470,7 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
 
 	TP_STRUCT__entry_btrfs(
 		__field(	u64,		rootid		)
-		__field(	unsigned long,	ino		)
+		__field(	u64,		ino		)
 		__field(	u64,		start		)
 		__field(	u64,		len		)
 		__field(	u64,		reserved	)
@@ -1485,14 +1479,14 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
 
 	TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
 		__entry->rootid		= BTRFS_I(inode)->root->objectid;
-		__entry->ino		= inode->i_ino;
+		__entry->ino		= btrfs_ino(BTRFS_I(inode));
 		__entry->start		= start;
 		__entry->len		= len;
 		__entry->reserved	= reserved;
 		__entry->op		= op;
 	),
 
-	TP_printk_btrfs("root=%llu ino=%lu start=%llu len=%llu reserved=%llu op=%s",
+	TP_printk_btrfs("root=%llu ino=%llu start=%llu len=%llu reserved=%llu op=%s",
 		  __entry->rootid, __entry->ino, __entry->start, __entry->len,
 		  __entry->reserved,
 		  __print_flags((unsigned long)__entry->op, "",
@@ -1584,12 +1578,14 @@ DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent,
 
 TRACE_EVENT(btrfs_qgroup_account_extent,
 
-	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 bytenr,
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 transid, u64 bytenr,
 		 u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots),
 
-	TP_ARGS(fs_info, bytenr, num_bytes, nr_old_roots, nr_new_roots),
+	TP_ARGS(fs_info, transid, bytenr, num_bytes, nr_old_roots,
+		nr_new_roots),
 
 	TP_STRUCT__entry_btrfs(
+		__field(	u64,  transid			)
 		__field(	u64,  bytenr			)
 		__field(	u64,  num_bytes			)
 		__field(	u64,  nr_old_roots		)
@@ -1597,43 +1593,49 @@ TRACE_EVENT(btrfs_qgroup_account_extent,
 	),
 
 	TP_fast_assign_btrfs(fs_info,
+		__entry->transid	= transid;
 		__entry->bytenr		= bytenr;
 		__entry->num_bytes	= num_bytes;
 		__entry->nr_old_roots	= nr_old_roots;
 		__entry->nr_new_roots	= nr_new_roots;
 	),
 
-	TP_printk_btrfs("bytenr=%llu num_bytes=%llu nr_old_roots=%llu "
-		  "nr_new_roots=%llu",
-		  __entry->bytenr,
-		  __entry->num_bytes,
-		  __entry->nr_old_roots,
-		  __entry->nr_new_roots)
+	TP_printk_btrfs(
+"transid=%llu bytenr=%llu num_bytes=%llu nr_old_roots=%llu nr_new_roots=%llu",
+		__entry->transid,
+		__entry->bytenr,
+		__entry->num_bytes,
+		__entry->nr_old_roots,
+		__entry->nr_new_roots)
 );
 
 TRACE_EVENT(qgroup_update_counters,
 
-	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 qgid,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 struct btrfs_qgroup *qgroup,
 		 u64 cur_old_count, u64 cur_new_count),
 
-	TP_ARGS(fs_info, qgid, cur_old_count, cur_new_count),
+	TP_ARGS(fs_info, qgroup, cur_old_count, cur_new_count),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	u64,  qgid			)
+		__field(	u64,  old_rfer			)
+		__field(	u64,  old_excl			)
 		__field(	u64,  cur_old_count		)
 		__field(	u64,  cur_new_count		)
 	),
 
 	TP_fast_assign_btrfs(fs_info,
-		__entry->qgid		= qgid;
+		__entry->qgid		= qgroup->qgroupid;
+		__entry->old_rfer	= qgroup->rfer;
+		__entry->old_excl	= qgroup->excl;
 		__entry->cur_old_count	= cur_old_count;
 		__entry->cur_new_count	= cur_new_count;
 	),
 
-	TP_printk_btrfs("qgid=%llu cur_old_count=%llu cur_new_count=%llu",
-		  __entry->qgid,
-		  __entry->cur_old_count,
-		  __entry->cur_new_count)
+	TP_printk_btrfs("qgid=%llu old_rfer=%llu old_excl=%llu cur_old_count=%llu cur_new_count=%llu",
+		  __entry->qgid, __entry->old_rfer, __entry->old_excl,
+		  __entry->cur_old_count, __entry->cur_new_count)
 );
 
 TRACE_EVENT(qgroup_update_reserve,
@@ -1765,14 +1767,14 @@ DECLARE_EVENT_CLASS(btrfs__prelim_ref,
 	),
 
 	TP_printk_btrfs("root_id=%llu key=[%llu,%u,%llu] level=%d count=[%d+%d=%d] parent=%llu wanted_disk_byte=%llu nodes=%llu",
-			(unsigned long long)__entry->root_id,
-			(unsigned long long)__entry->objectid, __entry->type,
-			(unsigned long long)__entry->offset, __entry->level,
+			__entry->root_id,
+			__entry->objectid, __entry->type,
+			__entry->offset, __entry->level,
 			__entry->old_count, __entry->mod_count,
 			__entry->old_count + __entry->mod_count,
-			(unsigned long long)__entry->parent,
-			(unsigned long long)__entry->bytenr,
-			(unsigned long long)__entry->tree_size)
+			__entry->parent,
+			__entry->bytenr,
+			__entry->tree_size)
 );
 
 DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_merge,
@@ -1808,8 +1810,51 @@ TRACE_EVENT(btrfs_inode_mod_outstanding_extents,
 
 	TP_printk_btrfs("root=%llu(%s) ino=%llu mod=%d",
 			show_root_type(__entry->root_objectid),
-			(unsigned long long)__entry->ino, __entry->mod)
+			__entry->ino, __entry->mod)
 );
+
+DECLARE_EVENT_CLASS(btrfs__block_group,
+	TP_PROTO(const struct btrfs_block_group_cache *bg_cache),
+
+	TP_ARGS(bg_cache),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,	bytenr		)
+		__field(	u64,	len		)
+		__field(	u64,	used		)
+		__field(	u64,	flags		)
+	),
+
+	TP_fast_assign_btrfs(bg_cache->fs_info,
+		__entry->bytenr = bg_cache->key.objectid,
+		__entry->len	= bg_cache->key.offset,
+		__entry->used	= btrfs_block_group_used(&bg_cache->item);
+		__entry->flags	= bg_cache->flags;
+	),
+
+	TP_printk_btrfs("bg bytenr=%llu len=%llu used=%llu flags=%llu(%s)",
+		__entry->bytenr, __entry->len, __entry->used, __entry->flags,
+		__print_flags(__entry->flags, "|", BTRFS_GROUP_FLAGS))
+);
+
+DEFINE_EVENT(btrfs__block_group, btrfs_remove_block_group,
+	TP_PROTO(const struct btrfs_block_group_cache *bg_cache),
+
+	TP_ARGS(bg_cache)
+);
+
+DEFINE_EVENT(btrfs__block_group, btrfs_add_unused_block_group,
+	TP_PROTO(const struct btrfs_block_group_cache *bg_cache),
+
+	TP_ARGS(bg_cache)
+);
+
+DEFINE_EVENT(btrfs__block_group, btrfs_skip_unused_block_group,
+	TP_PROTO(const struct btrfs_block_group_cache *bg_cache),
+
+	TP_ARGS(bg_cache)
+);
+
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index d8c3329..5936aac 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -84,20 +84,21 @@ TRACE_EVENT(rcu_grace_period,
 );
 
 /*
- * Tracepoint for future grace-period events, including those for no-callbacks
- * CPUs.  The caller should pull the data from the rcu_node structure,
- * other than rcuname, which comes from the rcu_state structure, and event,
- * which is one of the following:
+ * Tracepoint for future grace-period events.  The caller should pull
+ * the data from the rcu_node structure, other than rcuname, which comes
+ * from the rcu_state structure, and event, which is one of the following:
  *
- * "Startleaf": Request a nocb grace period based on leaf-node data.
+ * "Startleaf": Request a grace period based on leaf-node data.
+ * "Prestarted": Someone beat us to the request
  * "Startedleaf": Leaf-node start proved sufficient.
  * "Startedleafroot": Leaf-node start proved sufficient after checking root.
  * "Startedroot": Requested a nocb grace period based on root-node data.
+ * "NoGPkthread": The RCU grace-period kthread has not yet started.
  * "StartWait": Start waiting for the requested grace period.
  * "ResumeWait": Resume waiting after signal.
  * "EndWait": Complete wait.
  * "Cleanup": Clean up rcu_node structure after previous GP.
- * "CleanupMore": Clean up, and another no-CB GP is needed.
+ * "CleanupMore": Clean up, and another GP is needed.
  */
 TRACE_EVENT(rcu_future_grace_period,
 
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index 558b902..80e2a72 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -249,7 +249,8 @@ typedef struct siginfo {
 #define TRAP_TRACE	2	/* process trace trap */
 #define TRAP_BRANCH     3	/* process taken branch trap */
 #define TRAP_HWBKPT     4	/* hardware breakpoint/watchpoint */
-#define NSIGTRAP	4
+#define TRAP_UNK	5	/* undiagnosed trap */
+#define NSIGTRAP	5
 
 /*
  * There is an additional set of SIGTRAP si_codes used by ptrace
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 8bcb186..4299067 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -732,9 +732,11 @@ __SYSCALL(__NR_pkey_alloc,    sys_pkey_alloc)
 __SYSCALL(__NR_pkey_free,     sys_pkey_free)
 #define __NR_statx 291
 __SYSCALL(__NR_statx,     sys_statx)
+#define __NR_io_pgetevents 292
+__SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
 
 #undef __NR_syscalls
-#define __NR_syscalls 292
+#define __NR_syscalls 293
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index a04adbc..ed01859 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -29,6 +29,7 @@
 
 #include <linux/types.h>
 #include <linux/fs.h>
+#include <linux/signal.h>
 #include <asm/byteorder.h>
 
 typedef __kernel_ulong_t aio_context_t;
@@ -38,10 +39,8 @@ enum {
 	IOCB_CMD_PWRITE = 1,
 	IOCB_CMD_FSYNC = 2,
 	IOCB_CMD_FDSYNC = 3,
-	/* These two are experimental.
-	 * IOCB_CMD_PREADX = 4,
-	 * IOCB_CMD_POLL = 5,
-	 */
+	/* 4 was the experimental IOCB_CMD_PREADX */
+	IOCB_CMD_POLL = 5,
 	IOCB_CMD_NOOP = 6,
 	IOCB_CMD_PREADV = 7,
 	IOCB_CMD_PWRITEV = 8,
@@ -108,5 +107,10 @@ struct iocb {
 #undef IFBIG
 #undef IFLITTLE
 
+struct __aio_sigset {
+	sigset_t __user	*sigmask;
+	size_t		sigsetsize;
+};
+
 #endif /* __LINUX__AIO_ABI_H */
 
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index c8d99b9..5ca1d21 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -422,6 +422,21 @@ struct btrfs_ioctl_ino_lookup_args {
 	char name[BTRFS_INO_LOOKUP_PATH_MAX];
 };
 
+#define BTRFS_INO_LOOKUP_USER_PATH_MAX (4080 - BTRFS_VOL_NAME_MAX - 1)
+struct btrfs_ioctl_ino_lookup_user_args {
+	/* in, inode number containing the subvolume of 'subvolid' */
+	__u64 dirid;
+	/* in */
+	__u64 treeid;
+	/* out, name of the subvolume of 'treeid' */
+	char name[BTRFS_VOL_NAME_MAX + 1];
+	/*
+	 * out, constructed path from the directory with which the ioctl is
+	 * called to dirid
+	 */
+	char path[BTRFS_INO_LOOKUP_USER_PATH_MAX];
+};
+
 /* Search criteria for the btrfs SEARCH ioctl family. */
 struct btrfs_ioctl_search_key {
 	/*
@@ -725,6 +740,82 @@ struct btrfs_ioctl_send_args {
 	__u64 reserved[4];		/* in */
 };
 
+/*
+ * Information about a fs tree root.
+ *
+ * All items are filled by the ioctl
+ */
+struct btrfs_ioctl_get_subvol_info_args {
+	/* Id of this subvolume */
+	__u64 treeid;
+
+	/* Name of this subvolume, used to get the real name at mount point */
+	char name[BTRFS_VOL_NAME_MAX + 1];
+
+	/*
+	 * Id of the subvolume which contains this subvolume.
+	 * Zero for top-level subvolume or a deleted subvolume.
+	 */
+	__u64 parent_id;
+
+	/*
+	 * Inode number of the directory which contains this subvolume.
+	 * Zero for top-level subvolume or a deleted subvolume
+	 */
+	__u64 dirid;
+
+	/* Latest transaction id of this subvolume */
+	__u64 generation;
+
+	/* Flags of this subvolume */
+	__u64 flags;
+
+	/* UUID of this subvolume */
+	__u8 uuid[BTRFS_UUID_SIZE];
+
+	/*
+	 * UUID of the subvolume of which this subvolume is a snapshot.
+	 * All zero for a non-snapshot subvolume.
+	 */
+	__u8 parent_uuid[BTRFS_UUID_SIZE];
+
+	/*
+	 * UUID of the subvolume from which this subvolume was received.
+	 * All zero for non-received subvolume.
+	 */
+	__u8 received_uuid[BTRFS_UUID_SIZE];
+
+	/* Transaction id indicating when change/create/send/receive happened */
+	__u64 ctransid;
+	__u64 otransid;
+	__u64 stransid;
+	__u64 rtransid;
+	/* Time corresponding to c/o/s/rtransid */
+	struct btrfs_ioctl_timespec ctime;
+	struct btrfs_ioctl_timespec otime;
+	struct btrfs_ioctl_timespec stime;
+	struct btrfs_ioctl_timespec rtime;
+
+	/* Must be zero */
+	__u64 reserved[8];
+};
+
+#define BTRFS_MAX_ROOTREF_BUFFER_NUM 255
+struct btrfs_ioctl_get_subvol_rootref_args {
+		/* in/out, minimum id of rootref's treeid to be searched */
+		__u64 min_treeid;
+
+		/* out */
+		struct {
+			__u64 treeid;
+			__u64 dirid;
+		} rootref[BTRFS_MAX_ROOTREF_BUFFER_NUM];
+
+		/* out, number of found items */
+		__u8 num_items;
+		__u8 align[7];
+};
+
 /* Error codes as returned by the kernel */
 enum btrfs_err_code {
 	BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1,
@@ -843,5 +934,11 @@ enum btrfs_err_code {
 				   struct btrfs_ioctl_vol_args_v2)
 #define BTRFS_IOC_LOGICAL_INO_V2 _IOWR(BTRFS_IOCTL_MAGIC, 59, \
 					struct btrfs_ioctl_logical_ino_args)
+#define BTRFS_IOC_GET_SUBVOL_INFO _IOR(BTRFS_IOCTL_MAGIC, 60, \
+				struct btrfs_ioctl_get_subvol_info_args)
+#define BTRFS_IOC_GET_SUBVOL_ROOTREF _IOWR(BTRFS_IOCTL_MAGIC, 61, \
+				struct btrfs_ioctl_get_subvol_rootref_args)
+#define BTRFS_IOC_INO_LOOKUP_USER _IOWR(BTRFS_IOCTL_MAGIC, 62, \
+				struct btrfs_ioctl_ino_lookup_user_args)
 
 #endif /* _UAPI_LINUX_BTRFS_H */
diff --git a/include/uapi/linux/signalfd.h b/include/uapi/linux/signalfd.h
index 6f0da42..83429a0 100644
--- a/include/uapi/linux/signalfd.h
+++ b/include/uapi/linux/signalfd.h
@@ -35,6 +35,10 @@ struct signalfd_siginfo {
 	__u64 ssi_stime;
 	__u64 ssi_addr;
 	__u16 ssi_addr_lsb;
+	__u16 __pad2;
+	__s32 ssi_syscall;
+	__u64 ssi_call_addr;
+	__u32 ssi_arch;
 
 	/*
 	 * Pad strcture to 128 bytes. Remember to update the
@@ -45,7 +49,7 @@ struct signalfd_siginfo {
 	 * comes out of a read(2) and we really don't want to have
 	 * a compat on read(2).
 	 */
-	__u8 __pad[46];
+	__u8 __pad[28];
 };
 
 
diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h
index cd4f0b8..2fce8b6 100644
--- a/include/uapi/linux/types.h
+++ b/include/uapi/linux/types.h
@@ -49,11 +49,7 @@ typedef __u32 __bitwise __wsum;
 #define __aligned_be64 __be64 __attribute__((aligned(8)))
 #define __aligned_le64 __le64 __attribute__((aligned(8)))
 
-#ifdef __CHECK_POLL
 typedef unsigned __bitwise __poll_t;
-#else
-typedef unsigned __poll_t;
-#endif
 
 #endif /*  __ASSEMBLY__ */
 #endif /* _UAPI_LINUX_TYPES_H */
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index b928b27..0808a33 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -218,9 +218,9 @@ extern const struct proc_ns_operations cgroupns_operations;
  * cgroup-v1.c
  */
 extern struct cftype cgroup1_base_files[];
-extern const struct file_operations proc_cgroupstats_operations;
 extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
 
+int proc_cgroupstats_show(struct seq_file *m, void *v);
 bool cgroup1_ssid_disabled(int ssid);
 void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
 void cgroup1_release_agent(struct work_struct *work);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index a2c05d2..e06c97f 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -682,7 +682,7 @@ struct cftype cgroup1_base_files[] = {
 };
 
 /* Display information about each subsystem and each hierarchy */
-static int proc_cgroupstats_show(struct seq_file *m, void *v)
+int proc_cgroupstats_show(struct seq_file *m, void *v)
 {
 	struct cgroup_subsys *ss;
 	int i;
@@ -705,18 +705,6 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int cgroupstats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_cgroupstats_show, NULL);
-}
-
-const struct file_operations proc_cgroupstats_operations = {
-	.open = cgroupstats_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 /**
  * cgroupstats_build - build and fill cgroupstats
  * @stats: cgroupstats to fill information into
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a662bfc..1288365 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5335,7 +5335,7 @@ int __init cgroup_init(void)
 	WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
 	WARN_ON(register_filesystem(&cgroup_fs_type));
 	WARN_ON(register_filesystem(&cgroup2_fs_type));
-	WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
+	WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
 
 	return 0;
 }
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index e2764d7..ca8ac28 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -44,23 +44,24 @@ void __delayacct_tsk_init(struct task_struct *tsk)
 {
 	tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
 	if (tsk->delays)
-		spin_lock_init(&tsk->delays->lock);
+		raw_spin_lock_init(&tsk->delays->lock);
 }
 
 /*
  * Finish delay accounting for a statistic using its timestamps (@start),
  * accumalator (@total) and @count
  */
-static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count)
+static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total,
+			  u32 *count)
 {
 	s64 ns = ktime_get_ns() - *start;
 	unsigned long flags;
 
 	if (ns > 0) {
-		spin_lock_irqsave(lock, flags);
+		raw_spin_lock_irqsave(lock, flags);
 		*total += ns;
 		(*count)++;
-		spin_unlock_irqrestore(lock, flags);
+		raw_spin_unlock_irqrestore(lock, flags);
 	}
 }
 
@@ -127,7 +128,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 
 	/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
 
-	spin_lock_irqsave(&tsk->delays->lock, flags);
+	raw_spin_lock_irqsave(&tsk->delays->lock, flags);
 	tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
 	d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
 	tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
@@ -137,7 +138,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	d->blkio_count += tsk->delays->blkio_count;
 	d->swapin_count += tsk->delays->swapin_count;
 	d->freepages_count += tsk->delays->freepages_count;
-	spin_unlock_irqrestore(&tsk->delays->lock, flags);
+	raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
 
 	return 0;
 }
@@ -147,10 +148,10 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
 	__u64 ret;
 	unsigned long flags;
 
-	spin_lock_irqsave(&tsk->delays->lock, flags);
+	raw_spin_lock_irqsave(&tsk->delays->lock, flags);
 	ret = nsec_to_clock_t(tsk->delays->blkio_delay +
 				tsk->delays->swapin_delay);
-	spin_unlock_irqrestore(&tsk->delays->lock, flags);
+	raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
 	return ret;
 }
 
diff --git a/kernel/dma.c b/kernel/dma.c
index 3506fc3..40f1529 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -135,21 +135,9 @@ static int proc_dma_show(struct seq_file *m, void *v)
 }
 #endif /* MAX_DMA_CHANNELS */
 
-static int proc_dma_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_dma_show, NULL);
-}
-
-static const struct file_operations proc_dma_operations = {
-	.open		= proc_dma_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_dma_init(void)
 {
-	proc_create("dma", 0, NULL, &proc_dma_operations);
+	proc_create_single("dma", 0, NULL, proc_dma_show);
 	return 0;
 }
 
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index a569711..33f07c5 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -27,21 +27,9 @@ static int execdomains_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int execdomains_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, execdomains_proc_show, NULL);
-}
-
-static const struct file_operations execdomains_proc_fops = {
-	.open		= execdomains_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_execdomains_init(void)
 {
-	proc_create("execdomains", 0, NULL, &execdomains_proc_fops);
+	proc_create_single("execdomains", 0, NULL, execdomains_proc_show);
 	return 0;
 }
 module_init(proc_execdomains_init);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 7cb091d..37eda10 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -185,11 +185,6 @@ static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
 	return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
 }
 
-static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode));
-}
-
 static const struct file_operations irq_affinity_proc_fops = {
 	.open		= irq_affinity_proc_open,
 	.read		= seq_read,
@@ -198,13 +193,6 @@ static const struct file_operations irq_affinity_proc_fops = {
 	.write		= irq_affinity_proc_write,
 };
 
-static const struct file_operations irq_affinity_hint_proc_fops = {
-	.open		= irq_affinity_hint_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static const struct file_operations irq_affinity_list_proc_fops = {
 	.open		= irq_affinity_list_proc_open,
 	.read		= seq_read,
@@ -223,32 +211,6 @@ static int irq_effective_aff_list_proc_show(struct seq_file *m, void *v)
 {
 	return show_irq_affinity(EFFECTIVE_LIST, m);
 }
-
-static int irq_effective_aff_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, irq_effective_aff_proc_show, PDE_DATA(inode));
-}
-
-static int irq_effective_aff_list_proc_open(struct inode *inode,
-					    struct file *file)
-{
-	return single_open(file, irq_effective_aff_list_proc_show,
-			   PDE_DATA(inode));
-}
-
-static const struct file_operations irq_effective_aff_proc_fops = {
-	.open		= irq_effective_aff_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static const struct file_operations irq_effective_aff_list_proc_fops = {
-	.open		= irq_effective_aff_list_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 static int default_affinity_show(struct seq_file *m, void *v)
@@ -313,18 +275,6 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
 	seq_printf(m, "%d\n", irq_desc_get_node(desc));
 	return 0;
 }
-
-static int irq_node_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, irq_node_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations irq_node_proc_fops = {
-	.open		= irq_node_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 static int irq_spurious_proc_show(struct seq_file *m, void *v)
@@ -337,18 +287,6 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int irq_spurious_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, irq_spurious_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations irq_spurious_proc_fops = {
-	.open		= irq_spurious_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 #define MAX_NAMELEN 128
 
 static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -421,24 +359,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 			 &irq_affinity_proc_fops, irqp);
 
 	/* create /proc/irq/<irq>/affinity_hint */
-	proc_create_data("affinity_hint", 0444, desc->dir,
-			 &irq_affinity_hint_proc_fops, irqp);
+	proc_create_single_data("affinity_hint", 0444, desc->dir,
+			irq_affinity_hint_proc_show, irqp);
 
 	/* create /proc/irq/<irq>/smp_affinity_list */
 	proc_create_data("smp_affinity_list", 0644, desc->dir,
 			 &irq_affinity_list_proc_fops, irqp);
 
-	proc_create_data("node", 0444, desc->dir,
-			 &irq_node_proc_fops, irqp);
+	proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show,
+			irqp);
 # ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
-	proc_create_data("effective_affinity", 0444, desc->dir,
-			 &irq_effective_aff_proc_fops, irqp);
-	proc_create_data("effective_affinity_list", 0444, desc->dir,
-			 &irq_effective_aff_list_proc_fops, irqp);
+	proc_create_single_data("effective_affinity", 0444, desc->dir,
+			irq_effective_aff_proc_show, irqp);
+	proc_create_single_data("effective_affinity_list", 0444, desc->dir,
+			irq_effective_aff_list_proc_show, irqp);
 # endif
 #endif
-	proc_create_data("spurious", 0444, desc->dir,
-			 &irq_spurious_proc_fops, (void *)(long)irq);
+	proc_create_single_data("spurious", 0444, desc->dir,
+			irq_spurious_proc_show, (void *)(long)irq);
 
 out_unlock:
 	mutex_unlock(&register_lock);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 0233863..edcac5d 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -561,20 +561,24 @@ static void print_lock(struct held_lock *hlock)
 	printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
 }
 
-static void lockdep_print_held_locks(struct task_struct *curr)
+static void lockdep_print_held_locks(struct task_struct *p)
 {
-	int i, depth = curr->lockdep_depth;
+	int i, depth = READ_ONCE(p->lockdep_depth);
 
-	if (!depth) {
-		printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr));
+	if (!depth)
+		printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p));
+	else
+		printk("%d lock%s held by %s/%d:\n", depth,
+		       depth > 1 ? "s" : "", p->comm, task_pid_nr(p));
+	/*
+	 * It's not reliable to print a task's held locks if it's not sleeping
+	 * and it's not the current task.
+	 */
+	if (p->state == TASK_RUNNING && p != current)
 		return;
-	}
-	printk("%d lock%s held by %s/%d:\n",
-		depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr));
-
 	for (i = 0; i < depth; i++) {
 		printk(" #%d: ", i);
-		print_lock(curr->held_locks + i);
+		print_lock(p->held_locks + i);
 	}
 }
 
@@ -4451,8 +4455,6 @@ EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
 void debug_show_all_locks(void)
 {
 	struct task_struct *g, *p;
-	int count = 10;
-	int unlock = 1;
 
 	if (unlikely(!debug_locks)) {
 		pr_warn("INFO: lockdep is turned off.\n");
@@ -4460,50 +4462,18 @@ void debug_show_all_locks(void)
 	}
 	pr_warn("\nShowing all locks held in the system:\n");
 
-	/*
-	 * Here we try to get the tasklist_lock as hard as possible,
-	 * if not successful after 2 seconds we ignore it (but keep
-	 * trying). This is to enable a debug printout even if a
-	 * tasklist_lock-holding task deadlocks or crashes.
-	 */
-retry:
-	if (!read_trylock(&tasklist_lock)) {
-		if (count == 10)
-			pr_warn("hm, tasklist_lock locked, retrying... ");
-		if (count) {
-			count--;
-			pr_cont(" #%d", 10-count);
-			mdelay(200);
-			goto retry;
-		}
-		pr_cont(" ignoring it.\n");
-		unlock = 0;
-	} else {
-		if (count != 10)
-			pr_cont(" locked it.\n");
-	}
-
-	do_each_thread(g, p) {
-		/*
-		 * It's not reliable to print a task's held locks
-		 * if it's not sleeping (or if it's not the current
-		 * task):
-		 */
-		if (p->state == TASK_RUNNING && p != current)
+	rcu_read_lock();
+	for_each_process_thread(g, p) {
+		if (!p->lockdep_depth)
 			continue;
-		if (p->lockdep_depth)
-			lockdep_print_held_locks(p);
-		if (!unlock)
-			if (read_trylock(&tasklist_lock))
-				unlock = 1;
+		lockdep_print_held_locks(p);
 		touch_nmi_watchdog();
-	} while_each_thread(g, p);
+		touch_all_softlockup_watchdogs();
+	}
+	rcu_read_unlock();
 
 	pr_warn("\n");
 	pr_warn("=============================================\n\n");
-
-	if (unlock)
-		read_unlock(&tasklist_lock);
 }
 EXPORT_SYMBOL_GPL(debug_show_all_locks);
 #endif
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index ad69bbc..3dd980d 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -101,18 +101,6 @@ static const struct seq_operations lockdep_ops = {
 	.show	= l_show,
 };
 
-static int lockdep_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &lockdep_ops);
-}
-
-static const struct file_operations proc_lockdep_operations = {
-	.open		= lockdep_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 #ifdef CONFIG_PROVE_LOCKING
 static void *lc_start(struct seq_file *m, loff_t *pos)
 {
@@ -170,18 +158,6 @@ static const struct seq_operations lockdep_chains_ops = {
 	.stop	= lc_stop,
 	.show	= lc_show,
 };
-
-static int lockdep_chains_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &lockdep_chains_ops);
-}
-
-static const struct file_operations proc_lockdep_chains_operations = {
-	.open		= lockdep_chains_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
 #endif /* CONFIG_PROVE_LOCKING */
 
 static void lockdep_stats_debug_show(struct seq_file *m)
@@ -355,18 +331,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int lockdep_stats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, lockdep_stats_show, NULL);
-}
-
-static const struct file_operations proc_lockdep_stats_operations = {
-	.open		= lockdep_stats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 #ifdef CONFIG_LOCK_STAT
 
 struct lock_stat_data {
@@ -682,14 +646,11 @@ static const struct file_operations proc_lock_stat_operations = {
 
 static int __init lockdep_proc_init(void)
 {
-	proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
+	proc_create_seq("lockdep", S_IRUSR, NULL, &lockdep_ops);
 #ifdef CONFIG_PROVE_LOCKING
-	proc_create("lockdep_chains", S_IRUSR, NULL,
-		    &proc_lockdep_chains_operations);
+	proc_create_seq("lockdep_chains", S_IRUSR, NULL, &lockdep_chains_ops);
 #endif
-	proc_create("lockdep_stats", S_IRUSR, NULL,
-		    &proc_lockdep_stats_operations);
-
+	proc_create_single("lockdep_stats", S_IRUSR, NULL, lockdep_stats_show);
 #ifdef CONFIG_LOCK_STAT
 	proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
 		    &proc_lock_stat_operations);
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index f046b7c..5e10153 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -23,13 +23,15 @@ struct mcs_spinlock {
 
 #ifndef arch_mcs_spin_lock_contended
 /*
- * Using smp_load_acquire() provides a memory barrier that ensures
- * subsequent operations happen after the lock is acquired.
+ * Using smp_cond_load_acquire() provides the acquire semantics
+ * required so that subsequent operations happen after the
+ * lock is acquired. Additionally, some architectures such as
+ * ARM64 would like to do spin-waiting instead of purely
+ * spinning, and smp_cond_load_acquire() provides that behavior.
  */
 #define arch_mcs_spin_lock_contended(l)					\
 do {									\
-	while (!(smp_load_acquire(l)))					\
-		cpu_relax();						\
+	smp_cond_load_acquire(l, VAL);					\
 } while (0)
 #endif
 
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 2048359..f44f658 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -139,8 +139,9 @@ static inline bool __mutex_trylock(struct mutex *lock)
 static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
 {
 	unsigned long curr = (unsigned long)current;
+	unsigned long zero = 0UL;
 
-	if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr))
+	if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr))
 		return true;
 
 	return false;
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index d880296..bfaeb05 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -12,11 +12,11 @@
  * GNU General Public License for more details.
  *
  * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
- * (C) Copyright 2013-2014 Red Hat, Inc.
+ * (C) Copyright 2013-2014,2018 Red Hat, Inc.
  * (C) Copyright 2015 Intel Corp.
  * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
  *
- * Authors: Waiman Long <waiman.long@hpe.com>
+ * Authors: Waiman Long <longman@redhat.com>
  *          Peter Zijlstra <peterz@infradead.org>
  */
 
@@ -33,6 +33,11 @@
 #include <asm/qspinlock.h>
 
 /*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
+/*
  * The basic principle of a queue-based spinlock can best be understood
  * by studying a classic queue-based spinlock implementation called the
  * MCS lock. The paper below provides a good description for this kind
@@ -77,6 +82,18 @@
 #endif
 
 /*
+ * The pending bit spinning loop count.
+ * This heuristic is used to limit the number of lockword accesses
+ * made by atomic_cond_read_relaxed when waiting for the lock to
+ * transition out of the "== _Q_PENDING_VAL" state. We don't spin
+ * indefinitely because there's no guarantee that we'll make forward
+ * progress.
+ */
+#ifndef _Q_PENDING_LOOPS
+#define _Q_PENDING_LOOPS	1
+#endif
+
+/*
  * Per-CPU queue node structures; we can never have more than 4 nested
  * contexts: task, softirq, hardirq, nmi.
  *
@@ -114,41 +131,18 @@ static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
 
 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
 
-/*
- * By using the whole 2nd least significant byte for the pending bit, we
- * can allow better optimization of the lock acquisition for the pending
- * bit holder.
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
  *
- * This internal structure is also used by the set_locked function which
- * is not restricted to _Q_PENDING_BITS == 8.
+ * *,1,* -> *,0,*
  */
-struct __qspinlock {
-	union {
-		atomic_t val;
-#ifdef __LITTLE_ENDIAN
-		struct {
-			u8	locked;
-			u8	pending;
-		};
-		struct {
-			u16	locked_pending;
-			u16	tail;
-		};
-#else
-		struct {
-			u16	tail;
-			u16	locked_pending;
-		};
-		struct {
-			u8	reserved[2];
-			u8	pending;
-			u8	locked;
-		};
-#endif
-	};
-};
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+	WRITE_ONCE(lock->pending, 0);
+}
 
-#if _Q_PENDING_BITS == 8
 /**
  * clear_pending_set_locked - take ownership and clear the pending bit.
  * @lock: Pointer to queued spinlock structure
@@ -159,9 +153,7 @@ struct __qspinlock {
  */
 static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
-
-	WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
 }
 
 /*
@@ -176,19 +168,28 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
  */
 static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 {
-	struct __qspinlock *l = (void *)lock;
-
 	/*
-	 * Use release semantics to make sure that the MCS node is properly
-	 * initialized before changing the tail code.
+	 * We can use relaxed semantics since the caller ensures that the
+	 * MCS node is properly initialized before updating the tail.
 	 */
-	return (u32)xchg_release(&l->tail,
+	return (u32)xchg_relaxed(&lock->tail,
 				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
 }
 
 #else /* _Q_PENDING_BITS == 8 */
 
 /**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+	atomic_andnot(_Q_PENDING_VAL, &lock->val);
+}
+
+/**
  * clear_pending_set_locked - take ownership and clear the pending bit.
  * @lock: Pointer to queued spinlock structure
  *
@@ -216,10 +217,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 	for (;;) {
 		new = (val & _Q_LOCKED_PENDING_MASK) | tail;
 		/*
-		 * Use release semantics to make sure that the MCS node is
-		 * properly initialized before changing the tail code.
+		 * We can use relaxed semantics since the caller ensures that
+		 * the MCS node is properly initialized before updating the
+		 * tail.
 		 */
-		old = atomic_cmpxchg_release(&lock->val, val, new);
+		old = atomic_cmpxchg_relaxed(&lock->val, val, new);
 		if (old == val)
 			break;
 
@@ -237,9 +239,7 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
  */
 static __always_inline void set_locked(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
-
-	WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
 }
 
 
@@ -294,86 +294,83 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
 void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 {
 	struct mcs_spinlock *prev, *next, *node;
-	u32 new, old, tail;
+	u32 old, tail;
 	int idx;
 
 	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
 
 	if (pv_enabled())
-		goto queue;
+		goto pv_queue;
 
 	if (virt_spin_lock(lock))
 		return;
 
 	/*
-	 * wait for in-progress pending->locked hand-overs
+	 * Wait for in-progress pending->locked hand-overs with a bounded
+	 * number of spins so that we guarantee forward progress.
 	 *
 	 * 0,1,0 -> 0,0,1
 	 */
 	if (val == _Q_PENDING_VAL) {
-		while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
-			cpu_relax();
+		int cnt = _Q_PENDING_LOOPS;
+		val = atomic_cond_read_relaxed(&lock->val,
+					       (VAL != _Q_PENDING_VAL) || !cnt--);
 	}
 
 	/*
+	 * If we observe any contention; queue.
+	 */
+	if (val & ~_Q_LOCKED_MASK)
+		goto queue;
+
+	/*
 	 * trylock || pending
 	 *
 	 * 0,0,0 -> 0,0,1 ; trylock
 	 * 0,0,1 -> 0,1,1 ; pending
 	 */
-	for (;;) {
+	val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+	if (!(val & ~_Q_LOCKED_MASK)) {
 		/*
-		 * If we observe any contention; queue.
+		 * We're pending, wait for the owner to go away.
+		 *
+		 * *,1,1 -> *,1,0
+		 *
+		 * this wait loop must be a load-acquire such that we match the
+		 * store-release that clears the locked bit and create lock
+		 * sequentiality; this is because not all
+		 * clear_pending_set_locked() implementations imply full
+		 * barriers.
 		 */
-		if (val & ~_Q_LOCKED_MASK)
-			goto queue;
-
-		new = _Q_LOCKED_VAL;
-		if (val == new)
-			new |= _Q_PENDING_VAL;
+		if (val & _Q_LOCKED_MASK) {
+			atomic_cond_read_acquire(&lock->val,
+						 !(VAL & _Q_LOCKED_MASK));
+		}
 
 		/*
-		 * Acquire semantic is required here as the function may
-		 * return immediately if the lock was free.
+		 * take ownership and clear the pending bit.
+		 *
+		 * *,1,0 -> *,0,1
 		 */
-		old = atomic_cmpxchg_acquire(&lock->val, val, new);
-		if (old == val)
-			break;
-
-		val = old;
-	}
-
-	/*
-	 * we won the trylock
-	 */
-	if (new == _Q_LOCKED_VAL)
+		clear_pending_set_locked(lock);
+		qstat_inc(qstat_lock_pending, true);
 		return;
+	}
 
 	/*
-	 * we're pending, wait for the owner to go away.
-	 *
-	 * *,1,1 -> *,1,0
-	 *
-	 * this wait loop must be a load-acquire such that we match the
-	 * store-release that clears the locked bit and create lock
-	 * sequentiality; this is because not all clear_pending_set_locked()
-	 * implementations imply full barriers.
-	 */
-	smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
-
-	/*
-	 * take ownership and clear the pending bit.
-	 *
-	 * *,1,0 -> *,0,1
+	 * If pending was clear but there are waiters in the queue, then
+	 * we need to undo our setting of pending before we queue ourselves.
 	 */
-	clear_pending_set_locked(lock);
-	return;
+	if (!(val & _Q_PENDING_MASK))
+		clear_pending(lock);
 
 	/*
 	 * End of pending bit optimistic spinning and beginning of MCS
 	 * queuing.
 	 */
 queue:
+	qstat_inc(qstat_lock_slowpath, true);
+pv_queue:
 	node = this_cpu_ptr(&mcs_nodes[0]);
 	idx = node->count++;
 	tail = encode_tail(smp_processor_id(), idx);
@@ -400,12 +397,18 @@ queue:
 		goto release;
 
 	/*
+	 * Ensure that the initialisation of @node is complete before we
+	 * publish the updated tail via xchg_tail() and potentially link
+	 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
+	 */
+	smp_wmb();
+
+	/*
+	 * Publish the updated tail.
 	 * We have already touched the queueing cacheline; don't bother with
 	 * pending stuff.
 	 *
 	 * p,*,* -> n,*,*
-	 *
-	 * RELEASE, such that the stores to @node must be complete.
 	 */
 	old = xchg_tail(lock, tail);
 	next = NULL;
@@ -417,14 +420,8 @@ queue:
 	if (old & _Q_TAIL_MASK) {
 		prev = decode_tail(old);
 
-		/*
-		 * We must ensure that the stores to @node are observed before
-		 * the write to prev->next. The address dependency from
-		 * xchg_tail is not sufficient to ensure this because the read
-		 * component of xchg_tail is unordered with respect to the
-		 * initialisation of @node.
-		 */
-		smp_store_release(&prev->next, node);
+		/* Link @node into the waitqueue. */
+		WRITE_ONCE(prev->next, node);
 
 		pv_wait_node(node, prev);
 		arch_mcs_spin_lock_contended(&node->locked);
@@ -453,8 +450,8 @@ queue:
 	 *
 	 * The PV pv_wait_head_or_lock function, if active, will acquire
 	 * the lock and return a non-zero value. So we have to skip the
-	 * smp_cond_load_acquire() call. As the next PV queue head hasn't been
-	 * designated yet, there is no way for the locked value to become
+	 * atomic_cond_read_acquire() call. As the next PV queue head hasn't
+	 * been designated yet, there is no way for the locked value to become
 	 * _Q_SLOW_VAL. So both the set_locked() and the
 	 * atomic_cmpxchg_relaxed() calls will be safe.
 	 *
@@ -464,44 +461,38 @@ queue:
 	if ((val = pv_wait_head_or_lock(lock, node)))
 		goto locked;
 
-	val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK));
+	val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK));
 
 locked:
 	/*
 	 * claim the lock:
 	 *
 	 * n,0,0 -> 0,0,1 : lock, uncontended
-	 * *,0,0 -> *,0,1 : lock, contended
+	 * *,*,0 -> *,*,1 : lock, contended
 	 *
-	 * If the queue head is the only one in the queue (lock value == tail),
-	 * clear the tail code and grab the lock. Otherwise, we only need
-	 * to grab the lock.
+	 * If the queue head is the only one in the queue (lock value == tail)
+	 * and nobody is pending, clear the tail code and grab the lock.
+	 * Otherwise, we only need to grab the lock.
 	 */
-	for (;;) {
-		/* In the PV case we might already have _Q_LOCKED_VAL set */
-		if ((val & _Q_TAIL_MASK) != tail) {
-			set_locked(lock);
-			break;
-		}
-		/*
-		 * The smp_cond_load_acquire() call above has provided the
-		 * necessary acquire semantics required for locking. At most
-		 * two iterations of this loop may be ran.
-		 */
-		old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
-		if (old == val)
-			goto release;	/* No contention */
 
-		val = old;
-	}
+	/*
+	 * In the PV case we might already have _Q_LOCKED_VAL set.
+	 *
+	 * The atomic_cond_read_acquire() call above has provided the
+	 * necessary acquire semantics required for locking.
+	 */
+	if (((val & _Q_TAIL_MASK) == tail) &&
+	    atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
+		goto release; /* No contention */
+
+	/* Either somebody is queued behind us or _Q_PENDING_VAL is set */
+	set_locked(lock);
 
 	/*
 	 * contended path; wait for next if not observed yet, release.
 	 */
-	if (!next) {
-		while (!(next = READ_ONCE(node->next)))
-			cpu_relax();
-	}
+	if (!next)
+		next = smp_cond_load_relaxed(&node->next, (VAL));
 
 	arch_mcs_spin_unlock_contended(&next->locked);
 	pv_kick_node(lock, next);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 6ee4777..5a0cf5f 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -56,11 +56,6 @@ struct pv_node {
 };
 
 /*
- * Include queued spinlock statistics code
- */
-#include "qspinlock_stat.h"
-
-/*
  * Hybrid PV queued/unfair lock
  *
  * By replacing the regular queued_spin_trylock() with the function below,
@@ -87,8 +82,6 @@ struct pv_node {
 #define queued_spin_trylock(l)	pv_hybrid_queued_unfair_trylock(l)
 static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
-
 	/*
 	 * Stay in unfair lock mode as long as queued mode waiters are
 	 * present in the MCS wait queue but the pending bit isn't set.
@@ -97,7 +90,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
 		int val = atomic_read(&lock->val);
 
 		if (!(val & _Q_LOCKED_PENDING_MASK) &&
-		   (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+		   (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
 			qstat_inc(qstat_pv_lock_stealing, true);
 			return true;
 		}
@@ -117,16 +110,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
 #if _Q_PENDING_BITS == 8
 static __always_inline void set_pending(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
-
-	WRITE_ONCE(l->pending, 1);
-}
-
-static __always_inline void clear_pending(struct qspinlock *lock)
-{
-	struct __qspinlock *l = (void *)lock;
-
-	WRITE_ONCE(l->pending, 0);
+	WRITE_ONCE(lock->pending, 1);
 }
 
 /*
@@ -136,10 +120,8 @@ static __always_inline void clear_pending(struct qspinlock *lock)
  */
 static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
-
-	return !READ_ONCE(l->locked) &&
-	       (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL,
+	return !READ_ONCE(lock->locked) &&
+	       (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL,
 				_Q_LOCKED_VAL) == _Q_PENDING_VAL);
 }
 #else /* _Q_PENDING_BITS == 8 */
@@ -148,11 +130,6 @@ static __always_inline void set_pending(struct qspinlock *lock)
 	atomic_or(_Q_PENDING_VAL, &lock->val);
 }
 
-static __always_inline void clear_pending(struct qspinlock *lock)
-{
-	atomic_andnot(_Q_PENDING_VAL, &lock->val);
-}
-
 static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 {
 	int val = atomic_read(&lock->val);
@@ -384,7 +361,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 {
 	struct pv_node *pn = (struct pv_node *)node;
-	struct __qspinlock *l = (void *)lock;
 
 	/*
 	 * If the vCPU is indeed halted, advance its state to match that of
@@ -413,7 +389,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 	 * the hash table later on at unlock time, no atomic instruction is
 	 * needed.
 	 */
-	WRITE_ONCE(l->locked, _Q_SLOW_VAL);
+	WRITE_ONCE(lock->locked, _Q_SLOW_VAL);
 	(void)pv_hash(lock, pn);
 }
 
@@ -428,7 +404,6 @@ static u32
 pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 {
 	struct pv_node *pn = (struct pv_node *)node;
-	struct __qspinlock *l = (void *)lock;
 	struct qspinlock **lp = NULL;
 	int waitcnt = 0;
 	int loop;
@@ -443,7 +418,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 	/*
 	 * Tracking # of slowpath locking operations
 	 */
-	qstat_inc(qstat_pv_lock_slowpath, true);
+	qstat_inc(qstat_lock_slowpath, true);
 
 	for (;; waitcnt++) {
 		/*
@@ -479,13 +454,13 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 			 *
 			 * Matches the smp_rmb() in __pv_queued_spin_unlock().
 			 */
-			if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
+			if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) {
 				/*
 				 * The lock was free and now we own the lock.
 				 * Change the lock value back to _Q_LOCKED_VAL
 				 * and unhash the table.
 				 */
-				WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+				WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
 				WRITE_ONCE(*lp, NULL);
 				goto gotlock;
 			}
@@ -493,7 +468,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 		WRITE_ONCE(pn->state, vcpu_hashed);
 		qstat_inc(qstat_pv_wait_head, true);
 		qstat_inc(qstat_pv_wait_again, waitcnt);
-		pv_wait(&l->locked, _Q_SLOW_VAL);
+		pv_wait(&lock->locked, _Q_SLOW_VAL);
 
 		/*
 		 * Because of lock stealing, the queue head vCPU may not be
@@ -518,7 +493,6 @@ gotlock:
 __visible void
 __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 {
-	struct __qspinlock *l = (void *)lock;
 	struct pv_node *node;
 
 	if (unlikely(locked != _Q_SLOW_VAL)) {
@@ -547,7 +521,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 	 * Now that we have a reference to the (likely) blocked pv_node,
 	 * release the lock.
 	 */
-	smp_store_release(&l->locked, 0);
+	smp_store_release(&lock->locked, 0);
 
 	/*
 	 * At this point the memory pointed at by lock can be freed/reused,
@@ -573,7 +547,6 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 #ifndef __pv_queued_spin_unlock
 __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
 	u8 locked;
 
 	/*
@@ -581,7 +554,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 	 * unhash. Otherwise it would be possible to have multiple @lock
 	 * entries, which would be BAD.
 	 */
-	locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0);
+	locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0);
 	if (likely(locked == _Q_LOCKED_VAL))
 		return;
 
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 4a30ef6..6bd78c0 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -22,13 +22,14 @@
  *   pv_kick_wake	- # of vCPU kicks used for computing pv_latency_wake
  *   pv_latency_kick	- average latency (ns) of vCPU kick operation
  *   pv_latency_wake	- average latency (ns) from vCPU kick to wakeup
- *   pv_lock_slowpath	- # of locking operations via the slowpath
  *   pv_lock_stealing	- # of lock stealing operations
  *   pv_spurious_wakeup	- # of spurious wakeups in non-head vCPUs
  *   pv_wait_again	- # of wait's after a queue head vCPU kick
  *   pv_wait_early	- # of early vCPU wait's
  *   pv_wait_head	- # of vCPU wait's at the queue head
  *   pv_wait_node	- # of vCPU wait's at a non-head queue node
+ *   lock_pending	- # of locking operations via pending code
+ *   lock_slowpath	- # of locking operations via MCS lock queue
  *
  * Writing to the "reset_counters" file will reset all the above counter
  * values.
@@ -46,13 +47,14 @@ enum qlock_stats {
 	qstat_pv_kick_wake,
 	qstat_pv_latency_kick,
 	qstat_pv_latency_wake,
-	qstat_pv_lock_slowpath,
 	qstat_pv_lock_stealing,
 	qstat_pv_spurious_wakeup,
 	qstat_pv_wait_again,
 	qstat_pv_wait_early,
 	qstat_pv_wait_head,
 	qstat_pv_wait_node,
+	qstat_lock_pending,
+	qstat_lock_slowpath,
 	qstat_num,	/* Total number of statistical counters */
 	qstat_reset_cnts = qstat_num,
 };
@@ -73,12 +75,13 @@ static const char * const qstat_names[qstat_num + 1] = {
 	[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
 	[qstat_pv_latency_kick]	   = "pv_latency_kick",
 	[qstat_pv_latency_wake]    = "pv_latency_wake",
-	[qstat_pv_lock_slowpath]   = "pv_lock_slowpath",
 	[qstat_pv_lock_stealing]   = "pv_lock_stealing",
 	[qstat_pv_wait_again]      = "pv_wait_again",
 	[qstat_pv_wait_early]      = "pv_wait_early",
 	[qstat_pv_wait_head]       = "pv_wait_head",
 	[qstat_pv_wait_node]       = "pv_wait_node",
+	[qstat_lock_pending]       = "lock_pending",
+	[qstat_lock_slowpath]      = "lock_slowpath",
 	[qstat_reset_cnts]         = "reset_counters",
 };
 
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index a903367..3064c50 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -347,6 +347,15 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 	}
 }
 
+static inline bool owner_on_cpu(struct task_struct *owner)
+{
+	/*
+	 * As lock holder preemption issue, we both skip spinning if
+	 * task is not on cpu or its cpu is preempted
+	 */
+	return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
+}
+
 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 {
 	struct task_struct *owner;
@@ -359,17 +368,10 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 
 	rcu_read_lock();
 	owner = READ_ONCE(sem->owner);
-	if (!owner || !is_rwsem_owner_spinnable(owner)) {
-		ret = !owner;	/* !owner is spinnable */
-		goto done;
+	if (owner) {
+		ret = is_rwsem_owner_spinnable(owner) &&
+		      owner_on_cpu(owner);
 	}
-
-	/*
-	 * As lock holder preemption issue, we both skip spinning if task is not
-	 * on cpu or its cpu is preempted
-	 */
-	ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
-done:
 	rcu_read_unlock();
 	return ret;
 }
@@ -398,8 +400,7 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
 		 * abort spinning when need_resched or owner is not running or
 		 * owner's cpu is preempted.
 		 */
-		if (!owner->on_cpu || need_resched() ||
-				vcpu_is_preempted(task_cpu(owner))) {
+		if (need_resched() || !owner_on_cpu(owner)) {
 			rcu_read_unlock();
 			return false;
 		}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11b4282..1efcb5b 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -269,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
 	struct bio *bio;
 	int error = 0;
 
-	bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
+	bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1);
 	bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
 	bio_set_dev(bio, hib_resume_bdev);
 	bio_set_op_attrs(bio, op, op_flags);
@@ -376,7 +376,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
 		return -ENOSPC;
 
 	if (hb) {
-		src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN |
+		src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
 		                              __GFP_NORETRY);
 		if (src) {
 			copy_page(src, buf);
@@ -384,7 +384,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
 			ret = hib_wait_io(hb); /* Free pages */
 			if (ret)
 				return ret;
-			src = (void *)__get_free_page(__GFP_RECLAIM |
+			src = (void *)__get_free_page(GFP_NOIO |
 			                              __GFP_NOWARN |
 			                              __GFP_NORETRY);
 			if (src) {
@@ -691,7 +691,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	nr_threads = num_online_cpus() - 1;
 	nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
 
-	page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH);
+	page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH);
 	if (!page) {
 		pr_err("Failed to allocate LZO page\n");
 		ret = -ENOMEM;
@@ -989,7 +989,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
 		last = tmp;
 
 		tmp->map = (struct swap_map_page *)
-			   __get_free_page(__GFP_RECLAIM | __GFP_HIGH);
+			   __get_free_page(GFP_NOIO | __GFP_HIGH);
 		if (!tmp->map) {
 			release_swap_reader(handle);
 			return -ENOMEM;
@@ -1261,8 +1261,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
 
 	for (i = 0; i < read_pages; i++) {
 		page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
-						  __GFP_RECLAIM | __GFP_HIGH :
-						  __GFP_RECLAIM | __GFP_NOWARN |
+						  GFP_NOIO | __GFP_HIGH :
+						  GFP_NOIO | __GFP_NOWARN |
 						  __GFP_NORETRY);
 
 		if (!page[i]) {
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 7a693e3..40cea67 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -270,6 +270,12 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
 	}
 }
 
+/* Returns first leaf rcu_node of the specified RCU flavor. */
+#define rcu_first_leaf_node(rsp) ((rsp)->level[rcu_num_lvls - 1])
+
+/* Is this rcu_node a leaf? */
+#define rcu_is_leaf_node(rnp) ((rnp)->level == rcu_num_lvls - 1)
+
 /*
  * Do a full breadth-first scan of the rcu_node structures for the
  * specified rcu_state structure.
@@ -284,8 +290,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
  * rcu_node tree with but one rcu_node structure, this loop is a no-op.
  */
 #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
-	for ((rnp) = &(rsp)->node[0]; \
-	     (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
+	for ((rnp) = &(rsp)->node[0]; !rcu_is_leaf_node(rsp, rnp); (rnp)++)
 
 /*
  * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
@@ -294,7 +299,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
  * It is still a leaf node, even if it is also the root node.
  */
 #define rcu_for_each_leaf_node(rsp, rnp) \
-	for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
+	for ((rnp) = rcu_first_leaf_node(rsp); \
 	     (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
 
 /*
@@ -486,6 +491,7 @@ void rcu_force_quiescent_state(void);
 void rcu_bh_force_quiescent_state(void);
 void rcu_sched_force_quiescent_state(void);
 extern struct workqueue_struct *rcu_gp_wq;
+extern struct workqueue_struct *rcu_par_gp_wq;
 #endif /* #else #ifdef CONFIG_TINY_RCU */
 
 #ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 88cba7c..5aff271 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -404,24 +404,6 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
 }
 
 /*
- * Scan the specified rcu_segcblist structure for callbacks that need
- * a grace period later than the one specified by "seq".  We don't look
- * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
- * have a grace-period sequence number.
- */
-bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
-				    unsigned long seq)
-{
-	int i;
-
-	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
-		if (rsclp->tails[i - 1] != rsclp->tails[i] &&
-		    ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
-			return true;
-	return false;
-}
-
-/*
  * Merge the source rcu_segcblist structure into the destination
  * rcu_segcblist structure, then initialize the source.  Any pending
  * callbacks from the source get to start over.  It is best to
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 581c12b..948470c 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -134,7 +134,5 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
 				   struct rcu_cblist *rclp);
 void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
 bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
-bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
-				    unsigned long seq);
 void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
 			 struct rcu_segcblist *src_rsclp);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 777e7a6..e232846 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -369,7 +369,7 @@ static bool __maybe_unused torturing_tasks(void)
  */
 static void rcu_perf_wait_shutdown(void)
 {
-	cond_resched_rcu_qs();
+	cond_resched_tasks_rcu_qs();
 	if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters)
 		return;
 	while (!torture_must_stop())
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 680c96d..e628fcf 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -593,7 +593,12 @@ static void srcu_torture_init(void)
 
 static void srcu_torture_cleanup(void)
 {
-	cleanup_srcu_struct(&srcu_ctld);
+	static DEFINE_TORTURE_RANDOM(rand);
+
+	if (torture_random(&rand) & 0x800)
+		cleanup_srcu_struct(&srcu_ctld);
+	else
+		cleanup_srcu_struct_quiesced(&srcu_ctld);
 	srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
 }
 
@@ -1609,6 +1614,9 @@ static enum cpuhp_state rcutor_hp;
 static void
 rcu_torture_cleanup(void)
 {
+	int flags = 0;
+	unsigned long gpnum = 0;
+	unsigned long completed = 0;
 	int i;
 
 	rcutorture_record_test_transition();
@@ -1639,6 +1647,11 @@ rcu_torture_cleanup(void)
 		fakewriter_tasks = NULL;
 	}
 
+	rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed);
+	srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
+				&flags, &gpnum, &completed);
+	pr_alert("%s:  End-test grace-period state: g%lu c%lu f%#x\n",
+		 cur_ops->name, gpnum, completed, flags);
 	torture_stop_kthread(rcu_torture_stats, stats_task);
 	torture_stop_kthread(rcu_torture_fqs, fqs_task);
 	for (i = 0; i < ncbflooders; i++)
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 76ac5f5..622792a 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
  * Must invoke this after you are finished using a given srcu_struct that
  * was initialized via init_srcu_struct(), else you leak memory.
  */
-void cleanup_srcu_struct(struct srcu_struct *sp)
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
 {
 	WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
-	flush_work(&sp->srcu_work);
+	if (quiesced)
+		WARN_ON(work_pending(&sp->srcu_work));
+	else
+		flush_work(&sp->srcu_work);
 	WARN_ON(sp->srcu_gp_running);
 	WARN_ON(sp->srcu_gp_waiting);
 	WARN_ON(sp->srcu_cb_head);
 	WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
 }
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
 
 /*
  * Removes the count for the old reader from the appropriate element of
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index fb560fc..b4123d7 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -366,24 +366,28 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp)
 	return SRCU_INTERVAL;
 }
 
-/**
- * cleanup_srcu_struct - deconstruct a sleep-RCU structure
- * @sp: structure to clean up.
- *
- * Must invoke this after you are finished using a given srcu_struct that
- * was initialized via init_srcu_struct(), else you leak memory.
- */
-void cleanup_srcu_struct(struct srcu_struct *sp)
+/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
 {
 	int cpu;
 
 	if (WARN_ON(!srcu_get_delay(sp)))
-		return; /* Leakage unless caller handles error. */
+		return; /* Just leak it! */
 	if (WARN_ON(srcu_readers_active(sp)))
-		return; /* Leakage unless caller handles error. */
-	flush_delayed_work(&sp->work);
+		return; /* Just leak it! */
+	if (quiesced) {
+		if (WARN_ON(delayed_work_pending(&sp->work)))
+			return; /* Just leak it! */
+	} else {
+		flush_delayed_work(&sp->work);
+	}
 	for_each_possible_cpu(cpu)
-		flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+		if (quiesced) {
+			if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work)))
+				return; /* Just leak it! */
+		} else {
+			flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+		}
 	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
 	    WARN_ON(srcu_readers_active(sp))) {
 		pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
@@ -392,7 +396,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 	free_percpu(sp->sda);
 	sp->sda = NULL;
 }
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
 
 /*
  * Counts the new reader in the appropriate per-CPU element of the
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2a73469..aa7cade 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -524,8 +524,6 @@ module_param(rcu_kick_kthreads, bool, 0644);
 static ulong jiffies_till_sched_qs = HZ / 10;
 module_param(jiffies_till_sched_qs, ulong, 0444);
 
-static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
-				  struct rcu_data *rdp);
 static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp));
 static void force_quiescent_state(struct rcu_state *rsp);
 static int rcu_pending(void);
@@ -711,44 +709,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
 }
 
 /*
- * Is there any need for future grace periods?
- * Interrupts must be disabled.  If the caller does not hold the root
- * rnp_node structure's ->lock, the results are advisory only.
- */
-static int rcu_future_needs_gp(struct rcu_state *rsp)
-{
-	struct rcu_node *rnp = rcu_get_root(rsp);
-	int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
-	int *fp = &rnp->need_future_gp[idx];
-
-	lockdep_assert_irqs_disabled();
-	return READ_ONCE(*fp);
-}
-
-/*
- * Does the current CPU require a not-yet-started grace period?
- * The caller must have disabled interrupts to prevent races with
- * normal callback registry.
- */
-static bool
-cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
-{
-	lockdep_assert_irqs_disabled();
-	if (rcu_gp_in_progress(rsp))
-		return false;  /* No, a grace period is already in progress. */
-	if (rcu_future_needs_gp(rsp))
-		return true;  /* Yes, a no-CBs CPU needs one. */
-	if (!rcu_segcblist_is_enabled(&rdp->cblist))
-		return false;  /* No, this is a no-CBs (or offline) CPU. */
-	if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
-		return true;  /* Yes, CPU has newly registered callbacks. */
-	if (rcu_segcblist_future_gp_needed(&rdp->cblist,
-					   READ_ONCE(rsp->completed)))
-		return true;  /* Yes, CBs for future grace period. */
-	return false; /* No grace period needed. */
-}
-
-/*
  * Enter an RCU extended quiescent state, which can be either the
  * idle loop or adaptive-tickless usermode execution.
  *
@@ -1234,10 +1194,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	}
 
 	/*
-	 * Has this CPU encountered a cond_resched_rcu_qs() since the
-	 * beginning of the grace period?  For this to be the case,
-	 * the CPU has to have noticed the current grace period.  This
-	 * might not be the case for nohz_full CPUs looping in the kernel.
+	 * Has this CPU encountered a cond_resched() since the beginning
+	 * of the grace period?  For this to be the case, the CPU has to
+	 * have noticed the current grace period.  This might not be the
+	 * case for nohz_full CPUs looping in the kernel.
 	 */
 	jtsq = jiffies_till_sched_qs;
 	ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
@@ -1642,18 +1602,30 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
 		return rnp->completed + 1;
 
 	/*
+	 * If the current rcu_node structure believes that RCU is
+	 * idle, and if the rcu_state structure does not yet reflect
+	 * the start of a new grace period, then the next grace period
+	 * will suffice.  The memory barrier is needed to accurately
+	 * sample the rsp->gpnum, and pairs with the second lock
+	 * acquisition in rcu_gp_init(), which is augmented with
+	 * smp_mb__after_unlock_lock() for this purpose.
+	 */
+	if (rnp->gpnum == rnp->completed) {
+		smp_mb(); /* See above block comment. */
+		if (READ_ONCE(rsp->gpnum) == rnp->completed)
+			return rnp->completed + 1;
+	}
+
+	/*
 	 * Otherwise, wait for a possible partial grace period and
 	 * then the subsequent full grace period.
 	 */
 	return rnp->completed + 2;
 }
 
-/*
- * Trace-event helper function for rcu_start_future_gp() and
- * rcu_nocb_wait_gp().
- */
-static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
-				unsigned long c, const char *s)
+/* Trace-event wrapper function for trace_rcu_future_grace_period.  */
+static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+			      unsigned long c, const char *s)
 {
 	trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
 				      rnp->completed, c, rnp->level,
@@ -1661,96 +1633,67 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
 }
 
 /*
- * Start some future grace period, as needed to handle newly arrived
+ * Start the specified grace period, as needed to handle newly arrived
  * callbacks.  The required future grace periods are recorded in each
- * rcu_node structure's ->need_future_gp field.  Returns true if there
+ * rcu_node structure's ->need_future_gp[] field.  Returns true if there
  * is reason to awaken the grace-period kthread.
  *
- * The caller must hold the specified rcu_node structure's ->lock.
+ * The caller must hold the specified rcu_node structure's ->lock, which
+ * is why the caller is responsible for waking the grace-period kthread.
  */
-static bool __maybe_unused
-rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
-		    unsigned long *c_out)
+static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+			      unsigned long c)
 {
-	unsigned long c;
 	bool ret = false;
-	struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
-
-	raw_lockdep_assert_held_rcu_node(rnp);
-
-	/*
-	 * Pick up grace-period number for new callbacks.  If this
-	 * grace period is already marked as needed, return to the caller.
-	 */
-	c = rcu_cbs_completed(rdp->rsp, rnp);
-	trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
-	if (rnp->need_future_gp[c & 0x1]) {
-		trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
-		goto out;
-	}
+	struct rcu_state *rsp = rdp->rsp;
+	struct rcu_node *rnp_root;
 
 	/*
-	 * If either this rcu_node structure or the root rcu_node structure
-	 * believe that a grace period is in progress, then we must wait
-	 * for the one following, which is in "c".  Because our request
-	 * will be noticed at the end of the current grace period, we don't
-	 * need to explicitly start one.  We only do the lockless check
-	 * of rnp_root's fields if the current rcu_node structure thinks
-	 * there is no grace period in flight, and because we hold rnp->lock,
-	 * the only possible change is when rnp_root's two fields are
-	 * equal, in which case rnp_root->gpnum might be concurrently
-	 * incremented.  But that is OK, as it will just result in our
-	 * doing some extra useless work.
+	 * Use funnel locking to either acquire the root rcu_node
+	 * structure's lock or bail out if the need for this grace period
+	 * has already been recorded -- or has already started.  If there
+	 * is already a grace period in progress in a non-leaf node, no
+	 * recording is needed because the end of the grace period will
+	 * scan the leaf rcu_node structures.  Note that rnp->lock must
+	 * not be released.
 	 */
-	if (rnp->gpnum != rnp->completed ||
-	    READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) {
-		rnp->need_future_gp[c & 0x1]++;
-		trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
-		goto out;
+	raw_lockdep_assert_held_rcu_node(rnp);
+	trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf"));
+	for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) {
+		if (rnp_root != rnp)
+			raw_spin_lock_rcu_node(rnp_root);
+		WARN_ON_ONCE(ULONG_CMP_LT(rnp_root->gpnum +
+					  need_future_gp_mask(), c));
+		if (need_future_gp_element(rnp_root, c) ||
+		    ULONG_CMP_GE(rnp_root->gpnum, c) ||
+		    (rnp != rnp_root &&
+		     rnp_root->gpnum != rnp_root->completed)) {
+			trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted"));
+			goto unlock_out;
+		}
+		need_future_gp_element(rnp_root, c) = true;
+		if (rnp_root != rnp && rnp_root->parent != NULL)
+			raw_spin_unlock_rcu_node(rnp_root);
+		if (!rnp_root->parent)
+			break;  /* At root, and perhaps also leaf. */
 	}
 
-	/*
-	 * There might be no grace period in progress.  If we don't already
-	 * hold it, acquire the root rcu_node structure's lock in order to
-	 * start one (if needed).
-	 */
-	if (rnp != rnp_root)
-		raw_spin_lock_rcu_node(rnp_root);
-
-	/*
-	 * Get a new grace-period number.  If there really is no grace
-	 * period in progress, it will be smaller than the one we obtained
-	 * earlier.  Adjust callbacks as needed.
-	 */
-	c = rcu_cbs_completed(rdp->rsp, rnp_root);
-	if (!rcu_is_nocb_cpu(rdp->cpu))
-		(void)rcu_segcblist_accelerate(&rdp->cblist, c);
-
-	/*
-	 * If the needed for the required grace period is already
-	 * recorded, trace and leave.
-	 */
-	if (rnp_root->need_future_gp[c & 0x1]) {
-		trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot"));
+	/* If GP already in progress, just leave, otherwise start one. */
+	if (rnp_root->gpnum != rnp_root->completed) {
+		trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot"));
 		goto unlock_out;
 	}
-
-	/* Record the need for the future grace period. */
-	rnp_root->need_future_gp[c & 0x1]++;
-
-	/* If a grace period is not already in progress, start one. */
-	if (rnp_root->gpnum != rnp_root->completed) {
-		trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
-	} else {
-		trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
-		ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
+	trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot"));
+	WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT);
+	if (!rsp->gp_kthread) {
+		trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread"));
+		goto unlock_out;
 	}
+	trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq"));
+	ret = true;  /* Caller must wake GP kthread. */
 unlock_out:
 	if (rnp != rnp_root)
 		raw_spin_unlock_rcu_node(rnp_root);
-out:
-	if (c_out != NULL)
-		*c_out = c;
 	return ret;
 }
 
@@ -1758,16 +1701,16 @@ out:
  * Clean up any old requests for the just-ended grace period.  Also return
  * whether any additional grace periods have been requested.
  */
-static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
 {
-	int c = rnp->completed;
-	int needmore;
+	unsigned long c = rnp->completed;
+	bool needmore;
 	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 
-	rnp->need_future_gp[c & 0x1] = 0;
-	needmore = rnp->need_future_gp[(c + 1) & 0x1];
-	trace_rcu_future_gp(rnp, rdp, c,
-			    needmore ? TPS("CleanupMore") : TPS("Cleanup"));
+	need_future_gp_element(rnp, c) = false;
+	needmore = need_any_future_gp(rnp);
+	trace_rcu_this_gp(rnp, rdp, c,
+			  needmore ? TPS("CleanupMore") : TPS("Cleanup"));
 	return needmore;
 }
 
@@ -1802,6 +1745,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
 static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
 			       struct rcu_data *rdp)
 {
+	unsigned long c;
 	bool ret = false;
 
 	raw_lockdep_assert_held_rcu_node(rnp);
@@ -1820,8 +1764,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
 	 * accelerating callback invocation to an earlier grace-period
 	 * number.
 	 */
-	if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp)))
-		ret = rcu_start_future_gp(rnp, rdp, NULL);
+	c = rcu_cbs_completed(rsp, rnp);
+	if (rcu_segcblist_accelerate(&rdp->cblist, c))
+		ret = rcu_start_this_gp(rnp, rdp, c);
 
 	/* Trace depending on how much we were able to accelerate. */
 	if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
@@ -2049,7 +1994,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 					    rnp->level, rnp->grplo,
 					    rnp->grphi, rnp->qsmask);
 		raw_spin_unlock_irq_rcu_node(rnp);
-		cond_resched_rcu_qs();
+		cond_resched_tasks_rcu_qs();
 		WRITE_ONCE(rsp->gp_activity, jiffies);
 	}
 
@@ -2108,7 +2053,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 {
 	unsigned long gp_duration;
 	bool needgp = false;
-	int nocb = 0;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	struct swait_queue_head *sq;
@@ -2147,31 +2091,35 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 		if (rnp == rdp->mynode)
 			needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
 		/* smp_mb() provided by prior unlock-lock pair. */
-		nocb += rcu_future_gp_cleanup(rsp, rnp);
+		needgp = rcu_future_gp_cleanup(rsp, rnp) || needgp;
 		sq = rcu_nocb_gp_get(rnp);
 		raw_spin_unlock_irq_rcu_node(rnp);
 		rcu_nocb_gp_cleanup(sq);
-		cond_resched_rcu_qs();
+		cond_resched_tasks_rcu_qs();
 		WRITE_ONCE(rsp->gp_activity, jiffies);
 		rcu_gp_slow(rsp, gp_cleanup_delay);
 	}
 	rnp = rcu_get_root(rsp);
 	raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */
-	rcu_nocb_gp_set(rnp, nocb);
 
 	/* Declare grace period done. */
 	WRITE_ONCE(rsp->completed, rsp->gpnum);
 	trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
 	rsp->gp_state = RCU_GP_IDLE;
+	/* Check for GP requests since above loop. */
 	rdp = this_cpu_ptr(rsp->rda);
+	if (need_any_future_gp(rnp)) {
+		trace_rcu_this_gp(rnp, rdp, rsp->completed - 1,
+				  TPS("CleanupMore"));
+		needgp = true;
+	}
 	/* Advance CBs to reduce false positives below. */
-	needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
-	if (needgp || cpu_needs_another_gp(rsp, rdp)) {
+	if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) {
 		WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
-		trace_rcu_grace_period(rsp->name,
-				       READ_ONCE(rsp->gpnum),
+		trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum),
 				       TPS("newreq"));
 	}
+	WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT);
 	raw_spin_unlock_irq_rcu_node(rnp);
 }
 
@@ -2202,7 +2150,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 			/* Locking provides needed memory barrier. */
 			if (rcu_gp_init(rsp))
 				break;
-			cond_resched_rcu_qs();
+			cond_resched_tasks_rcu_qs();
 			WRITE_ONCE(rsp->gp_activity, jiffies);
 			WARN_ON(signal_pending(current));
 			trace_rcu_grace_period(rsp->name,
@@ -2247,7 +2195,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 				trace_rcu_grace_period(rsp->name,
 						       READ_ONCE(rsp->gpnum),
 						       TPS("fqsend"));
-				cond_resched_rcu_qs();
+				cond_resched_tasks_rcu_qs();
 				WRITE_ONCE(rsp->gp_activity, jiffies);
 				ret = 0; /* Force full wait till next FQS. */
 				j = jiffies_till_next_fqs;
@@ -2260,7 +2208,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 				}
 			} else {
 				/* Deal with stray signal. */
-				cond_resched_rcu_qs();
+				cond_resched_tasks_rcu_qs();
 				WRITE_ONCE(rsp->gp_activity, jiffies);
 				WARN_ON(signal_pending(current));
 				trace_rcu_grace_period(rsp->name,
@@ -2283,71 +2231,6 @@ static int __noreturn rcu_gp_kthread(void *arg)
 }
 
 /*
- * Start a new RCU grace period if warranted, re-initializing the hierarchy
- * in preparation for detecting the next grace period.  The caller must hold
- * the root node's ->lock and hard irqs must be disabled.
- *
- * Note that it is legal for a dying CPU (which is marked as offline) to
- * invoke this function.  This can happen when the dying CPU reports its
- * quiescent state.
- *
- * Returns true if the grace-period kthread must be awakened.
- */
-static bool
-rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
-		      struct rcu_data *rdp)
-{
-	raw_lockdep_assert_held_rcu_node(rnp);
-	if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
-		/*
-		 * Either we have not yet spawned the grace-period
-		 * task, this CPU does not need another grace period,
-		 * or a grace period is already in progress.
-		 * Either way, don't start a new grace period.
-		 */
-		return false;
-	}
-	WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
-	trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum),
-			       TPS("newreq"));
-
-	/*
-	 * We can't do wakeups while holding the rnp->lock, as that
-	 * could cause possible deadlocks with the rq->lock. Defer
-	 * the wakeup to our caller.
-	 */
-	return true;
-}
-
-/*
- * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
- * callbacks.  Note that rcu_start_gp_advanced() cannot do this because it
- * is invoked indirectly from rcu_advance_cbs(), which would result in
- * endless recursion -- or would do so if it wasn't for the self-deadlock
- * that is encountered beforehand.
- *
- * Returns true if the grace-period kthread needs to be awakened.
- */
-static bool rcu_start_gp(struct rcu_state *rsp)
-{
-	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
-	struct rcu_node *rnp = rcu_get_root(rsp);
-	bool ret = false;
-
-	/*
-	 * If there is no grace period in progress right now, any
-	 * callbacks we have up to this point will be satisfied by the
-	 * next grace period.  Also, advancing the callbacks reduces the
-	 * probability of false positives from cpu_needs_another_gp()
-	 * resulting in pointless grace periods.  So, advance callbacks
-	 * then start the grace period!
-	 */
-	ret = rcu_advance_cbs(rsp, rnp, rdp) || ret;
-	ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret;
-	return ret;
-}
-
-/*
  * Report a full set of quiescent states to the specified rcu_state data
  * structure.  Invoke rcu_gp_kthread_wake() to awaken the grace-period
  * kthread if another grace period is required.  Whether we wake
@@ -2398,7 +2281,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 			return;
 		}
 		WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
-		WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 &&
+		WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
 			     rcu_preempt_blocked_readers_cgp(rnp));
 		rnp->qsmask &= ~mask;
 		trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
@@ -2782,7 +2665,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp))
 	struct rcu_node *rnp;
 
 	rcu_for_each_leaf_node(rsp, rnp) {
-		cond_resched_rcu_qs();
+		cond_resched_tasks_rcu_qs();
 		mask = 0;
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		if (rnp->qsmask == 0) {
@@ -2874,22 +2757,27 @@ __rcu_process_callbacks(struct rcu_state *rsp)
 	unsigned long flags;
 	bool needwake;
 	struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
+	struct rcu_node *rnp;
 
 	WARN_ON_ONCE(!rdp->beenonline);
 
 	/* Update RCU state based on any recent quiescent states. */
 	rcu_check_quiescent_state(rsp, rdp);
 
-	/* Does this CPU require a not-yet-started grace period? */
-	local_irq_save(flags);
-	if (cpu_needs_another_gp(rsp, rdp)) {
-		raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */
-		needwake = rcu_start_gp(rsp);
-		raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
-		if (needwake)
-			rcu_gp_kthread_wake(rsp);
-	} else {
-		local_irq_restore(flags);
+	/* No grace period and unregistered callbacks? */
+	if (!rcu_gp_in_progress(rsp) &&
+	    rcu_segcblist_is_enabled(&rdp->cblist)) {
+		local_irq_save(flags);
+		if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) {
+			local_irq_restore(flags);
+		} else {
+			rnp = rdp->mynode;
+			raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
+			needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
+			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+			if (needwake)
+				rcu_gp_kthread_wake(rsp);
+		}
 	}
 
 	/* If there are callbacks ready, invoke them. */
@@ -2973,11 +2861,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
 
 		/* Start a new grace period if one not already started. */
 		if (!rcu_gp_in_progress(rsp)) {
-			struct rcu_node *rnp_root = rcu_get_root(rsp);
+			struct rcu_node *rnp = rdp->mynode;
 
-			raw_spin_lock_rcu_node(rnp_root);
-			needwake = rcu_start_gp(rsp);
-			raw_spin_unlock_rcu_node(rnp_root);
+			raw_spin_lock_rcu_node(rnp);
+			needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
+			raw_spin_unlock_rcu_node(rnp);
 			if (needwake)
 				rcu_gp_kthread_wake(rsp);
 		} else {
@@ -3368,7 +3256,9 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 		return 1;
 
 	/* Has RCU gone idle with this CPU needing another grace period? */
-	if (cpu_needs_another_gp(rsp, rdp))
+	if (!rcu_gp_in_progress(rsp) &&
+	    rcu_segcblist_is_enabled(&rdp->cblist) &&
+	    !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
 		return 1;
 
 	/* Has another RCU grace period completed?  */
@@ -3775,6 +3665,8 @@ int rcutree_dead_cpu(unsigned int cpu)
 	return 0;
 }
 
+static DEFINE_PER_CPU(int, rcu_cpu_started);
+
 /*
  * Mark the specified CPU as being online so that subsequent grace periods
  * (both expedited and normal) will wait on it.  Note that this means that
@@ -3796,6 +3688,11 @@ void rcu_cpu_starting(unsigned int cpu)
 	struct rcu_node *rnp;
 	struct rcu_state *rsp;
 
+	if (per_cpu(rcu_cpu_started, cpu))
+		return;
+
+	per_cpu(rcu_cpu_started, cpu) = 1;
+
 	for_each_rcu_flavor(rsp) {
 		rdp = per_cpu_ptr(rsp->rda, cpu);
 		rnp = rdp->mynode;
@@ -3852,6 +3749,8 @@ void rcu_report_dead(unsigned int cpu)
 	preempt_enable();
 	for_each_rcu_flavor(rsp)
 		rcu_cleanup_dying_idle_cpu(cpu, rsp);
+
+	per_cpu(rcu_cpu_started, cpu) = 0;
 }
 
 /* Migrate the dead CPU's callbacks to the current CPU. */
@@ -3861,6 +3760,7 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
 	struct rcu_data *my_rdp;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+	bool needwake;
 
 	if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
 		return;  /* No callbacks to migrate. */
@@ -3872,12 +3772,15 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
 		return;
 	}
 	raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
-	rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */
-	rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */
+	/* Leverage recent GPs and set GP for new callbacks. */
+	needwake = rcu_advance_cbs(rsp, rnp_root, rdp) ||
+		   rcu_advance_cbs(rsp, rnp_root, my_rdp);
 	rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
 	WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
 		     !rcu_segcblist_n_cbs(&my_rdp->cblist));
 	raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
+	if (needwake)
+		rcu_gp_kthread_wake(rsp);
 	WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
 		  !rcu_segcblist_empty(&rdp->cblist),
 		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
@@ -4056,7 +3959,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 
 	init_swait_queue_head(&rsp->gp_wq);
 	init_swait_queue_head(&rsp->expedited_wq);
-	rnp = rsp->level[rcu_num_lvls - 1];
+	rnp = rcu_first_leaf_node(rsp);
 	for_each_possible_cpu(i) {
 		while (i > rnp->grphi)
 			rnp++;
@@ -4168,6 +4071,7 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
 }
 
 struct workqueue_struct *rcu_gp_wq;
+struct workqueue_struct *rcu_par_gp_wq;
 
 void __init rcu_init(void)
 {
@@ -4199,6 +4103,8 @@ void __init rcu_init(void)
 	/* Create workqueue for expedited GPs and for Tree SRCU. */
 	rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
 	WARN_ON(!rcu_gp_wq);
+	rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
+	WARN_ON(!rcu_par_gp_wq);
 }
 
 #include "tree_exp.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index f491ab4..78e051d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -58,6 +58,14 @@ struct rcu_dynticks {
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 };
 
+/* Communicate arguments to a workqueue handler. */
+struct rcu_exp_work {
+	smp_call_func_t rew_func;
+	struct rcu_state *rew_rsp;
+	unsigned long rew_s;
+	struct work_struct rew_work;
+};
+
 /* RCU's kthread states for tracing. */
 #define RCU_KTHREAD_STOPPED  0
 #define RCU_KTHREAD_RUNNING  1
@@ -150,15 +158,32 @@ struct rcu_node {
 	struct swait_queue_head nocb_gp_wq[2];
 				/* Place for rcu_nocb_kthread() to wait GP. */
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
-	int need_future_gp[2];
-				/* Counts of upcoming no-CB GP requests. */
+	u8 need_future_gp[4];	/* Counts of upcoming GP requests. */
 	raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
 
 	spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
 	unsigned long exp_seq_rq;
 	wait_queue_head_t exp_wq[4];
+	struct rcu_exp_work rew;
+	bool exp_need_flush;	/* Need to flush workitem? */
 } ____cacheline_internodealigned_in_smp;
 
+/* Accessors for ->need_future_gp[] array. */
+#define need_future_gp_mask() \
+	(ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1)
+#define need_future_gp_element(rnp, c) \
+	((rnp)->need_future_gp[(c) & need_future_gp_mask()])
+#define need_any_future_gp(rnp)						\
+({									\
+	int __i;							\
+	bool __nonzero = false;						\
+									\
+	for (__i = 0; __i < ARRAY_SIZE((rnp)->need_future_gp); __i++)	\
+		__nonzero = __nonzero ||				\
+			    READ_ONCE((rnp)->need_future_gp[__i]);	\
+	__nonzero;							\
+})
+
 /*
  * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and
  * are indexed relative to this interval rather than the global CPU ID space.
@@ -224,10 +249,6 @@ struct rcu_data {
 #ifdef CONFIG_RCU_FAST_NO_HZ
 	struct rcu_head oom_head;
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-	atomic_long_t exp_workdone0;	/* # done by workqueue. */
-	atomic_long_t exp_workdone1;	/* # done by others #1. */
-	atomic_long_t exp_workdone2;	/* # done by others #2. */
-	atomic_long_t exp_workdone3;	/* # done by others #3. */
 	int exp_dynticks_snap;		/* Double-check need for IPI. */
 
 	/* 6) Callback offloading. */
@@ -408,7 +429,6 @@ extern struct rcu_state rcu_preempt_state;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 
 int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
-bool rcu_eqs_special_set(int cpu);
 
 #ifdef CONFIG_RCU_BOOST
 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
@@ -438,7 +458,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
 static void invoke_rcu_callbacks_kthread(void);
 static bool rcu_is_callbacks_kthread(void);
 #ifdef CONFIG_RCU_BOOST
-static void rcu_preempt_do_callbacks(void);
 static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 						 struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_BOOST */
@@ -454,7 +473,6 @@ static void print_cpu_stall_info_end(void);
 static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static void increment_cpu_stall_ticks(void);
 static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index f72eefa..d40708e 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -20,6 +20,8 @@
  * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  */
 
+#include <linux/lockdep.h>
+
 /*
  * Record the start of an expedited grace period.
  */
@@ -154,15 +156,35 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
  * for the current expedited grace period.  Works only for preemptible
  * RCU -- other RCU implementation use other means.
  *
- * Caller must hold the rcu_state's exp_mutex.
+ * Caller must hold the specificed rcu_node structure's ->lock
  */
 static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 {
+	raw_lockdep_assert_held_rcu_node(rnp);
+
 	return rnp->exp_tasks == NULL &&
 	       READ_ONCE(rnp->expmask) == 0;
 }
 
 /*
+ * Like sync_rcu_preempt_exp_done(), but this function assumes the caller
+ * doesn't hold the rcu_node's ->lock, and will acquire and release the lock
+ * itself
+ */
+static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp)
+{
+	unsigned long flags;
+	bool ret;
+
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
+	ret = sync_rcu_preempt_exp_done(rnp);
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+	return ret;
+}
+
+
+/*
  * Report the exit from RCU read-side critical section for the last task
  * that queued itself during or before the current expedited preemptible-RCU
  * grace period.  This event is reported either to the rcu_node structure on
@@ -170,8 +192,7 @@ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
  * recursively up the tree.  (Calm down, calm down, we do the recursion
  * iteratively!)
  *
- * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
- * structure's ->lock.
+ * Caller must hold the specified rcu_node structure's ->lock.
  */
 static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 				 bool wake, unsigned long flags)
@@ -207,8 +228,6 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 /*
  * Report expedited quiescent state for specified node.  This is a
  * lock-acquisition wrapper function for __rcu_report_exp_rnp().
- *
- * Caller must hold the rcu_state's exp_mutex.
  */
 static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
 					      struct rcu_node *rnp, bool wake)
@@ -221,8 +240,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
 
 /*
  * Report expedited quiescent state for multiple CPUs, all covered by the
- * specified leaf rcu_node structure.  Caller must hold the rcu_state's
- * exp_mutex.
+ * specified leaf rcu_node structure.
  */
 static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
 				    unsigned long mask, bool wake)
@@ -248,14 +266,12 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
 }
 
 /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
-static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
-			       unsigned long s)
+static bool sync_exp_work_done(struct rcu_state *rsp, unsigned long s)
 {
 	if (rcu_exp_gp_seq_done(rsp, s)) {
 		trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
 		/* Ensure test happens before caller kfree(). */
 		smp_mb__before_atomic(); /* ^^^ */
-		atomic_long_inc(stat);
 		return true;
 	}
 	return false;
@@ -289,7 +305,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 	 * promoting locality and is not strictly needed for correctness.
 	 */
 	for (; rnp != NULL; rnp = rnp->parent) {
-		if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
+		if (sync_exp_work_done(rsp, s))
 			return true;
 
 		/* Work not done, either wait here or go up. */
@@ -302,8 +318,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 						  rnp->grplo, rnp->grphi,
 						  TPS("wait"));
 			wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
-				   sync_exp_work_done(rsp,
-						      &rdp->exp_workdone2, s));
+				   sync_exp_work_done(rsp, s));
 			return true;
 		}
 		rnp->exp_seq_rq = s; /* Followers can wait on us. */
@@ -313,7 +328,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 	}
 	mutex_lock(&rsp->exp_mutex);
 fastpath:
-	if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
+	if (sync_exp_work_done(rsp, s)) {
 		mutex_unlock(&rsp->exp_mutex);
 		return true;
 	}
@@ -362,93 +377,129 @@ static void sync_sched_exp_online_cleanup(int cpu)
 }
 
 /*
- * Select the nodes that the upcoming expedited grace period needs
- * to wait for.
+ * Select the CPUs within the specified rcu_node that the upcoming
+ * expedited grace period needs to wait for.
  */
-static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
-				     smp_call_func_t func)
+static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
 {
 	int cpu;
 	unsigned long flags;
+	smp_call_func_t func;
 	unsigned long mask_ofl_test;
 	unsigned long mask_ofl_ipi;
 	int ret;
-	struct rcu_node *rnp;
-
-	trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
-	sync_exp_reset_tree(rsp);
-	trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
-	rcu_for_each_leaf_node(rsp, rnp) {
-		raw_spin_lock_irqsave_rcu_node(rnp, flags);
+	struct rcu_exp_work *rewp =
+		container_of(wp, struct rcu_exp_work, rew_work);
+	struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
+	struct rcu_state *rsp = rewp->rew_rsp;
 
-		/* Each pass checks a CPU for identity, offline, and idle. */
-		mask_ofl_test = 0;
-		for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
-			unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
-			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-			struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
-			int snap;
+	func = rewp->rew_func;
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 
-			if (raw_smp_processor_id() == cpu ||
-			    !(rnp->qsmaskinitnext & mask)) {
+	/* Each pass checks a CPU for identity, offline, and idle. */
+	mask_ofl_test = 0;
+	for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+		unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+		struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+		struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
+		int snap;
+
+		if (raw_smp_processor_id() == cpu ||
+		    !(rnp->qsmaskinitnext & mask)) {
+			mask_ofl_test |= mask;
+		} else {
+			snap = rcu_dynticks_snap(rdtp);
+			if (rcu_dynticks_in_eqs(snap))
 				mask_ofl_test |= mask;
-			} else {
-				snap = rcu_dynticks_snap(rdtp);
-				if (rcu_dynticks_in_eqs(snap))
-					mask_ofl_test |= mask;
-				else
-					rdp->exp_dynticks_snap = snap;
-			}
+			else
+				rdp->exp_dynticks_snap = snap;
 		}
-		mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
-
-		/*
-		 * Need to wait for any blocked tasks as well.  Note that
-		 * additional blocking tasks will also block the expedited
-		 * GP until such time as the ->expmask bits are cleared.
-		 */
-		if (rcu_preempt_has_tasks(rnp))
-			rnp->exp_tasks = rnp->blkd_tasks.next;
-		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+	}
+	mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
 
-		/* IPI the remaining CPUs for expedited quiescent state. */
-		for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
-			unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
-			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+	/*
+	 * Need to wait for any blocked tasks as well.	Note that
+	 * additional blocking tasks will also block the expedited GP
+	 * until such time as the ->expmask bits are cleared.
+	 */
+	if (rcu_preempt_has_tasks(rnp))
+		rnp->exp_tasks = rnp->blkd_tasks.next;
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
-			if (!(mask_ofl_ipi & mask))
-				continue;
+	/* IPI the remaining CPUs for expedited quiescent state. */
+	for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+		unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+		struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+
+		if (!(mask_ofl_ipi & mask))
+			continue;
 retry_ipi:
-			if (rcu_dynticks_in_eqs_since(rdp->dynticks,
-						      rdp->exp_dynticks_snap)) {
-				mask_ofl_test |= mask;
-				continue;
-			}
-			ret = smp_call_function_single(cpu, func, rsp, 0);
-			if (!ret) {
-				mask_ofl_ipi &= ~mask;
-				continue;
-			}
-			/* Failed, raced with CPU hotplug operation. */
-			raw_spin_lock_irqsave_rcu_node(rnp, flags);
-			if ((rnp->qsmaskinitnext & mask) &&
-			    (rnp->expmask & mask)) {
-				/* Online, so delay for a bit and try again. */
-				raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-				trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
-				schedule_timeout_uninterruptible(1);
-				goto retry_ipi;
-			}
-			/* CPU really is offline, so we can ignore it. */
-			if (!(rnp->expmask & mask))
-				mask_ofl_ipi &= ~mask;
+		if (rcu_dynticks_in_eqs_since(rdp->dynticks,
+					      rdp->exp_dynticks_snap)) {
+			mask_ofl_test |= mask;
+			continue;
+		}
+		ret = smp_call_function_single(cpu, func, rsp, 0);
+		if (!ret) {
+			mask_ofl_ipi &= ~mask;
+			continue;
+		}
+		/* Failed, raced with CPU hotplug operation. */
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
+		if ((rnp->qsmaskinitnext & mask) &&
+		    (rnp->expmask & mask)) {
+			/* Online, so delay for a bit and try again. */
 			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+			trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
+			schedule_timeout_uninterruptible(1);
+			goto retry_ipi;
+		}
+		/* CPU really is offline, so we can ignore it. */
+		if (!(rnp->expmask & mask))
+			mask_ofl_ipi &= ~mask;
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+	}
+	/* Report quiescent states for those that went offline. */
+	mask_ofl_test |= mask_ofl_ipi;
+	if (mask_ofl_test)
+		rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+}
+
+/*
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
+ */
+static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
+				     smp_call_func_t func)
+{
+	struct rcu_node *rnp;
+
+	trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
+	sync_exp_reset_tree(rsp);
+	trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
+
+	/* Schedule work for each leaf rcu_node structure. */
+	rcu_for_each_leaf_node(rsp, rnp) {
+		rnp->exp_need_flush = false;
+		if (!READ_ONCE(rnp->expmask))
+			continue; /* Avoid early boot non-existent wq. */
+		rnp->rew.rew_func = func;
+		rnp->rew.rew_rsp = rsp;
+		if (!READ_ONCE(rcu_par_gp_wq) ||
+		    rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+			/* No workqueues yet. */
+			sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
+			continue;
 		}
-		/* Report quiescent states for those that went offline. */
-		mask_ofl_test |= mask_ofl_ipi;
-		if (mask_ofl_test)
-			rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+		INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+		queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work);
+		rnp->exp_need_flush = true;
 	}
+
+	/* Wait for workqueue jobs (if any) to complete. */
+	rcu_for_each_leaf_node(rsp, rnp)
+		if (rnp->exp_need_flush)
+			flush_work(&rnp->rew.rew_work);
 }
 
 static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
@@ -469,9 +520,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 	for (;;) {
 		ret = swait_event_timeout(
 				rsp->expedited_wq,
-				sync_rcu_preempt_exp_done(rnp_root),
+				sync_rcu_preempt_exp_done_unlocked(rnp_root),
 				jiffies_stall);
-		if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
+		if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root))
 			return;
 		WARN_ON(ret < 0);  /* workqueues should not be signaled. */
 		if (rcu_cpu_stall_suppress)
@@ -504,7 +555,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 			rcu_for_each_node_breadth_first(rsp, rnp) {
 				if (rnp == rnp_root)
 					continue; /* printed unconditionally */
-				if (sync_rcu_preempt_exp_done(rnp))
+				if (sync_rcu_preempt_exp_done_unlocked(rnp))
 					continue;
 				pr_cont(" l=%u:%d-%d:%#lx/%c",
 					rnp->level, rnp->grplo, rnp->grphi,
@@ -560,14 +611,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
 	mutex_unlock(&rsp->exp_wake_mutex);
 }
 
-/* Let the workqueue handler know what it is supposed to do. */
-struct rcu_exp_work {
-	smp_call_func_t rew_func;
-	struct rcu_state *rew_rsp;
-	unsigned long rew_s;
-	struct work_struct rew_work;
-};
-
 /*
  * Common code to drive an expedited grace period forward, used by
  * workqueues and mid-boot-time tasks.
@@ -633,7 +676,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
 	rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
 	rnp = rcu_get_root(rsp);
 	wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
-		   sync_exp_work_done(rsp, &rdp->exp_workdone0, s));
+		   sync_exp_work_done(rsp, s));
 	smp_mb(); /* Workqueue actions happen before return. */
 
 	/* Let the next expedited grace period start. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 84fbee4..7fd1203 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -182,7 +182,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
 
 	raw_lockdep_assert_held_rcu_node(rnp);
 	WARN_ON_ONCE(rdp->mynode != rnp);
-	WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
+	WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
 
 	/*
 	 * Decide where to queue the newly blocked task.  In theory,
@@ -384,6 +384,50 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 }
 
 /*
+ * Preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+	current->rcu_read_lock_nesting++;
+	barrier();  /* critical section after entry code. */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+/*
+ * Preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+	struct task_struct *t = current;
+
+	if (t->rcu_read_lock_nesting != 1) {
+		--t->rcu_read_lock_nesting;
+	} else {
+		barrier();  /* critical section before exit code. */
+		t->rcu_read_lock_nesting = INT_MIN;
+		barrier();  /* assign before ->rcu_read_unlock_special load */
+		if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
+			rcu_read_unlock_special(t);
+		barrier();  /* ->rcu_read_unlock_special load before assign */
+		t->rcu_read_lock_nesting = 0;
+	}
+#ifdef CONFIG_PROVE_LOCKING
+	{
+		int rrln = READ_ONCE(t->rcu_read_lock_nesting);
+
+		WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+	}
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+/*
  * Advance a ->blkd_tasks-list pointer to the next entry, instead
  * returning NULL if at the end of the list.
  */
@@ -489,7 +533,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 		rnp = t->rcu_blocked_node;
 		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
 		WARN_ON_ONCE(rnp != t->rcu_blocked_node);
-		WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
+		WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
 		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
 		empty_exp = sync_rcu_preempt_exp_done(rnp);
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -685,15 +729,6 @@ static void rcu_preempt_check_callbacks(void)
 		t->rcu_read_unlock_special.b.need_qs = true;
 }
 
-#ifdef CONFIG_RCU_BOOST
-
-static void rcu_preempt_do_callbacks(void)
-{
-	rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
-}
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
 /**
  * call_rcu() - Queue an RCU callback for invocation after a grace period.
  * @head: structure to be used for queueing the RCU updates.
@@ -1140,7 +1175,7 @@ static void rcu_kthread_do_work(void)
 {
 	rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
 	rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
-	rcu_preempt_do_callbacks();
+	rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
 }
 
 static void rcu_cpu_kthread_setup(unsigned int cpu)
@@ -1607,7 +1642,7 @@ static int rcu_oom_notify(struct notifier_block *self,
 
 	for_each_online_cpu(cpu) {
 		smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
-		cond_resched_rcu_qs();
+		cond_resched_tasks_rcu_qs();
 	}
 
 	/* Unconditionally decrement: no need to wake ourselves up. */
@@ -1780,19 +1815,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
 	swake_up_all(sq);
 }
 
-/*
- * Set the root rcu_node structure's ->need_future_gp field
- * based on the sum of those of all rcu_node structures.  This does
- * double-count the root rcu_node structure's requests, but this
- * is necessary to handle the possibility of a rcu_nocb_kthread()
- * having awakened during the time that the rcu_node structures
- * were being updated for the end of the previous grace period.
- */
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
-{
-	rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
-}
-
 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
 {
 	return &rnp->nocb_gp_wq[rnp->completed & 0x1];
@@ -1966,7 +1988,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
 			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
 					    TPS("WakeOvf"));
 		} else {
-			wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
+			wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE,
 					       TPS("WakeOvfIsDeferred"));
 		}
 		rdp->qlen_last_fqs_check = LONG_MAX / 2;
@@ -2048,7 +2070,8 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 	struct rcu_node *rnp = rdp->mynode;
 
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-	needwake = rcu_start_future_gp(rnp, rdp, &c);
+	c = rcu_cbs_completed(rdp->rsp, rnp);
+	needwake = rcu_start_this_gp(rnp, rdp, c);
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	if (needwake)
 		rcu_gp_kthread_wake(rdp->rsp);
@@ -2057,7 +2080,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 	 * Wait for the grace period.  Do so interruptibly to avoid messing
 	 * up the load average.
 	 */
-	trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
+	trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait"));
 	for (;;) {
 		swait_event_interruptible(
 			rnp->nocb_gp_wq[c & 0x1],
@@ -2065,9 +2088,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 		if (likely(d))
 			break;
 		WARN_ON(signal_pending(current));
-		trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
+		trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait"));
 	}
-	trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
+	trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait"));
 	smp_mb(); /* Ensure that CB invocation happens after GP end. */
 }
 
@@ -2236,7 +2259,7 @@ static int rcu_nocb_kthread(void *arg)
 				cl++;
 			c++;
 			local_bh_enable();
-			cond_resched_rcu_qs();
+			cond_resched_tasks_rcu_qs();
 			list = next;
 		}
 		trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
@@ -2292,7 +2315,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
 void __init rcu_init_nohz(void)
 {
 	int cpu;
-	bool need_rcu_nocb_mask = true;
+	bool need_rcu_nocb_mask = false;
 	struct rcu_state *rsp;
 
 #if defined(CONFIG_NO_HZ_FULL)
@@ -2315,7 +2338,7 @@ void __init rcu_init_nohz(void)
 #endif /* #if defined(CONFIG_NO_HZ_FULL) */
 
 	if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
-		pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
+		pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
 		cpumask_and(rcu_nocb_mask, cpu_possible_mask,
 			    rcu_nocb_mask);
 	}
@@ -2495,10 +2518,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
 {
 }
 
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
-{
-}
-
 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
 {
 	return NULL;
@@ -2587,8 +2606,7 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
 }
 
 /*
- * Bind the grace-period kthread for the sysidle flavor of RCU to the
- * timekeeping CPU.
+ * Bind the RCU grace-period kthreads to the housekeeping CPU.
  */
 static void rcu_bind_gp_kthread(void)
 {
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 68fa19a..4c230a6 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -226,54 +226,6 @@ core_initcall(rcu_set_runtime_mode);
 
 #endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
 
-#ifdef CONFIG_PREEMPT_RCU
-
-/*
- * Preemptible RCU implementation for rcu_read_lock().
- * Just increment ->rcu_read_lock_nesting, shared state will be updated
- * if we block.
- */
-void __rcu_read_lock(void)
-{
-	current->rcu_read_lock_nesting++;
-	barrier();  /* critical section after entry code. */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-
-/*
- * Preemptible RCU implementation for rcu_read_unlock().
- * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
- * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
- * invoke rcu_read_unlock_special() to clean up after a context switch
- * in an RCU read-side critical section and other special cases.
- */
-void __rcu_read_unlock(void)
-{
-	struct task_struct *t = current;
-
-	if (t->rcu_read_lock_nesting != 1) {
-		--t->rcu_read_lock_nesting;
-	} else {
-		barrier();  /* critical section before exit code. */
-		t->rcu_read_lock_nesting = INT_MIN;
-		barrier();  /* assign before ->rcu_read_unlock_special load */
-		if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
-			rcu_read_unlock_special(t);
-		barrier();  /* ->rcu_read_unlock_special load before assign */
-		t->rcu_read_lock_nesting = 0;
-	}
-#ifdef CONFIG_PROVE_LOCKING
-	{
-		int rrln = READ_ONCE(t->rcu_read_lock_nesting);
-
-		WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
-	}
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
 struct lockdep_map rcu_lock_map =
@@ -624,7 +576,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
  * grace period has elapsed, in other words after all currently
  * executing rcu-tasks read-side critical sections have elapsed.  These
  * read-side critical sections are delimited by calls to schedule(),
- * cond_resched_rcu_qs(), idle execution, userspace execution, calls
+ * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls
  * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched().
  *
  * This is a very specialized primitive, intended only for a few uses in
diff --git a/kernel/resource.c b/kernel/resource.c
index 2af6c03..b589dda 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -87,7 +87,7 @@ enum { MAX_IORES_LEVEL = 5 };
 static void *r_start(struct seq_file *m, loff_t *pos)
 	__acquires(resource_lock)
 {
-	struct resource *p = m->private;
+	struct resource *p = PDE_DATA(file_inode(m->file));
 	loff_t l = 0;
 	read_lock(&resource_lock);
 	for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
@@ -103,7 +103,7 @@ static void r_stop(struct seq_file *m, void *v)
 
 static int r_show(struct seq_file *m, void *v)
 {
-	struct resource *root = m->private;
+	struct resource *root = PDE_DATA(file_inode(m->file));
 	struct resource *r = v, *p;
 	unsigned long long start, end;
 	int width = root->end < 0x10000 ? 4 : 8;
@@ -135,44 +135,11 @@ static const struct seq_operations resource_op = {
 	.show	= r_show,
 };
 
-static int ioports_open(struct inode *inode, struct file *file)
-{
-	int res = seq_open(file, &resource_op);
-	if (!res) {
-		struct seq_file *m = file->private_data;
-		m->private = &ioport_resource;
-	}
-	return res;
-}
-
-static int iomem_open(struct inode *inode, struct file *file)
-{
-	int res = seq_open(file, &resource_op);
-	if (!res) {
-		struct seq_file *m = file->private_data;
-		m->private = &iomem_resource;
-	}
-	return res;
-}
-
-static const struct file_operations proc_ioports_operations = {
-	.open		= ioports_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static const struct file_operations proc_iomem_operations = {
-	.open		= iomem_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int __init ioresources_init(void)
 {
-	proc_create("ioports", 0, NULL, &proc_ioports_operations);
-	proc_create("iomem", 0, NULL, &proc_iomem_operations);
+	proc_create_seq_data("ioports", 0, NULL, &resource_op,
+			&ioport_resource);
+	proc_create_seq_data("iomem", 0, NULL, &resource_op, &iomem_resource);
 	return 0;
 }
 __initcall(ioresources_init);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 211890e..e9866f8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2194,27 +2194,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
 
-#ifdef CONFIG_NUMA_BALANCING
-	if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
-		p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
-		p->mm->numa_scan_seq = 0;
-	}
-
-	if (clone_flags & CLONE_VM)
-		p->numa_preferred_nid = current->numa_preferred_nid;
-	else
-		p->numa_preferred_nid = -1;
-
-	p->node_stamp = 0ULL;
-	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
-	p->numa_scan_period = sysctl_numa_balancing_scan_delay;
-	p->numa_work.next = &p->numa_work;
-	p->numa_faults = NULL;
-	p->last_task_numa_placement = 0;
-	p->last_sum_exec_runtime = 0;
-
-	p->numa_group = NULL;
-#endif /* CONFIG_NUMA_BALANCING */
+	init_numa_balancing(clone_flags, p);
 }
 
 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -4050,6 +4030,23 @@ int idle_cpu(int cpu)
 }
 
 /**
+ * available_idle_cpu - is a given CPU idle for enqueuing work.
+ * @cpu: the CPU in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int available_idle_cpu(int cpu)
+{
+	if (!idle_cpu(cpu))
+		return 0;
+
+	if (vcpu_is_preempted(cpu))
+		return 0;
+
+	return 1;
+}
+
+/**
  * idle_task - return the idle task for a given CPU.
  * @cpu: the processor in question.
  *
@@ -5025,20 +5022,6 @@ int __cond_resched_lock(spinlock_t *lock)
 }
 EXPORT_SYMBOL(__cond_resched_lock);
 
-int __sched __cond_resched_softirq(void)
-{
-	BUG_ON(!in_softirq());
-
-	if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
-		local_bh_enable();
-		preempt_schedule_common();
-		local_bh_disable();
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(__cond_resched_softirq);
-
 /**
  * yield - yield the current processor to other threads.
  *
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e13df95..28592b6 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -183,22 +183,21 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
 static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
 {
 	struct rq *rq = cpu_rq(sg_cpu->cpu);
-	unsigned long util;
 
-	if (rq->rt.rt_nr_running) {
-		util = sg_cpu->max;
-	} else {
-		util = sg_cpu->util_dl;
-		if (rq->cfs.h_nr_running)
-			util += sg_cpu->util_cfs;
-	}
+	if (rq->rt.rt_nr_running)
+		return sg_cpu->max;
 
 	/*
+	 * Utilization required by DEADLINE must always be granted while, for
+	 * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to
+	 * gracefully reduce the frequency when no tasks show up for longer
+	 * periods of time.
+	 *
 	 * Ideally we would like to set util_dl as min/guaranteed freq and
 	 * util_cfs + util_dl as requested freq. However, cpufreq is not yet
 	 * ready for such an interface. So, we only do the latter for now.
 	 */
-	return min(util, sg_cpu->max);
+	return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));
 }
 
 static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 15b10e2..e593b41 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -823,35 +823,9 @@ static const struct seq_operations sched_debug_sops = {
 	.show		= sched_debug_show,
 };
 
-static int sched_debug_release(struct inode *inode, struct file *file)
-{
-	seq_release(inode, file);
-
-	return 0;
-}
-
-static int sched_debug_open(struct inode *inode, struct file *filp)
-{
-	int ret = 0;
-
-	ret = seq_open(filp, &sched_debug_sops);
-
-	return ret;
-}
-
-static const struct file_operations sched_debug_fops = {
-	.open		= sched_debug_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= sched_debug_release,
-};
-
 static int __init init_sched_debug_procfs(void)
 {
-	struct proc_dir_entry *pe;
-
-	pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
-	if (!pe)
+	if (!proc_create_seq("sched_debug", 0444, NULL, &sched_debug_sops))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 79f574d..e497c05 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1139,6 +1139,47 @@ static unsigned int task_scan_max(struct task_struct *p)
 	return max(smin, smax);
 }
 
+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+	int mm_users = 0;
+	struct mm_struct *mm = p->mm;
+
+	if (mm) {
+		mm_users = atomic_read(&mm->mm_users);
+		if (mm_users == 1) {
+			mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+			mm->numa_scan_seq = 0;
+		}
+	}
+	p->node_stamp			= 0;
+	p->numa_scan_seq		= mm ? mm->numa_scan_seq : 0;
+	p->numa_scan_period		= sysctl_numa_balancing_scan_delay;
+	p->numa_work.next		= &p->numa_work;
+	p->numa_faults			= NULL;
+	p->numa_group			= NULL;
+	p->last_task_numa_placement	= 0;
+	p->last_sum_exec_runtime	= 0;
+
+	/* New address space, reset the preferred nid */
+	if (!(clone_flags & CLONE_VM)) {
+		p->numa_preferred_nid = -1;
+		return;
+	}
+
+	/*
+	 * New thread, keep existing numa_preferred_nid which should be copied
+	 * already by arch_dup_task_struct but stagger when scans start.
+	 */
+	if (mm) {
+		unsigned int delay;
+
+		delay = min_t(unsigned int, task_scan_max(current),
+			current->numa_scan_period * mm_users * NSEC_PER_MSEC);
+		delay += 2 * TICK_NSEC;
+		p->node_stamp = delay;
+	}
+}
+
 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 {
 	rq->nr_numa_running += (p->numa_preferred_nid != -1);
@@ -5345,6 +5386,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	struct sched_entity *se = &p->se;
 
 	/*
+	 * The code below (indirectly) updates schedutil which looks at
+	 * the cfs_rq utilization to select a frequency.
+	 * Let's add the task's estimated utilization to the cfs_rq's
+	 * estimated utilization, before we update schedutil.
+	 */
+	util_est_enqueue(&rq->cfs, p);
+
+	/*
 	 * If in_iowait is set, the code below may not trigger any cpufreq
 	 * utilization updates, so do it here explicitly with the IOWAIT flag
 	 * passed.
@@ -5385,7 +5434,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!se)
 		add_nr_running(rq, 1);
 
-	util_est_enqueue(&rq->cfs, p);
 	hrtick_update(rq);
 }
 
@@ -5858,8 +5906,8 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
 	 * a cpufreq perspective, it's better to have higher utilisation
 	 * on one CPU.
 	 */
-	if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
-		return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
+	if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
+		return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
 
 	if (sync && cpu_rq(this_cpu)->nr_running == 1)
 		return this_cpu;
@@ -6102,7 +6150,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
 
 	/* Traverse only the allowed CPUs */
 	for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
-		if (idle_cpu(i)) {
+		if (available_idle_cpu(i)) {
 			struct rq *rq = cpu_rq(i);
 			struct cpuidle_state *idle = idle_get_state(rq);
 			if (idle && idle->exit_latency < min_exit_latency) {
@@ -6144,6 +6192,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 	if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
 		return prev_cpu;
 
+	/*
+	 * We need task's util for capacity_spare_wake, sync it up to prev_cpu's
+	 * last_update_time.
+	 */
+	if (!(sd_flag & SD_BALANCE_FORK))
+		sync_entity_load_avg(&p->se);
+
 	while (sd) {
 		struct sched_group *group;
 		struct sched_domain *tmp;
@@ -6224,7 +6279,7 @@ void __update_idle_core(struct rq *rq)
 		if (cpu == core)
 			continue;
 
-		if (!idle_cpu(cpu))
+		if (!available_idle_cpu(cpu))
 			goto unlock;
 	}
 
@@ -6256,7 +6311,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
 
 		for_each_cpu(cpu, cpu_smt_mask(core)) {
 			cpumask_clear_cpu(cpu, cpus);
-			if (!idle_cpu(cpu))
+			if (!available_idle_cpu(cpu))
 				idle = false;
 		}
 
@@ -6285,7 +6340,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
 	for_each_cpu(cpu, cpu_smt_mask(target)) {
 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
-		if (idle_cpu(cpu))
+		if (available_idle_cpu(cpu))
 			return cpu;
 	}
 
@@ -6348,7 +6403,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 			return -1;
 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
-		if (idle_cpu(cpu))
+		if (available_idle_cpu(cpu))
 			break;
 	}
 
@@ -6368,13 +6423,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	struct sched_domain *sd;
 	int i, recent_used_cpu;
 
-	if (idle_cpu(target))
+	if (available_idle_cpu(target))
 		return target;
 
 	/*
 	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
-	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+	if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
 		return prev;
 
 	/* Check a recently used CPU as a potential idle candidate: */
@@ -6382,7 +6437,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if (recent_used_cpu != prev &&
 	    recent_used_cpu != target &&
 	    cpus_share_cache(recent_used_cpu, target) &&
-	    idle_cpu(recent_used_cpu) &&
+	    available_idle_cpu(recent_used_cpu) &&
 	    cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
 		/*
 		 * Replace recent_used_cpu with prev as it is a potential
@@ -6558,7 +6613,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 static int
 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
 {
-	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
+	struct sched_domain *tmp, *sd = NULL;
 	int cpu = smp_processor_id();
 	int new_cpu = prev_cpu;
 	int want_affine = 0;
@@ -6581,7 +6636,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 		 */
 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
-			affine_sd = tmp;
+			if (cpu != prev_cpu)
+				new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
+
+			sd = NULL; /* Prefer wake_affine over balance flags */
 			break;
 		}
 
@@ -6591,33 +6649,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 			break;
 	}
 
-	if (affine_sd) {
-		sd = NULL; /* Prefer wake_affine over balance flags */
-		if (cpu == prev_cpu)
-			goto pick_cpu;
-
-		new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
-	}
-
-	if (sd && !(sd_flag & SD_BALANCE_FORK)) {
-		/*
-		 * We're going to need the task's util for capacity_spare_wake
-		 * in find_idlest_group. Sync it up to prev_cpu's
-		 * last_update_time.
-		 */
-		sync_entity_load_avg(&p->se);
-	}
+	if (unlikely(sd)) {
+		/* Slow path */
+		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
+	} else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
+		/* Fast path */
 
-	if (!sd) {
-pick_cpu:
-		if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
-			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 
-			if (want_affine)
-				current->recent_used_cpu = cpu;
-		}
-	} else {
-		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
+		if (want_affine)
+			current->recent_used_cpu = cpu;
 	}
 	rcu_read_unlock();
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cb467c22..6601baf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1069,6 +1069,12 @@ enum numa_faults_stats {
 extern void sched_setnuma(struct task_struct *p, int node);
 extern int migrate_task_to(struct task_struct *p, int cpu);
 extern int migrate_swap(struct task_struct *, struct task_struct *);
+extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
+#else
+static inline void
+init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index ab112cb..750fb3c 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -120,22 +120,9 @@ static const struct seq_operations schedstat_sops = {
 	.show  = show_schedstat,
 };
 
-static int schedstat_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &schedstat_sops);
-}
-
-static const struct file_operations proc_schedstat_operations = {
-	.open    = schedstat_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release,
-};
-
 static int __init proc_schedstat_init(void)
 {
-	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
-
+	proc_create_seq("schedstat", 0, NULL, &schedstat_sops);
 	return 0;
 }
 subsys_initcall(proc_schedstat_init);
diff --git a/kernel/signal.c b/kernel/signal.c
index 9c33163..0f865d6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1539,7 +1539,6 @@ int send_sig_fault(int sig, int code, void __user *addr
 	return send_sig_info(info.si_signo, &info, t);
 }
 
-#if defined(BUS_MCEERR_AO) && defined(BUS_MCEERR_AR)
 int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
 {
 	struct siginfo info;
@@ -1568,9 +1567,7 @@ int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *
 	return send_sig_info(info.si_signo, &info, t);
 }
 EXPORT_SYMBOL(send_sig_mceerr);
-#endif
 
-#ifdef SEGV_BNDERR
 int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
 {
 	struct siginfo info;
@@ -1584,7 +1581,6 @@ int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
 	info.si_upper = upper;
 	return force_sig_info(info.si_signo, &info, current);
 }
-#endif
 
 #ifdef SEGV_PKUERR
 int force_sig_pkuerr(void __user *addr, u32 pkey)
@@ -2837,8 +2833,19 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
 			[SIGPOLL] = { NSIGPOLL, SIL_POLL },
 			[SIGSYS]  = { NSIGSYS,  SIL_SYS },
 		};
-		if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit))
+		if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) {
 			layout = filter[sig].layout;
+			/* Handle the exceptions */
+			if ((sig == SIGBUS) &&
+			    (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO))
+				layout = SIL_FAULT_MCEERR;
+			else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR))
+				layout = SIL_FAULT_BNDERR;
+#ifdef SEGV_PKUERR
+			else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR))
+				layout = SIL_FAULT_PKUERR;
+#endif
+		}
 		else if (si_code <= NSIGPOLL)
 			layout = SIL_POLL;
 	} else {
@@ -2848,104 +2855,15 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
 			layout = SIL_POLL;
 		else if (si_code < 0)
 			layout = SIL_RT;
-		/* Tests to support buggy kernel ABIs */
-#ifdef TRAP_FIXME
-		if ((sig == SIGTRAP) && (si_code == TRAP_FIXME))
-			layout = SIL_FAULT;
-#endif
-#ifdef FPE_FIXME
-		if ((sig == SIGFPE) && (si_code == FPE_FIXME))
-			layout = SIL_FAULT;
-#endif
 	}
 	return layout;
 }
 
 int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
 {
-	int err;
-
-	if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t)))
+	if (copy_to_user(to, from , sizeof(struct siginfo)))
 		return -EFAULT;
-	if (from->si_code < 0)
-		return __copy_to_user(to, from, sizeof(siginfo_t))
-			? -EFAULT : 0;
-	/*
-	 * If you change siginfo_t structure, please be sure
-	 * this code is fixed accordingly.
-	 * Please remember to update the signalfd_copyinfo() function
-	 * inside fs/signalfd.c too, in case siginfo_t changes.
-	 * It should never copy any pad contained in the structure
-	 * to avoid security leaks, but must copy the generic
-	 * 3 ints plus the relevant union member.
-	 */
-	err = __put_user(from->si_signo, &to->si_signo);
-	err |= __put_user(from->si_errno, &to->si_errno);
-	err |= __put_user(from->si_code, &to->si_code);
-	switch (siginfo_layout(from->si_signo, from->si_code)) {
-	case SIL_KILL:
-		err |= __put_user(from->si_pid, &to->si_pid);
-		err |= __put_user(from->si_uid, &to->si_uid);
-		break;
-	case SIL_TIMER:
-		/* Unreached SI_TIMER is negative */
-		break;
-	case SIL_POLL:
-		err |= __put_user(from->si_band, &to->si_band);
-		err |= __put_user(from->si_fd, &to->si_fd);
-		break;
-	case SIL_FAULT:
-		err |= __put_user(from->si_addr, &to->si_addr);
-#ifdef __ARCH_SI_TRAPNO
-		err |= __put_user(from->si_trapno, &to->si_trapno);
-#endif
-#ifdef __ia64__
-		err |= __put_user(from->si_imm, &to->si_imm);
-		err |= __put_user(from->si_flags, &to->si_flags);
-		err |= __put_user(from->si_isr, &to->si_isr);
-#endif
-		/*
-		 * Other callers might not initialize the si_lsb field,
-		 * so check explicitly for the right codes here.
-		 */
-#ifdef BUS_MCEERR_AR
-		if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AR)
-			err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
-#endif
-#ifdef BUS_MCEERR_AO
-		if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AO)
-			err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
-#endif
-#ifdef SEGV_BNDERR
-		if (from->si_signo == SIGSEGV && from->si_code == SEGV_BNDERR) {
-			err |= __put_user(from->si_lower, &to->si_lower);
-			err |= __put_user(from->si_upper, &to->si_upper);
-		}
-#endif
-#ifdef SEGV_PKUERR
-		if (from->si_signo == SIGSEGV && from->si_code == SEGV_PKUERR)
-			err |= __put_user(from->si_pkey, &to->si_pkey);
-#endif
-		break;
-	case SIL_CHLD:
-		err |= __put_user(from->si_pid, &to->si_pid);
-		err |= __put_user(from->si_uid, &to->si_uid);
-		err |= __put_user(from->si_status, &to->si_status);
-		err |= __put_user(from->si_utime, &to->si_utime);
-		err |= __put_user(from->si_stime, &to->si_stime);
-		break;
-	case SIL_RT:
-		err |= __put_user(from->si_pid, &to->si_pid);
-		err |= __put_user(from->si_uid, &to->si_uid);
-		err |= __put_user(from->si_ptr, &to->si_ptr);
-		break;
-	case SIL_SYS:
-		err |= __put_user(from->si_call_addr, &to->si_call_addr);
-		err |= __put_user(from->si_syscall, &to->si_syscall);
-		err |= __put_user(from->si_arch, &to->si_arch);
-		break;
-	}
-	return err;
+	return 0;
 }
 
 #ifdef CONFIG_COMPAT
@@ -2984,27 +2902,28 @@ int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
 #ifdef __ARCH_SI_TRAPNO
 		new.si_trapno = from->si_trapno;
 #endif
-#ifdef BUS_MCEERR_AR
-		if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AR))
-			new.si_addr_lsb = from->si_addr_lsb;
-#endif
-#ifdef BUS_MCEERR_AO
-		if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AO))
-			new.si_addr_lsb = from->si_addr_lsb;
+		break;
+	case SIL_FAULT_MCEERR:
+		new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		new.si_trapno = from->si_trapno;
 #endif
-#ifdef SEGV_BNDERR
-		if ((from->si_signo == SIGSEGV) &&
-		    (from->si_code == SEGV_BNDERR)) {
-			new.si_lower = ptr_to_compat(from->si_lower);
-			new.si_upper = ptr_to_compat(from->si_upper);
-		}
+		new.si_addr_lsb = from->si_addr_lsb;
+		break;
+	case SIL_FAULT_BNDERR:
+		new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		new.si_trapno = from->si_trapno;
 #endif
-#ifdef SEGV_PKUERR
-		if ((from->si_signo == SIGSEGV) &&
-		    (from->si_code == SEGV_PKUERR))
-			new.si_pkey = from->si_pkey;
+		new.si_lower = ptr_to_compat(from->si_lower);
+		new.si_upper = ptr_to_compat(from->si_upper);
+		break;
+	case SIL_FAULT_PKUERR:
+		new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		new.si_trapno = from->si_trapno;
 #endif
-
+		new.si_pkey = from->si_pkey;
 		break;
 	case SIL_CHLD:
 		new.si_pid    = from->si_pid;
@@ -3070,24 +2989,28 @@ int copy_siginfo_from_user32(struct siginfo *to,
 #ifdef __ARCH_SI_TRAPNO
 		to->si_trapno = from.si_trapno;
 #endif
-#ifdef BUS_MCEERR_AR
-		if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AR))
-			to->si_addr_lsb = from.si_addr_lsb;
-#endif
-#ifdef BUS_MCEER_AO
-		if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AO))
-			to->si_addr_lsb = from.si_addr_lsb;
+		break;
+	case SIL_FAULT_MCEERR:
+		to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		to->si_trapno = from.si_trapno;
 #endif
-#ifdef SEGV_BNDERR
-		if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_BNDERR)) {
-			to->si_lower = compat_ptr(from.si_lower);
-			to->si_upper = compat_ptr(from.si_upper);
-		}
+		to->si_addr_lsb = from.si_addr_lsb;
+		break;
+	case SIL_FAULT_BNDERR:
+		to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		to->si_trapno = from.si_trapno;
 #endif
-#ifdef SEGV_PKUERR
-		if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_PKUERR))
-			to->si_pkey = from.si_pkey;
+		to->si_lower = compat_ptr(from.si_lower);
+		to->si_upper = compat_ptr(from.si_upper);
+		break;
+	case SIL_FAULT_PKUERR:
+		to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		to->si_trapno = from.si_trapno;
 #endif
+		to->si_pkey = from.si_pkey;
 		break;
 	case SIL_CHLD:
 		to->si_pid    = from.si_pid;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 177de36..03981f1 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -145,8 +145,7 @@ static void __local_bh_enable(unsigned int cnt)
 }
 
 /*
- * Special-case - softirqs can safely be enabled in
- * cond_resched_softirq(), or by __do_softirq(),
+ * Special-case - softirqs can safely be enabled by __do_softirq(),
  * without processing still-pending softirqs:
  */
 void _local_bh_enable(void)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 64c0291..f89014a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -37,7 +37,7 @@ struct cpu_stop_done {
 struct cpu_stopper {
 	struct task_struct	*thread;
 
-	spinlock_t		lock;
+	raw_spinlock_t		lock;
 	bool			enabled;	/* is this stopper enabled? */
 	struct list_head	works;		/* list of pending works */
 
@@ -81,13 +81,13 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 	unsigned long flags;
 	bool enabled;
 
-	spin_lock_irqsave(&stopper->lock, flags);
+	raw_spin_lock_irqsave(&stopper->lock, flags);
 	enabled = stopper->enabled;
 	if (enabled)
 		__cpu_stop_queue_work(stopper, work, &wakeq);
 	else if (work->done)
 		cpu_stop_signal_done(work->done);
-	spin_unlock_irqrestore(&stopper->lock, flags);
+	raw_spin_unlock_irqrestore(&stopper->lock, flags);
 
 	wake_up_q(&wakeq);
 
@@ -237,8 +237,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
 	DEFINE_WAKE_Q(wakeq);
 	int err;
 retry:
-	spin_lock_irq(&stopper1->lock);
-	spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
+	raw_spin_lock_irq(&stopper1->lock);
+	raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
 
 	err = -ENOENT;
 	if (!stopper1->enabled || !stopper2->enabled)
@@ -261,8 +261,8 @@ retry:
 	__cpu_stop_queue_work(stopper1, work1, &wakeq);
 	__cpu_stop_queue_work(stopper2, work2, &wakeq);
 unlock:
-	spin_unlock(&stopper2->lock);
-	spin_unlock_irq(&stopper1->lock);
+	raw_spin_unlock(&stopper2->lock);
+	raw_spin_unlock_irq(&stopper1->lock);
 
 	if (unlikely(err == -EDEADLK)) {
 		while (stop_cpus_in_progress)
@@ -457,9 +457,9 @@ static int cpu_stop_should_run(unsigned int cpu)
 	unsigned long flags;
 	int run;
 
-	spin_lock_irqsave(&stopper->lock, flags);
+	raw_spin_lock_irqsave(&stopper->lock, flags);
 	run = !list_empty(&stopper->works);
-	spin_unlock_irqrestore(&stopper->lock, flags);
+	raw_spin_unlock_irqrestore(&stopper->lock, flags);
 	return run;
 }
 
@@ -470,13 +470,13 @@ static void cpu_stopper_thread(unsigned int cpu)
 
 repeat:
 	work = NULL;
-	spin_lock_irq(&stopper->lock);
+	raw_spin_lock_irq(&stopper->lock);
 	if (!list_empty(&stopper->works)) {
 		work = list_first_entry(&stopper->works,
 					struct cpu_stop_work, list);
 		list_del_init(&work->list);
 	}
-	spin_unlock_irq(&stopper->lock);
+	raw_spin_unlock_irq(&stopper->lock);
 
 	if (work) {
 		cpu_stop_fn_t fn = work->fn;
@@ -550,7 +550,7 @@ static int __init cpu_stop_init(void)
 	for_each_possible_cpu(cpu) {
 		struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
 
-		spin_lock_init(&stopper->lock);
+		raw_spin_lock_init(&stopper->lock);
 		INIT_LIST_HEAD(&stopper->works);
 	}
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 9791364..183169c 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -43,7 +43,9 @@ COND_SYSCALL(io_submit);
 COND_SYSCALL_COMPAT(io_submit);
 COND_SYSCALL(io_cancel);
 COND_SYSCALL(io_getevents);
+COND_SYSCALL(io_pgetevents);
 COND_SYSCALL_COMPAT(io_getevents);
+COND_SYSCALL_COMPAT(io_pgetevents);
 
 /* fs/xattr.c */
 
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 0ed768b..675c4e9 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -372,24 +372,12 @@ static const struct seq_operations timer_list_sops = {
 	.show = timer_list_show,
 };
 
-static int timer_list_open(struct inode *inode, struct file *filp)
-{
-	return seq_open_private(filp, &timer_list_sops,
-			sizeof(struct timer_list_iter));
-}
-
-static const struct file_operations timer_list_fops = {
-	.open		= timer_list_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
-
 static int __init init_timer_list_procfs(void)
 {
 	struct proc_dir_entry *pe;
 
-	pe = proc_create("timer_list", 0400, NULL, &timer_list_fops);
+	pe = proc_create_seq_private("timer_list", 0400, NULL, &timer_list_sops,
+			sizeof(struct timer_list_iter), NULL);
 	if (!pe)
 		return -ENOMEM;
 	return 0;
diff --git a/kernel/torture.c b/kernel/torture.c
index 37b9401..3de1efb 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -574,7 +574,7 @@ void stutter_wait(const char *title)
 {
 	int spt;
 
-	cond_resched_rcu_qs();
+	cond_resched_tasks_rcu_qs();
 	spt = READ_ONCE(stutter_pause_test);
 	for (; spt; spt = READ_ONCE(stutter_pause_test)) {
 		if (spt == 1) {
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 22fee76..80e0b2a 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -159,13 +159,13 @@ static int benchmark_event_kthread(void *arg)
 		 * wants to run, schedule in, but if the CPU is idle,
 		 * we'll keep burning cycles.
 		 *
-		 * Note the _rcu_qs() version of cond_resched() will
+		 * Note the tasks_rcu_qs() version of cond_resched() will
 		 * notify synchronize_rcu_tasks() that this thread has
 		 * passed a quiescent state for rcu_tasks. Otherwise
 		 * this thread will never voluntarily schedule which would
 		 * block synchronize_rcu_tasks() indefinitely.
 		 */
-		cond_resched();
+		cond_resched_tasks_rcu_qs();
 	}
 
 	return 0;
diff --git a/lib/Kconfig b/lib/Kconfig
index 5fe5776..7a91393 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -429,15 +429,50 @@ config SGL_ALLOC
 	bool
 	default n
 
+config NEED_SG_DMA_LENGTH
+	bool
+
+config NEED_DMA_MAP_STATE
+	bool
+
+config ARCH_DMA_ADDR_T_64BIT
+	def_bool 64BIT || PHYS_ADDR_T_64BIT
+
+config IOMMU_HELPER
+	bool
+
+config ARCH_HAS_SYNC_DMA_FOR_DEVICE
+	bool
+
+config ARCH_HAS_SYNC_DMA_FOR_CPU
+	bool
+	select NEED_DMA_MAP_STATE
+
 config DMA_DIRECT_OPS
 	bool
-	depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT)
-	default n
+	depends on HAS_DMA
+
+config DMA_NONCOHERENT_OPS
+	bool
+	depends on HAS_DMA
+	select DMA_DIRECT_OPS
+
+config DMA_NONCOHERENT_MMAP
+	bool
+	depends on DMA_NONCOHERENT_OPS
+
+config DMA_NONCOHERENT_CACHE_SYNC
+	bool
+	depends on DMA_NONCOHERENT_OPS
 
 config DMA_VIRT_OPS
 	bool
-	depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT)
-	default n
+	depends on HAS_DMA
+
+config SWIOTLB
+	bool
+	select DMA_DIRECT_OPS
+	select NEED_DMA_MAP_STATE
 
 config CHECK_SIGNATURE
 	bool
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c40c7b7..7655547 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1634,7 +1634,7 @@ config PROVIDE_OHCI1394_DMA_INIT
 
 config DMA_API_DEBUG
 	bool "Enable debugging of DMA-API usage"
-	depends on HAVE_DMA_API_DEBUG
+	select NEED_DMA_MAP_STATE
 	help
 	  Enable this option to debug the use of the DMA API by device drivers.
 	  With this option you will be able to detect common bugs in device
@@ -1651,6 +1651,23 @@ config DMA_API_DEBUG
 
 	  If unsure, say N.
 
+config DMA_API_DEBUG_SG
+	bool "Debug DMA scatter-gather usage"
+	default y
+	depends on DMA_API_DEBUG
+	help
+	  Perform extra checking that callers of dma_map_sg() have respected the
+	  appropriate segment length/boundary limits for the given device when
+	  preparing DMA scatterlists.
+
+	  This is particularly likely to have been overlooked in cases where the
+	  dma_map_sg() API is used for general bulk mapping of pages rather than
+	  preparing literal scatter-gather descriptors, where there is a risk of
+	  unexpected behaviour from DMA API implementations if the scatterlist
+	  is technically out-of-spec.
+
+	  If unsure, say N.
+
 menuconfig RUNTIME_TESTING_MENU
 	bool "Runtime Testing"
 	def_bool y
diff --git a/lib/Makefile b/lib/Makefile
index ce20696..9f18c81 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -30,6 +30,7 @@ lib-$(CONFIG_PRINTK) += dump_stack.o
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
 lib-$(CONFIG_DMA_DIRECT_OPS) += dma-direct.o
+lib-$(CONFIG_DMA_NONCOHERENT_OPS) += dma-noncoherent.o
 lib-$(CONFIG_DMA_VIRT_OPS) += dma-virt.o
 
 lib-y	+= kobject.o klist.o
@@ -147,7 +148,7 @@ obj-$(CONFIG_AUDIT_GENERIC) += audit.o
 obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o
 
 obj-$(CONFIG_SWIOTLB) += swiotlb.o
-obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o iommu-common.o
+obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
 obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
 obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o
 obj-$(CONFIG_PM_NOTIFIER_ERROR_INJECT) += pm-notifier-error-inject.o
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 7f5cdc1..c007d25 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -41,6 +41,11 @@
 #define HASH_FN_SHIFT   13
 #define HASH_FN_MASK    (HASH_SIZE - 1)
 
+/* allow architectures to override this if absolutely required */
+#ifndef PREALLOC_DMA_DEBUG_ENTRIES
+#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
+#endif
+
 enum {
 	dma_debug_single,
 	dma_debug_page,
@@ -127,7 +132,7 @@ static u32 min_free_entries;
 static u32 nr_total_entries;
 
 /* number of preallocated entries requested by kernel cmdline */
-static u32 req_entries;
+static u32 nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
 
 /* debugfs dentry's for the stuff above */
 static struct dentry *dma_debug_dent        __read_mostly;
@@ -439,7 +444,6 @@ void debug_dma_dump_mappings(struct device *dev)
 		spin_unlock_irqrestore(&bucket->lock, flags);
 	}
 }
-EXPORT_SYMBOL(debug_dma_dump_mappings);
 
 /*
  * For each mapping (initial cacheline in the case of
@@ -748,7 +752,6 @@ int dma_debug_resize_entries(u32 num_entries)
 
 	return ret;
 }
-EXPORT_SYMBOL(dma_debug_resize_entries);
 
 /*
  * DMA-API debugging init code
@@ -1004,10 +1007,7 @@ void dma_debug_add_bus(struct bus_type *bus)
 	bus_register_notifier(bus, nb);
 }
 
-/*
- * Let the architectures decide how many entries should be preallocated.
- */
-void dma_debug_init(u32 num_entries)
+static int dma_debug_init(void)
 {
 	int i;
 
@@ -1015,7 +1015,7 @@ void dma_debug_init(u32 num_entries)
 	 * called to set dma_debug_initialized
 	 */
 	if (global_disable)
-		return;
+		return 0;
 
 	for (i = 0; i < HASH_SIZE; ++i) {
 		INIT_LIST_HEAD(&dma_entry_hash[i].list);
@@ -1026,17 +1026,14 @@ void dma_debug_init(u32 num_entries)
 		pr_err("DMA-API: error creating debugfs entries - disabling\n");
 		global_disable = true;
 
-		return;
+		return 0;
 	}
 
-	if (req_entries)
-		num_entries = req_entries;
-
-	if (prealloc_memory(num_entries) != 0) {
+	if (prealloc_memory(nr_prealloc_entries) != 0) {
 		pr_err("DMA-API: debugging out of memory error - disabled\n");
 		global_disable = true;
 
-		return;
+		return 0;
 	}
 
 	nr_total_entries = num_free_entries;
@@ -1044,7 +1041,9 @@ void dma_debug_init(u32 num_entries)
 	dma_debug_initialized = true;
 
 	pr_info("DMA-API: debugging enabled by kernel config\n");
+	return 0;
 }
+core_initcall(dma_debug_init);
 
 static __init int dma_debug_cmdline(char *str)
 {
@@ -1061,16 +1060,10 @@ static __init int dma_debug_cmdline(char *str)
 
 static __init int dma_debug_entries_cmdline(char *str)
 {
-	int res;
-
 	if (!str)
 		return -EINVAL;
-
-	res = get_option(&str, &req_entries);
-
-	if (!res)
-		req_entries = 0;
-
+	if (!get_option(&str, &nr_prealloc_entries))
+		nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
 	return 0;
 }
 
@@ -1293,6 +1286,32 @@ out:
 	put_hash_bucket(bucket, &flags);
 }
 
+static void check_sg_segment(struct device *dev, struct scatterlist *sg)
+{
+#ifdef CONFIG_DMA_API_DEBUG_SG
+	unsigned int max_seg = dma_get_max_seg_size(dev);
+	u64 start, end, boundary = dma_get_seg_boundary(dev);
+
+	/*
+	 * Either the driver forgot to set dma_parms appropriately, or
+	 * whoever generated the list forgot to check them.
+	 */
+	if (sg->length > max_seg)
+		err_printk(dev, NULL, "DMA-API: mapping sg segment longer than device claims to support [len=%u] [max=%u]\n",
+			   sg->length, max_seg);
+	/*
+	 * In some cases this could potentially be the DMA API
+	 * implementation's fault, but it would usually imply that
+	 * the scatterlist was built inappropriately to begin with.
+	 */
+	start = sg_dma_address(sg);
+	end = start + sg_dma_len(sg) - 1;
+	if ((start ^ end) & ~boundary)
+		err_printk(dev, NULL, "DMA-API: mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n",
+			   start, end, boundary);
+#endif
+}
+
 void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
 			size_t size, int direction, dma_addr_t dma_addr,
 			bool map_single)
@@ -1423,6 +1442,8 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
 			check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s));
 		}
 
+		check_sg_segment(dev, s);
+
 		add_dma_entry(entry);
 	}
 }
diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index bbfb229..8be8106 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -34,6 +34,13 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t size,
 		const char *caller)
 {
 	if (unlikely(dev && !dma_capable(dev, dma_addr, size))) {
+		if (!dev->dma_mask) {
+			dev_err(dev,
+				"%s: call on device without dma_mask\n",
+				caller);
+			return false;
+		}
+
 		if (*dev->dma_mask >= DMA_BIT_MASK(32)) {
 			dev_err(dev,
 				"%s: overflow %pad+%zu of device mask %llx\n",
@@ -84,6 +91,13 @@ again:
 		__free_pages(page, page_order);
 		page = NULL;
 
+		if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
+		    dev->coherent_dma_mask < DMA_BIT_MASK(64) &&
+		    !(gfp & (GFP_DMA32 | GFP_DMA))) {
+			gfp |= GFP_DMA32;
+			goto again;
+		}
+
 		if (IS_ENABLED(CONFIG_ZONE_DMA) &&
 		    dev->coherent_dma_mask < DMA_BIT_MASK(32) &&
 		    !(gfp & GFP_DMA)) {
@@ -121,7 +135,7 @@ void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
 		free_pages((unsigned long)cpu_addr, page_order);
 }
 
-static dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
+dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 		unsigned long offset, size_t size, enum dma_data_direction dir,
 		unsigned long attrs)
 {
@@ -132,8 +146,8 @@ static dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 	return dma_addr;
 }
 
-static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
-		int nents, enum dma_data_direction dir, unsigned long attrs)
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+		enum dma_data_direction dir, unsigned long attrs)
 {
 	int i;
 	struct scatterlist *sg;
@@ -165,10 +179,16 @@ int dma_direct_supported(struct device *dev, u64 mask)
 	if (mask < DMA_BIT_MASK(32))
 		return 0;
 #endif
+	/*
+	 * Various PCI/PCIe bridges have broken support for > 32bit DMA even
+	 * if the device itself might support it.
+	 */
+	if (dev->dma_32bit_limit && mask > DMA_BIT_MASK(32))
+		return 0;
 	return 1;
 }
 
-static int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
+int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return dma_addr == DIRECT_MAPPING_ERROR;
 }
@@ -180,6 +200,5 @@ const struct dma_map_ops dma_direct_ops = {
 	.map_sg			= dma_direct_map_sg,
 	.dma_supported		= dma_direct_supported,
 	.mapping_error		= dma_direct_mapping_error,
-	.is_phys		= 1,
 };
 EXPORT_SYMBOL(dma_direct_ops);
diff --git a/lib/dma-noncoherent.c b/lib/dma-noncoherent.c
new file mode 100644
index 0000000..79e9a75
--- /dev/null
+++ b/lib/dma-noncoherent.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 Christoph Hellwig.
+ *
+ * DMA operations that map physical memory directly without providing cache
+ * coherence.
+ */
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
+#include <linux/scatterlist.h>
+
+static void dma_noncoherent_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+	arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir);
+}
+
+static void dma_noncoherent_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i)
+		arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
+}
+
+static dma_addr_t dma_noncoherent_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	dma_addr_t addr;
+
+	addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+	if (!dma_mapping_error(dev, addr) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		arch_sync_dma_for_device(dev, page_to_phys(page) + offset,
+				size, dir);
+	return addr;
+}
+
+static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	nents = dma_direct_map_sg(dev, sgl, nents, dir, attrs);
+	if (nents > 0 && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		dma_noncoherent_sync_sg_for_device(dev, sgl, nents, dir);
+	return nents;
+}
+
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+static void dma_noncoherent_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+	arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir);
+}
+
+static void dma_noncoherent_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i)
+		arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
+}
+
+static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		dma_noncoherent_sync_single_for_cpu(dev, addr, size, dir);
+}
+
+static void dma_noncoherent_unmap_sg(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		dma_noncoherent_sync_sg_for_cpu(dev, sgl, nents, dir);
+}
+#endif
+
+const struct dma_map_ops dma_noncoherent_ops = {
+	.alloc			= arch_dma_alloc,
+	.free			= arch_dma_free,
+	.mmap			= arch_dma_mmap,
+	.sync_single_for_device	= dma_noncoherent_sync_single_for_device,
+	.sync_sg_for_device	= dma_noncoherent_sync_sg_for_device,
+	.map_page		= dma_noncoherent_map_page,
+	.map_sg			= dma_noncoherent_map_sg,
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+	.sync_single_for_cpu	= dma_noncoherent_sync_single_for_cpu,
+	.sync_sg_for_cpu	= dma_noncoherent_sync_sg_for_cpu,
+	.unmap_page		= dma_noncoherent_unmap_page,
+	.unmap_sg		= dma_noncoherent_unmap_sg,
+#endif
+	.dma_supported		= dma_direct_supported,
+	.mapping_error		= dma_direct_mapping_error,
+	.cache_sync		= arch_dma_cache_sync,
+};
+EXPORT_SYMBOL(dma_noncoherent_ops);
diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c
index 23633c0..92a9f24 100644
--- a/lib/iommu-helper.c
+++ b/lib/iommu-helper.c
@@ -3,19 +3,8 @@
  * IOMMU helper functions for the free area management
  */
 
-#include <linux/export.h>
 #include <linux/bitmap.h>
-#include <linux/bug.h>
-
-int iommu_is_span_boundary(unsigned int index, unsigned int nr,
-			   unsigned long shift,
-			   unsigned long boundary_size)
-{
-	BUG_ON(!is_power_of_2(boundary_size));
-
-	shift = (shift + index) & (boundary_size - 1);
-	return shift + nr > boundary_size;
-}
+#include <linux/iommu-helper.h>
 
 unsigned long iommu_area_alloc(unsigned long *map, unsigned long size,
 			       unsigned long start, unsigned int nr,
@@ -38,4 +27,3 @@ again:
 	}
 	return -1;
 }
-EXPORT_SYMBOL(iommu_area_alloc);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index fdae394..7e43cd5 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -573,6 +573,67 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 }
 EXPORT_SYMBOL(_copy_to_iter);
 
+#ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE
+static int copyout_mcsafe(void __user *to, const void *from, size_t n)
+{
+	if (access_ok(VERIFY_WRITE, to, n)) {
+		kasan_check_read(from, n);
+		n = copy_to_user_mcsafe((__force void *) to, from, n);
+	}
+	return n;
+}
+
+static unsigned long memcpy_mcsafe_to_page(struct page *page, size_t offset,
+		const char *from, size_t len)
+{
+	unsigned long ret;
+	char *to;
+
+	to = kmap_atomic(page);
+	ret = memcpy_mcsafe(to + offset, from, len);
+	kunmap_atomic(to);
+
+	return ret;
+}
+
+size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i)
+{
+	const char *from = addr;
+	unsigned long rem, curr_addr, s_addr = (unsigned long) addr;
+
+	if (unlikely(i->type & ITER_PIPE)) {
+		WARN_ON(1);
+		return 0;
+	}
+	if (iter_is_iovec(i))
+		might_fault();
+	iterate_and_advance(i, bytes, v,
+		copyout_mcsafe(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
+		({
+		rem = memcpy_mcsafe_to_page(v.bv_page, v.bv_offset,
+                               (from += v.bv_len) - v.bv_len, v.bv_len);
+		if (rem) {
+			curr_addr = (unsigned long) from;
+			bytes = curr_addr - s_addr - rem;
+			return bytes;
+		}
+		}),
+		({
+		rem = memcpy_mcsafe(v.iov_base, (from += v.iov_len) - v.iov_len,
+				v.iov_len);
+		if (rem) {
+			curr_addr = (unsigned long) from;
+			bytes = curr_addr - s_addr - rem;
+			return bytes;
+		}
+		})
+	)
+
+	return bytes;
+}
+EXPORT_SYMBOL_GPL(_copy_to_iter_mcsafe);
+#endif /* CONFIG_ARCH_HAS_UACCESS_MCSAFE */
+
 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 {
 	char *to = addr;
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index e6a9c06..6fdc626 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -270,18 +270,33 @@ void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m)
 }
 EXPORT_SYMBOL_GPL(sbitmap_bitmap_show);
 
-static unsigned int sbq_calc_wake_batch(unsigned int depth)
+static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq,
+					unsigned int depth)
 {
 	unsigned int wake_batch;
+	unsigned int shallow_depth;
 
 	/*
 	 * For each batch, we wake up one queue. We need to make sure that our
-	 * batch size is small enough that the full depth of the bitmap is
-	 * enough to wake up all of the queues.
+	 * batch size is small enough that the full depth of the bitmap,
+	 * potentially limited by a shallow depth, is enough to wake up all of
+	 * the queues.
+	 *
+	 * Each full word of the bitmap has bits_per_word bits, and there might
+	 * be a partial word. There are depth / bits_per_word full words and
+	 * depth % bits_per_word bits left over. In bitwise arithmetic:
+	 *
+	 * bits_per_word = 1 << shift
+	 * depth / bits_per_word = depth >> shift
+	 * depth % bits_per_word = depth & ((1 << shift) - 1)
+	 *
+	 * Each word can be limited to sbq->min_shallow_depth bits.
 	 */
-	wake_batch = SBQ_WAKE_BATCH;
-	if (wake_batch > depth / SBQ_WAIT_QUEUES)
-		wake_batch = max(1U, depth / SBQ_WAIT_QUEUES);
+	shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth);
+	depth = ((depth >> sbq->sb.shift) * shallow_depth +
+		 min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth));
+	wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1,
+			     SBQ_WAKE_BATCH);
 
 	return wake_batch;
 }
@@ -307,7 +322,8 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 			*per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth;
 	}
 
-	sbq->wake_batch = sbq_calc_wake_batch(depth);
+	sbq->min_shallow_depth = UINT_MAX;
+	sbq->wake_batch = sbq_calc_wake_batch(sbq, depth);
 	atomic_set(&sbq->wake_index, 0);
 
 	sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
@@ -327,21 +343,28 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
 
-void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
+static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+					    unsigned int depth)
 {
-	unsigned int wake_batch = sbq_calc_wake_batch(depth);
+	unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth);
 	int i;
 
 	if (sbq->wake_batch != wake_batch) {
 		WRITE_ONCE(sbq->wake_batch, wake_batch);
 		/*
-		 * Pairs with the memory barrier in sbq_wake_up() to ensure that
-		 * the batch size is updated before the wait counts.
+		 * Pairs with the memory barrier in sbitmap_queue_wake_up()
+		 * to ensure that the batch size is updated before the wait
+		 * counts.
 		 */
 		smp_mb__before_atomic();
 		for (i = 0; i < SBQ_WAIT_QUEUES; i++)
 			atomic_set(&sbq->ws[i].wait_cnt, 1);
 	}
+}
+
+void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
+{
+	sbitmap_queue_update_wake_batch(sbq, depth);
 	sbitmap_resize(&sbq->sb, depth);
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
@@ -380,6 +403,8 @@ int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
 	unsigned int hint, depth;
 	int nr;
 
+	WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth);
+
 	hint = this_cpu_read(*sbq->alloc_hint);
 	depth = READ_ONCE(sbq->sb.depth);
 	if (unlikely(hint >= depth)) {
@@ -403,6 +428,14 @@ int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
 }
 EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow);
 
+void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
+				     unsigned int min_shallow_depth)
+{
+	sbq->min_shallow_depth = min_shallow_depth;
+	sbitmap_queue_update_wake_batch(sbq, sbq->sb.depth);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth);
+
 static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 {
 	int i, wake_index;
@@ -425,52 +458,67 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 	return NULL;
 }
 
-static void sbq_wake_up(struct sbitmap_queue *sbq)
+static bool __sbq_wake_up(struct sbitmap_queue *sbq)
 {
 	struct sbq_wait_state *ws;
 	unsigned int wake_batch;
 	int wait_cnt;
 
-	/*
-	 * Pairs with the memory barrier in set_current_state() to ensure the
-	 * proper ordering of clear_bit()/waitqueue_active() in the waker and
-	 * test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the
-	 * waiter. See the comment on waitqueue_active(). This is __after_atomic
-	 * because we just did clear_bit_unlock() in the caller.
-	 */
-	smp_mb__after_atomic();
-
 	ws = sbq_wake_ptr(sbq);
 	if (!ws)
-		return;
+		return false;
 
 	wait_cnt = atomic_dec_return(&ws->wait_cnt);
 	if (wait_cnt <= 0) {
+		int ret;
+
 		wake_batch = READ_ONCE(sbq->wake_batch);
+
 		/*
 		 * Pairs with the memory barrier in sbitmap_queue_resize() to
 		 * ensure that we see the batch size update before the wait
 		 * count is reset.
 		 */
 		smp_mb__before_atomic();
+
 		/*
-		 * If there are concurrent callers to sbq_wake_up(), the last
-		 * one to decrement the wait count below zero will bump it back
-		 * up. If there is a concurrent resize, the count reset will
-		 * either cause the cmpxchg to fail or overwrite after the
-		 * cmpxchg.
+		 * For concurrent callers of this, the one that failed the
+		 * atomic_cmpxhcg() race should call this function again
+		 * to wakeup a new batch on a different 'ws'.
 		 */
-		atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wait_cnt + wake_batch);
-		sbq_index_atomic_inc(&sbq->wake_index);
-		wake_up_nr(&ws->wait, wake_batch);
+		ret = atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wake_batch);
+		if (ret == wait_cnt) {
+			sbq_index_atomic_inc(&sbq->wake_index);
+			wake_up_nr(&ws->wait, wake_batch);
+			return false;
+		}
+
+		return true;
 	}
+
+	return false;
+}
+
+void sbitmap_queue_wake_up(struct sbitmap_queue *sbq)
+{
+	while (__sbq_wake_up(sbq))
+		;
 }
+EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
 
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 			 unsigned int cpu)
 {
 	sbitmap_clear_bit_unlock(&sbq->sb, nr);
-	sbq_wake_up(sbq);
+	/*
+	 * Pairs with the memory barrier in set_current_state() to ensure the
+	 * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
+	 * and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the
+	 * waiter. See the comment on waitqueue_active().
+	 */
+	smp_mb__after_atomic();
+	sbitmap_queue_wake_up(sbq);
+
 	if (likely(!sbq->round_robin && nr < sbq->sb.depth))
 		*per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
 }
@@ -482,7 +530,7 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
 
 	/*
 	 * Pairs with the memory barrier in set_current_state() like in
-	 * sbq_wake_up().
+	 * sbitmap_queue_wake_up().
 	 */
 	smp_mb();
 	wake_index = atomic_read(&sbq->wake_index);
@@ -528,5 +576,6 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
 	seq_puts(m, "}\n");
 
 	seq_printf(m, "round_robin=%d\n", sbq->round_robin);
+	seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth);
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_show);
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index cc64058..04b68d9 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -593,9 +593,8 @@ found:
 }
 
 /*
- * Allocates bounce buffer and returns its kernel virtual address.
+ * Allocates bounce buffer and returns its physical address.
  */
-
 static phys_addr_t
 map_single(struct device *hwdev, phys_addr_t phys, size_t size,
 	   enum dma_data_direction dir, unsigned long attrs)
@@ -614,7 +613,7 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size,
 }
 
 /*
- * dma_addr is the kernel virtual address of the bounce buffer to unmap.
+ * tlb_addr is the physical address of the bounce buffer to unmap.
  */
 void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 			      size_t size, enum dma_data_direction dir,
@@ -692,7 +691,6 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
 	}
 }
 
-#ifdef CONFIG_DMA_DIRECT_OPS
 static inline bool dma_coherent_ok(struct device *dev, dma_addr_t addr,
 		size_t size)
 {
@@ -727,7 +725,7 @@ swiotlb_alloc_buffer(struct device *dev, size_t size, dma_addr_t *dma_handle,
 
 out_unmap:
 	dev_warn(dev, "hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n",
-		(unsigned long long)(dev ? dev->coherent_dma_mask : 0),
+		(unsigned long long)dev->coherent_dma_mask,
 		(unsigned long long)*dma_handle);
 
 	/*
@@ -764,7 +762,6 @@ static bool swiotlb_free_buffer(struct device *dev, size_t size,
 				 DMA_ATTR_SKIP_CPU_SYNC);
 	return true;
 }
-#endif
 
 static void
 swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir,
@@ -1045,7 +1042,6 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask)
 	return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
 }
 
-#ifdef CONFIG_DMA_DIRECT_OPS
 void *swiotlb_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs)
 {
@@ -1089,4 +1085,3 @@ const struct dma_map_ops swiotlb_dma_ops = {
 	.unmap_page		= swiotlb_unmap_page,
 	.dma_supported		= dma_direct_supported,
 };
-#endif /* CONFIG_DMA_DIRECT_OPS */
diff --git a/mm/Kconfig b/mm/Kconfig
index e14c015..3e0b6e8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -266,7 +266,7 @@ config ARCH_ENABLE_THP_MIGRATION
 	bool
 
 config PHYS_ADDR_T_64BIT
-	def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
+	def_bool 64BIT
 
 config BOUNCE
 	bool "Enable bounce buffers"
@@ -305,7 +305,7 @@ config KSM
 	  the many instances by a single page with that content, so
 	  saving memory until one or another app needs to modify the content.
 	  Recommended for use with KVM, or with other duplicative applications.
-	  See Documentation/vm/ksm.txt for more information: KSM is inactive
+	  See Documentation/vm/ksm.rst for more information: KSM is inactive
 	  until a program has madvised that an area is MADV_MERGEABLE, and
 	  root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
 
@@ -530,7 +530,7 @@ config MEM_SOFT_DIRTY
 	  into a page just as regular dirty bit, but unlike the latter
 	  it can be cleared by hands.
 
-	  See Documentation/vm/soft-dirty.txt for more details.
+	  See Documentation/admin-guide/mm/soft-dirty.rst for more details.
 
 config ZSWAP
 	bool "Compressed cache for swap pages (EXPERIMENTAL)"
@@ -657,7 +657,8 @@ config IDLE_PAGE_TRACKING
 	  be useful to tune memory cgroup limits and/or for job placement
 	  within a compute cluster.
 
-	  See Documentation/vm/idle_page_tracking.txt for more details.
+	  See Documentation/admin-guide/mm/idle_page_tracking.rst for
+	  more details.
 
 # arch_add_memory() comprehends device memory
 config ARCH_HAS_ZONE_DEVICE
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7441bd9..8fe3ebd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -412,6 +412,7 @@ static void wb_exit(struct bdi_writeback *wb)
  * protected.
  */
 static DEFINE_SPINLOCK(cgwb_lock);
+static struct workqueue_struct *cgwb_release_wq;
 
 /**
  * wb_congested_get_create - get or create a wb_congested
@@ -522,7 +523,7 @@ static void cgwb_release(struct percpu_ref *refcnt)
 {
 	struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
 						refcnt);
-	schedule_work(&wb->release_work);
+	queue_work(cgwb_release_wq, &wb->release_work);
 }
 
 static void cgwb_kill(struct bdi_writeback *wb)
@@ -784,6 +785,21 @@ static void cgwb_bdi_register(struct backing_dev_info *bdi)
 	spin_unlock_irq(&cgwb_lock);
 }
 
+static int __init cgwb_init(void)
+{
+	/*
+	 * There can be many concurrent release work items overwhelming
+	 * system_wq.  Put them in a separate wq and limit concurrency.
+	 * There's no point in executing many of these in parallel.
+	 */
+	cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
+	if (!cgwb_release_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+subsys_initcall(cgwb_init);
+
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static int cgwb_bdi_init(struct backing_dev_info *bdi)
diff --git a/mm/cleancache.c b/mm/cleancache.c
index f7b9fdc..126548b 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -3,7 +3,7 @@
  *
  * This code provides the generic "frontend" layer to call a matching
  * "backend" driver implementation of cleancache.  See
- * Documentation/vm/cleancache.txt for more information.
+ * Documentation/vm/cleancache.rst for more information.
  *
  * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
  * Author: Dan Magenheimer
diff --git a/mm/frontswap.c b/mm/frontswap.c
index fec8b50..4f5476a 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -3,7 +3,7 @@
  *
  * This code provides the generic "frontend" layer to call a matching
  * "backend" driver implementation of frontswap.  See
- * Documentation/vm/frontswap.txt for more information.
+ * Documentation/vm/frontswap.rst for more information.
  *
  * Copyright (C) 2009-2012 Oracle Corp.  All rights reserved.
  * Author: Dan Magenheimer
diff --git a/mm/hmm.c b/mm/hmm.c
index 486dc39..e63e353 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -37,7 +37,7 @@
 
 #if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
 /*
- * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h
+ * Device private memory see HMM (Documentation/vm/hmm.rst) or hmm.h
  */
 DEFINE_STATIC_KEY_FALSE(device_private_key);
 EXPORT_SYMBOL(device_private_key);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b9f3dbd..ac5591d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1185,7 +1185,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
 	 * mmu_notifier_invalidate_range_end() happens which can lead to a
 	 * device seeing memory write in different order than CPU.
 	 *
-	 * See Documentation/vm/mmu_notifier.txt
+	 * See Documentation/vm/mmu_notifier.rst
 	 */
 	pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
 
@@ -2037,7 +2037,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	 * replacing a zero pmd write protected page with a zero pte write
 	 * protected page.
 	 *
-	 * See Documentation/vm/mmu_notifier.txt
+	 * See Documentation/vm/mmu_notifier.rst
 	 */
 	pmdp_huge_clear_flush(vma, haddr, pmd);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2186791..1290887 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3291,7 +3291,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 				 * table protection not changing it to point
 				 * to a new page.
 				 *
-				 * See Documentation/vm/mmu_notifier.txt
+				 * See Documentation/vm/mmu_notifier.rst
 				 */
 				huge_ptep_set_wrprotect(src, addr, src_pte);
 			}
@@ -4357,7 +4357,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	 * No need to call mmu_notifier_invalidate_range() we are downgrading
 	 * page table protection not changing it to point to a new page.
 	 *
-	 * See Documentation/vm/mmu_notifier.txt
+	 * See Documentation/vm/mmu_notifier.rst
 	 */
 	i_mmap_unlock_write(vma->vm_file->f_mapping);
 	mmu_notifier_invalidate_range_end(mm, start, end);
diff --git a/mm/ksm.c b/mm/ksm.c
index e3cbf9a..7d6558f 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -51,7 +51,9 @@
 #define DO_NUMA(x)	do { } while (0)
 #endif
 
-/*
+/**
+ * DOC: Overview
+ *
  * A few notes about the KSM scanning process,
  * to make it easier to understand the data structures below:
  *
@@ -67,6 +69,21 @@
  * this tree is fully assured to be working (except when pages are unmapped),
  * and therefore this tree is called the stable tree.
  *
+ * The stable tree node includes information required for reverse
+ * mapping from a KSM page to virtual addresses that map this page.
+ *
+ * In order to avoid large latencies of the rmap walks on KSM pages,
+ * KSM maintains two types of nodes in the stable tree:
+ *
+ * * the regular nodes that keep the reverse mapping structures in a
+ *   linked list
+ * * the "chains" that link nodes ("dups") that represent the same
+ *   write protected memory content, but each "dup" corresponds to a
+ *   different KSM page copy of that content
+ *
+ * Internally, the regular nodes, "dups" and "chains" are represented
+ * using the same :c:type:`struct stable_node` structure.
+ *
  * In addition to the stable tree, KSM uses a second data structure called the
  * unstable tree: this tree holds pointers to pages which have been found to
  * be "unchanged for a period of time".  The unstable tree sorts these pages
@@ -1049,7 +1066,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 		 * No need to notify as we are downgrading page table to read
 		 * only not changing it to point to a new page.
 		 *
-		 * See Documentation/vm/mmu_notifier.txt
+		 * See Documentation/vm/mmu_notifier.rst
 		 */
 		entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
 		/*
@@ -1145,7 +1162,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	 * No need to notify as we are replacing a read only page with another
 	 * read only page with the same content.
 	 *
-	 * See Documentation/vm/mmu_notifier.txt
+	 * See Documentation/vm/mmu_notifier.rst
 	 */
 	ptep_clear_flush(vma, addr, ptep);
 	set_pte_at_notify(mm, addr, ptep, newpte);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2bd3df3..1695f38 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3849,7 +3849,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
 	if (ret)
 		goto out_put_css;
 
-	efile.file->f_op->poll(efile.file, &event->pt);
+	vfs_poll(efile.file, &event->pt);
 
 	spin_lock(&memcg->event_list_lock);
 	list_add(&event->list, &memcg->event_list);
diff --git a/mm/mempool.c b/mm/mempool.c
index 5c9dce3..b54f2c2 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -138,6 +138,28 @@ static void *remove_element(mempool_t *pool, gfp_t flags)
 }
 
 /**
+ * mempool_exit - exit a mempool initialized with mempool_init()
+ * @pool:      pointer to the memory pool which was initialized with
+ *             mempool_init().
+ *
+ * Free all reserved elements in @pool and @pool itself.  This function
+ * only sleeps if the free_fn() function sleeps.
+ *
+ * May be called on a zeroed but uninitialized mempool (i.e. allocated with
+ * kzalloc()).
+ */
+void mempool_exit(mempool_t *pool)
+{
+	while (pool->curr_nr) {
+		void *element = remove_element(pool, GFP_KERNEL);
+		pool->free(element, pool->pool_data);
+	}
+	kfree(pool->elements);
+	pool->elements = NULL;
+}
+EXPORT_SYMBOL(mempool_exit);
+
+/**
  * mempool_destroy - deallocate a memory pool
  * @pool:      pointer to the memory pool which was allocated via
  *             mempool_create().
@@ -150,15 +172,65 @@ void mempool_destroy(mempool_t *pool)
 	if (unlikely(!pool))
 		return;
 
-	while (pool->curr_nr) {
-		void *element = remove_element(pool, GFP_KERNEL);
-		pool->free(element, pool->pool_data);
-	}
-	kfree(pool->elements);
+	mempool_exit(pool);
 	kfree(pool);
 }
 EXPORT_SYMBOL(mempool_destroy);
 
+int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+		      mempool_free_t *free_fn, void *pool_data,
+		      gfp_t gfp_mask, int node_id)
+{
+	spin_lock_init(&pool->lock);
+	pool->min_nr	= min_nr;
+	pool->pool_data = pool_data;
+	pool->alloc	= alloc_fn;
+	pool->free	= free_fn;
+	init_waitqueue_head(&pool->wait);
+
+	pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
+					    gfp_mask, node_id);
+	if (!pool->elements)
+		return -ENOMEM;
+
+	/*
+	 * First pre-allocate the guaranteed number of buffers.
+	 */
+	while (pool->curr_nr < pool->min_nr) {
+		void *element;
+
+		element = pool->alloc(gfp_mask, pool->pool_data);
+		if (unlikely(!element)) {
+			mempool_exit(pool);
+			return -ENOMEM;
+		}
+		add_element(pool, element);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(mempool_init_node);
+
+/**
+ * mempool_init - initialize a memory pool
+ * @min_nr:    the minimum number of elements guaranteed to be
+ *             allocated for this pool.
+ * @alloc_fn:  user-defined element-allocation function.
+ * @free_fn:   user-defined element-freeing function.
+ * @pool_data: optional private data available to the user-defined functions.
+ *
+ * Like mempool_create(), but initializes the pool in (i.e. embedded in another
+ * structure).
+ */
+int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+		 mempool_free_t *free_fn, void *pool_data)
+{
+	return mempool_init_node(pool, min_nr, alloc_fn, free_fn,
+				 pool_data, GFP_KERNEL, NUMA_NO_NODE);
+
+}
+EXPORT_SYMBOL(mempool_init);
+
 /**
  * mempool_create - create a memory pool
  * @min_nr:    the minimum number of elements guaranteed to be
@@ -186,35 +258,17 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
 			       gfp_t gfp_mask, int node_id)
 {
 	mempool_t *pool;
+
 	pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id);
 	if (!pool)
 		return NULL;
-	pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
-				      gfp_mask, node_id);
-	if (!pool->elements) {
+
+	if (mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data,
+			      gfp_mask, node_id)) {
 		kfree(pool);
 		return NULL;
 	}
-	spin_lock_init(&pool->lock);
-	pool->min_nr = min_nr;
-	pool->pool_data = pool_data;
-	init_waitqueue_head(&pool->wait);
-	pool->alloc = alloc_fn;
-	pool->free = free_fn;
 
-	/*
-	 * First pre-allocate the guaranteed number of buffers.
-	 */
-	while (pool->curr_nr < pool->min_nr) {
-		void *element;
-
-		element = pool->alloc(gfp_mask, pool->pool_data);
-		if (unlikely(!element)) {
-			mempool_destroy(pool);
-			return NULL;
-		}
-		add_element(pool, element);
-	}
 	return pool;
 }
 EXPORT_SYMBOL(mempool_create_node);
diff --git a/mm/mmap.c b/mm/mmap.c
index fc41c05..d817764 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2828,7 +2828,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	unsigned long ret = -EINVAL;
 	struct file *file;
 
-	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n",
+	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n",
 		     current->comm, current->pid);
 
 	if (prot)
diff --git a/mm/rmap.c b/mm/rmap.c
index 8d5337f..6db729d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -942,7 +942,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
 		 * downgrading page table protection not changing it to point
 		 * to a new page.
 		 *
-		 * See Documentation/vm/mmu_notifier.txt
+		 * See Documentation/vm/mmu_notifier.rst
 		 */
 		if (ret)
 			(*cleaned)++;
@@ -1599,7 +1599,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 * point at new page while a device still is using this
 			 * page.
 			 *
-			 * See Documentation/vm/mmu_notifier.txt
+			 * See Documentation/vm/mmu_notifier.rst
 			 */
 			dec_mm_counter(mm, mm_counter_file(page));
 		}
@@ -1609,7 +1609,7 @@ discard:
 		 * done above for all cases requiring it to happen under page
 		 * table lock before mmu_notifier_invalidate_range_end()
 		 *
-		 * See Documentation/vm/mmu_notifier.txt
+		 * See Documentation/vm/mmu_notifier.rst
 		 */
 		page_remove_rmap(subpage, PageHuge(page));
 		put_page(page);
diff --git a/mm/util.c b/mm/util.c
index 45fc316..c2d0a7c 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -621,7 +621,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
  * succeed and -ENOMEM implies there is not.
  *
  * We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting
+ * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting.rst
  *
  * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
  * Additional code 2002 Jul 20 by Robert Love.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ebff729..63a5f50 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2751,25 +2751,14 @@ static const struct seq_operations vmalloc_op = {
 	.show = s_show,
 };
 
-static int vmalloc_open(struct inode *inode, struct file *file)
+static int __init proc_vmalloc_init(void)
 {
 	if (IS_ENABLED(CONFIG_NUMA))
-		return seq_open_private(file, &vmalloc_op,
-					nr_node_ids * sizeof(unsigned int));
+		proc_create_seq_private("vmallocinfo", S_IRUSR, NULL,
+				&vmalloc_op,
+				nr_node_ids * sizeof(unsigned int), NULL);
 	else
-		return seq_open(file, &vmalloc_op);
-}
-
-static const struct file_operations proc_vmalloc_operations = {
-	.open		= vmalloc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
-
-static int __init proc_vmalloc_init(void)
-{
-	proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
+		proc_create_seq("vmallocinfo", S_IRUSR, NULL, &vmalloc_op);
 	return 0;
 }
 module_init(proc_vmalloc_init);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a2b9518..75eda9c2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1516,18 +1516,6 @@ static const struct seq_operations fragmentation_op = {
 	.show	= frag_show,
 };
 
-static int fragmentation_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &fragmentation_op);
-}
-
-static const struct file_operations buddyinfo_file_operations = {
-	.open		= fragmentation_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static const struct seq_operations pagetypeinfo_op = {
 	.start	= frag_start,
 	.next	= frag_next,
@@ -1535,18 +1523,6 @@ static const struct seq_operations pagetypeinfo_op = {
 	.show	= pagetypeinfo_show,
 };
 
-static int pagetypeinfo_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &pagetypeinfo_op);
-}
-
-static const struct file_operations pagetypeinfo_file_operations = {
-	.open		= pagetypeinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
 {
 	int zid;
@@ -1663,18 +1639,6 @@ static const struct seq_operations zoneinfo_op = {
 	.show	= zoneinfo_show,
 };
 
-static int zoneinfo_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &zoneinfo_op);
-}
-
-static const struct file_operations zoneinfo_file_operations = {
-	.open		= zoneinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 enum writeback_stat_item {
 	NR_DIRTY_THRESHOLD,
 	NR_DIRTY_BG_THRESHOLD,
@@ -1762,18 +1726,6 @@ static const struct seq_operations vmstat_op = {
 	.stop	= vmstat_stop,
 	.show	= vmstat_show,
 };
-
-static int vmstat_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &vmstat_op);
-}
-
-static const struct file_operations vmstat_file_operations = {
-	.open		= vmstat_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
 #endif /* CONFIG_PROC_FS */
 
 #ifdef CONFIG_SMP
@@ -2020,10 +1972,10 @@ void __init init_mm_internals(void)
 	start_shepherd_timer();
 #endif
 #ifdef CONFIG_PROC_FS
-	proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations);
-	proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations);
-	proc_create("vmstat", 0444, NULL, &vmstat_file_operations);
-	proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations);
+	proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
+	proc_create_seq("pagetypeinfo", 0444, NULL, &pagetypeinfo_op);
+	proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
+	proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
 #endif
 }
 
diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index a627a5d..d36e8c4 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -73,35 +73,6 @@ static const struct seq_operations vlan_seq_ops = {
 	.show = vlan_seq_show,
 };
 
-static int vlan_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &vlan_seq_ops,
-			sizeof(struct seq_net_private));
-}
-
-static const struct file_operations vlan_fops = {
-	.open    = vlan_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
-/*
- *	/proc/net/vlan/<device> file and inode operations
- */
-
-static int vlandev_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, vlandev_seq_show, PDE_DATA(inode));
-}
-
-static const struct file_operations vlandev_fops = {
-	.open    = vlandev_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = single_release,
-};
-
 /*
  * Proc filesystem directory entries.
  */
@@ -148,8 +119,9 @@ int __net_init vlan_proc_init(struct net *net)
 	if (!vn->proc_vlan_dir)
 		goto err;
 
-	vn->proc_vlan_conf = proc_create(name_conf, S_IFREG | 0600,
-					 vn->proc_vlan_dir, &vlan_fops);
+	vn->proc_vlan_conf = proc_create_net(name_conf, S_IFREG | 0600,
+			vn->proc_vlan_dir, &vlan_seq_ops,
+			sizeof(struct seq_net_private));
 	if (!vn->proc_vlan_conf)
 		goto err;
 	return 0;
@@ -171,9 +143,8 @@ int vlan_proc_add_dev(struct net_device *vlandev)
 
 	if (!strcmp(vlandev->name, name_conf))
 		return -EINVAL;
-	vlan->dent =
-		proc_create_data(vlandev->name, S_IFREG | 0600,
-				 vn->proc_vlan_dir, &vlandev_fops, vlandev);
+	vlan->dent = proc_create_single_data(vlandev->name, S_IFREG | 0600,
+			vn->proc_vlan_dir, vlandev_seq_show, vlandev);
 	if (!vlan->dent)
 		return -ENOBUFS;
 	return 0;
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 848969f..588bf88 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -231,7 +231,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
 static __poll_t
 p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err)
 {
-	__poll_t ret, n;
+	__poll_t ret;
 	struct p9_trans_fd *ts = NULL;
 
 	if (client && client->status == Connected)
@@ -243,19 +243,9 @@ p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err)
 		return EPOLLERR;
 	}
 
-	if (!ts->rd->f_op->poll)
-		ret = DEFAULT_POLLMASK;
-	else
-		ret = ts->rd->f_op->poll(ts->rd, pt);
-
-	if (ts->rd != ts->wr) {
-		if (!ts->wr->f_op->poll)
-			n = DEFAULT_POLLMASK;
-		else
-			n = ts->wr->f_op->poll(ts->wr, pt);
-		ret = (ret & ~EPOLLOUT) | (n & ~EPOLLIN);
-	}
-
+	ret = vfs_poll(ts->rd, pt);
+	if (ts->rd != ts->wr)
+		ret = (ret & ~EPOLLOUT) | (vfs_poll(ts->wr, pt) & ~EPOLLIN);
 	return ret;
 }
 
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index d4c1021..49a16ce 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -907,11 +907,6 @@ void aarp_device_down(struct net_device *dev)
 }
 
 #ifdef CONFIG_PROC_FS
-struct aarp_iter_state {
-	int bucket;
-	struct aarp_entry **table;
-};
-
 /*
  * Get the aarp entry that is in the chain described
  * by the iterator.
@@ -1033,25 +1028,12 @@ static int aarp_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations aarp_seq_ops = {
+const struct seq_operations aarp_seq_ops = {
 	.start  = aarp_seq_start,
 	.next   = aarp_seq_next,
 	.stop   = aarp_seq_stop,
 	.show   = aarp_seq_show,
 };
-
-static int aarp_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_private(file, &aarp_seq_ops,
-			sizeof(struct aarp_iter_state));
-}
-
-const struct file_operations atalk_seq_arp_fops = {
-	.open           = aarp_seq_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release	= seq_release_private,
-};
 #endif
 
 /* General module cleanup. Called from cleanup_module() in ddp.c. */
diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c
index 7214aea..8006295 100644
--- a/net/appletalk/atalk_proc.c
+++ b/net/appletalk/atalk_proc.c
@@ -210,42 +210,6 @@ static const struct seq_operations atalk_seq_socket_ops = {
 	.show   = atalk_seq_socket_show,
 };
 
-static int atalk_seq_interface_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &atalk_seq_interface_ops);
-}
-
-static int atalk_seq_route_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &atalk_seq_route_ops);
-}
-
-static int atalk_seq_socket_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &atalk_seq_socket_ops);
-}
-
-static const struct file_operations atalk_seq_interface_fops = {
-	.open		= atalk_seq_interface_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static const struct file_operations atalk_seq_route_fops = {
-	.open		= atalk_seq_route_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static const struct file_operations atalk_seq_socket_fops = {
-	.open		= atalk_seq_socket_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static struct proc_dir_entry *atalk_proc_dir;
 
 int __init atalk_proc_init(void)
@@ -257,22 +221,23 @@ int __init atalk_proc_init(void)
 	if (!atalk_proc_dir)
 		goto out;
 
-	p = proc_create("interface", 0444, atalk_proc_dir,
-			&atalk_seq_interface_fops);
+	p = proc_create_seq("interface", 0444, atalk_proc_dir,
+			&atalk_seq_interface_ops);
 	if (!p)
 		goto out_interface;
 
-	p = proc_create("route", 0444, atalk_proc_dir,
-			&atalk_seq_route_fops);
+	p = proc_create_seq("route", 0444, atalk_proc_dir,
+			&atalk_seq_route_ops);
 	if (!p)
 		goto out_route;
 
-	p = proc_create("socket", 0444, atalk_proc_dir,
-			&atalk_seq_socket_fops);
+	p = proc_create_seq("socket", 0444, atalk_proc_dir,
+			&atalk_seq_socket_ops);
 	if (!p)
 		goto out_socket;
 
-	p = proc_create("arp", 0444, atalk_proc_dir, &atalk_seq_arp_fops);
+	p = proc_create_seq_private("arp", 0444, atalk_proc_dir, &aarp_seq_ops,
+			sizeof(struct aarp_iter_state), NULL);
 	if (!p)
 		goto out_arp;
 
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 9b6bc5a..55fdba0 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1869,7 +1869,7 @@ static const struct proto_ops atalk_dgram_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= atalk_getname,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.ioctl		= atalk_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= atalk_compat_ioctl,
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index fd94bea..36b3ada 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -818,18 +818,6 @@ static const struct seq_operations br2684_seq_ops = {
 	.show = br2684_seq_show,
 };
 
-static int br2684_proc_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &br2684_seq_ops);
-}
-
-static const struct file_operations br2684_proc_ops = {
-	.open = br2684_proc_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 extern struct proc_dir_entry *atm_proc_root;	/* from proc.c */
 #endif /* CONFIG_PROC_FS */
 
@@ -837,7 +825,7 @@ static int __init br2684_init(void)
 {
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry *p;
-	p = proc_create("br2684", 0, atm_proc_root, &br2684_proc_ops);
+	p = proc_create_seq("br2684", 0, atm_proc_root, &br2684_seq_ops);
 	if (p == NULL)
 		return -ENOMEM;
 #endif
diff --git a/net/atm/clip.c b/net/atm/clip.c
index f07dbc6..66caa48 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -863,20 +863,6 @@ static const struct seq_operations arp_seq_ops = {
 	.stop	= neigh_seq_stop,
 	.show	= clip_seq_show,
 };
-
-static int arp_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &arp_seq_ops,
-			    sizeof(struct clip_seq_state));
-}
-
-static const struct file_operations arp_seq_fops = {
-	.open		= arp_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-	.owner		= THIS_MODULE
-};
 #endif
 
 static void atm_clip_exit_noproc(void);
@@ -893,7 +879,8 @@ static int __init atm_clip_init(void)
 	{
 		struct proc_dir_entry *p;
 
-		p = proc_create("arp", 0444, atm_proc_root, &arp_seq_fops);
+		p = proc_create_net("arp", 0444, atm_proc_root, &arp_seq_ops,
+				sizeof(struct clip_seq_state));
 		if (!p) {
 			pr_err("Unable to initialize /proc/net/atm/arp\n");
 			atm_clip_exit_noproc();
diff --git a/net/atm/common.c b/net/atm/common.c
index fc78a05..1f2af59 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -648,16 +648,11 @@ out:
 	return error;
 }
 
-__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
+__poll_t vcc_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
-	struct atm_vcc *vcc;
-	__poll_t mask;
-
-	sock_poll_wait(file, sk_sleep(sk), wait);
-	mask = 0;
-
-	vcc = ATM_SD(sock);
+	struct atm_vcc *vcc = ATM_SD(sock);
+	__poll_t mask = 0;
 
 	/* exceptional events */
 	if (sk->sk_err)
diff --git a/net/atm/common.h b/net/atm/common.h
index 58506490..526796a 100644
--- a/net/atm/common.h
+++ b/net/atm/common.h
@@ -17,7 +17,7 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci);
 int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 		int flags);
 int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len);
-__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait);
+__poll_t vcc_poll_mask(struct socket *sock, __poll_t events);
 int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 int vcc_setsockopt(struct socket *sock, int level, int optname,
diff --git a/net/atm/lec.c b/net/atm/lec.c
index 3138a86..5a95fcf 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -990,18 +990,6 @@ static const struct seq_operations lec_seq_ops = {
 	.stop = lec_seq_stop,
 	.show = lec_seq_show,
 };
-
-static int lec_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_private(file, &lec_seq_ops, sizeof(struct lec_state));
-}
-
-static const struct file_operations lec_seq_fops = {
-	.open = lec_seq_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release_private,
-};
 #endif
 
 static int lane_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
@@ -1047,7 +1035,8 @@ static int __init lane_module_init(void)
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry *p;
 
-	p = proc_create("lec", 0444, atm_proc_root, &lec_seq_fops);
+	p = proc_create_seq_private("lec", 0444, atm_proc_root, &lec_seq_ops,
+			sizeof(struct lec_state), NULL);
 	if (!p) {
 		pr_err("Unable to initialize /proc/net/atm/lec\n");
 		return -ENOMEM;
diff --git a/net/atm/proc.c b/net/atm/proc.c
index 55410c0..0b0495a 100644
--- a/net/atm/proc.c
+++ b/net/atm/proc.c
@@ -68,7 +68,6 @@ static void atm_dev_info(struct seq_file *seq, const struct atm_dev *dev)
 struct vcc_state {
 	int bucket;
 	struct sock *sk;
-	int family;
 };
 
 static inline int compare_family(struct sock *sk, int family)
@@ -106,23 +105,13 @@ out:
 	return (l < 0);
 }
 
-static inline void *vcc_walk(struct vcc_state *state, loff_t l)
+static inline void *vcc_walk(struct seq_file *seq, loff_t l)
 {
-	return __vcc_walk(&state->sk, state->family, &state->bucket, l) ?
-	       state : NULL;
-}
-
-static int __vcc_seq_open(struct inode *inode, struct file *file,
-	int family, const struct seq_operations *ops)
-{
-	struct vcc_state *state;
-
-	state = __seq_open_private(file, ops, sizeof(*state));
-	if (state == NULL)
-		return -ENOMEM;
+	struct vcc_state *state = seq->private;
+	int family = (uintptr_t)(PDE_DATA(file_inode(seq->file)));
 
-	state->family = family;
-	return 0;
+	return __vcc_walk(&state->sk, family, &state->bucket, l) ?
+	       state : NULL;
 }
 
 static void *vcc_seq_start(struct seq_file *seq, loff_t *pos)
@@ -133,7 +122,7 @@ static void *vcc_seq_start(struct seq_file *seq, loff_t *pos)
 
 	read_lock(&vcc_sklist_lock);
 	state->sk = SEQ_START_TOKEN;
-	return left ? vcc_walk(state, left) : SEQ_START_TOKEN;
+	return left ? vcc_walk(seq, left) : SEQ_START_TOKEN;
 }
 
 static void vcc_seq_stop(struct seq_file *seq, void *v)
@@ -144,9 +133,7 @@ static void vcc_seq_stop(struct seq_file *seq, void *v)
 
 static void *vcc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct vcc_state *state = seq->private;
-
-	v = vcc_walk(state, 1);
+	v = vcc_walk(seq, 1);
 	*pos += !!PTR_ERR(v);
 	return v;
 }
@@ -257,18 +244,6 @@ static const struct seq_operations atm_dev_seq_ops = {
 	.show	= atm_dev_seq_show,
 };
 
-static int atm_dev_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &atm_dev_seq_ops);
-}
-
-static const struct file_operations devices_seq_fops = {
-	.open		= atm_dev_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int pvc_seq_show(struct seq_file *seq, void *v)
 {
 	static char atm_pvc_banner[] =
@@ -292,18 +267,6 @@ static const struct seq_operations pvc_seq_ops = {
 	.show	= pvc_seq_show,
 };
 
-static int pvc_seq_open(struct inode *inode, struct file *file)
-{
-	return __vcc_seq_open(inode, file, PF_ATMPVC, &pvc_seq_ops);
-}
-
-static const struct file_operations pvc_seq_fops = {
-	.open		= pvc_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
-
 static int vcc_seq_show(struct seq_file *seq, void *v)
 {
 	if (v == SEQ_START_TOKEN) {
@@ -326,18 +289,6 @@ static const struct seq_operations vcc_seq_ops = {
 	.show	= vcc_seq_show,
 };
 
-static int vcc_seq_open(struct inode *inode, struct file *file)
-{
-	return __vcc_seq_open(inode, file, 0, &vcc_seq_ops);
-}
-
-static const struct file_operations vcc_seq_fops = {
-	.open		= vcc_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
-
 static int svc_seq_show(struct seq_file *seq, void *v)
 {
 	static const char atm_svc_banner[] =
@@ -361,18 +312,6 @@ static const struct seq_operations svc_seq_ops = {
 	.show	= svc_seq_show,
 };
 
-static int svc_seq_open(struct inode *inode, struct file *file)
-{
-	return __vcc_seq_open(inode, file, PF_ATMSVC, &svc_seq_ops);
-}
-
-static const struct file_operations svc_seq_fops = {
-	.open		= svc_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
-
 static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
 				 size_t count, loff_t *pos)
 {
@@ -440,58 +379,22 @@ void atm_proc_dev_deregister(struct atm_dev *dev)
 	kfree(dev->proc_name);
 }
 
-static struct atm_proc_entry {
-	char *name;
-	const struct file_operations *proc_fops;
-	struct proc_dir_entry *dirent;
-} atm_proc_ents[] = {
-	{ .name = "devices",	.proc_fops = &devices_seq_fops },
-	{ .name = "pvc",	.proc_fops = &pvc_seq_fops },
-	{ .name = "svc",	.proc_fops = &svc_seq_fops },
-	{ .name = "vc",		.proc_fops = &vcc_seq_fops },
-	{ .name = NULL,		.proc_fops = NULL }
-};
-
-static void atm_proc_dirs_remove(void)
-{
-	static struct atm_proc_entry *e;
-
-	for (e = atm_proc_ents; e->name; e++) {
-		if (e->dirent)
-			remove_proc_entry(e->name, atm_proc_root);
-	}
-	remove_proc_entry("atm", init_net.proc_net);
-}
-
 int __init atm_proc_init(void)
 {
-	static struct atm_proc_entry *e;
-	int ret;
-
 	atm_proc_root = proc_net_mkdir(&init_net, "atm", init_net.proc_net);
 	if (!atm_proc_root)
-		goto err_out;
-	for (e = atm_proc_ents; e->name; e++) {
-		struct proc_dir_entry *dirent;
-
-		dirent = proc_create(e->name, 0444,
-				     atm_proc_root, e->proc_fops);
-		if (!dirent)
-			goto err_out_remove;
-		e->dirent = dirent;
-	}
-	ret = 0;
-out:
-	return ret;
-
-err_out_remove:
-	atm_proc_dirs_remove();
-err_out:
-	ret = -ENOMEM;
-	goto out;
+		return -ENOMEM;
+	proc_create_seq("devices", 0444, atm_proc_root, &atm_dev_seq_ops);
+	proc_create_seq_private("pvc", 0444, atm_proc_root, &pvc_seq_ops,
+			sizeof(struct vcc_state), (void *)(uintptr_t)PF_ATMPVC);
+	proc_create_seq_private("svc", 0444, atm_proc_root, &svc_seq_ops,
+			sizeof(struct vcc_state), (void *)(uintptr_t)PF_ATMSVC);
+	proc_create_seq_private("vc", 0444, atm_proc_root, &vcc_seq_ops,
+			sizeof(struct vcc_state), NULL);
+	return 0;
 }
 
 void atm_proc_exit(void)
 {
-	atm_proc_dirs_remove();
+	remove_proc_subtree("atm", init_net.proc_net);
 }
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index 2cb10af..9f75092 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -113,7 +113,7 @@ static const struct proto_ops pvc_proto_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	pvc_getname,
-	.poll =		vcc_poll,
+	.poll_mask =	vcc_poll_mask,
 	.ioctl =	vcc_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = vcc_compat_ioctl,
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 2f91b76..53f4ad7 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -636,7 +636,7 @@ static const struct proto_ops svc_proto_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	svc_accept,
 	.getname =	svc_getname,
-	.poll =		vcc_poll,
+	.poll_mask =	vcc_poll_mask,
 	.ioctl =	svc_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl =	svc_compat_ioctl,
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 2b41366..d1d2442 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1924,19 +1924,6 @@ static const struct seq_operations ax25_info_seqops = {
 	.stop = ax25_info_stop,
 	.show = ax25_info_show,
 };
-
-static int ax25_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &ax25_info_seqops);
-}
-
-static const struct file_operations ax25_info_fops = {
-	.open = ax25_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 #endif
 
 static const struct net_proto_family ax25_family_ops = {
@@ -1954,7 +1941,7 @@ static const struct proto_ops ax25_proto_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= ax25_accept,
 	.getname	= ax25_getname,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.ioctl		= ax25_ioctl,
 	.listen		= ax25_listen,
 	.shutdown	= ax25_shutdown,
@@ -1989,10 +1976,10 @@ static int __init ax25_init(void)
 	dev_add_pack(&ax25_packet_type);
 	register_netdevice_notifier(&ax25_dev_notifier);
 
-	proc_create("ax25_route", 0444, init_net.proc_net,
-		    &ax25_route_fops);
-	proc_create("ax25", 0444, init_net.proc_net, &ax25_info_fops);
-	proc_create("ax25_calls", 0444, init_net.proc_net, &ax25_uid_fops);
+	proc_create_seq("ax25_route", 0444, init_net.proc_net, &ax25_rt_seqops);
+	proc_create_seq("ax25", 0444, init_net.proc_net, &ax25_info_seqops);
+	proc_create_seq("ax25_calls", 0444, init_net.proc_net,
+			&ax25_uid_seqops);
 out:
 	return rc;
 }
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index 5255589..a0eff32 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -323,25 +323,12 @@ static int ax25_rt_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations ax25_rt_seqops = {
+const struct seq_operations ax25_rt_seqops = {
 	.start = ax25_rt_seq_start,
 	.next = ax25_rt_seq_next,
 	.stop = ax25_rt_seq_stop,
 	.show = ax25_rt_seq_show,
 };
-
-static int ax25_rt_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &ax25_rt_seqops);
-}
-
-const struct file_operations ax25_route_fops = {
-	.open = ax25_rt_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 #endif
 
 /*
diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c
index 4ebe91b..99d02e3 100644
--- a/net/ax25/ax25_uid.c
+++ b/net/ax25/ax25_uid.c
@@ -181,25 +181,12 @@ static int ax25_uid_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations ax25_uid_seqops = {
+const struct seq_operations ax25_uid_seqops = {
 	.start = ax25_uid_seq_start,
 	.next = ax25_uid_seq_next,
 	.stop = ax25_uid_seq_stop,
 	.show = ax25_uid_seq_show,
 };
-
-static int ax25_uid_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &ax25_uid_seqops);
-}
-
-const struct file_operations ax25_uid_fops = {
-	.open = ax25_uid_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 #endif
 
 /*
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 84d92a0..510ab4f 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -437,16 +437,13 @@ static inline __poll_t bt_accept_poll(struct sock *parent)
 	return 0;
 }
 
-__poll_t bt_sock_poll(struct file *file, struct socket *sock,
-			  poll_table *wait)
+__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
 
 	BT_DBG("sock %p, sk %p", sock, sk);
 
-	poll_wait(file, sk_sleep(sk), wait);
-
 	if (sk->sk_state == BT_LISTEN)
 		return bt_accept_poll(sk);
 
@@ -478,7 +475,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock,
 
 	return mask;
 }
-EXPORT_SYMBOL(bt_sock_poll);
+EXPORT_SYMBOL(bt_sock_poll_mask);
 
 int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
@@ -605,15 +602,10 @@ int bt_sock_wait_ready(struct sock *sk, unsigned long flags)
 EXPORT_SYMBOL(bt_sock_wait_ready);
 
 #ifdef CONFIG_PROC_FS
-struct bt_seq_state {
-	struct bt_sock_list *l;
-};
-
 static void *bt_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(seq->private->l->lock)
 {
-	struct bt_seq_state *s = seq->private;
-	struct bt_sock_list *l = s->l;
+	struct bt_sock_list *l = PDE_DATA(file_inode(seq->file));
 
 	read_lock(&l->lock);
 	return seq_hlist_start_head(&l->head, *pos);
@@ -621,8 +613,7 @@ static void *bt_seq_start(struct seq_file *seq, loff_t *pos)
 
 static void *bt_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct bt_seq_state *s = seq->private;
-	struct bt_sock_list *l = s->l;
+	struct bt_sock_list *l = PDE_DATA(file_inode(seq->file));
 
 	return seq_hlist_next(v, &l->head, pos);
 }
@@ -630,16 +621,14 @@ static void *bt_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 static void bt_seq_stop(struct seq_file *seq, void *v)
 	__releases(seq->private->l->lock)
 {
-	struct bt_seq_state *s = seq->private;
-	struct bt_sock_list *l = s->l;
+	struct bt_sock_list *l = PDE_DATA(file_inode(seq->file));
 
 	read_unlock(&l->lock);
 }
 
 static int bt_seq_show(struct seq_file *seq, void *v)
 {
-	struct bt_seq_state *s = seq->private;
-	struct bt_sock_list *l = s->l;
+	struct bt_sock_list *l = PDE_DATA(file_inode(seq->file));
 
 	if (v == SEQ_START_TOKEN) {
 		seq_puts(seq ,"sk               RefCnt Rmem   Wmem   User   Inode  Parent");
@@ -681,35 +670,13 @@ static const struct seq_operations bt_seq_ops = {
 	.show  = bt_seq_show,
 };
 
-static int bt_seq_open(struct inode *inode, struct file *file)
-{
-	struct bt_sock_list *sk_list;
-	struct bt_seq_state *s;
-
-	sk_list = PDE_DATA(inode);
-	s = __seq_open_private(file, &bt_seq_ops,
-			       sizeof(struct bt_seq_state));
-	if (!s)
-		return -ENOMEM;
-
-	s->l = sk_list;
-	return 0;
-}
-
-static const struct file_operations bt_fops = {
-	.open = bt_seq_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release_private
-};
-
 int bt_procfs_init(struct net *net, const char *name,
 		   struct bt_sock_list *sk_list,
 		   int (* seq_show)(struct seq_file *, void *))
 {
 	sk_list->custom_seq_show = seq_show;
 
-	if (!proc_create_data(name, 0, net->proc_net, &bt_fops, sk_list))
+	if (!proc_create_seq_data(name, 0, net->proc_net, &bt_seq_ops, sk_list))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
index b5116fa..00deacd 100644
--- a/net/bluetooth/bnep/sock.c
+++ b/net/bluetooth/bnep/sock.c
@@ -175,7 +175,6 @@ static const struct proto_ops bnep_sock_ops = {
 	.getname	= sock_no_getname,
 	.sendmsg	= sock_no_sendmsg,
 	.recvmsg	= sock_no_recvmsg,
-	.poll		= sock_no_poll,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= sock_no_setsockopt,
diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c
index 426a92f..eb415560 100644
--- a/net/bluetooth/cmtp/capi.c
+++ b/net/bluetooth/cmtp/capi.c
@@ -521,18 +521,6 @@ static int cmtp_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int cmtp_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, cmtp_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations cmtp_proc_fops = {
-	.open		= cmtp_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 int cmtp_attach_device(struct cmtp_session *session)
 {
 	unsigned char buf[4];
@@ -571,7 +559,7 @@ int cmtp_attach_device(struct cmtp_session *session)
 	session->ctrl.send_message  = cmtp_send_message;
 
 	session->ctrl.procinfo      = cmtp_procinfo;
-	session->ctrl.proc_fops = &cmtp_proc_fops;
+	session->ctrl.proc_show     = cmtp_proc_show;
 
 	if (attach_capi_ctr(&session->ctrl) < 0) {
 		BT_ERR("Can't attach new controller");
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
index ce86a7b..e08f28fa 100644
--- a/net/bluetooth/cmtp/sock.c
+++ b/net/bluetooth/cmtp/sock.c
@@ -178,7 +178,6 @@ static const struct proto_ops cmtp_sock_ops = {
 	.getname	= sock_no_getname,
 	.sendmsg	= sock_no_sendmsg,
 	.recvmsg	= sock_no_recvmsg,
-	.poll		= sock_no_poll,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= sock_no_setsockopt,
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 1506e16..d6c0998 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1975,7 +1975,7 @@ static const struct proto_ops hci_sock_ops = {
 	.sendmsg	= hci_sock_sendmsg,
 	.recvmsg	= hci_sock_recvmsg,
 	.ioctl		= hci_sock_ioctl,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= hci_sock_setsockopt,
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index 008ba43..1eaac01 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -208,7 +208,6 @@ static const struct proto_ops hidp_sock_ops = {
 	.getname	= sock_no_getname,
 	.sendmsg	= sock_no_sendmsg,
 	.recvmsg	= sock_no_recvmsg,
-	.poll		= sock_no_poll,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= sock_no_setsockopt,
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 686bdc6..742a190 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1653,7 +1653,7 @@ static const struct proto_ops l2cap_sock_ops = {
 	.getname	= l2cap_sock_getname,
 	.sendmsg	= l2cap_sock_sendmsg,
 	.recvmsg	= l2cap_sock_recvmsg,
-	.poll		= bt_sock_poll,
+	.poll_mask	= bt_sock_poll_mask,
 	.ioctl		= bt_sock_ioctl,
 	.mmap		= sock_no_mmap,
 	.socketpair	= sock_no_socketpair,
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index d606e92..1cf5762 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -1049,7 +1049,7 @@ static const struct proto_ops rfcomm_sock_ops = {
 	.setsockopt	= rfcomm_sock_setsockopt,
 	.getsockopt	= rfcomm_sock_getsockopt,
 	.ioctl		= rfcomm_sock_ioctl,
-	.poll		= bt_sock_poll,
+	.poll_mask	= bt_sock_poll_mask,
 	.socketpair	= sock_no_socketpair,
 	.mmap		= sock_no_mmap
 };
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 413b8ee..d60dbc6 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -1197,7 +1197,7 @@ static const struct proto_ops sco_sock_ops = {
 	.getname	= sco_sock_getname,
 	.sendmsg	= sco_sock_sendmsg,
 	.recvmsg	= sco_sock_recvmsg,
-	.poll		= bt_sock_poll,
+	.poll_mask	= bt_sock_poll_mask,
 	.ioctl		= bt_sock_ioctl,
 	.mmap		= sock_no_mmap,
 	.socketpair	= sock_no_socketpair,
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index a6fb1b3..c799186 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -934,15 +934,11 @@ static int caif_release(struct socket *sock)
 }
 
 /* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */
-static __poll_t caif_poll(struct file *file,
-			      struct socket *sock, poll_table *wait)
+static __poll_t caif_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
-	__poll_t mask;
 	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
-
-	sock_poll_wait(file, sk_sleep(sk), wait);
-	mask = 0;
+	__poll_t mask = 0;
 
 	/* exceptional events? */
 	if (sk->sk_err)
@@ -976,7 +972,7 @@ static const struct proto_ops caif_seqpacket_ops = {
 	.socketpair = sock_no_socketpair,
 	.accept = sock_no_accept,
 	.getname = sock_no_getname,
-	.poll = caif_poll,
+	.poll_mask = caif_poll_mask,
 	.ioctl = sock_no_ioctl,
 	.listen = sock_no_listen,
 	.shutdown = sock_no_shutdown,
@@ -997,7 +993,7 @@ static const struct proto_ops caif_stream_ops = {
 	.socketpair = sock_no_socketpair,
 	.accept = sock_no_accept,
 	.getname = sock_no_getname,
-	.poll = caif_poll,
+	.poll_mask = caif_poll_mask,
 	.ioctl = sock_no_ioctl,
 	.listen = sock_no_listen,
 	.shutdown = sock_no_shutdown,
diff --git a/net/can/bcm.c b/net/can/bcm.c
index ac5e5e3..97fedff 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -239,18 +239,6 @@ static int bcm_proc_show(struct seq_file *m, void *v)
 	seq_putc(m, '\n');
 	return 0;
 }
-
-static int bcm_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, bcm_proc_show);
-}
-
-static const struct file_operations bcm_proc_fops = {
-	.open		= bcm_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif /* CONFIG_PROC_FS */
 
 /*
@@ -1606,9 +1594,9 @@ static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len,
 	if (net->can.bcmproc_dir) {
 		/* unique socket address as filename */
 		sprintf(bo->procname, "%lu", sock_i_ino(sk));
-		bo->bcm_proc_read = proc_create_data(bo->procname, 0644,
+		bo->bcm_proc_read = proc_create_net_single(bo->procname, 0644,
 						     net->can.bcmproc_dir,
-						     &bcm_proc_fops, sk);
+						     bcm_proc_show, sk);
 		if (!bo->bcm_proc_read) {
 			ret = -ENOMEM;
 			goto fail;
@@ -1669,7 +1657,7 @@ static const struct proto_ops bcm_ops = {
 	.socketpair    = sock_no_socketpair,
 	.accept        = sock_no_accept,
 	.getname       = sock_no_getname,
-	.poll          = datagram_poll,
+	.poll_mask     = datagram_poll_mask,
 	.ioctl         = can_ioctl,	/* use can_ioctl() from af_can.c */
 	.listen        = sock_no_listen,
 	.shutdown      = sock_no_shutdown,
diff --git a/net/can/proc.c b/net/can/proc.c
index fdf704e..70fea17 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -270,18 +270,6 @@ static int can_stats_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int can_stats_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, can_stats_proc_show);
-}
-
-static const struct file_operations can_stats_proc_fops = {
-	.open		= can_stats_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int can_reset_stats_proc_show(struct seq_file *m, void *v)
 {
 	struct net *net = m->private;
@@ -303,36 +291,12 @@ static int can_reset_stats_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int can_reset_stats_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, can_reset_stats_proc_show);
-}
-
-static const struct file_operations can_reset_stats_proc_fops = {
-	.open		= can_reset_stats_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int can_version_proc_show(struct seq_file *m, void *v)
 {
 	seq_printf(m, "%s\n", CAN_VERSION_STRING);
 	return 0;
 }
 
-static int can_version_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, can_version_proc_show);
-}
-
-static const struct file_operations can_version_proc_fops = {
-	.open		= can_version_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx,
 					     struct net_device *dev,
 					     struct can_dev_rcv_lists *d)
@@ -373,18 +337,6 @@ static int can_rcvlist_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int can_rcvlist_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, can_rcvlist_proc_show);
-}
-
-static const struct file_operations can_rcvlist_proc_fops = {
-	.open		= can_rcvlist_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static inline void can_rcvlist_proc_show_array(struct seq_file *m,
 					       struct net_device *dev,
 					       struct hlist_head *rcv_array,
@@ -440,19 +392,6 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int can_rcvlist_sff_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, can_rcvlist_sff_proc_show);
-}
-
-static const struct file_operations can_rcvlist_sff_proc_fops = {
-	.open		= can_rcvlist_sff_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-
 static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v)
 {
 	struct net_device *dev;
@@ -483,18 +422,6 @@ static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int can_rcvlist_eff_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, can_rcvlist_eff_proc_show);
-}
-
-static const struct file_operations can_rcvlist_eff_proc_fops = {
-	.open		= can_rcvlist_eff_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /*
  * can_init_proc - create main CAN proc directory and procfs entries
  */
@@ -510,37 +437,29 @@ void can_init_proc(struct net *net)
 	}
 
 	/* own procfs entries from the AF_CAN core */
-	net->can.pde_version     = proc_create(CAN_PROC_VERSION, 0644,
-					       net->can.proc_dir,
-					       &can_version_proc_fops);
-	net->can.pde_stats       = proc_create(CAN_PROC_STATS, 0644,
-					       net->can.proc_dir,
-					       &can_stats_proc_fops);
-	net->can.pde_reset_stats = proc_create(CAN_PROC_RESET_STATS, 0644,
-					       net->can.proc_dir,
-					       &can_reset_stats_proc_fops);
-	net->can.pde_rcvlist_err = proc_create_data(CAN_PROC_RCVLIST_ERR, 0644,
-						    net->can.proc_dir,
-						    &can_rcvlist_proc_fops,
-						    (void *)RX_ERR);
-	net->can.pde_rcvlist_all = proc_create_data(CAN_PROC_RCVLIST_ALL, 0644,
-						    net->can.proc_dir,
-						    &can_rcvlist_proc_fops,
-						    (void *)RX_ALL);
-	net->can.pde_rcvlist_fil = proc_create_data(CAN_PROC_RCVLIST_FIL, 0644,
-						    net->can.proc_dir,
-						    &can_rcvlist_proc_fops,
-						    (void *)RX_FIL);
-	net->can.pde_rcvlist_inv = proc_create_data(CAN_PROC_RCVLIST_INV, 0644,
-						    net->can.proc_dir,
-						    &can_rcvlist_proc_fops,
-						    (void *)RX_INV);
-	net->can.pde_rcvlist_eff = proc_create(CAN_PROC_RCVLIST_EFF, 0644,
-					       net->can.proc_dir,
-					       &can_rcvlist_eff_proc_fops);
-	net->can.pde_rcvlist_sff = proc_create(CAN_PROC_RCVLIST_SFF, 0644,
-					       net->can.proc_dir,
-					       &can_rcvlist_sff_proc_fops);
+	net->can.pde_version = proc_create_net_single(CAN_PROC_VERSION, 0644,
+			net->can.proc_dir, can_version_proc_show, NULL);
+	net->can.pde_stats = proc_create_net_single(CAN_PROC_STATS, 0644,
+			net->can.proc_dir, can_stats_proc_show, NULL);
+	net->can.pde_reset_stats = proc_create_net_single(CAN_PROC_RESET_STATS,
+			0644, net->can.proc_dir, can_reset_stats_proc_show,
+			NULL);
+	net->can.pde_rcvlist_err = proc_create_net_single(CAN_PROC_RCVLIST_ERR,
+			0644, net->can.proc_dir, can_rcvlist_proc_show,
+			(void *)RX_ERR);
+	net->can.pde_rcvlist_all = proc_create_net_single(CAN_PROC_RCVLIST_ALL,
+			0644, net->can.proc_dir, can_rcvlist_proc_show,
+			(void *)RX_ALL);
+	net->can.pde_rcvlist_fil = proc_create_net_single(CAN_PROC_RCVLIST_FIL,
+			0644, net->can.proc_dir, can_rcvlist_proc_show,
+			(void *)RX_FIL);
+	net->can.pde_rcvlist_inv = proc_create_net_single(CAN_PROC_RCVLIST_INV,
+			0644, net->can.proc_dir, can_rcvlist_proc_show,
+			(void *)RX_INV);
+	net->can.pde_rcvlist_eff = proc_create_net_single(CAN_PROC_RCVLIST_EFF,
+			0644, net->can.proc_dir, can_rcvlist_eff_proc_show, NULL);
+	net->can.pde_rcvlist_sff = proc_create_net_single(CAN_PROC_RCVLIST_SFF,
+			0644, net->can.proc_dir, can_rcvlist_sff_proc_show, NULL);
 }
 
 /*
diff --git a/net/can/raw.c b/net/can/raw.c
index 1051eee..fd7e2f4 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -843,7 +843,7 @@ static const struct proto_ops raw_ops = {
 	.socketpair    = sock_no_socketpair,
 	.accept        = sock_no_accept,
 	.getname       = raw_getname,
-	.poll          = datagram_poll,
+	.poll_mask     = datagram_poll_mask,
 	.ioctl         = can_ioctl,	/* use can_ioctl() from af_can.c */
 	.listen        = sock_no_listen,
 	.shutdown      = sock_no_shutdown,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 9938952..f19bf3d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -819,9 +819,8 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
 
 /**
  * 	datagram_poll - generic datagram poll
- *	@file: file struct
  *	@sock: socket
- *	@wait: poll table
+ *	@events to wait for
  *
  *	Datagram poll: Again totally generic. This also handles
  *	sequenced packet sockets providing the socket receive queue
@@ -831,14 +830,10 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
  *	and you use a different write policy from sock_writeable()
  *	then please supply your own write_space callback.
  */
-__poll_t datagram_poll(struct file *file, struct socket *sock,
-			   poll_table *wait)
+__poll_t datagram_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
-	__poll_t mask;
-
-	sock_poll_wait(file, sk_sleep(sk), wait);
-	mask = 0;
+	__poll_t mask = 0;
 
 	/* exceptional events? */
 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
@@ -871,4 +866,4 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
 
 	return mask;
 }
-EXPORT_SYMBOL(datagram_poll);
+EXPORT_SYMBOL(datagram_poll_mask);
diff --git a/net/core/dev.c b/net/core/dev.c
index 2af787e..983b277 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2884,11 +2884,7 @@ void netdev_rx_csum_fault(struct net_device *dev)
 EXPORT_SYMBOL(netdev_rx_csum_fault);
 #endif
 
-/* Actually, we should eliminate this check as soon as we know, that:
- * 1. IOMMU is present and allows to map all the memory.
- * 2. No high memory really exists on this machine.
- */
-
+/* XXX: check that highmem exists at all on the given machine. */
 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 {
 #ifdef CONFIG_HIGHMEM
@@ -2902,20 +2898,6 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 				return 1;
 		}
 	}
-
-	if (PCI_DMA_BUS_IS_PHYS) {
-		struct device *pdev = dev->dev.parent;
-
-		if (!pdev)
-			return 0;
-		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
-
-			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
-				return 1;
-		}
-	}
 #endif
 	return 0;
 }
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index ce51986..1fb43bf 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -59,7 +59,7 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
 				    struct net_device *dev);
 
 #ifdef CONFIG_PROC_FS
-static const struct file_operations neigh_stat_seq_fops;
+static const struct seq_operations neigh_stat_seq_ops;
 #endif
 
 /*
@@ -1558,8 +1558,8 @@ void neigh_table_init(int index, struct neigh_table *tbl)
 		panic("cannot create neighbour cache statistics");
 
 #ifdef CONFIG_PROC_FS
-	if (!proc_create_data(tbl->id, 0, init_net.proc_net_stat,
-			      &neigh_stat_seq_fops, tbl))
+	if (!proc_create_seq_data(tbl->id, 0, init_net.proc_net_stat,
+			      &neigh_stat_seq_ops, tbl))
 		panic("cannot create neighbour proc dir entry");
 #endif
 
@@ -2786,7 +2786,7 @@ EXPORT_SYMBOL(neigh_seq_stop);
 
 static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct neigh_table *tbl = seq->private;
+	struct neigh_table *tbl = PDE_DATA(file_inode(seq->file));
 	int cpu;
 
 	if (*pos == 0)
@@ -2803,7 +2803,7 @@ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
 
 static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct neigh_table *tbl = seq->private;
+	struct neigh_table *tbl = PDE_DATA(file_inode(seq->file));
 	int cpu;
 
 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
@@ -2822,7 +2822,7 @@ static void neigh_stat_seq_stop(struct seq_file *seq, void *v)
 
 static int neigh_stat_seq_show(struct seq_file *seq, void *v)
 {
-	struct neigh_table *tbl = seq->private;
+	struct neigh_table *tbl = PDE_DATA(file_inode(seq->file));
 	struct neigh_statistics *st = v;
 
 	if (v == SEQ_START_TOKEN) {
@@ -2861,25 +2861,6 @@ static const struct seq_operations neigh_stat_seq_ops = {
 	.stop	= neigh_stat_seq_stop,
 	.show	= neigh_stat_seq_show,
 };
-
-static int neigh_stat_seq_open(struct inode *inode, struct file *file)
-{
-	int ret = seq_open(file, &neigh_stat_seq_ops);
-
-	if (!ret) {
-		struct seq_file *sf = file->private_data;
-		sf->private = PDE_DATA(inode);
-	}
-	return ret;
-};
-
-static const struct file_operations neigh_stat_seq_fops = {
-	.open 	 = neigh_stat_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release,
-};
-
 #endif /* CONFIG_PROC_FS */
 
 static inline size_t neigh_nlmsg_size(void)
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 9737302..63881f7 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -175,19 +175,6 @@ static const struct seq_operations dev_seq_ops = {
 	.show  = dev_seq_show,
 };
 
-static int dev_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &dev_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations dev_seq_fops = {
-	.open    = dev_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 static const struct seq_operations softnet_seq_ops = {
 	.start = softnet_seq_start,
 	.next  = softnet_seq_next,
@@ -195,18 +182,6 @@ static const struct seq_operations softnet_seq_ops = {
 	.show  = softnet_seq_show,
 };
 
-static int softnet_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &softnet_seq_ops);
-}
-
-static const struct file_operations softnet_seq_fops = {
-	.open    = softnet_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release,
-};
-
 static void *ptype_get_idx(loff_t pos)
 {
 	struct packet_type *pt = NULL;
@@ -297,30 +272,18 @@ static const struct seq_operations ptype_seq_ops = {
 	.show  = ptype_seq_show,
 };
 
-static int ptype_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ptype_seq_ops,
-			sizeof(struct seq_net_private));
-}
-
-static const struct file_operations ptype_seq_fops = {
-	.open    = ptype_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
-
 static int __net_init dev_proc_net_init(struct net *net)
 {
 	int rc = -ENOMEM;
 
-	if (!proc_create("dev", 0444, net->proc_net, &dev_seq_fops))
+	if (!proc_create_net("dev", 0444, net->proc_net, &dev_seq_ops,
+			sizeof(struct seq_net_private)))
 		goto out;
-	if (!proc_create("softnet_stat", 0444, net->proc_net,
-			 &softnet_seq_fops))
+	if (!proc_create_seq("softnet_stat", 0444, net->proc_net,
+			 &softnet_seq_ops))
 		goto out_dev;
-	if (!proc_create("ptype", 0444, net->proc_net, &ptype_seq_fops))
+	if (!proc_create_net("ptype", 0444, net->proc_net, &ptype_seq_ops,
+			sizeof(struct seq_net_private)))
 		goto out_softnet;
 
 	if (wext_proc_init(net))
@@ -377,22 +340,10 @@ static const struct seq_operations dev_mc_seq_ops = {
 	.show  = dev_mc_seq_show,
 };
 
-static int dev_mc_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &dev_mc_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations dev_mc_seq_fops = {
-	.open    = dev_mc_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 static int __net_init dev_mc_net_init(struct net *net)
 {
-	if (!proc_create("dev_mcast", 0, net->proc_net, &dev_mc_seq_fops))
+	if (!proc_create_net("dev_mcast", 0, net->proc_net, &dev_mc_seq_ops,
+			sizeof(struct seq_net_private)))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/core/sock.c b/net/core/sock.c
index 3b6d028..2aed99a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2567,12 +2567,6 @@ int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
 }
 EXPORT_SYMBOL(sock_no_getname);
 
-__poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
-{
-	return 0;
-}
-EXPORT_SYMBOL(sock_no_poll);
-
 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
 	return -EOPNOTSUPP;
@@ -3439,22 +3433,10 @@ static const struct seq_operations proto_seq_ops = {
 	.show   = proto_seq_show,
 };
 
-static int proto_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &proto_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations proto_seq_fops = {
-	.open		= proto_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 static __net_init int proto_init_net(struct net *net)
 {
-	if (!proc_create("protocols", 0444, net->proc_net, &proto_seq_fops))
+	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
+			sizeof(struct seq_net_private)))
 		return -ENOMEM;
 
 	return 0;
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index f91e381..0ea2ee5 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -316,8 +316,7 @@ int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 		 int flags, int *addr_len);
 void dccp_shutdown(struct sock *sk, int how);
 int inet_dccp_listen(struct socket *sock, int backlog);
-__poll_t dccp_poll(struct file *file, struct socket *sock,
-		       poll_table *wait);
+__poll_t dccp_poll_mask(struct socket *sock, __poll_t events);
 int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 void dccp_req_err(struct sock *sk, u64 seq);
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b08feb2..a9e478c 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -984,7 +984,7 @@ static const struct proto_ops inet_dccp_ops = {
 	.accept		   = inet_accept,
 	.getname	   = inet_getname,
 	/* FIXME: work on tcp_poll to rename it to inet_csk_poll */
-	.poll		   = dccp_poll,
+	.poll_mask	   = dccp_poll_mask,
 	.ioctl		   = inet_ioctl,
 	/* FIXME: work on inet_listen to rename it to sock_common_listen */
 	.listen		   = inet_dccp_listen,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 6344f1b..17fc4e0 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1070,7 +1070,7 @@ static const struct proto_ops inet6_dccp_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = inet_accept,
 	.getname	   = inet6_getname,
-	.poll		   = dccp_poll,
+	.poll_mask	   = dccp_poll_mask,
 	.ioctl		   = inet6_ioctl,
 	.listen		   = inet_dccp_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 0d56e36..ca21c1c 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -312,20 +312,11 @@ int dccp_disconnect(struct sock *sk, int flags)
 
 EXPORT_SYMBOL_GPL(dccp_disconnect);
 
-/*
- *	Wait for a DCCP event.
- *
- *	Note that we don't need to lock the socket, as the upper poll layers
- *	take care of normal races (between the test and the event) and we don't
- *	go look at any of the socket buffers directly.
- */
-__poll_t dccp_poll(struct file *file, struct socket *sock,
-		       poll_table *wait)
+__poll_t dccp_poll_mask(struct socket *sock, __poll_t events)
 {
 	__poll_t mask;
 	struct sock *sk = sock->sk;
 
-	sock_poll_wait(file, sk_sleep(sk), wait);
 	if (sk->sk_state == DCCP_LISTEN)
 		return inet_csk_listen_poll(sk);
 
@@ -367,7 +358,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock,
 	return mask;
 }
 
-EXPORT_SYMBOL_GPL(dccp_poll);
+EXPORT_SYMBOL_GPL(dccp_poll_mask);
 
 int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 3275160..9a686d8 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1207,11 +1207,11 @@ static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer)
 }
 
 
-static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table  *wait)
+static __poll_t dn_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	struct dn_scp *scp = DN_SK(sk);
-	__poll_t mask = datagram_poll(file, sock, wait);
+	__poll_t mask = datagram_poll_mask(sock, events);
 
 	if (!skb_queue_empty(&scp->other_receive_queue))
 		mask |= EPOLLRDBAND;
@@ -2314,19 +2314,6 @@ static const struct seq_operations dn_socket_seq_ops = {
 	.stop	= dn_socket_seq_stop,
 	.show	= dn_socket_seq_show,
 };
-
-static int dn_socket_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_private(file, &dn_socket_seq_ops,
-			sizeof(struct dn_iter_state));
-}
-
-static const struct file_operations dn_socket_seq_fops = {
-	.open		= dn_socket_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
 #endif
 
 static const struct net_proto_family	dn_family_ops = {
@@ -2344,7 +2331,7 @@ static const struct proto_ops dn_proto_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	dn_accept,
 	.getname =	dn_getname,
-	.poll =		dn_poll,
+	.poll_mask =	dn_poll_mask,
 	.ioctl =	dn_ioctl,
 	.listen =	dn_listen,
 	.shutdown =	dn_shutdown,
@@ -2383,7 +2370,9 @@ static int __init decnet_init(void)
 	dev_add_pack(&dn_dix_packet_type);
 	register_netdevice_notifier(&dn_dev_notifier);
 
-	proc_create("decnet", 0444, init_net.proc_net, &dn_socket_seq_fops);
+	proc_create_seq_private("decnet", 0444, init_net.proc_net,
+			&dn_socket_seq_ops, sizeof(struct dn_iter_state),
+			NULL);
 	dn_register_sysctl();
 out:
 	return rc;
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index c03b046..bfd43e8 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -1382,19 +1382,6 @@ static const struct seq_operations dn_dev_seq_ops = {
 	.stop	= dn_dev_seq_stop,
 	.show	= dn_dev_seq_show,
 };
-
-static int dn_dev_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &dn_dev_seq_ops);
-}
-
-static const struct file_operations dn_dev_seq_fops = {
-	.open	 = dn_dev_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release,
-};
-
 #endif /* CONFIG_PROC_FS */
 
 static int addr[2];
@@ -1424,7 +1411,7 @@ void __init dn_dev_init(void)
 	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETADDR,
 			     NULL, dn_nl_dump_ifaddr, 0);
 
-	proc_create("decnet_dev", 0444, init_net.proc_net, &dn_dev_seq_fops);
+	proc_create_seq("decnet_dev", 0444, init_net.proc_net, &dn_dev_seq_ops);
 
 #ifdef CONFIG_SYSCTL
 	{
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 1315616..94b306f 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -589,27 +589,13 @@ static const struct seq_operations dn_neigh_seq_ops = {
 	.stop  = neigh_seq_stop,
 	.show  = dn_neigh_seq_show,
 };
-
-static int dn_neigh_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &dn_neigh_seq_ops,
-			    sizeof(struct neigh_seq_state));
-}
-
-static const struct file_operations dn_neigh_seq_fops = {
-	.open		= dn_neigh_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 #endif
 
 void __init dn_neigh_init(void)
 {
 	neigh_table_init(NEIGH_DN_TABLE, &dn_neigh_table);
-	proc_create("decnet_neigh", 0444, init_net.proc_net,
-		    &dn_neigh_seq_fops);
+	proc_create_net("decnet_neigh", 0444, init_net.proc_net,
+			&dn_neigh_seq_ops, sizeof(struct neigh_seq_state));
 }
 
 void __exit dn_neigh_cleanup(void)
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index eca0cc6..e747650 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1852,20 +1852,6 @@ static const struct seq_operations dn_rt_cache_seq_ops = {
 	.stop	= dn_rt_cache_seq_stop,
 	.show	= dn_rt_cache_seq_show,
 };
-
-static int dn_rt_cache_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_private(file, &dn_rt_cache_seq_ops,
-			sizeof(struct dn_rt_cache_iter_state));
-}
-
-static const struct file_operations dn_rt_cache_seq_fops = {
-	.open	 = dn_rt_cache_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_private,
-};
-
 #endif /* CONFIG_PROC_FS */
 
 void __init dn_route_init(void)
@@ -1918,8 +1904,9 @@ void __init dn_route_init(void)
 
 	dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1);
 
-	proc_create("decnet_cache", 0444, init_net.proc_net,
-		    &dn_rt_cache_seq_fops);
+	proc_create_seq_private("decnet_cache", 0444, init_net.proc_net,
+			&dn_rt_cache_seq_ops,
+			sizeof(struct dn_rt_cache_iter_state), NULL);
 
 #ifdef CONFIG_DECNET_ROUTER
 	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE,
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index a60658c..a0768d2 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -423,7 +423,7 @@ static const struct proto_ops ieee802154_raw_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = sock_no_getname,
-	.poll		   = datagram_poll,
+	.poll_mask	   = datagram_poll_mask,
 	.ioctl		   = ieee802154_sock_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = sock_no_shutdown,
@@ -969,7 +969,7 @@ static const struct proto_ops ieee802154_dgram_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = sock_no_getname,
-	.poll		   = datagram_poll,
+	.poll_mask	   = datagram_poll_mask,
 	.ioctl		   = ieee802154_sock_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = sock_no_shutdown,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index eaed036..8a59428 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -986,7 +986,7 @@ const struct proto_ops inet_stream_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = inet_accept,
 	.getname	   = inet_getname,
-	.poll		   = tcp_poll,
+	.poll_mask	   = tcp_poll_mask,
 	.ioctl		   = inet_ioctl,
 	.listen		   = inet_listen,
 	.shutdown	   = inet_shutdown,
@@ -1018,7 +1018,7 @@ const struct proto_ops inet_dgram_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = inet_getname,
-	.poll		   = udp_poll,
+	.poll_mask	   = udp_poll_mask,
 	.ioctl		   = inet_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
@@ -1039,7 +1039,7 @@ EXPORT_SYMBOL(inet_dgram_ops);
 
 /*
  * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
- * udp_poll
+ * udp_poll_mask
  */
 static const struct proto_ops inet_sockraw_ops = {
 	.family		   = PF_INET,
@@ -1050,7 +1050,7 @@ static const struct proto_ops inet_sockraw_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = inet_getname,
-	.poll		   = datagram_poll,
+	.poll_mask	   = datagram_poll_mask,
 	.ioctl		   = inet_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index bf6c2d4..e90c89e 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1418,23 +1418,12 @@ static const struct seq_operations arp_seq_ops = {
 	.show	= arp_seq_show,
 };
 
-static int arp_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &arp_seq_ops,
-			    sizeof(struct neigh_seq_state));
-}
-
-static const struct file_operations arp_seq_fops = {
-	.open           = arp_seq_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release	= seq_release_net,
-};
-
+/* ------------------------------------------------------------------------ */
 
 static int __net_init arp_net_init(struct net *net)
 {
-	if (!proc_create("arp", 0444, net->proc_net, &arp_seq_fops))
+	if (!proc_create_net("arp", 0444, net->proc_net, &arp_seq_ops,
+			sizeof(struct neigh_seq_state)))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3dcffd3..99c23a0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2348,18 +2348,6 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int fib_triestat_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, fib_triestat_seq_show);
-}
-
-static const struct file_operations fib_triestat_fops = {
-	.open	= fib_triestat_seq_open,
-	.read	= seq_read,
-	.llseek	= seq_lseek,
-	.release = single_release_net,
-};
-
 static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
 {
 	struct fib_trie_iter *iter = seq->private;
@@ -2533,19 +2521,6 @@ static const struct seq_operations fib_trie_seq_ops = {
 	.show   = fib_trie_seq_show,
 };
 
-static int fib_trie_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &fib_trie_seq_ops,
-			    sizeof(struct fib_trie_iter));
-}
-
-static const struct file_operations fib_trie_fops = {
-	.open   = fib_trie_seq_open,
-	.read   = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release_net,
-};
-
 struct fib_route_iter {
 	struct seq_net_private p;
 	struct fib_table *main_tb;
@@ -2726,29 +2701,18 @@ static const struct seq_operations fib_route_seq_ops = {
 	.show   = fib_route_seq_show,
 };
 
-static int fib_route_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &fib_route_seq_ops,
-			    sizeof(struct fib_route_iter));
-}
-
-static const struct file_operations fib_route_fops = {
-	.open   = fib_route_seq_open,
-	.read   = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release_net,
-};
-
 int __net_init fib_proc_init(struct net *net)
 {
-	if (!proc_create("fib_trie", 0444, net->proc_net, &fib_trie_fops))
+	if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops,
+			sizeof(struct fib_trie_iter)))
 		goto out1;
 
-	if (!proc_create("fib_triestat", 0444, net->proc_net,
-			 &fib_triestat_fops))
+	if (!proc_create_net_single("fib_triestat", 0444, net->proc_net,
+			fib_triestat_seq_show, NULL))
 		goto out2;
 
-	if (!proc_create("route", 0444, net->proc_net, &fib_route_fops))
+	if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops,
+			sizeof(struct fib_route_iter)))
 		goto out3;
 
 	return 0;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index b26a81a..85b617b 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2829,19 +2829,6 @@ static const struct seq_operations igmp_mc_seq_ops = {
 	.show	=	igmp_mc_seq_show,
 };
 
-static int igmp_mc_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &igmp_mc_seq_ops,
-			sizeof(struct igmp_mc_iter_state));
-}
-
-static const struct file_operations igmp_mc_seq_fops = {
-	.open		=	igmp_mc_seq_open,
-	.read		=	seq_read,
-	.llseek		=	seq_lseek,
-	.release	=	seq_release_net,
-};
-
 struct igmp_mcf_iter_state {
 	struct seq_net_private p;
 	struct net_device *dev;
@@ -2975,29 +2962,17 @@ static const struct seq_operations igmp_mcf_seq_ops = {
 	.show	=	igmp_mcf_seq_show,
 };
 
-static int igmp_mcf_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &igmp_mcf_seq_ops,
-			sizeof(struct igmp_mcf_iter_state));
-}
-
-static const struct file_operations igmp_mcf_seq_fops = {
-	.open		=	igmp_mcf_seq_open,
-	.read		=	seq_read,
-	.llseek		=	seq_lseek,
-	.release	=	seq_release_net,
-};
-
 static int __net_init igmp_net_init(struct net *net)
 {
 	struct proc_dir_entry *pde;
 	int err;
 
-	pde = proc_create("igmp", 0444, net->proc_net, &igmp_mc_seq_fops);
+	pde = proc_create_net("igmp", 0444, net->proc_net, &igmp_mc_seq_ops,
+			sizeof(struct igmp_mc_iter_state));
 	if (!pde)
 		goto out_igmp;
-	pde = proc_create("mcfilter", 0444, net->proc_net,
-			  &igmp_mcf_seq_fops);
+	pde = proc_create_net("mcfilter", 0444, net->proc_net,
+			&igmp_mcf_seq_ops, sizeof(struct igmp_mcf_iter_state));
 	if (!pde)
 		goto out_mcfilter;
 	err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET,
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 43f620f..bbcbcc1 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1282,18 +1282,6 @@ static int pnp_seq_show(struct seq_file *seq, void *v)
 			   &ic_servaddr);
 	return 0;
 }
-
-static int pnp_seq_open(struct inode *indoe, struct file *file)
-{
-	return single_open(file, pnp_seq_show, NULL);
-}
-
-static const struct file_operations pnp_seq_fops = {
-	.open		= pnp_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif /* CONFIG_PROC_FS */
 
 /*
@@ -1369,7 +1357,7 @@ static int __init ip_auto_config(void)
 	unsigned int i;
 
 #ifdef CONFIG_PROC_FS
-	proc_create("pnp", 0444, init_net.proc_net, &pnp_seq_fops);
+	proc_create_single("pnp", 0444, init_net.proc_net, pnp_seq_show);
 #endif /* CONFIG_PROC_FS */
 
 	if (!ic_enable)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 2fb4de3..37c4f88 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2828,19 +2828,6 @@ static const struct seq_operations ipmr_vif_seq_ops = {
 	.show  = ipmr_vif_seq_show,
 };
 
-static int ipmr_vif_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ipmr_vif_seq_ops,
-			    sizeof(struct mr_vif_iter));
-}
-
-static const struct file_operations ipmr_vif_fops = {
-	.open    = ipmr_vif_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct net *net = seq_file_net(seq);
@@ -2900,19 +2887,6 @@ static const struct seq_operations ipmr_mfc_seq_ops = {
 	.stop  = mr_mfc_seq_stop,
 	.show  = ipmr_mfc_seq_show,
 };
-
-static int ipmr_mfc_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
-			    sizeof(struct mr_mfc_iter));
-}
-
-static const struct file_operations ipmr_mfc_fops = {
-	.open    = ipmr_mfc_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
 #endif
 
 #ifdef CONFIG_IP_PIMSM_V2
@@ -2977,9 +2951,11 @@ static int __net_init ipmr_net_init(struct net *net)
 
 #ifdef CONFIG_PROC_FS
 	err = -ENOMEM;
-	if (!proc_create("ip_mr_vif", 0, net->proc_net, &ipmr_vif_fops))
+	if (!proc_create_net("ip_mr_vif", 0, net->proc_net, &ipmr_vif_seq_ops,
+			sizeof(struct mr_vif_iter)))
 		goto proc_vif_fail;
-	if (!proc_create("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_fops))
+	if (!proc_create_net("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops,
+			sizeof(struct mr_mfc_iter)))
 		goto proc_cache_fail;
 #endif
 	return 0;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 56a0106..2ed64bc 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1150,58 +1150,24 @@ static int ping_v4_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int ping_seq_open(struct inode *inode, struct file *file)
-{
-	struct ping_seq_afinfo *afinfo = PDE_DATA(inode);
-	return seq_open_net(inode, file, &afinfo->seq_ops,
-			   sizeof(struct ping_iter_state));
-}
-
-const struct file_operations ping_seq_fops = {
-	.open		= ping_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-EXPORT_SYMBOL_GPL(ping_seq_fops);
-
-static struct ping_seq_afinfo ping_v4_seq_afinfo = {
-	.name		= "icmp",
-	.family		= AF_INET,
-	.seq_fops	= &ping_seq_fops,
-	.seq_ops	= {
-		.start		= ping_v4_seq_start,
-		.show		= ping_v4_seq_show,
-		.next		= ping_seq_next,
-		.stop		= ping_seq_stop,
-	},
+static const struct seq_operations ping_v4_seq_ops = {
+	.start		= ping_v4_seq_start,
+	.show		= ping_v4_seq_show,
+	.next		= ping_seq_next,
+	.stop		= ping_seq_stop,
 };
 
-int ping_proc_register(struct net *net, struct ping_seq_afinfo *afinfo)
+static int __net_init ping_v4_proc_init_net(struct net *net)
 {
-	struct proc_dir_entry *p;
-	p = proc_create_data(afinfo->name, 0444, net->proc_net,
-			     afinfo->seq_fops, afinfo);
-	if (!p)
+	if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops,
+			sizeof(struct ping_iter_state)))
 		return -ENOMEM;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(ping_proc_register);
-
-void ping_proc_unregister(struct net *net, struct ping_seq_afinfo *afinfo)
-{
-	remove_proc_entry(afinfo->name, net->proc_net);
-}
-EXPORT_SYMBOL_GPL(ping_proc_unregister);
-
-static int __net_init ping_v4_proc_init_net(struct net *net)
-{
-	return ping_proc_register(net, &ping_v4_seq_afinfo);
-}
 
 static void __net_exit ping_v4_proc_exit_net(struct net *net)
 {
-	ping_proc_unregister(net, &ping_v4_seq_afinfo);
+	remove_proc_entry("icmp", net->proc_net);
 }
 
 static struct pernet_operations ping_v4_net_ops = {
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index a058de6..573e43c 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -77,18 +77,6 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int sockstat_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, sockstat_seq_show);
-}
-
-static const struct file_operations sockstat_seq_fops = {
-	.open	 = sockstat_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release_net,
-};
-
 /* snmp items */
 static const struct snmp_mib snmp4_ipstats_list[] = {
 	SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INPKTS),
@@ -460,20 +448,6 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int snmp_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, snmp_seq_show);
-}
-
-static const struct file_operations snmp_seq_fops = {
-	.open	 = snmp_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release_net,
-};
-
-
-
 /*
  *	Output /proc/net/netstat
  */
@@ -507,26 +481,16 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int netstat_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, netstat_seq_show);
-}
-
-static const struct file_operations netstat_seq_fops = {
-	.open	 = netstat_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release_net,
-};
-
 static __net_init int ip_proc_init_net(struct net *net)
 {
-	if (!proc_create("sockstat", 0444, net->proc_net,
-			 &sockstat_seq_fops))
+	if (!proc_create_net_single("sockstat", 0444, net->proc_net,
+			sockstat_seq_show, NULL))
 		goto out_sockstat;
-	if (!proc_create("netstat", 0444, net->proc_net, &netstat_seq_fops))
+	if (!proc_create_net_single("netstat", 0444, net->proc_net,
+			netstat_seq_show, NULL))
 		goto out_netstat;
-	if (!proc_create("snmp", 0444, net->proc_net, &snmp_seq_fops))
+	if (!proc_create_net_single("snmp", 0444, net->proc_net, snmp_seq_show,
+			NULL))
 		goto out_snmp;
 
 	return 0;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 1b4d335..abb3c94 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -1003,11 +1003,12 @@ struct proto raw_prot = {
 static struct sock *raw_get_first(struct seq_file *seq)
 {
 	struct sock *sk;
+	struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
 	struct raw_iter_state *state = raw_seq_private(seq);
 
 	for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
 			++state->bucket) {
-		sk_for_each(sk, &state->h->ht[state->bucket])
+		sk_for_each(sk, &h->ht[state->bucket])
 			if (sock_net(sk) == seq_file_net(seq))
 				goto found;
 	}
@@ -1018,6 +1019,7 @@ found:
 
 static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
 {
+	struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
 	struct raw_iter_state *state = raw_seq_private(seq);
 
 	do {
@@ -1027,7 +1029,7 @@ try_again:
 	} while (sk && sock_net(sk) != seq_file_net(seq));
 
 	if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {
-		sk = sk_head(&state->h->ht[state->bucket]);
+		sk = sk_head(&h->ht[state->bucket]);
 		goto try_again;
 	}
 	return sk;
@@ -1045,9 +1047,9 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
 
 void *raw_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct raw_iter_state *state = raw_seq_private(seq);
+	struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
 
-	read_lock(&state->h->lock);
+	read_lock(&h->lock);
 	return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 }
 EXPORT_SYMBOL_GPL(raw_seq_start);
@@ -1067,9 +1069,9 @@ EXPORT_SYMBOL_GPL(raw_seq_next);
 
 void raw_seq_stop(struct seq_file *seq, void *v)
 {
-	struct raw_iter_state *state = raw_seq_private(seq);
+	struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
 
-	read_unlock(&state->h->lock);
+	read_unlock(&h->lock);
 }
 EXPORT_SYMBOL_GPL(raw_seq_stop);
 
@@ -1110,37 +1112,10 @@ static const struct seq_operations raw_seq_ops = {
 	.show  = raw_seq_show,
 };
 
-int raw_seq_open(struct inode *ino, struct file *file,
-		 struct raw_hashinfo *h, const struct seq_operations *ops)
-{
-	int err;
-	struct raw_iter_state *i;
-
-	err = seq_open_net(ino, file, ops, sizeof(struct raw_iter_state));
-	if (err < 0)
-		return err;
-
-	i = raw_seq_private((struct seq_file *)file->private_data);
-	i->h = h;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(raw_seq_open);
-
-static int raw_v4_seq_open(struct inode *inode, struct file *file)
-{
-	return raw_seq_open(inode, file, &raw_v4_hashinfo, &raw_seq_ops);
-}
-
-static const struct file_operations raw_seq_fops = {
-	.open	 = raw_v4_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
 static __net_init int raw_init_net(struct net *net)
 {
-	if (!proc_create("raw", 0444, net->proc_net, &raw_seq_fops))
+	if (!proc_create_net_data("raw", 0444, net->proc_net, &raw_seq_ops,
+			sizeof(struct raw_iter_state), &raw_v4_hashinfo))
 		return -ENOMEM;
 
 	return 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2cfa1b5..75fb886 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -360,18 +360,6 @@ static int rt_acct_proc_show(struct seq_file *m, void *v)
 	kfree(dst);
 	return 0;
 }
-
-static int rt_acct_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, rt_acct_proc_show, NULL);
-}
-
-static const struct file_operations rt_acct_proc_fops = {
-	.open		= rt_acct_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 static int __net_init ip_rt_do_proc_init(struct net *net)
@@ -389,7 +377,8 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
 		goto err2;
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
-	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
+	pde = proc_create_single("rt_acct", 0, net->proc_net,
+			rt_acct_proc_show);
 	if (!pde)
 		goto err3;
 #endif
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c9d00ef..dec47e6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -494,32 +494,21 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
 }
 
 /*
- *	Wait for a TCP event.
- *
- *	Note that we don't need to lock the socket, as the upper poll layers
- *	take care of normal races (between the test and the event) and we don't
- *	go look at any of the socket buffers directly.
+ * Socket is not locked. We are protected from async events by poll logic and
+ * correct handling of state changes made by other threads is impossible in
+ * any case.
  */
-__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
+__poll_t tcp_poll_mask(struct socket *sock, __poll_t events)
 {
-	__poll_t mask;
 	struct sock *sk = sock->sk;
 	const struct tcp_sock *tp = tcp_sk(sk);
+	__poll_t mask = 0;
 	int state;
 
-	sock_poll_wait(file, sk_sleep(sk), wait);
-
 	state = inet_sk_state_load(sk);
 	if (state == TCP_LISTEN)
 		return inet_csk_listen_poll(sk);
 
-	/* Socket is not locked. We are protected from async events
-	 * by poll logic and correct handling of state changes
-	 * made by other threads is impossible in any case.
-	 */
-
-	mask = 0;
-
 	/*
 	 * EPOLLHUP is certainly not done right. But poll() doesn't
 	 * have a notion of HUP in just one direction, and for a
@@ -600,7 +589,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 	return mask;
 }
-EXPORT_SYMBOL(tcp_poll);
+EXPORT_SYMBOL(tcp_poll_mask);
 
 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f70586b..2c97062 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1961,6 +1961,7 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
  */
 static void *listening_get_next(struct seq_file *seq, void *cur)
 {
+	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
 	struct tcp_iter_state *st = seq->private;
 	struct net *net = seq_file_net(seq);
 	struct inet_listen_hashbucket *ilb;
@@ -1983,7 +1984,7 @@ get_sk:
 	sk_for_each_from(sk) {
 		if (!net_eq(sock_net(sk), net))
 			continue;
-		if (sk->sk_family == st->family)
+		if (sk->sk_family == afinfo->family)
 			return sk;
 	}
 	spin_unlock(&ilb->lock);
@@ -2020,6 +2021,7 @@ static inline bool empty_bucket(const struct tcp_iter_state *st)
  */
 static void *established_get_first(struct seq_file *seq)
 {
+	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
 	struct tcp_iter_state *st = seq->private;
 	struct net *net = seq_file_net(seq);
 	void *rc = NULL;
@@ -2036,7 +2038,7 @@ static void *established_get_first(struct seq_file *seq)
 
 		spin_lock_bh(lock);
 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
-			if (sk->sk_family != st->family ||
+			if (sk->sk_family != afinfo->family ||
 			    !net_eq(sock_net(sk), net)) {
 				continue;
 			}
@@ -2051,6 +2053,7 @@ out:
 
 static void *established_get_next(struct seq_file *seq, void *cur)
 {
+	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
 	struct sock *sk = cur;
 	struct hlist_nulls_node *node;
 	struct tcp_iter_state *st = seq->private;
@@ -2062,7 +2065,8 @@ static void *established_get_next(struct seq_file *seq, void *cur)
 	sk = sk_nulls_next(sk);
 
 	sk_nulls_for_each_from(sk, node) {
-		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
+		if (sk->sk_family == afinfo->family &&
+		    net_eq(sock_net(sk), net))
 			return sk;
 	}
 
@@ -2135,7 +2139,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
 	return rc;
 }
 
-static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
+void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct tcp_iter_state *st = seq->private;
 	void *rc;
@@ -2156,8 +2160,9 @@ out:
 	st->last_pos = *pos;
 	return rc;
 }
+EXPORT_SYMBOL(tcp_seq_start);
 
-static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct tcp_iter_state *st = seq->private;
 	void *rc = NULL;
@@ -2186,8 +2191,9 @@ out:
 	st->last_pos = *pos;
 	return rc;
 }
+EXPORT_SYMBOL(tcp_seq_next);
 
-static void tcp_seq_stop(struct seq_file *seq, void *v)
+void tcp_seq_stop(struct seq_file *seq, void *v)
 {
 	struct tcp_iter_state *st = seq->private;
 
@@ -2202,47 +2208,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
 		break;
 	}
 }
-
-int tcp_seq_open(struct inode *inode, struct file *file)
-{
-	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
-	struct tcp_iter_state *s;
-	int err;
-
-	err = seq_open_net(inode, file, &afinfo->seq_ops,
-			  sizeof(struct tcp_iter_state));
-	if (err < 0)
-		return err;
-
-	s = ((struct seq_file *)file->private_data)->private;
-	s->family		= afinfo->family;
-	s->last_pos		= 0;
-	return 0;
-}
-EXPORT_SYMBOL(tcp_seq_open);
-
-int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
-{
-	int rc = 0;
-	struct proc_dir_entry *p;
-
-	afinfo->seq_ops.start		= tcp_seq_start;
-	afinfo->seq_ops.next		= tcp_seq_next;
-	afinfo->seq_ops.stop		= tcp_seq_stop;
-
-	p = proc_create_data(afinfo->name, 0444, net->proc_net,
-			     afinfo->seq_fops, afinfo);
-	if (!p)
-		rc = -ENOMEM;
-	return rc;
-}
-EXPORT_SYMBOL(tcp_proc_register);
-
-void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
-{
-	remove_proc_entry(afinfo->name, net->proc_net);
-}
-EXPORT_SYMBOL(tcp_proc_unregister);
+EXPORT_SYMBOL(tcp_seq_stop);
 
 static void get_openreq4(const struct request_sock *req,
 			 struct seq_file *f, int i)
@@ -2377,30 +2343,28 @@ out:
 	return 0;
 }
 
-static const struct file_operations tcp_afinfo_seq_fops = {
-	.open    = tcp_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net
+static const struct seq_operations tcp4_seq_ops = {
+	.show		= tcp4_seq_show,
+	.start		= tcp_seq_start,
+	.next		= tcp_seq_next,
+	.stop		= tcp_seq_stop,
 };
 
 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
-	.name		= "tcp",
 	.family		= AF_INET,
-	.seq_fops	= &tcp_afinfo_seq_fops,
-	.seq_ops	= {
-		.show		= tcp4_seq_show,
-	},
 };
 
 static int __net_init tcp4_proc_init_net(struct net *net)
 {
-	return tcp_proc_register(net, &tcp4_seq_afinfo);
+	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
+			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
+		return -ENOMEM;
+	return 0;
 }
 
 static void __net_exit tcp4_proc_exit_net(struct net *net)
 {
-	tcp_proc_unregister(net, &tcp4_seq_afinfo);
+	remove_proc_entry("tcp", net->proc_net);
 }
 
 static struct pernet_operations tcp4_net_ops = {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index b61a770..675433e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2501,7 +2501,7 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname,
  * 	udp_poll - wait for a UDP event.
  *	@file - file struct
  *	@sock - socket
- *	@wait - poll table
+ *	@events - events to wait for
  *
  *	This is same as datagram poll, except for the special case of
  *	blocking sockets. If application is using a blocking fd
@@ -2510,23 +2510,23 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname,
  *	but then block when reading it. Add special case code
  *	to work around these arguably broken applications.
  */
-__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
+__poll_t udp_poll_mask(struct socket *sock, __poll_t events)
 {
-	__poll_t mask = datagram_poll(file, sock, wait);
+	__poll_t mask = datagram_poll_mask(sock, events);
 	struct sock *sk = sock->sk;
 
 	if (!skb_queue_empty(&udp_sk(sk)->reader_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Check for false positives due to checksum errors */
-	if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
+	if ((mask & EPOLLRDNORM) && !(sock->file->f_flags & O_NONBLOCK) &&
 	    !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
 		mask &= ~(EPOLLIN | EPOLLRDNORM);
 
 	return mask;
 
 }
-EXPORT_SYMBOL(udp_poll);
+EXPORT_SYMBOL(udp_poll_mask);
 
 int udp_abort(struct sock *sk, int err)
 {
@@ -2582,12 +2582,13 @@ EXPORT_SYMBOL(udp_prot);
 static struct sock *udp_get_first(struct seq_file *seq, int start)
 {
 	struct sock *sk;
+	struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
 	struct udp_iter_state *state = seq->private;
 	struct net *net = seq_file_net(seq);
 
-	for (state->bucket = start; state->bucket <= state->udp_table->mask;
+	for (state->bucket = start; state->bucket <= afinfo->udp_table->mask;
 	     ++state->bucket) {
-		struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
+		struct udp_hslot *hslot = &afinfo->udp_table->hash[state->bucket];
 
 		if (hlist_empty(&hslot->head))
 			continue;
@@ -2596,7 +2597,7 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
 		sk_for_each(sk, &hslot->head) {
 			if (!net_eq(sock_net(sk), net))
 				continue;
-			if (sk->sk_family == state->family)
+			if (sk->sk_family == afinfo->family)
 				goto found;
 		}
 		spin_unlock_bh(&hslot->lock);
@@ -2608,16 +2609,17 @@ found:
 
 static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
 {
+	struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
 	struct udp_iter_state *state = seq->private;
 	struct net *net = seq_file_net(seq);
 
 	do {
 		sk = sk_next(sk);
-	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
+	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != afinfo->family));
 
 	if (!sk) {
-		if (state->bucket <= state->udp_table->mask)
-			spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
+		if (state->bucket <= afinfo->udp_table->mask)
+			spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
 		return udp_get_first(seq, state->bucket + 1);
 	}
 	return sk;
@@ -2633,15 +2635,16 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
 	return pos ? NULL : sk;
 }
 
-static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
+void *udp_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct udp_iter_state *state = seq->private;
 	state->bucket = MAX_UDP_PORTS;
 
 	return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
 }
+EXPORT_SYMBOL(udp_seq_start);
 
-static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct sock *sk;
 
@@ -2653,56 +2656,17 @@ static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	++*pos;
 	return sk;
 }
+EXPORT_SYMBOL(udp_seq_next);
 
-static void udp_seq_stop(struct seq_file *seq, void *v)
+void udp_seq_stop(struct seq_file *seq, void *v)
 {
+	struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
 	struct udp_iter_state *state = seq->private;
 
-	if (state->bucket <= state->udp_table->mask)
-		spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
+	if (state->bucket <= afinfo->udp_table->mask)
+		spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
 }
-
-int udp_seq_open(struct inode *inode, struct file *file)
-{
-	struct udp_seq_afinfo *afinfo = PDE_DATA(inode);
-	struct udp_iter_state *s;
-	int err;
-
-	err = seq_open_net(inode, file, &afinfo->seq_ops,
-			   sizeof(struct udp_iter_state));
-	if (err < 0)
-		return err;
-
-	s = ((struct seq_file *)file->private_data)->private;
-	s->family		= afinfo->family;
-	s->udp_table		= afinfo->udp_table;
-	return err;
-}
-EXPORT_SYMBOL(udp_seq_open);
-
-/* ------------------------------------------------------------------------ */
-int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
-{
-	struct proc_dir_entry *p;
-	int rc = 0;
-
-	afinfo->seq_ops.start		= udp_seq_start;
-	afinfo->seq_ops.next		= udp_seq_next;
-	afinfo->seq_ops.stop		= udp_seq_stop;
-
-	p = proc_create_data(afinfo->name, 0444, net->proc_net,
-			     afinfo->seq_fops, afinfo);
-	if (!p)
-		rc = -ENOMEM;
-	return rc;
-}
-EXPORT_SYMBOL(udp_proc_register);
-
-void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo)
-{
-	remove_proc_entry(afinfo->name, net->proc_net);
-}
-EXPORT_SYMBOL(udp_proc_unregister);
+EXPORT_SYMBOL(udp_seq_stop);
 
 /* ------------------------------------------------------------------------ */
 static void udp4_format_sock(struct sock *sp, struct seq_file *f,
@@ -2742,32 +2706,30 @@ int udp4_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct file_operations udp_afinfo_seq_fops = {
-	.open     = udp_seq_open,
-	.read     = seq_read,
-	.llseek   = seq_lseek,
-	.release  = seq_release_net
+const struct seq_operations udp_seq_ops = {
+	.start		= udp_seq_start,
+	.next		= udp_seq_next,
+	.stop		= udp_seq_stop,
+	.show		= udp4_seq_show,
 };
+EXPORT_SYMBOL(udp_seq_ops);
 
-/* ------------------------------------------------------------------------ */
 static struct udp_seq_afinfo udp4_seq_afinfo = {
-	.name		= "udp",
 	.family		= AF_INET,
 	.udp_table	= &udp_table,
-	.seq_fops	= &udp_afinfo_seq_fops,
-	.seq_ops	= {
-		.show		= udp4_seq_show,
-	},
 };
 
 static int __net_init udp4_proc_init_net(struct net *net)
 {
-	return udp_proc_register(net, &udp4_seq_afinfo);
+	if (!proc_create_net_data("udp", 0444, net->proc_net, &udp_seq_ops,
+			sizeof(struct udp_iter_state), &udp4_seq_afinfo))
+		return -ENOMEM;
+	return 0;
 }
 
 static void __net_exit udp4_proc_exit_net(struct net *net)
 {
-	udp_proc_unregister(net, &udp4_seq_afinfo);
+	remove_proc_entry("udp", net->proc_net);
 }
 
 static struct pernet_operations udp4_net_ops = {
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index f96614e..8545457 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -14,6 +14,7 @@
 #define pr_fmt(fmt) "UDPLite: " fmt
 
 #include <linux/export.h>
+#include <linux/proc_fs.h>
 #include "udp_impl.h"
 
 struct udp_table 	udplite_table __read_mostly;
@@ -73,32 +74,22 @@ static struct inet_protosw udplite4_protosw = {
 };
 
 #ifdef CONFIG_PROC_FS
-
-static const struct file_operations udplite_afinfo_seq_fops = {
-	.open     = udp_seq_open,
-	.read     = seq_read,
-	.llseek   = seq_lseek,
-	.release  = seq_release_net
-};
-
 static struct udp_seq_afinfo udplite4_seq_afinfo = {
-	.name		= "udplite",
 	.family		= AF_INET,
 	.udp_table 	= &udplite_table,
-	.seq_fops	= &udplite_afinfo_seq_fops,
-	.seq_ops	= {
-		.show		= udp4_seq_show,
-	},
 };
 
 static int __net_init udplite4_proc_init_net(struct net *net)
 {
-	return udp_proc_register(net, &udplite4_seq_afinfo);
+	if (!proc_create_net_data("udplite", 0444, net->proc_net, &udp_seq_ops,
+			sizeof(struct udp_iter_state), &udplite4_seq_afinfo))
+		return -ENOMEM;
+	return 0;
 }
 
 static void __net_exit udplite4_proc_exit_net(struct net *net)
 {
-	udp_proc_unregister(net, &udplite4_seq_afinfo);
+	remove_proc_entry("udplite", net->proc_net);
 }
 
 static struct pernet_operations udplite4_net_ops = {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 78cef00..1b5ea33 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4254,22 +4254,10 @@ static const struct seq_operations if6_seq_ops = {
 	.stop	= if6_seq_stop,
 };
 
-static int if6_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &if6_seq_ops,
-			    sizeof(struct if6_iter_state));
-}
-
-static const struct file_operations if6_fops = {
-	.open		= if6_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 static int __net_init if6_proc_net_init(struct net *net)
 {
-	if (!proc_create("if_inet6", 0444, net->proc_net, &if6_fops))
+	if (!proc_create_net("if_inet6", 0444, net->proc_net, &if6_seq_ops,
+			sizeof(struct if6_iter_state)))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 8da0b51..d443c18 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -571,7 +571,7 @@ const struct proto_ops inet6_stream_ops = {
 	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/
 	.accept		   = inet_accept,		/* ok		*/
 	.getname	   = inet6_getname,
-	.poll		   = tcp_poll,			/* ok		*/
+	.poll_mask	   = tcp_poll_mask,		/* ok		*/
 	.ioctl		   = inet6_ioctl,		/* must change  */
 	.listen		   = inet_listen,		/* ok		*/
 	.shutdown	   = inet_shutdown,		/* ok		*/
@@ -601,7 +601,7 @@ const struct proto_ops inet6_dgram_ops = {
 	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/
 	.accept		   = sock_no_accept,		/* a do nothing	*/
 	.getname	   = inet6_getname,
-	.poll		   = udp_poll,			/* ok		*/
+	.poll_mask	   = udp_poll_mask,		/* ok		*/
 	.ioctl		   = inet6_ioctl,		/* must change  */
 	.listen		   = sock_no_listen,		/* ok		*/
 	.shutdown	   = inet_shutdown,		/* ok		*/
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index bbcabbb..ebeaf47 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -529,22 +529,10 @@ static const struct seq_operations ac6_seq_ops = {
 	.show	=	ac6_seq_show,
 };
 
-static int ac6_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ac6_seq_ops,
-			    sizeof(struct ac6_iter_state));
-}
-
-static const struct file_operations ac6_seq_fops = {
-	.open		=	ac6_seq_open,
-	.read		=	seq_read,
-	.llseek		=	seq_lseek,
-	.release	=	seq_release_net,
-};
-
 int __net_init ac6_proc_init(struct net *net)
 {
-	if (!proc_create("anycast6", 0444, net->proc_net, &ac6_seq_fops))
+	if (!proc_create_net("anycast6", 0444, net->proc_net, &ac6_seq_ops,
+			sizeof(struct ac6_iter_state)))
 		return -ENOMEM;
 
 	return 0;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index deab2db..01372dd 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -2209,15 +2209,6 @@ void fib6_gc_cleanup(void)
 }
 
 #ifdef CONFIG_PROC_FS
-
-struct ipv6_route_iter {
-	struct seq_net_private p;
-	struct fib6_walker w;
-	loff_t skip;
-	struct fib6_table *tbl;
-	int sernum;
-};
-
 static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 {
 	struct rt6_info *rt = v;
@@ -2383,17 +2374,10 @@ static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
 	rcu_read_unlock_bh();
 }
 
-static const struct seq_operations ipv6_route_seq_ops = {
+const struct seq_operations ipv6_route_seq_ops = {
 	.start	= ipv6_route_seq_start,
 	.next	= ipv6_route_seq_next,
 	.stop	= ipv6_route_seq_stop,
 	.show	= ipv6_route_seq_show
 };
-
-int ipv6_route_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ipv6_route_seq_ops,
-			    sizeof(struct ipv6_route_iter));
-}
-
 #endif /* CONFIG_PROC_FS */
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index c05c4e8..3eee763 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -754,6 +754,10 @@ static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos)
 static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(RCU)
 {
+	struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
+
+	state->pid_ns = proc_pid_ns(file_inode(seq->file));
+
 	rcu_read_lock_bh();
 	return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 }
@@ -808,44 +812,10 @@ static const struct seq_operations ip6fl_seq_ops = {
 	.show	=	ip6fl_seq_show,
 };
 
-static int ip6fl_seq_open(struct inode *inode, struct file *file)
-{
-	struct seq_file *seq;
-	struct ip6fl_iter_state *state;
-	int err;
-
-	err = seq_open_net(inode, file, &ip6fl_seq_ops,
-			   sizeof(struct ip6fl_iter_state));
-
-	if (!err) {
-		seq = file->private_data;
-		state = ip6fl_seq_private(seq);
-		rcu_read_lock();
-		state->pid_ns = get_pid_ns(task_active_pid_ns(current));
-		rcu_read_unlock();
-	}
-	return err;
-}
-
-static int ip6fl_seq_release(struct inode *inode, struct file *file)
-{
-	struct seq_file *seq = file->private_data;
-	struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
-	put_pid_ns(state->pid_ns);
-	return seq_release_net(inode, file);
-}
-
-static const struct file_operations ip6fl_seq_fops = {
-	.open		=	ip6fl_seq_open,
-	.read		=	seq_read,
-	.llseek		=	seq_lseek,
-	.release	=	ip6fl_seq_release,
-};
-
 static int __net_init ip6_flowlabel_proc_init(struct net *net)
 {
-	if (!proc_create("ip6_flowlabel", 0444, net->proc_net,
-			 &ip6fl_seq_fops))
+	if (!proc_create_net("ip6_flowlabel", 0444, net->proc_net,
+			&ip6fl_seq_ops, sizeof(struct ip6fl_iter_state)))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 298fd8b..4a15529 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -439,19 +439,6 @@ static const struct seq_operations ip6mr_vif_seq_ops = {
 	.show  = ip6mr_vif_seq_show,
 };
 
-static int ip6mr_vif_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ip6mr_vif_seq_ops,
-			    sizeof(struct mr_vif_iter));
-}
-
-static const struct file_operations ip6mr_vif_fops = {
-	.open    = ip6mr_vif_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct net *net = seq_file_net(seq);
@@ -512,19 +499,6 @@ static const struct seq_operations ipmr_mfc_seq_ops = {
 	.stop  = mr_mfc_seq_stop,
 	.show  = ipmr_mfc_seq_show,
 };
-
-static int ipmr_mfc_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
-			    sizeof(struct mr_mfc_iter));
-}
-
-static const struct file_operations ip6mr_mfc_fops = {
-	.open    = ipmr_mfc_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
 #endif
 
 #ifdef CONFIG_IPV6_PIMSM_V2
@@ -1316,9 +1290,11 @@ static int __net_init ip6mr_net_init(struct net *net)
 
 #ifdef CONFIG_PROC_FS
 	err = -ENOMEM;
-	if (!proc_create("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_fops))
+	if (!proc_create_net("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_seq_ops,
+			sizeof(struct mr_vif_iter)))
 		goto proc_vif_fail;
-	if (!proc_create("ip6_mr_cache", 0, net->proc_net, &ip6mr_mfc_fops))
+	if (!proc_create_net("ip6_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops,
+			sizeof(struct mr_mfc_iter)))
 		goto proc_cache_fail;
 #endif
 
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 793159d..975021d 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -2749,19 +2749,6 @@ static const struct seq_operations igmp6_mc_seq_ops = {
 	.show	=	igmp6_mc_seq_show,
 };
 
-static int igmp6_mc_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &igmp6_mc_seq_ops,
-			    sizeof(struct igmp6_mc_iter_state));
-}
-
-static const struct file_operations igmp6_mc_seq_fops = {
-	.open		=	igmp6_mc_seq_open,
-	.read		=	seq_read,
-	.llseek		=	seq_lseek,
-	.release	=	seq_release_net,
-};
-
 struct igmp6_mcf_iter_state {
 	struct seq_net_private p;
 	struct net_device *dev;
@@ -2903,28 +2890,17 @@ static const struct seq_operations igmp6_mcf_seq_ops = {
 	.show	=	igmp6_mcf_seq_show,
 };
 
-static int igmp6_mcf_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &igmp6_mcf_seq_ops,
-			    sizeof(struct igmp6_mcf_iter_state));
-}
-
-static const struct file_operations igmp6_mcf_seq_fops = {
-	.open		=	igmp6_mcf_seq_open,
-	.read		=	seq_read,
-	.llseek		=	seq_lseek,
-	.release	=	seq_release_net,
-};
-
 static int __net_init igmp6_proc_init(struct net *net)
 {
 	int err;
 
 	err = -ENOMEM;
-	if (!proc_create("igmp6", 0444, net->proc_net, &igmp6_mc_seq_fops))
+	if (!proc_create_net("igmp6", 0444, net->proc_net, &igmp6_mc_seq_ops,
+			sizeof(struct igmp6_mc_iter_state)))
 		goto out;
-	if (!proc_create("mcfilter6", 0444, net->proc_net,
-			 &igmp6_mcf_seq_fops))
+	if (!proc_create_net("mcfilter6", 0444, net->proc_net,
+			&igmp6_mcf_seq_ops,
+			sizeof(struct igmp6_mcf_iter_state)))
 		goto out_proc_net_igmp6;
 
 	err = 0;
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 746eeae..96f56bf 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -24,6 +24,7 @@
 #include <net/protocol.h>
 #include <net/udp.h>
 #include <net/transp_v6.h>
+#include <linux/proc_fs.h>
 #include <net/ping.h>
 
 /* Compatibility glue so we can support IPv6 when it's compiled as a module */
@@ -215,26 +216,24 @@ static int ping_v6_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static struct ping_seq_afinfo ping_v6_seq_afinfo = {
-	.name		= "icmp6",
-	.family		= AF_INET6,
-	.seq_fops       = &ping_seq_fops,
-	.seq_ops	= {
-		.start		= ping_v6_seq_start,
-		.show		= ping_v6_seq_show,
-		.next		= ping_seq_next,
-		.stop		= ping_seq_stop,
-	},
+static const struct seq_operations ping_v6_seq_ops = {
+	.start		= ping_v6_seq_start,
+	.show		= ping_v6_seq_show,
+	.next		= ping_seq_next,
+	.stop		= ping_seq_stop,
 };
 
 static int __net_init ping_v6_proc_init_net(struct net *net)
 {
-	return ping_proc_register(net, &ping_v6_seq_afinfo);
+	if (!proc_create_net("icmp6", 0444, net->proc_net, &ping_v6_seq_ops,
+			sizeof(struct ping_iter_state)))
+		return -ENOMEM;
+	return 0;
 }
 
 static void __net_init ping_v6_proc_exit_net(struct net *net)
 {
-	return ping_proc_unregister(net, &ping_v6_seq_afinfo);
+	remove_proc_entry("icmp6", net->proc_net);
 }
 
 static struct pernet_operations ping_v6_net_ops = {
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index a85f7e0..2356b4a 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -53,18 +53,6 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int sockstat6_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, sockstat6_seq_show);
-}
-
-static const struct file_operations sockstat6_seq_fops = {
-	.open	 = sockstat6_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release_net,
-};
-
 static const struct snmp_mib snmp6_ipstats_list[] = {
 /* ipv6 mib according to RFC 2465 */
 	SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INPKTS),
@@ -242,18 +230,6 @@ static int snmp6_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int snmp6_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, snmp6_seq_show);
-}
-
-static const struct file_operations snmp6_seq_fops = {
-	.open	 = snmp6_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release_net,
-};
-
 static int snmp6_dev_seq_show(struct seq_file *seq, void *v)
 {
 	struct inet6_dev *idev = (struct inet6_dev *)seq->private;
@@ -267,18 +243,6 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int snmp6_dev_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, snmp6_dev_seq_show, PDE_DATA(inode));
-}
-
-static const struct file_operations snmp6_dev_seq_fops = {
-	.open	 = snmp6_dev_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release,
-};
-
 int snmp6_register_dev(struct inet6_dev *idev)
 {
 	struct proc_dir_entry *p;
@@ -291,9 +255,8 @@ int snmp6_register_dev(struct inet6_dev *idev)
 	if (!net->mib.proc_net_devsnmp6)
 		return -ENOENT;
 
-	p = proc_create_data(idev->dev->name, 0444,
-			     net->mib.proc_net_devsnmp6,
-			     &snmp6_dev_seq_fops, idev);
+	p = proc_create_single_data(idev->dev->name, 0444,
+			net->mib.proc_net_devsnmp6, snmp6_dev_seq_show, idev);
 	if (!p)
 		return -ENOMEM;
 
@@ -315,11 +278,12 @@ int snmp6_unregister_dev(struct inet6_dev *idev)
 
 static int __net_init ipv6_proc_init_net(struct net *net)
 {
-	if (!proc_create("sockstat6", 0444, net->proc_net,
-			 &sockstat6_seq_fops))
+	if (!proc_create_net_single("sockstat6", 0444, net->proc_net,
+			sockstat6_seq_show, NULL))
 		return -ENOMEM;
 
-	if (!proc_create("snmp6", 0444, net->proc_net, &snmp6_seq_fops))
+	if (!proc_create_net_single("snmp6", 0444, net->proc_net,
+			snmp6_seq_show, NULL))
 		goto proc_snmp6_fail;
 
 	net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 5eb9b08..ce6f0d1 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1304,21 +1304,10 @@ static const struct seq_operations raw6_seq_ops = {
 	.show =		raw6_seq_show,
 };
 
-static int raw6_seq_open(struct inode *inode, struct file *file)
-{
-	return raw_seq_open(inode, file, &raw_v6_hashinfo, &raw6_seq_ops);
-}
-
-static const struct file_operations raw6_seq_fops = {
-	.open =		raw6_seq_open,
-	.read =		seq_read,
-	.llseek =	seq_lseek,
-	.release =	seq_release_net,
-};
-
 static int __net_init raw6_init_net(struct net *net)
 {
-	if (!proc_create("raw6", 0444, net->proc_net, &raw6_seq_fops))
+	if (!proc_create_net_data("raw6", 0444, net->proc_net, &raw6_seq_ops,
+			sizeof(struct raw_iter_state), &raw_v6_hashinfo))
 		return -ENOMEM;
 
 	return 0;
@@ -1345,7 +1334,7 @@ void raw6_proc_exit(void)
 }
 #endif	/* CONFIG_PROC_FS */
 
-/* Same as inet6_dgram_ops, sans udp_poll.  */
+/* Same as inet6_dgram_ops, sans udp_poll_mask.  */
 const struct proto_ops inet6_sockraw_ops = {
 	.family		   = PF_INET6,
 	.owner		   = THIS_MODULE,
@@ -1355,7 +1344,7 @@ const struct proto_ops inet6_sockraw_ops = {
 	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/
 	.accept		   = sock_no_accept,		/* a do nothing	*/
 	.getname	   = inet6_getname,
-	.poll		   = datagram_poll,		/* ok		*/
+	.poll_mask	   = datagram_poll_mask,	/* ok		*/
 	.ioctl		   = inet6_ioctl,		/* must change  */
 	.listen		   = sock_no_listen,		/* ok		*/
 	.shutdown	   = inet_shutdown,		/* ok		*/
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f4d6173..a659876 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4862,14 +4862,6 @@ static int ip6_route_dev_notify(struct notifier_block *this,
  */
 
 #ifdef CONFIG_PROC_FS
-
-static const struct file_operations ipv6_route_proc_fops = {
-	.open		= ipv6_route_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = (struct net *)seq->private;
@@ -4884,18 +4876,6 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v)
 
 	return 0;
 }
-
-static int rt6_stats_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, rt6_stats_seq_show);
-}
-
-static const struct file_operations rt6_stats_seq_fops = {
-	.open	 = rt6_stats_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release_net,
-};
 #endif	/* CONFIG_PROC_FS */
 
 #ifdef CONFIG_SYSCTL
@@ -5100,8 +5080,10 @@ static void __net_exit ip6_route_net_exit(struct net *net)
 static int __net_init ip6_route_net_init_late(struct net *net)
 {
 #ifdef CONFIG_PROC_FS
-	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
-	proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
+	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
+			sizeof(struct ipv6_route_iter));
+	proc_create_net_single("rt6_stats", 0444, net->proc_net,
+			rt6_stats_seq_show, NULL);
 #endif
 	return 0;
 }
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6d664d8..d2ce66b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1909,30 +1909,28 @@ out:
 	return 0;
 }
 
-static const struct file_operations tcp6_afinfo_seq_fops = {
-	.open    = tcp_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net
+static const struct seq_operations tcp6_seq_ops = {
+	.show		= tcp6_seq_show,
+	.start		= tcp_seq_start,
+	.next		= tcp_seq_next,
+	.stop		= tcp_seq_stop,
 };
 
 static struct tcp_seq_afinfo tcp6_seq_afinfo = {
-	.name		= "tcp6",
 	.family		= AF_INET6,
-	.seq_fops	= &tcp6_afinfo_seq_fops,
-	.seq_ops	= {
-		.show		= tcp6_seq_show,
-	},
 };
 
 int __net_init tcp6_proc_init(struct net *net)
 {
-	return tcp_proc_register(net, &tcp6_seq_afinfo);
+	if (!proc_create_net_data("tcp6", 0444, net->proc_net, &tcp6_seq_ops,
+			sizeof(struct tcp_iter_state), &tcp6_seq_afinfo))
+		return -ENOMEM;
+	return 0;
 }
 
 void tcp6_proc_exit(struct net *net)
 {
-	tcp_proc_unregister(net, &tcp6_seq_afinfo);
+	remove_proc_entry("tcp6", net->proc_net);
 }
 #endif
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index ea07300..00e2112 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1480,31 +1480,30 @@ int udp6_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct file_operations udp6_afinfo_seq_fops = {
-	.open     = udp_seq_open,
-	.read     = seq_read,
-	.llseek   = seq_lseek,
-	.release  = seq_release_net
+const struct seq_operations udp6_seq_ops = {
+	.start		= udp_seq_start,
+	.next		= udp_seq_next,
+	.stop		= udp_seq_stop,
+	.show		= udp6_seq_show,
 };
+EXPORT_SYMBOL(udp6_seq_ops);
 
 static struct udp_seq_afinfo udp6_seq_afinfo = {
-	.name		= "udp6",
 	.family		= AF_INET6,
 	.udp_table	= &udp_table,
-	.seq_fops	= &udp6_afinfo_seq_fops,
-	.seq_ops	= {
-		.show		= udp6_seq_show,
-	},
 };
 
 int __net_init udp6_proc_init(struct net *net)
 {
-	return udp_proc_register(net, &udp6_seq_afinfo);
+	if (!proc_create_net_data("udp6", 0444, net->proc_net, &udp6_seq_ops,
+			sizeof(struct udp_iter_state), &udp6_seq_afinfo))
+		return -ENOMEM;
+	return 0;
 }
 
 void udp6_proc_exit(struct net *net)
 {
-	udp_proc_unregister(net, &udp6_seq_afinfo);
+	remove_proc_entry("udp6", net->proc_net);
 }
 #endif /* CONFIG_PROC_FS */
 
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 14ae32b..5000ad6 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -12,6 +12,7 @@
  *		2 of the License, or (at your option) any later version.
  */
 #include <linux/export.h>
+#include <linux/proc_fs.h>
 #include "udp_impl.h"
 
 static int udplitev6_rcv(struct sk_buff *skb)
@@ -92,32 +93,23 @@ void udplitev6_exit(void)
 }
 
 #ifdef CONFIG_PROC_FS
-
-static const struct file_operations udplite6_afinfo_seq_fops = {
-	.open     = udp_seq_open,
-	.read     = seq_read,
-	.llseek   = seq_lseek,
-	.release  = seq_release_net
-};
-
 static struct udp_seq_afinfo udplite6_seq_afinfo = {
-	.name		= "udplite6",
 	.family		= AF_INET6,
 	.udp_table	= &udplite_table,
-	.seq_fops	= &udplite6_afinfo_seq_fops,
-	.seq_ops	= {
-		.show		= udp6_seq_show,
-	},
 };
 
 static int __net_init udplite6_proc_init_net(struct net *net)
 {
-	return udp_proc_register(net, &udplite6_seq_afinfo);
+	if (!proc_create_net_data("udplite6", 0444, net->proc_net,
+			&udp6_seq_ops, sizeof(struct udp_iter_state),
+			&udplite6_seq_afinfo))
+		return -ENOMEM;
+	return 0;
 }
 
 static void __net_exit udplite6_proc_exit_net(struct net *net)
 {
-	udp_proc_unregister(net, &udplite6_seq_afinfo);
+	remove_proc_entry("udplite6", net->proc_net);
 }
 
 static struct pernet_operations udplite6_net_ops = {
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 893a022..68e8625 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1488,14 +1488,11 @@ static inline __poll_t iucv_accept_poll(struct sock *parent)
 	return 0;
 }
 
-__poll_t iucv_sock_poll(struct file *file, struct socket *sock,
-			    poll_table *wait)
+static __poll_t iucv_sock_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
 
-	sock_poll_wait(file, sk_sleep(sk), wait);
-
 	if (sk->sk_state == IUCV_LISTEN)
 		return iucv_accept_poll(sk);
 
@@ -2388,7 +2385,7 @@ static const struct proto_ops iucv_sock_ops = {
 	.getname	= iucv_sock_getname,
 	.sendmsg	= iucv_sock_sendmsg,
 	.recvmsg	= iucv_sock_recvmsg,
-	.poll		= iucv_sock_poll,
+	.poll_mask	= iucv_sock_poll_mask,
 	.ioctl		= sock_no_ioctl,
 	.mmap		= sock_no_mmap,
 	.socketpair	= sock_no_socketpair,
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index 1fac925..370da2f 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -15,12 +15,6 @@
 #include <net/tcp.h>
 
 #ifdef CONFIG_PROC_FS
-struct kcm_seq_muxinfo {
-	char				*name;
-	const struct file_operations	*seq_fops;
-	const struct seq_operations	seq_ops;
-};
-
 static struct kcm_mux *kcm_get_first(struct seq_file *seq)
 {
 	struct net *net = seq_file_net(seq);
@@ -86,14 +80,6 @@ struct kcm_proc_mux_state {
 	int idx;
 };
 
-static int kcm_seq_open(struct inode *inode, struct file *file)
-{
-	struct kcm_seq_muxinfo *muxinfo = PDE_DATA(inode);
-
-	return seq_open_net(inode, file, &muxinfo->seq_ops,
-			   sizeof(struct kcm_proc_mux_state));
-}
-
 static void kcm_format_mux_header(struct seq_file *seq)
 {
 	struct net *net = seq_file_net(seq);
@@ -246,44 +232,13 @@ static int kcm_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct file_operations kcm_seq_fops = {
-	.open		= kcm_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
+static const struct seq_operations kcm_seq_ops = {
+	.show	= kcm_seq_show,
+	.start	= kcm_seq_start,
+	.next	= kcm_seq_next,
+	.stop	= kcm_seq_stop,
 };
 
-static struct kcm_seq_muxinfo kcm_seq_muxinfo = {
-	.name		= "kcm",
-	.seq_fops	= &kcm_seq_fops,
-	.seq_ops	= {
-		.show	= kcm_seq_show,
-		.start	= kcm_seq_start,
-		.next	= kcm_seq_next,
-		.stop	= kcm_seq_stop,
-	}
-};
-
-static int kcm_proc_register(struct net *net, struct kcm_seq_muxinfo *muxinfo)
-{
-	struct proc_dir_entry *p;
-	int rc = 0;
-
-	p = proc_create_data(muxinfo->name, 0444, net->proc_net,
-			     muxinfo->seq_fops, muxinfo);
-	if (!p)
-		rc = -ENOMEM;
-	return rc;
-}
-EXPORT_SYMBOL(kcm_proc_register);
-
-static void kcm_proc_unregister(struct net *net,
-				struct kcm_seq_muxinfo *muxinfo)
-{
-	remove_proc_entry(muxinfo->name, net->proc_net);
-}
-EXPORT_SYMBOL(kcm_proc_unregister);
-
 static int kcm_stats_seq_show(struct seq_file *seq, void *v)
 {
 	struct kcm_psock_stats psock_stats;
@@ -390,30 +345,14 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int kcm_stats_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, kcm_stats_seq_show);
-}
-
-static const struct file_operations kcm_stats_seq_fops = {
-	.open    = kcm_stats_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = single_release_net,
-};
-
 static int kcm_proc_init_net(struct net *net)
 {
-	int err;
-
-	if (!proc_create("kcm_stats", 0444, net->proc_net,
-			 &kcm_stats_seq_fops)) {
-		err = -ENOMEM;
+	if (!proc_create_net_single("kcm_stats", 0444, net->proc_net,
+			 kcm_stats_seq_show, NULL))
 		goto out_kcm_stats;
-	}
 
-	err = kcm_proc_register(net, &kcm_seq_muxinfo);
-	if (err)
+	if (!proc_create_net("kcm", 0444, net->proc_net, &kcm_seq_ops,
+			sizeof(struct kcm_proc_mux_state)))
 		goto out_kcm;
 
 	return 0;
@@ -421,12 +360,12 @@ static int kcm_proc_init_net(struct net *net)
 out_kcm:
 	remove_proc_entry("kcm_stats", net->proc_net);
 out_kcm_stats:
-	return err;
+	return -ENOMEM;
 }
 
 static void kcm_proc_exit_net(struct net *net)
 {
-	kcm_proc_unregister(net, &kcm_seq_muxinfo);
+	remove_proc_entry("kcm", net->proc_net);
 	remove_proc_entry("kcm_stats", net->proc_net);
 }
 
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index d3601d4..84b7d5c 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1336,9 +1336,9 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
 	struct list_head *head;
 	int index = 0;
 
-	/* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
-	 * we set sk_state, otherwise epoll_wait always returns right away with
-	 * EPOLLHUP
+	/* For SOCK_SEQPACKET sock type, datagram_poll_mask checks the sk_state,
+	 * so  we set sk_state, otherwise epoll_wait always returns right away
+	 * with EPOLLHUP
 	 */
 	kcm->sk.sk_state = TCP_ESTABLISHED;
 
@@ -1903,7 +1903,7 @@ static const struct proto_ops kcm_dgram_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	sock_no_getname,
-	.poll =		datagram_poll,
+	.poll_mask =	datagram_poll_mask,
 	.ioctl =	kcm_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
@@ -1924,7 +1924,7 @@ static const struct proto_ops kcm_seqpacket_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	sock_no_getname,
-	.poll =		datagram_poll,
+	.poll_mask =	datagram_poll_mask,
 	.ioctl =	kcm_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index e62e52e..8bdc1cb 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3751,7 +3751,7 @@ static const struct proto_ops pfkey_ops = {
 
 	/* Now the operations that really occur. */
 	.release	=	pfkey_release,
-	.poll		=	datagram_poll,
+	.poll_mask	=	datagram_poll_mask,
 	.sendmsg	=	pfkey_sendmsg,
 	.recvmsg	=	pfkey_recvmsg,
 };
@@ -3812,24 +3812,12 @@ static const struct seq_operations pfkey_seq_ops = {
 	.show	= pfkey_seq_show,
 };
 
-static int pfkey_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &pfkey_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations pfkey_proc_ops = {
-	.open	 = pfkey_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
 static int __net_init pfkey_init_proc(struct net *net)
 {
 	struct proc_dir_entry *e;
 
-	e = proc_create("pfkey", 0, net->proc_net, &pfkey_proc_ops);
+	e = proc_create_net("pfkey", 0, net->proc_net, &pfkey_seq_ops,
+			sizeof(struct seq_net_private));
 	if (e == NULL)
 		return -ENOMEM;
 
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index a9c05b2..181073b 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -613,7 +613,7 @@ static const struct proto_ops l2tp_ip_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = l2tp_ip_getname,
-	.poll		   = datagram_poll,
+	.poll_mask	   = datagram_poll_mask,
 	.ioctl		   = inet_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 9573691..336e4c0 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -754,7 +754,7 @@ static const struct proto_ops l2tp_ip6_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = l2tp_ip6_getname,
-	.poll		   = datagram_poll,
+	.poll_mask	   = datagram_poll_mask,
 	.ioctl		   = inet6_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 1fd9e14..3d8ca12 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1742,24 +1742,6 @@ static const struct seq_operations pppol2tp_seq_ops = {
 	.stop		= pppol2tp_seq_stop,
 	.show		= pppol2tp_seq_show,
 };
-
-/* Called when our /proc file is opened. We allocate data for use when
- * iterating our tunnel / session contexts and store it in the private
- * data of the seq_file.
- */
-static int pppol2tp_proc_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &pppol2tp_seq_ops,
-			    sizeof(struct pppol2tp_seq_data));
-}
-
-static const struct file_operations pppol2tp_proc_fops = {
-	.open		= pppol2tp_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 #endif /* CONFIG_PROC_FS */
 
 /*****************************************************************************
@@ -1771,8 +1753,8 @@ static __net_init int pppol2tp_init_net(struct net *net)
 	struct proc_dir_entry *pde;
 	int err = 0;
 
-	pde = proc_create("pppol2tp", 0444, net->proc_net,
-			  &pppol2tp_proc_fops);
+	pde = proc_create_net("pppol2tp", 0444, net->proc_net,
+			&pppol2tp_seq_ops, sizeof(struct pppol2tp_seq_data));
 	if (!pde) {
 		err = -ENOMEM;
 		goto out;
@@ -1806,7 +1788,7 @@ static const struct proto_ops pppol2tp_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= pppol2tp_getname,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= pppol2tp_setsockopt,
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 1beeea9..804de84 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -1192,7 +1192,7 @@ static const struct proto_ops llc_ui_ops = {
 	.socketpair  = sock_no_socketpair,
 	.accept      = llc_ui_accept,
 	.getname     = llc_ui_getname,
-	.poll	     = datagram_poll,
+	.poll_mask   = datagram_poll_mask,
 	.ioctl       = llc_ui_ioctl,
 	.listen      = llc_ui_listen,
 	.shutdown    = llc_ui_shutdown,
diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c
index 62ea0ae..f3a36c1 100644
--- a/net/llc/llc_proc.c
+++ b/net/llc/llc_proc.c
@@ -214,30 +214,6 @@ static const struct seq_operations llc_seq_core_ops = {
 	.show   = llc_seq_core_show,
 };
 
-static int llc_seq_socket_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &llc_seq_socket_ops);
-}
-
-static int llc_seq_core_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &llc_seq_core_ops);
-}
-
-static const struct file_operations llc_seq_socket_fops = {
-	.open		= llc_seq_socket_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static const struct file_operations llc_seq_core_fops = {
-	.open		= llc_seq_core_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static struct proc_dir_entry *llc_proc_dir;
 
 int __init llc_proc_init(void)
@@ -249,11 +225,11 @@ int __init llc_proc_init(void)
 	if (!llc_proc_dir)
 		goto out;
 
-	p = proc_create("socket", 0444, llc_proc_dir, &llc_seq_socket_fops);
+	p = proc_create_seq("socket", 0444, llc_proc_dir, &llc_seq_socket_ops);
 	if (!p)
 		goto out_socket;
 
-	p = proc_create("core", 0444, llc_proc_dir, &llc_seq_core_fops);
+	p = proc_create_seq("core", 0444, llc_proc_dir, &llc_seq_core_ops);
 	if (!p)
 		goto out_core;
 
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 1c98c90..c3db074 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -587,25 +587,13 @@ static const struct seq_operations ip_vs_app_seq_ops = {
 	.stop  = ip_vs_app_seq_stop,
 	.show  = ip_vs_app_seq_show,
 };
-
-static int ip_vs_app_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ip_vs_app_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations ip_vs_app_fops = {
-	.open	 = ip_vs_app_open,
-	.read	 = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
 #endif
 
 int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs)
 {
 	INIT_LIST_HEAD(&ipvs->app_list);
-	proc_create("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_fops);
+	proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_seq_ops,
+			sizeof(struct seq_net_private));
 	return 0;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 75de465..61c3a38 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1131,19 +1131,6 @@ static const struct seq_operations ip_vs_conn_seq_ops = {
 	.show  = ip_vs_conn_seq_show,
 };
 
-static int ip_vs_conn_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ip_vs_conn_seq_ops,
-			    sizeof(struct ip_vs_iter_state));
-}
-
-static const struct file_operations ip_vs_conn_fops = {
-	.open    = ip_vs_conn_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 static const char *ip_vs_origin_name(unsigned int flags)
 {
 	if (flags & IP_VS_CONN_F_SYNC)
@@ -1207,20 +1194,6 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = {
 	.stop  = ip_vs_conn_seq_stop,
 	.show  = ip_vs_conn_sync_seq_show,
 };
-
-static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
-			    sizeof(struct ip_vs_iter_state));
-}
-
-static const struct file_operations ip_vs_conn_sync_fops = {
-	.open    = ip_vs_conn_sync_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 #endif
 
 
@@ -1380,9 +1353,11 @@ int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
 {
 	atomic_set(&ipvs->conn_count, 0);
 
-	proc_create("ip_vs_conn", 0, ipvs->net->proc_net, &ip_vs_conn_fops);
-	proc_create("ip_vs_conn_sync", 0, ipvs->net->proc_net,
-		    &ip_vs_conn_sync_fops);
+	proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
+			&ip_vs_conn_seq_ops, sizeof(struct ip_vs_iter_state));
+	proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
+			&ip_vs_conn_sync_seq_ops,
+			sizeof(struct ip_vs_iter_state));
 	return 0;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 3ecca06..141b150 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2109,19 +2109,6 @@ static const struct seq_operations ip_vs_info_seq_ops = {
 	.show  = ip_vs_info_seq_show,
 };
 
-static int ip_vs_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ip_vs_info_seq_ops,
-			sizeof(struct ip_vs_iter));
-}
-
-static const struct file_operations ip_vs_info_fops = {
-	.open    = ip_vs_info_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 static int ip_vs_stats_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq_file_single_net(seq);
@@ -2154,18 +2141,6 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, ip_vs_stats_show);
-}
-
-static const struct file_operations ip_vs_stats_fops = {
-	.open = ip_vs_stats_seq_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release_net,
-};
-
 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq_file_single_net(seq);
@@ -2221,18 +2196,6 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
 
 	return 0;
 }
-
-static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, ip_vs_stats_percpu_show);
-}
-
-static const struct file_operations ip_vs_stats_percpu_fops = {
-	.open = ip_vs_stats_percpu_seq_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release_net,
-};
 #endif
 
 /*
@@ -4039,10 +4002,12 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 
 	spin_lock_init(&ipvs->tot_stats.lock);
 
-	proc_create("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_fops);
-	proc_create("ip_vs_stats", 0, ipvs->net->proc_net, &ip_vs_stats_fops);
-	proc_create("ip_vs_stats_percpu", 0, ipvs->net->proc_net,
-		    &ip_vs_stats_percpu_fops);
+	proc_create_net("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_seq_ops,
+			sizeof(struct ip_vs_iter));
+	proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net,
+			ip_vs_stats_show, NULL);
+	proc_create_net_single("ip_vs_stats_percpu", 0, ipvs->net->proc_net,
+			ip_vs_stats_percpu_show, NULL);
 
 	if (ip_vs_control_net_init_sysctl(ipvs))
 		goto err;
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 4b2b3d5..853b232 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -644,19 +644,6 @@ static const struct seq_operations exp_seq_ops = {
 	.stop = exp_seq_stop,
 	.show = exp_seq_show
 };
-
-static int exp_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &exp_seq_ops,
-			sizeof(struct ct_expect_iter_state));
-}
-
-static const struct file_operations exp_file_ops = {
-	.open    = exp_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
 #endif /* CONFIG_NF_CONNTRACK_PROCFS */
 
 static int exp_proc_init(struct net *net)
@@ -666,8 +653,8 @@ static int exp_proc_init(struct net *net)
 	kuid_t root_uid;
 	kgid_t root_gid;
 
-	proc = proc_create("nf_conntrack_expect", 0440, net->proc_net,
-			   &exp_file_ops);
+	proc = proc_create_net("nf_conntrack_expect", 0440, net->proc_net,
+			&exp_seq_ops, sizeof(struct ct_expect_iter_state));
 	if (!proc)
 		return -ENOMEM;
 
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 037fec5..b642c0b 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -375,19 +375,6 @@ static const struct seq_operations ct_seq_ops = {
 	.show  = ct_seq_show
 };
 
-static int ct_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ct_seq_ops,
-			sizeof(struct ct_iter_state));
-}
-
-static const struct file_operations ct_file_ops = {
-	.open    = ct_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct net *net = seq_file_net(seq);
@@ -467,26 +454,14 @@ static const struct seq_operations ct_cpu_seq_ops = {
 	.show	= ct_cpu_seq_show,
 };
 
-static int ct_cpu_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &ct_cpu_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations ct_cpu_seq_fops = {
-	.open	 = ct_cpu_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
 static int nf_conntrack_standalone_init_proc(struct net *net)
 {
 	struct proc_dir_entry *pde;
 	kuid_t root_uid;
 	kgid_t root_gid;
 
-	pde = proc_create("nf_conntrack", 0440, net->proc_net, &ct_file_ops);
+	pde = proc_create_net("nf_conntrack", 0440, net->proc_net, &ct_seq_ops,
+			sizeof(struct ct_iter_state));
 	if (!pde)
 		goto out_nf_conntrack;
 
@@ -495,8 +470,8 @@ static int nf_conntrack_standalone_init_proc(struct net *net)
 	if (uid_valid(root_uid) && gid_valid(root_gid))
 		proc_set_user(pde, root_uid, root_gid);
 
-	pde = proc_create("nf_conntrack", 0444, net->proc_net_stat,
-			  &ct_cpu_seq_fops);
+	pde = proc_create_net("nf_conntrack", 0444, net->proc_net_stat,
+			&ct_cpu_seq_ops, sizeof(struct seq_net_private));
 	if (!pde)
 		goto out_stat_nf_conntrack;
 	return 0;
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 6d03578..4264570 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -394,21 +394,6 @@ static const struct seq_operations nflog_seq_ops = {
 	.stop	= seq_stop,
 	.show	= seq_show,
 };
-
-static int nflog_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &nflog_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations nflog_file_ops = {
-	.open	 = nflog_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
-
 #endif /* PROC_FS */
 
 #ifdef CONFIG_SYSCTL
@@ -549,8 +534,8 @@ static int __net_init nf_log_net_init(struct net *net)
 	int ret = -ENOMEM;
 
 #ifdef CONFIG_PROC_FS
-	if (!proc_create("nf_log", 0444,
-			 net->nf.proc_netfilter, &nflog_file_ops))
+	if (!proc_create_net("nf_log", 0444, net->nf.proc_netfilter,
+			&nflog_seq_ops, sizeof(struct seq_net_private)))
 		return ret;
 #endif
 	ret = netfilter_log_sysctl_init(net);
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 6039b35..8ff4d22 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -310,23 +310,10 @@ static const struct seq_operations synproxy_cpu_seq_ops = {
 	.show		= synproxy_cpu_seq_show,
 };
 
-static int synproxy_cpu_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &synproxy_cpu_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations synproxy_cpu_seq_fops = {
-	.open		= synproxy_cpu_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 static int __net_init synproxy_proc_init(struct net *net)
 {
-	if (!proc_create("synproxy", 0444, net->proc_net_stat,
-			 &synproxy_cpu_seq_fops))
+	if (!proc_create_net("synproxy", 0444, net->proc_net_stat,
+			&synproxy_cpu_seq_ops, sizeof(struct seq_net_private)))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 7b46aa4..c14822b 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -1046,20 +1046,6 @@ static const struct seq_operations nful_seq_ops = {
 	.stop	= seq_stop,
 	.show	= seq_show,
 };
-
-static int nful_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &nful_seq_ops,
-			    sizeof(struct iter_state));
-}
-
-static const struct file_operations nful_file_ops = {
-	.open	 = nful_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
 #endif /* PROC_FS */
 
 static int __net_init nfnl_log_net_init(struct net *net)
@@ -1077,8 +1063,8 @@ static int __net_init nfnl_log_net_init(struct net *net)
 	spin_lock_init(&log->instances_lock);
 
 #ifdef CONFIG_PROC_FS
-	proc = proc_create("nfnetlink_log", 0440,
-			   net->nf.proc_netfilter, &nful_file_ops);
+	proc = proc_create_net("nfnetlink_log", 0440, net->nf.proc_netfilter,
+			&nful_seq_ops, sizeof(struct iter_state));
 	if (!proc)
 		return -ENOMEM;
 
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 74a0463..494a9ab 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1469,20 +1469,6 @@ static const struct seq_operations nfqnl_seq_ops = {
 	.stop	= seq_stop,
 	.show	= seq_show,
 };
-
-static int nfqnl_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &nfqnl_seq_ops,
-			sizeof(struct iter_state));
-}
-
-static const struct file_operations nfqnl_file_ops = {
-	.open	 = nfqnl_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
 #endif /* PROC_FS */
 
 static int __net_init nfnl_queue_net_init(struct net *net)
@@ -1496,8 +1482,8 @@ static int __net_init nfnl_queue_net_init(struct net *net)
 	spin_lock_init(&q->instances_lock);
 
 #ifdef CONFIG_PROC_FS
-	if (!proc_create("nfnetlink_queue", 0440,
-			 net->nf.proc_netfilter, &nfqnl_file_ops))
+	if (!proc_create_net("nfnetlink_queue", 0440, net->nf.proc_netfilter,
+			&nfqnl_seq_ops, sizeof(struct iter_state)))
 		return -ENOMEM;
 #endif
 	nf_register_queue_handler(net, &nfqh);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index cb7cb30..55cb4d1 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1495,15 +1495,10 @@ void *xt_unregister_table(struct xt_table *table)
 EXPORT_SYMBOL_GPL(xt_unregister_table);
 
 #ifdef CONFIG_PROC_FS
-struct xt_names_priv {
-	struct seq_net_private p;
-	u_int8_t af;
-};
 static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct xt_names_priv *priv = seq->private;
 	struct net *net = seq_file_net(seq);
-	u_int8_t af = priv->af;
+	u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file));
 
 	mutex_lock(&xt[af].mutex);
 	return seq_list_start(&net->xt.tables[af], *pos);
@@ -1511,17 +1506,15 @@ static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos)
 
 static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct xt_names_priv *priv = seq->private;
 	struct net *net = seq_file_net(seq);
-	u_int8_t af = priv->af;
+	u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file));
 
 	return seq_list_next(v, &net->xt.tables[af], pos);
 }
 
 static void xt_table_seq_stop(struct seq_file *seq, void *v)
 {
-	struct xt_names_priv *priv = seq->private;
-	u_int8_t af = priv->af;
+	u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file));
 
 	mutex_unlock(&xt[af].mutex);
 }
@@ -1542,34 +1535,13 @@ static const struct seq_operations xt_table_seq_ops = {
 	.show	= xt_table_seq_show,
 };
 
-static int xt_table_open(struct inode *inode, struct file *file)
-{
-	int ret;
-	struct xt_names_priv *priv;
-
-	ret = seq_open_net(inode, file, &xt_table_seq_ops,
-			   sizeof(struct xt_names_priv));
-	if (!ret) {
-		priv = ((struct seq_file *)file->private_data)->private;
-		priv->af = (unsigned long)PDE_DATA(inode);
-	}
-	return ret;
-}
-
-static const struct file_operations xt_table_ops = {
-	.open	 = xt_table_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
 /*
  * Traverse state for ip{,6}_{tables,matches} for helping crossing
  * the multi-AF mutexes.
  */
 struct nf_mttg_trav {
 	struct list_head *head, *curr;
-	uint8_t class, nfproto;
+	uint8_t class;
 };
 
 enum {
@@ -1586,6 +1558,7 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos,
 		[MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC,
 		[MTTG_TRAV_NFP_SPEC]   = MTTG_TRAV_DONE,
 	};
+	uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file));
 	struct nf_mttg_trav *trav = seq->private;
 
 	switch (trav->class) {
@@ -1600,9 +1573,9 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos,
 		if (trav->curr != trav->head)
 			break;
 		mutex_unlock(&xt[NFPROTO_UNSPEC].mutex);
-		mutex_lock(&xt[trav->nfproto].mutex);
+		mutex_lock(&xt[nfproto].mutex);
 		trav->head = trav->curr = is_target ?
-			&xt[trav->nfproto].target : &xt[trav->nfproto].match;
+			&xt[nfproto].target : &xt[nfproto].match;
 		trav->class = next_class[trav->class];
 		break;
 	case MTTG_TRAV_NFP_SPEC:
@@ -1634,6 +1607,7 @@ static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos,
 
 static void xt_mttg_seq_stop(struct seq_file *seq, void *v)
 {
+	uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file));
 	struct nf_mttg_trav *trav = seq->private;
 
 	switch (trav->class) {
@@ -1641,7 +1615,7 @@ static void xt_mttg_seq_stop(struct seq_file *seq, void *v)
 		mutex_unlock(&xt[NFPROTO_UNSPEC].mutex);
 		break;
 	case MTTG_TRAV_NFP_SPEC:
-		mutex_unlock(&xt[trav->nfproto].mutex);
+		mutex_unlock(&xt[nfproto].mutex);
 		break;
 	}
 }
@@ -1680,24 +1654,6 @@ static const struct seq_operations xt_match_seq_ops = {
 	.show	= xt_match_seq_show,
 };
 
-static int xt_match_open(struct inode *inode, struct file *file)
-{
-	struct nf_mttg_trav *trav;
-	trav = __seq_open_private(file, &xt_match_seq_ops, sizeof(*trav));
-	if (!trav)
-		return -ENOMEM;
-
-	trav->nfproto = (unsigned long)PDE_DATA(inode);
-	return 0;
-}
-
-static const struct file_operations xt_match_ops = {
-	.open	 = xt_match_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_private,
-};
-
 static void *xt_target_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	return xt_mttg_seq_start(seq, pos, true);
@@ -1732,24 +1688,6 @@ static const struct seq_operations xt_target_seq_ops = {
 	.show	= xt_target_seq_show,
 };
 
-static int xt_target_open(struct inode *inode, struct file *file)
-{
-	struct nf_mttg_trav *trav;
-	trav = __seq_open_private(file, &xt_target_seq_ops, sizeof(*trav));
-	if (!trav)
-		return -ENOMEM;
-
-	trav->nfproto = (unsigned long)PDE_DATA(inode);
-	return 0;
-}
-
-static const struct file_operations xt_target_ops = {
-	.open	 = xt_target_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_private,
-};
-
 #define FORMAT_TABLES	"_tables_names"
 #define	FORMAT_MATCHES	"_tables_matches"
 #define FORMAT_TARGETS 	"_tables_targets"
@@ -1813,8 +1751,9 @@ int xt_proto_init(struct net *net, u_int8_t af)
 
 	strlcpy(buf, xt_prefix[af], sizeof(buf));
 	strlcat(buf, FORMAT_TABLES, sizeof(buf));
-	proc = proc_create_data(buf, 0440, net->proc_net, &xt_table_ops,
-				(void *)(unsigned long)af);
+	proc = proc_create_net_data(buf, 0440, net->proc_net, &xt_table_seq_ops,
+			sizeof(struct seq_net_private),
+			(void *)(unsigned long)af);
 	if (!proc)
 		goto out;
 	if (uid_valid(root_uid) && gid_valid(root_gid))
@@ -1822,8 +1761,9 @@ int xt_proto_init(struct net *net, u_int8_t af)
 
 	strlcpy(buf, xt_prefix[af], sizeof(buf));
 	strlcat(buf, FORMAT_MATCHES, sizeof(buf));
-	proc = proc_create_data(buf, 0440, net->proc_net, &xt_match_ops,
-				(void *)(unsigned long)af);
+	proc = proc_create_seq_private(buf, 0440, net->proc_net,
+			&xt_match_seq_ops, sizeof(struct nf_mttg_trav),
+			(void *)(unsigned long)af);
 	if (!proc)
 		goto out_remove_tables;
 	if (uid_valid(root_uid) && gid_valid(root_gid))
@@ -1831,8 +1771,9 @@ int xt_proto_init(struct net *net, u_int8_t af)
 
 	strlcpy(buf, xt_prefix[af], sizeof(buf));
 	strlcat(buf, FORMAT_TARGETS, sizeof(buf));
-	proc = proc_create_data(buf, 0440, net->proc_net, &xt_target_ops,
-				(void *)(unsigned long)af);
+	proc = proc_create_seq_private(buf, 0440, net->proc_net,
+			 &xt_target_seq_ops, sizeof(struct nf_mttg_trav),
+			 (void *)(unsigned long)af);
 	if (!proc)
 		goto out_remove_matches;
 	if (uid_valid(root_uid) && gid_valid(root_gid))
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 0cd7356..9b16402 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -57,9 +57,9 @@ static inline struct hashlimit_net *hashlimit_pernet(struct net *net)
 }
 
 /* need to declare this at the top */
-static const struct file_operations dl_file_ops_v2;
-static const struct file_operations dl_file_ops_v1;
-static const struct file_operations dl_file_ops;
+static const struct seq_operations dl_seq_ops_v2;
+static const struct seq_operations dl_seq_ops_v1;
+static const struct seq_operations dl_seq_ops;
 
 /* hash table crap */
 struct dsthash_dst {
@@ -272,7 +272,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
 {
 	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
 	struct xt_hashlimit_htable *hinfo;
-	const struct file_operations *fops;
+	const struct seq_operations *ops;
 	unsigned int size, i;
 	int ret;
 
@@ -321,19 +321,19 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
 
 	switch (revision) {
 	case 1:
-		fops = &dl_file_ops_v1;
+		ops = &dl_seq_ops_v1;
 		break;
 	case 2:
-		fops = &dl_file_ops_v2;
+		ops = &dl_seq_ops_v2;
 		break;
 	default:
-		fops = &dl_file_ops;
+		ops = &dl_seq_ops;
 	}
 
-	hinfo->pde = proc_create_data(name, 0,
+	hinfo->pde = proc_create_seq_data(name, 0,
 		(family == NFPROTO_IPV4) ?
 		hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
-		fops, hinfo);
+		ops, hinfo);
 	if (hinfo->pde == NULL) {
 		kfree(hinfo->name);
 		vfree(hinfo);
@@ -1057,7 +1057,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 static void *dl_seq_start(struct seq_file *s, loff_t *pos)
 	__acquires(htable->lock)
 {
-	struct xt_hashlimit_htable *htable = s->private;
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
 	unsigned int *bucket;
 
 	spin_lock_bh(&htable->lock);
@@ -1074,7 +1074,7 @@ static void *dl_seq_start(struct seq_file *s, loff_t *pos)
 
 static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
 {
-	struct xt_hashlimit_htable *htable = s->private;
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
 	unsigned int *bucket = v;
 
 	*pos = ++(*bucket);
@@ -1088,7 +1088,7 @@ static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
 static void dl_seq_stop(struct seq_file *s, void *v)
 	__releases(htable->lock)
 {
-	struct xt_hashlimit_htable *htable = s->private;
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
 	unsigned int *bucket = v;
 
 	if (!IS_ERR(bucket))
@@ -1130,7 +1130,7 @@ static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family,
 static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
 			       struct seq_file *s)
 {
-	const struct xt_hashlimit_htable *ht = s->private;
+	struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private));
 
 	spin_lock(&ent->lock);
 	/* recalculate to show accurate numbers */
@@ -1145,7 +1145,7 @@ static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
 static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
 			       struct seq_file *s)
 {
-	const struct xt_hashlimit_htable *ht = s->private;
+	struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private));
 
 	spin_lock(&ent->lock);
 	/* recalculate to show accurate numbers */
@@ -1160,7 +1160,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
 static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 			    struct seq_file *s)
 {
-	const struct xt_hashlimit_htable *ht = s->private;
+	struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private));
 
 	spin_lock(&ent->lock);
 	/* recalculate to show accurate numbers */
@@ -1174,7 +1174,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 
 static int dl_seq_show_v2(struct seq_file *s, void *v)
 {
-	struct xt_hashlimit_htable *htable = s->private;
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
 	unsigned int *bucket = (unsigned int *)v;
 	struct dsthash_ent *ent;
 
@@ -1188,7 +1188,7 @@ static int dl_seq_show_v2(struct seq_file *s, void *v)
 
 static int dl_seq_show_v1(struct seq_file *s, void *v)
 {
-	struct xt_hashlimit_htable *htable = s->private;
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
 	unsigned int *bucket = v;
 	struct dsthash_ent *ent;
 
@@ -1202,7 +1202,7 @@ static int dl_seq_show_v1(struct seq_file *s, void *v)
 
 static int dl_seq_show(struct seq_file *s, void *v)
 {
-	struct xt_hashlimit_htable *htable = s->private;
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
 	unsigned int *bucket = v;
 	struct dsthash_ent *ent;
 
@@ -1235,62 +1235,6 @@ static const struct seq_operations dl_seq_ops = {
 	.show  = dl_seq_show
 };
 
-static int dl_proc_open_v2(struct inode *inode, struct file *file)
-{
-	int ret = seq_open(file, &dl_seq_ops_v2);
-
-	if (!ret) {
-		struct seq_file *sf = file->private_data;
-
-		sf->private = PDE_DATA(inode);
-	}
-	return ret;
-}
-
-static int dl_proc_open_v1(struct inode *inode, struct file *file)
-{
-	int ret = seq_open(file, &dl_seq_ops_v1);
-
-	if (!ret) {
-		struct seq_file *sf = file->private_data;
-		sf->private = PDE_DATA(inode);
-	}
-	return ret;
-}
-
-static int dl_proc_open(struct inode *inode, struct file *file)
-{
-	int ret = seq_open(file, &dl_seq_ops);
-
-	if (!ret) {
-		struct seq_file *sf = file->private_data;
-
-		sf->private = PDE_DATA(inode);
-	}
-	return ret;
-}
-
-static const struct file_operations dl_file_ops_v2 = {
-	.open    = dl_proc_open_v2,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release
-};
-
-static const struct file_operations dl_file_ops_v1 = {
-	.open    = dl_proc_open_v1,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release
-};
-
-static const struct file_operations dl_file_ops = {
-	.open    = dl_proc_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release
-};
-
 static int __net_init hashlimit_proc_net_init(struct net *net)
 {
 	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 2e2dd88..1189b84 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2635,21 +2635,6 @@ static const struct seq_operations netlink_seq_ops = {
 	.stop   = netlink_seq_stop,
 	.show   = netlink_seq_show,
 };
-
-
-static int netlink_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &netlink_seq_ops,
-				sizeof(struct nl_seq_iter));
-}
-
-static const struct file_operations netlink_seq_fops = {
-	.open		= netlink_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 #endif
 
 int netlink_register_notifier(struct notifier_block *nb)
@@ -2673,7 +2658,7 @@ static const struct proto_ops netlink_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	netlink_getname,
-	.poll =		datagram_poll,
+	.poll_mask =	datagram_poll_mask,
 	.ioctl =	netlink_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
@@ -2694,7 +2679,8 @@ static const struct net_proto_family netlink_family_ops = {
 static int __net_init netlink_net_init(struct net *net)
 {
 #ifdef CONFIG_PROC_FS
-	if (!proc_create("netlink", 0, net->proc_net, &netlink_seq_fops))
+	if (!proc_create_net("netlink", 0, net->proc_net, &netlink_seq_ops,
+			sizeof(struct nl_seq_iter)))
 		return -ENOMEM;
 #endif
 	return 0;
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 4221d98..b97eb76 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1338,18 +1338,6 @@ static const struct seq_operations nr_info_seqops = {
 	.stop = nr_info_stop,
 	.show = nr_info_show,
 };
-
-static int nr_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &nr_info_seqops);
-}
-
-static const struct file_operations nr_info_fops = {
-	.open = nr_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
 #endif	/* CONFIG_PROC_FS */
 
 static const struct net_proto_family nr_family_ops = {
@@ -1367,7 +1355,7 @@ static const struct proto_ops nr_proto_ops = {
 	.socketpair	=	sock_no_socketpair,
 	.accept		=	nr_accept,
 	.getname	=	nr_getname,
-	.poll		=	datagram_poll,
+	.poll_mask	=	datagram_poll_mask,
 	.ioctl		=	nr_ioctl,
 	.listen		=	nr_listen,
 	.shutdown	=	sock_no_shutdown,
@@ -1450,9 +1438,9 @@ static int __init nr_proto_init(void)
 
 	nr_loopback_init();
 
-	proc_create("nr", 0444, init_net.proc_net, &nr_info_fops);
-	proc_create("nr_neigh", 0444, init_net.proc_net, &nr_neigh_fops);
-	proc_create("nr_nodes", 0444, init_net.proc_net, &nr_nodes_fops);
+	proc_create_seq("nr", 0444, init_net.proc_net, &nr_info_seqops);
+	proc_create_seq("nr_neigh", 0444, init_net.proc_net, &nr_neigh_seqops);
+	proc_create_seq("nr_nodes", 0444, init_net.proc_net, &nr_node_seqops);
 out:
 	return rc;
 fail:
diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c
index b5a7dcb..6485f59 100644
--- a/net/netrom/nr_route.c
+++ b/net/netrom/nr_route.c
@@ -888,25 +888,13 @@ static int nr_node_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations nr_node_seqops = {
+const struct seq_operations nr_node_seqops = {
 	.start = nr_node_start,
 	.next = nr_node_next,
 	.stop = nr_node_stop,
 	.show = nr_node_show,
 };
 
-static int nr_node_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &nr_node_seqops);
-}
-
-const struct file_operations nr_nodes_fops = {
-	.open = nr_node_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 static void *nr_neigh_start(struct seq_file *seq, loff_t *pos)
 {
 	spin_lock_bh(&nr_neigh_list_lock);
@@ -954,25 +942,12 @@ static int nr_neigh_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations nr_neigh_seqops = {
+const struct seq_operations nr_neigh_seqops = {
 	.start = nr_neigh_start,
 	.next = nr_neigh_next,
 	.stop = nr_neigh_stop,
 	.show = nr_neigh_show,
 };
-
-static int nr_neigh_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &nr_neigh_seqops);
-}
-
-const struct file_operations nr_neigh_fops = {
-	.open = nr_neigh_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 #endif
 
 /*
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index ea0c0c6..ab5bb14 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -548,16 +548,13 @@ static inline __poll_t llcp_accept_poll(struct sock *parent)
 	return 0;
 }
 
-static __poll_t llcp_sock_poll(struct file *file, struct socket *sock,
-				   poll_table *wait)
+static __poll_t llcp_sock_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
 
 	pr_debug("%p\n", sk);
 
-	sock_poll_wait(file, sk_sleep(sk), wait);
-
 	if (sk->sk_state == LLCP_LISTEN)
 		return llcp_accept_poll(sk);
 
@@ -899,7 +896,7 @@ static const struct proto_ops llcp_sock_ops = {
 	.socketpair     = sock_no_socketpair,
 	.accept         = llcp_sock_accept,
 	.getname        = llcp_sock_getname,
-	.poll           = llcp_sock_poll,
+	.poll_mask      = llcp_sock_poll_mask,
 	.ioctl          = sock_no_ioctl,
 	.listen         = llcp_sock_listen,
 	.shutdown       = sock_no_shutdown,
@@ -919,7 +916,7 @@ static const struct proto_ops llcp_rawsock_ops = {
 	.socketpair     = sock_no_socketpair,
 	.accept         = sock_no_accept,
 	.getname        = llcp_sock_getname,
-	.poll           = llcp_sock_poll,
+	.poll_mask      = llcp_sock_poll_mask,
 	.ioctl          = sock_no_ioctl,
 	.listen         = sock_no_listen,
 	.shutdown       = sock_no_shutdown,
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index e2188de..60c3225 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -284,7 +284,7 @@ static const struct proto_ops rawsock_ops = {
 	.socketpair     = sock_no_socketpair,
 	.accept         = sock_no_accept,
 	.getname        = sock_no_getname,
-	.poll           = datagram_poll,
+	.poll_mask      = datagram_poll_mask,
 	.ioctl          = sock_no_ioctl,
 	.listen         = sock_no_listen,
 	.shutdown       = sock_no_shutdown,
@@ -304,7 +304,7 @@ static const struct proto_ops rawsock_raw_ops = {
 	.socketpair     = sock_no_socketpair,
 	.accept         = sock_no_accept,
 	.getname        = sock_no_getname,
-	.poll           = datagram_poll,
+	.poll_mask      = datagram_poll_mask,
 	.ioctl          = sock_no_ioctl,
 	.listen         = sock_no_listen,
 	.shutdown       = sock_no_shutdown,
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index acb7b86..674390b 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4110,12 +4110,11 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
 	return 0;
 }
 
-static __poll_t packet_poll(struct file *file, struct socket *sock,
-				poll_table *wait)
+static __poll_t packet_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	struct packet_sock *po = pkt_sk(sk);
-	__poll_t mask = datagram_poll(file, sock, wait);
+	__poll_t mask = datagram_poll_mask(sock, events);
 
 	spin_lock_bh(&sk->sk_receive_queue.lock);
 	if (po->rx_ring.pg_vec) {
@@ -4457,7 +4456,7 @@ static const struct proto_ops packet_ops_spkt = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	packet_getname_spkt,
-	.poll =		datagram_poll,
+	.poll_mask =	datagram_poll_mask,
 	.ioctl =	packet_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
@@ -4478,7 +4477,7 @@ static const struct proto_ops packet_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	packet_getname,
-	.poll =		packet_poll,
+	.poll_mask =	packet_poll_mask,
 	.ioctl =	packet_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
@@ -4556,20 +4555,6 @@ static const struct seq_operations packet_seq_ops = {
 	.stop	= packet_seq_stop,
 	.show	= packet_seq_show,
 };
-
-static int packet_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &packet_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations packet_seq_fops = {
-	.open		= packet_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 #endif
 
 static int __net_init packet_net_init(struct net *net)
@@ -4577,7 +4562,8 @@ static int __net_init packet_net_init(struct net *net)
 	mutex_init(&net->packet.sklist_lock);
 	INIT_HLIST_HEAD(&net->packet.sklist);
 
-	if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
+	if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
+			sizeof(struct seq_net_private)))
 		return -ENOMEM;
 
 	return 0;
diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c
index 7778751..6cb4f60 100644
--- a/net/phonet/pn_dev.c
+++ b/net/phonet/pn_dev.c
@@ -320,7 +320,8 @@ static int __net_init phonet_init_net(struct net *net)
 {
 	struct phonet_net *pnn = phonet_pernet(net);
 
-	if (!proc_create("phonet", 0, net->proc_net, &pn_sock_seq_fops))
+	if (!proc_create_net("phonet", 0, net->proc_net, &pn_sock_seq_ops,
+			sizeof(struct seq_net_private)))
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&pnn->pndevs.list);
@@ -351,7 +352,8 @@ int __init phonet_device_init(void)
 	if (err)
 		return err;
 
-	proc_create("pnresource", 0, init_net.proc_net, &pn_res_seq_fops);
+	proc_create_net("pnresource", 0, init_net.proc_net, &pn_res_seq_ops,
+			sizeof(struct seq_net_private));
 	register_netdevice_notifier(&phonet_device_notifier);
 	err = phonet_netlink_register();
 	if (err)
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index f9b40e6..c295c4e 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -340,15 +340,12 @@ static int pn_socket_getname(struct socket *sock, struct sockaddr *addr,
 	return sizeof(struct sockaddr_pn);
 }
 
-static __poll_t pn_socket_poll(struct file *file, struct socket *sock,
-					poll_table *wait)
+static __poll_t pn_socket_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	struct pep_sock *pn = pep_sk(sk);
 	__poll_t mask = 0;
 
-	poll_wait(file, sk_sleep(sk), wait);
-
 	if (sk->sk_state == TCP_CLOSE)
 		return EPOLLERR;
 	if (!skb_queue_empty(&sk->sk_receive_queue))
@@ -448,7 +445,7 @@ const struct proto_ops phonet_dgram_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= pn_socket_getname,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.ioctl		= pn_socket_ioctl,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
@@ -473,7 +470,7 @@ const struct proto_ops phonet_stream_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= pn_socket_accept,
 	.getname	= pn_socket_getname,
-	.poll		= pn_socket_poll,
+	.poll_mask	= pn_socket_poll_mask,
 	.ioctl		= pn_socket_ioctl,
 	.listen		= pn_socket_listen,
 	.shutdown	= sock_no_shutdown,
@@ -620,25 +617,12 @@ static int pn_sock_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations pn_sock_seq_ops = {
+const struct seq_operations pn_sock_seq_ops = {
 	.start = pn_sock_seq_start,
 	.next = pn_sock_seq_next,
 	.stop = pn_sock_seq_stop,
 	.show = pn_sock_seq_show,
 };
-
-static int pn_sock_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &pn_sock_seq_ops,
-				sizeof(struct seq_net_private));
-}
-
-const struct file_operations pn_sock_seq_fops = {
-	.open = pn_sock_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release_net,
-};
 #endif
 
 static struct  {
@@ -802,23 +786,10 @@ static int pn_res_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations pn_res_seq_ops = {
+const struct seq_operations pn_res_seq_ops = {
 	.start = pn_res_seq_start,
 	.next = pn_res_seq_next,
 	.stop = pn_res_seq_stop,
 	.show = pn_res_seq_show,
 };
-
-static int pn_res_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &pn_res_seq_ops,
-				sizeof(struct seq_net_private));
-}
-
-const struct file_operations pn_res_seq_fops = {
-	.open = pn_res_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release_net,
-};
 #endif
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 2aa07b5..1b5025e 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -1023,7 +1023,7 @@ static const struct proto_ops qrtr_proto_ops = {
 	.recvmsg	= qrtr_recvmsg,
 	.getname	= qrtr_getname,
 	.ioctl		= qrtr_ioctl,
-	.poll		= datagram_poll,
+	.poll_mask	= datagram_poll_mask,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= sock_no_setsockopt,
 	.getsockopt	= sock_no_getsockopt,
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 9ff5e0a..5b73fea 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1453,18 +1453,6 @@ static const struct seq_operations rose_info_seqops = {
 	.stop = rose_info_stop,
 	.show = rose_info_show,
 };
-
-static int rose_info_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &rose_info_seqops);
-}
-
-static const struct file_operations rose_info_fops = {
-	.open = rose_info_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
 #endif	/* CONFIG_PROC_FS */
 
 static const struct net_proto_family rose_family_ops = {
@@ -1482,7 +1470,7 @@ static const struct proto_ops rose_proto_ops = {
 	.socketpair	=	sock_no_socketpair,
 	.accept		=	rose_accept,
 	.getname	=	rose_getname,
-	.poll		=	datagram_poll,
+	.poll_mask	=	datagram_poll_mask,
 	.ioctl		=	rose_ioctl,
 	.listen		=	rose_listen,
 	.shutdown	=	sock_no_shutdown,
@@ -1567,13 +1555,13 @@ static int __init rose_proto_init(void)
 
 	rose_add_loopback_neigh();
 
-	proc_create("rose", 0444, init_net.proc_net, &rose_info_fops);
-	proc_create("rose_neigh", 0444, init_net.proc_net,
-		    &rose_neigh_fops);
-	proc_create("rose_nodes", 0444, init_net.proc_net,
-		    &rose_nodes_fops);
-	proc_create("rose_routes", 0444, init_net.proc_net,
-		    &rose_routes_fops);
+	proc_create_seq("rose", 0444, init_net.proc_net, &rose_info_seqops);
+	proc_create_seq("rose_neigh", 0444, init_net.proc_net,
+		    &rose_neigh_seqops);
+	proc_create_seq("rose_nodes", 0444, init_net.proc_net,
+		    &rose_node_seqops);
+	proc_create_seq("rose_routes", 0444, init_net.proc_net,
+		    &rose_route_seqops);
 out:
 	return rc;
 fail:
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index 178619d..77e9f85 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -1143,25 +1143,13 @@ static int rose_node_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations rose_node_seqops = {
+const struct seq_operations rose_node_seqops = {
 	.start = rose_node_start,
 	.next = rose_node_next,
 	.stop = rose_node_stop,
 	.show = rose_node_show,
 };
 
-static int rose_nodes_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &rose_node_seqops);
-}
-
-const struct file_operations rose_nodes_fops = {
-	.open = rose_nodes_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 static void *rose_neigh_start(struct seq_file *seq, loff_t *pos)
 	__acquires(rose_neigh_list_lock)
 {
@@ -1226,26 +1214,13 @@ static int rose_neigh_show(struct seq_file *seq, void *v)
 }
 
 
-static const struct seq_operations rose_neigh_seqops = {
+const struct seq_operations rose_neigh_seqops = {
 	.start = rose_neigh_start,
 	.next = rose_neigh_next,
 	.stop = rose_neigh_stop,
 	.show = rose_neigh_show,
 };
 
-static int rose_neigh_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &rose_neigh_seqops);
-}
-
-const struct file_operations rose_neigh_fops = {
-	.open = rose_neigh_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
-
 static void *rose_route_start(struct seq_file *seq, loff_t *pos)
 	__acquires(rose_route_list_lock)
 {
@@ -1311,25 +1286,12 @@ static int rose_route_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations rose_route_seqops = {
+struct seq_operations rose_route_seqops = {
 	.start = rose_route_start,
 	.next = rose_route_next,
 	.stop = rose_route_stop,
 	.show = rose_route_show,
 };
-
-static int rose_route_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &rose_route_seqops);
-}
-
-const struct file_operations rose_routes_fops = {
-	.open = rose_route_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 #endif /* CONFIG_PROC_FS */
 
 /*
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 2b46304..3b1ac93 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -734,15 +734,11 @@ static int rxrpc_getsockopt(struct socket *sock, int level, int optname,
 /*
  * permit an RxRPC socket to be polled
  */
-static __poll_t rxrpc_poll(struct file *file, struct socket *sock,
-			       poll_table *wait)
+static __poll_t rxrpc_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	struct rxrpc_sock *rx = rxrpc_sk(sk);
-	__poll_t mask;
-
-	sock_poll_wait(file, sk_sleep(sk), wait);
-	mask = 0;
+	__poll_t mask = 0;
 
 	/* the socket is readable if there are any messages waiting on the Rx
 	 * queue */
@@ -949,7 +945,7 @@ static const struct proto_ops rxrpc_rpc_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= sock_no_getname,
-	.poll		= rxrpc_poll,
+	.poll_mask	= rxrpc_poll_mask,
 	.ioctl		= sock_no_ioctl,
 	.listen		= rxrpc_listen,
 	.shutdown	= rxrpc_shutdown,
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 19975d2..29923ec 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -1051,8 +1051,8 @@ void __rxrpc_queue_peer_error(struct rxrpc_peer *);
 /*
  * proc.c
  */
-extern const struct file_operations rxrpc_call_seq_fops;
-extern const struct file_operations rxrpc_connection_seq_fops;
+extern const struct seq_operations rxrpc_call_seq_ops;
+extern const struct seq_operations rxrpc_connection_seq_ops;
 
 /*
  * recvmsg.c
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index c7a023f..5d6a773 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -97,8 +97,11 @@ static __net_init int rxrpc_init_net(struct net *net)
 	if (!rxnet->proc_net)
 		goto err_proc;
 
-	proc_create("calls", 0444, rxnet->proc_net, &rxrpc_call_seq_fops);
-	proc_create("conns", 0444, rxnet->proc_net, &rxrpc_connection_seq_fops);
+	proc_create_net("calls", 0444, rxnet->proc_net, &rxrpc_call_seq_ops,
+			sizeof(struct seq_net_private));
+	proc_create_net("conns", 0444, rxnet->proc_net,
+			&rxrpc_connection_seq_ops,
+			sizeof(struct seq_net_private));
 	return 0;
 
 err_proc:
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 7e45db0..d9fca8c 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -115,26 +115,13 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations rxrpc_call_seq_ops = {
+const struct seq_operations rxrpc_call_seq_ops = {
 	.start  = rxrpc_call_seq_start,
 	.next   = rxrpc_call_seq_next,
 	.stop   = rxrpc_call_seq_stop,
 	.show   = rxrpc_call_seq_show,
 };
 
-static int rxrpc_call_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &rxrpc_call_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-const struct file_operations rxrpc_call_seq_fops = {
-	.open		= rxrpc_call_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 /*
  * generate a list of extant virtual connections in /proc/net/rxrpc_conns
  */
@@ -207,23 +194,9 @@ print:
 	return 0;
 }
 
-static const struct seq_operations rxrpc_connection_seq_ops = {
+const struct seq_operations rxrpc_connection_seq_ops = {
 	.start  = rxrpc_connection_seq_start,
 	.next   = rxrpc_connection_seq_next,
 	.stop   = rxrpc_connection_seq_stop,
 	.show   = rxrpc_connection_seq_show,
 };
-
-
-static int rxrpc_connection_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &rxrpc_connection_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-const struct file_operations rxrpc_connection_seq_fops = {
-	.open		= rxrpc_connection_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 106dae7e..54eca68 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -2092,23 +2092,11 @@ static int psched_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int psched_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, psched_show, NULL);
-}
-
-static const struct file_operations psched_fops = {
-	.open = psched_open,
-	.read  = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 static int __net_init psched_net_init(struct net *net)
 {
 	struct proc_dir_entry *e;
 
-	e = proc_create("psched", 0, net->proc_net, &psched_fops);
+	e = proc_create_single("psched", 0, net->proc_net, psched_show);
 	if (e == NULL)
 		return -ENOMEM;
 
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 0cd2e76..7339918 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -1010,7 +1010,7 @@ static const struct proto_ops inet6_seqpacket_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = inet_accept,
 	.getname	   = sctp_getname,
-	.poll		   = sctp_poll,
+	.poll_mask	   = sctp_poll_mask,
 	.ioctl		   = inet6_ioctl,
 	.listen		   = sctp_inet_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index fd2684a..a6179b2 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -108,25 +108,13 @@ static const struct seq_operations sctp_objcnt_seq_ops = {
 	.show  = sctp_objcnt_seq_show,
 };
 
-static int sctp_objcnt_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &sctp_objcnt_seq_ops);
-}
-
-static const struct file_operations sctp_objcnt_ops = {
-	.open	 = sctp_objcnt_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release,
-};
-
 /* Initialize the objcount in the proc filesystem.  */
 void sctp_dbg_objcnt_init(struct net *net)
 {
 	struct proc_dir_entry *ent;
 
-	ent = proc_create("sctp_dbg_objcnt", 0,
-			  net->sctp.proc_net_sctp, &sctp_objcnt_ops);
+	ent = proc_create_seq("sctp_dbg_objcnt", 0,
+			  net->sctp.proc_net_sctp, &sctp_objcnt_seq_ops);
 	if (!ent)
 		pr_warn("sctp_dbg_objcnt: Unable to create /proc entry.\n");
 }
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 1d9ccc6..ef5c9a8 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -88,19 +88,6 @@ static int sctp_snmp_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-/* Initialize the seq file operations for 'snmp' object. */
-static int sctp_snmp_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, sctp_snmp_seq_show);
-}
-
-static const struct file_operations sctp_snmp_seq_fops = {
-	.open	 = sctp_snmp_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release_net,
-};
-
 /* Dump local addresses of an association/endpoint. */
 static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb)
 {
@@ -225,21 +212,6 @@ static const struct seq_operations sctp_eps_ops = {
 	.show  = sctp_eps_seq_show,
 };
 
-
-/* Initialize the seq file operations for 'eps' object. */
-static int sctp_eps_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &sctp_eps_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations sctp_eps_seq_fops = {
-	.open	 = sctp_eps_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
 struct sctp_ht_iter {
 	struct seq_net_private p;
 	struct rhashtable_iter hti;
@@ -338,20 +310,6 @@ static const struct seq_operations sctp_assoc_ops = {
 	.show  = sctp_assocs_seq_show,
 };
 
-/* Initialize the seq file operations for 'assocs' object. */
-static int sctp_assocs_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &sctp_assoc_ops,
-			    sizeof(struct sctp_ht_iter));
-}
-
-static const struct file_operations sctp_assocs_seq_fops = {
-	.open	 = sctp_assocs_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release_net,
-};
-
 static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
 {
 	struct sctp_association *assoc;
@@ -431,36 +389,23 @@ static const struct seq_operations sctp_remaddr_ops = {
 	.show  = sctp_remaddr_seq_show,
 };
 
-static int sctp_remaddr_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &sctp_remaddr_ops,
-			    sizeof(struct sctp_ht_iter));
-}
-
-static const struct file_operations sctp_remaddr_seq_fops = {
-	.open = sctp_remaddr_seq_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release_net,
-};
-
 /* Set up the proc fs entry for the SCTP protocol. */
 int __net_init sctp_proc_init(struct net *net)
 {
 	net->sctp.proc_net_sctp = proc_net_mkdir(net, "sctp", net->proc_net);
 	if (!net->sctp.proc_net_sctp)
 		return -ENOMEM;
-	if (!proc_create("snmp", 0444, net->sctp.proc_net_sctp,
-			 &sctp_snmp_seq_fops))
+	if (!proc_create_net_single("snmp", 0444, net->sctp.proc_net_sctp,
+			 sctp_snmp_seq_show, NULL))
 		goto cleanup;
-	if (!proc_create("eps", 0444, net->sctp.proc_net_sctp,
-			 &sctp_eps_seq_fops))
+	if (!proc_create_net("eps", 0444, net->sctp.proc_net_sctp,
+			&sctp_eps_ops, sizeof(struct seq_net_private)))
 		goto cleanup;
-	if (!proc_create("assocs", 0444, net->sctp.proc_net_sctp,
-			 &sctp_assocs_seq_fops))
+	if (!proc_create_net("assocs", 0444, net->sctp.proc_net_sctp,
+			&sctp_assoc_ops, sizeof(struct sctp_ht_iter)))
 		goto cleanup;
-	if (!proc_create("remaddr", 0444, net->sctp.proc_net_sctp,
-			 &sctp_remaddr_seq_fops))
+	if (!proc_create_net("remaddr", 0444, net->sctp.proc_net_sctp,
+			&sctp_remaddr_ops, sizeof(struct sctp_ht_iter)))
 		goto cleanup;
 	return 0;
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 6bf0a99..11d9337 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1016,7 +1016,7 @@ static const struct proto_ops inet_seqpacket_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = inet_accept,
 	.getname	   = inet_getname,	/* Semantics are different.  */
-	.poll		   = sctp_poll,
+	.poll_mask	   = sctp_poll_mask,
 	.ioctl		   = inet_ioctl,
 	.listen		   = sctp_inet_listen,
 	.shutdown	   = inet_shutdown,	/* Looks harmless.  */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index ae7e7c6..bf74709 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7722,14 +7722,12 @@ out:
  * here, again, by modeling the current TCP/UDP code.  We don't have
  * a good way to test with it yet.
  */
-__poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
+__poll_t sctp_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	struct sctp_sock *sp = sctp_sk(sk);
 	__poll_t mask;
 
-	poll_wait(file, sk_sleep(sk), wait);
-
 	sock_rps_record_flow(sk);
 
 	/* A TCP-style listening socket becomes readable when the accept queue
diff --git a/net/socket.c b/net/socket.c
index f10f1d9..2d752e9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -117,8 +117,10 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from);
 static int sock_mmap(struct file *file, struct vm_area_struct *vma);
 
 static int sock_close(struct inode *inode, struct file *file);
-static __poll_t sock_poll(struct file *file,
-			      struct poll_table_struct *wait);
+static struct wait_queue_head *sock_get_poll_head(struct file *file,
+		__poll_t events);
+static __poll_t sock_poll_mask(struct file *file, __poll_t);
+static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait);
 static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 #ifdef CONFIG_COMPAT
 static long compat_sock_ioctl(struct file *file,
@@ -141,6 +143,8 @@ static const struct file_operations socket_file_ops = {
 	.llseek =	no_llseek,
 	.read_iter =	sock_read_iter,
 	.write_iter =	sock_write_iter,
+	.get_poll_head = sock_get_poll_head,
+	.poll_mask =	sock_poll_mask,
 	.poll =		sock_poll,
 	.unlocked_ioctl = sock_ioctl,
 #ifdef CONFIG_COMPAT
@@ -1114,27 +1118,48 @@ out_release:
 }
 EXPORT_SYMBOL(sock_create_lite);
 
-/* No kernel lock held - perfect */
-static __poll_t sock_poll(struct file *file, poll_table *wait)
+static struct wait_queue_head *sock_get_poll_head(struct file *file,
+		__poll_t events)
 {
-	__poll_t busy_flag = 0;
-	struct socket *sock;
+	struct socket *sock = file->private_data;
+
+	if (!sock->ops->poll_mask)
+		return NULL;
+	sock_poll_busy_loop(sock, events);
+	return sk_sleep(sock->sk);
+}
+
+static __poll_t sock_poll_mask(struct file *file, __poll_t events)
+{
+	struct socket *sock = file->private_data;
 
 	/*
-	 *      We can't return errors to poll, so it's either yes or no.
+	 * We need to be sure we are in sync with the socket flags modification.
+	 *
+	 * This memory barrier is paired in the wq_has_sleeper.
 	 */
-	sock = file->private_data;
+	smp_mb();
+
+	/* this socket can poll_ll so tell the system call */
+	return sock->ops->poll_mask(sock, events) |
+		(sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0);
+}
 
-	if (sk_can_busy_loop(sock->sk)) {
-		/* this socket can poll_ll so tell the system call */
-		busy_flag = POLL_BUSY_LOOP;
+/* No kernel lock held - perfect */
+static __poll_t sock_poll(struct file *file, poll_table *wait)
+{
+	struct socket *sock = file->private_data;
+	__poll_t events = poll_requested_events(wait), mask = 0;
 
-		/* once, only if requested by syscall */
-		if (wait && (wait->_key & POLL_BUSY_LOOP))
-			sk_busy_loop(sock->sk, 1);
+	if (sock->ops->poll) {
+		sock_poll_busy_loop(sock, events);
+		mask = sock->ops->poll(file, sock, wait);
+	} else if (sock->ops->poll_mask) {
+		sock_poll_wait(file, sock_get_poll_head(file, events), wait);
+		mask = sock->ops->poll_mask(sock, events);
 	}
 
-	return busy_flag | sock->ops->poll(file, sock, wait);
+	return mask | sock_poll_busy_flag(sock);
 }
 
 static int sock_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index c81ef5e..4fda18d 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -609,22 +609,6 @@ static int __rpc_rmdir(struct inode *dir, struct dentry *dentry)
 	return ret;
 }
 
-int rpc_rmdir(struct dentry *dentry)
-{
-	struct dentry *parent;
-	struct inode *dir;
-	int error;
-
-	parent = dget_parent(dentry);
-	dir = d_inode(parent);
-	inode_lock_nested(dir, I_MUTEX_PARENT);
-	error = __rpc_rmdir(dir, dentry);
-	inode_unlock(dir);
-	dput(parent);
-	return error;
-}
-EXPORT_SYMBOL_GPL(rpc_rmdir);
-
 static int __rpc_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int ret;
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 6be2157..3bb4504 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -692,10 +692,9 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
 }
 
 /**
- * tipc_poll - read and possibly block on pollmask
+ * tipc_poll - read pollmask
  * @file: file structure associated with the socket
  * @sock: socket for which to calculate the poll bits
- * @wait: ???
  *
  * Returns pollmask value
  *
@@ -709,15 +708,12 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
  * imply that the operation will succeed, merely that it should be performed
  * and will not block.
  */
-static __poll_t tipc_poll(struct file *file, struct socket *sock,
-			      poll_table *wait)
+static __poll_t tipc_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
 	__poll_t revents = 0;
 
-	sock_poll_wait(file, sk_sleep(sk), wait);
-
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
 		revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
@@ -3028,7 +3024,7 @@ static const struct proto_ops msg_ops = {
 	.socketpair	= tipc_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= tipc_getname,
-	.poll		= tipc_poll,
+	.poll_mask	= tipc_poll_mask,
 	.ioctl		= tipc_ioctl,
 	.listen		= sock_no_listen,
 	.shutdown	= tipc_shutdown,
@@ -3049,7 +3045,7 @@ static const struct proto_ops packet_ops = {
 	.socketpair	= tipc_socketpair,
 	.accept		= tipc_accept,
 	.getname	= tipc_getname,
-	.poll		= tipc_poll,
+	.poll_mask	= tipc_poll_mask,
 	.ioctl		= tipc_ioctl,
 	.listen		= tipc_listen,
 	.shutdown	= tipc_shutdown,
@@ -3070,7 +3066,7 @@ static const struct proto_ops stream_ops = {
 	.socketpair	= tipc_socketpair,
 	.accept		= tipc_accept,
 	.getname	= tipc_getname,
-	.poll		= tipc_poll,
+	.poll_mask	= tipc_poll_mask,
 	.ioctl		= tipc_ioctl,
 	.listen		= tipc_listen,
 	.shutdown	= tipc_shutdown,
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 68bb70a..95b02a7 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -638,9 +638,8 @@ static int unix_stream_connect(struct socket *, struct sockaddr *,
 static int unix_socketpair(struct socket *, struct socket *);
 static int unix_accept(struct socket *, struct socket *, int, bool);
 static int unix_getname(struct socket *, struct sockaddr *, int);
-static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
-static __poll_t unix_dgram_poll(struct file *, struct socket *,
-				    poll_table *);
+static __poll_t unix_poll_mask(struct socket *, __poll_t);
+static __poll_t unix_dgram_poll_mask(struct socket *, __poll_t);
 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 static int unix_shutdown(struct socket *, int);
 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
@@ -681,7 +680,7 @@ static const struct proto_ops unix_stream_ops = {
 	.socketpair =	unix_socketpair,
 	.accept =	unix_accept,
 	.getname =	unix_getname,
-	.poll =		unix_poll,
+	.poll_mask =	unix_poll_mask,
 	.ioctl =	unix_ioctl,
 	.listen =	unix_listen,
 	.shutdown =	unix_shutdown,
@@ -704,7 +703,7 @@ static const struct proto_ops unix_dgram_ops = {
 	.socketpair =	unix_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	unix_getname,
-	.poll =		unix_dgram_poll,
+	.poll_mask =	unix_dgram_poll_mask,
 	.ioctl =	unix_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	unix_shutdown,
@@ -726,7 +725,7 @@ static const struct proto_ops unix_seqpacket_ops = {
 	.socketpair =	unix_socketpair,
 	.accept =	unix_accept,
 	.getname =	unix_getname,
-	.poll =		unix_dgram_poll,
+	.poll_mask =	unix_dgram_poll_mask,
 	.ioctl =	unix_ioctl,
 	.listen =	unix_listen,
 	.shutdown =	unix_shutdown,
@@ -2630,13 +2629,10 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
+static __poll_t unix_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
-	__poll_t mask;
-
-	sock_poll_wait(file, sk_sleep(sk), wait);
-	mask = 0;
+	__poll_t mask = 0;
 
 	/* exceptional events? */
 	if (sk->sk_err)
@@ -2665,15 +2661,11 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa
 	return mask;
 }
 
-static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
-				    poll_table *wait)
+static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk, *other;
-	unsigned int writable;
-	__poll_t mask;
-
-	sock_poll_wait(file, sk_sleep(sk), wait);
-	mask = 0;
+	int writable;
+	__poll_t mask = 0;
 
 	/* exceptional events? */
 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
@@ -2699,7 +2691,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
 	}
 
 	/* No write status requested, avoid expensive OUT tests. */
-	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
+	if (!(events & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
 		return mask;
 
 	writable = unix_writable(sk);
@@ -2852,20 +2844,6 @@ static const struct seq_operations unix_seq_ops = {
 	.stop   = unix_seq_stop,
 	.show   = unix_seq_show,
 };
-
-static int unix_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &unix_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations unix_seq_fops = {
-	.open		= unix_seq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_net,
-};
-
 #endif
 
 static const struct net_proto_family unix_family_ops = {
@@ -2884,7 +2862,8 @@ static int __net_init unix_net_init(struct net *net)
 		goto out;
 
 #ifdef CONFIG_PROC_FS
-	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
+	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
+			sizeof(struct seq_net_private))) {
 		unix_sysctl_unregister(net);
 		goto out;
 	}
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index c1076c1..bb5d5fa 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -850,18 +850,11 @@ static int vsock_shutdown(struct socket *sock, int mode)
 	return err;
 }
 
-static __poll_t vsock_poll(struct file *file, struct socket *sock,
-			       poll_table *wait)
+static __poll_t vsock_poll_mask(struct socket *sock, __poll_t events)
 {
-	struct sock *sk;
-	__poll_t mask;
-	struct vsock_sock *vsk;
-
-	sk = sock->sk;
-	vsk = vsock_sk(sk);
-
-	poll_wait(file, sk_sleep(sk), wait);
-	mask = 0;
+	struct sock *sk = sock->sk;
+	struct vsock_sock *vsk = vsock_sk(sk);
+	__poll_t mask = 0;
 
 	if (sk->sk_err)
 		/* Signify that there has been an error on this socket. */
@@ -1091,7 +1084,7 @@ static const struct proto_ops vsock_dgram_ops = {
 	.socketpair = sock_no_socketpair,
 	.accept = sock_no_accept,
 	.getname = vsock_getname,
-	.poll = vsock_poll,
+	.poll_mask = vsock_poll_mask,
 	.ioctl = sock_no_ioctl,
 	.listen = sock_no_listen,
 	.shutdown = vsock_shutdown,
@@ -1849,7 +1842,7 @@ static const struct proto_ops vsock_stream_ops = {
 	.socketpair = sock_no_socketpair,
 	.accept = vsock_accept,
 	.getname = vsock_getname,
-	.poll = vsock_poll,
+	.poll_mask = vsock_poll_mask,
 	.ioctl = sock_no_ioctl,
 	.listen = vsock_listen,
 	.shutdown = vsock_shutdown,
diff --git a/net/wireless/wext-proc.c b/net/wireless/wext-proc.c
index b4c4645..cadcf86 100644
--- a/net/wireless/wext-proc.c
+++ b/net/wireless/wext-proc.c
@@ -126,24 +126,11 @@ static const struct seq_operations wireless_seq_ops = {
 	.show  = wireless_dev_seq_show,
 };
 
-static int seq_open_wireless(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &wireless_seq_ops,
-			    sizeof(struct seq_net_private));
-}
-
-static const struct file_operations wireless_seq_fops = {
-	.open    = seq_open_wireless,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release_net,
-};
-
 int __net_init wext_proc_init(struct net *net)
 {
 	/* Create /proc/net/wireless entry */
-	if (!proc_create("wireless", 0444, net->proc_net,
-			 &wireless_seq_fops))
+	if (!proc_create_net("wireless", 0444, net->proc_net,
+			&wireless_seq_ops, sizeof(struct seq_net_private)))
 		return -ENOMEM;
 
 	return 0;
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index d49aa79..f93365a 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1750,7 +1750,7 @@ static const struct proto_ops x25_proto_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	x25_accept,
 	.getname =	x25_getname,
-	.poll =		datagram_poll,
+	.poll_mask =	datagram_poll_mask,
 	.ioctl =	x25_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = compat_x25_ioctl,
diff --git a/net/x25/x25_proc.c b/net/x25/x25_proc.c
index 64b415e..da52c9d 100644
--- a/net/x25/x25_proc.c
+++ b/net/x25/x25_proc.c
@@ -171,57 +171,21 @@ static const struct seq_operations x25_seq_forward_ops = {
 	.show   = x25_seq_forward_show,
 };
 
-static int x25_seq_socket_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &x25_seq_socket_ops);
-}
-
-static int x25_seq_route_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &x25_seq_route_ops);
-}
-
-static int x25_seq_forward_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &x25_seq_forward_ops);
-}
-
-static const struct file_operations x25_seq_socket_fops = {
-	.open		= x25_seq_socket_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static const struct file_operations x25_seq_route_fops = {
-	.open		= x25_seq_route_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static const struct file_operations x25_seq_forward_fops = {
-	.open		= x25_seq_forward_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 int __init x25_proc_init(void)
 {
 	if (!proc_mkdir("x25", init_net.proc_net))
 		return -ENOMEM;
 
-	if (!proc_create("x25/route", 0444, init_net.proc_net,
-			 &x25_seq_route_fops))
+	if (!proc_create_seq("x25/route", 0444, init_net.proc_net,
+			 &x25_seq_route_ops))
 		goto out;
 
-	if (!proc_create("x25/socket", 0444, init_net.proc_net,
-			 &x25_seq_socket_fops))
+	if (!proc_create_seq("x25/socket", 0444, init_net.proc_net,
+			 &x25_seq_socket_ops))
 		goto out;
 
-	if (!proc_create("x25/forward", 0444, init_net.proc_net,
-			 &x25_seq_forward_fops))
+	if (!proc_create_seq("x25/forward", 0444, init_net.proc_net,
+			 &x25_seq_forward_ops))
 		goto out;
 	return 0;
 
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
index ed06903..178318d 100644
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -65,22 +65,10 @@ static int xfrm_statistics_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int xfrm_statistics_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open_net(inode, file, xfrm_statistics_seq_show);
-}
-
-static const struct file_operations xfrm_statistics_seq_fops = {
-	.open	 = xfrm_statistics_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = single_release_net,
-};
-
 int __net_init xfrm_proc_init(struct net *net)
 {
-	if (!proc_create("xfrm_stat", 0444, net->proc_net,
-			 &xfrm_statistics_seq_fops))
+	if (!proc_create_net_single("xfrm_stat", 0444, net->proc_net,
+			 xfrm_statistics_seq_show, NULL))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/scripts/documentation-file-ref-check b/scripts/documentation-file-ref-check
index bc16599..2520bc1 100755
--- a/scripts/documentation-file-ref-check
+++ b/scripts/documentation-file-ref-check
@@ -1,15 +1,116 @@
-#!/bin/sh
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+#
 # Treewide grep for references to files under Documentation, and report
 # non-existing files in stderr.
 
-for f in $(git ls-files); do
-	for ref in $(grep -ho "Documentation/[A-Za-z0-9_.,~/*+-]*" "$f"); do
-		# presume trailing . and , are not part of the name
-		ref=${ref%%[.,]}
-
-		# use ls to handle wildcards
-		if ! ls $ref >/dev/null 2>&1; then
-			echo "$f: $ref" >&2
-		fi
-	done
-done
+use warnings;
+use strict;
+use Getopt::Long qw(:config no_auto_abbrev);
+
+my $scriptname = $0;
+$scriptname =~ s,.*/([^/]+/),$1,;
+
+# Parse arguments
+my $help = 0;
+my $fix = 0;
+
+GetOptions(
+	'fix' => \$fix,
+	'h|help|usage' => \$help,
+);
+
+if ($help != 0) {
+    print "$scriptname [--help] [--fix-rst]\n";
+    exit -1;
+}
+
+# Step 1: find broken references
+print "Finding broken references. This may take a while...  " if ($fix);
+
+my %broken_ref;
+
+open IN, "git grep 'Documentation/'|"
+     or die "Failed to run git grep";
+while (<IN>) {
+	next if (!m/^([^:]+):(.*)/);
+
+	my $f = $1;
+	my $ln = $2;
+
+	# Makefiles contain nasty expressions to parse docs
+	next if ($f =~ m/Makefile/);
+	# Skip this script
+	next if ($f eq $scriptname);
+
+	if ($ln =~ m,\b(\S*)(Documentation/[A-Za-z0-9\_\.\,\~/\*+-]*),) {
+		my $prefix = $1;
+		my $ref = $2;
+		my $base = $2;
+
+		$ref =~ s/[\,\.]+$//;
+
+		my $fulref = "$prefix$ref";
+
+		$fulref =~ s/^(\<file|ref)://;
+		$fulref =~ s/^[\'\`]+//;
+		$fulref =~ s,^\$\(.*\)/,,;
+		$base =~ s,.*/,,;
+
+		# Remove URL false-positives
+		next if ($fulref =~ m/^http/);
+
+		# Check if exists, evaluating wildcards
+		next if (grep -e, glob("$ref $fulref"));
+
+		if ($fix) {
+			if (!($ref =~ m/(devicetree|scripts|Kconfig|Kbuild)/)) {
+				$broken_ref{$ref}++;
+			}
+		} else {
+			print STDERR "$f: $fulref\n";
+		}
+	}
+}
+
+exit 0 if (!$fix);
+
+# Step 2: Seek for file name alternatives
+print "Auto-fixing broken references. Please double-check the results\n";
+
+foreach my $ref (keys %broken_ref) {
+	my $new =$ref;
+
+	# get just the basename
+	$new =~ s,.*/,,;
+
+	# Seek for the same name on another place, as it may have been moved
+	my $f="";
+
+	$f = qx(find . -iname $new) if ($new);
+
+	# usual reason for breakage: file renamed to .rst
+	if (!$f) {
+		$new =~ s/\.txt$/.rst/;
+		$f=qx(find . -iname $new) if ($new);
+	}
+
+	my @find = split /\s+/, $f;
+
+	if (!$f) {
+		print STDERR "ERROR: Didn't find a replacement for $ref\n";
+	} elsif (scalar(@find) > 1) {
+		print STDERR "WARNING: Won't auto-replace, as found multiple files close to $ref:\n";
+		foreach my $j (@find) {
+			$j =~ s,^./,,;
+			print STDERR "    $j\n";
+		}
+	} else {
+		$f = $find[0];
+		$f =~ s,^./,,;
+		print "INFO: Replacing $ref to $f\n";
+		foreach my $j (qx(git grep -l $ref)) {
+			qx(sed "s\@$ref\@$f\@g" -i $j);
+		}
+	}
+}
diff --git a/scripts/spdxcheck.py b/scripts/spdxcheck.py
new file mode 100755
index 0000000..7deaef2
--- /dev/null
+++ b/scripts/spdxcheck.py
@@ -0,0 +1,284 @@
+#!/usr/bin/env python
+# SPDX-License-Identifier: GPL-2.0
+# Copyright Thomas Gleixner <tglx@linutronix.de>
+
+from argparse import ArgumentParser
+from ply import lex, yacc
+import traceback
+import sys
+import git
+import re
+import os
+
+class ParserException(Exception):
+    def __init__(self, tok, txt):
+        self.tok = tok
+        self.txt = txt
+
+class SPDXException(Exception):
+    def __init__(self, el, txt):
+        self.el = el
+        self.txt = txt
+
+class SPDXdata(object):
+    def __init__(self):
+        self.license_files = 0
+        self.exception_files = 0
+        self.licenses = [ ]
+        self.exceptions = { }
+
+# Read the spdx data from the LICENSES directory
+def read_spdxdata(repo):
+
+    # The subdirectories of LICENSES in the kernel source
+    license_dirs = [ "preferred", "other", "exceptions" ]
+    lictree = repo.heads.master.commit.tree['LICENSES']
+
+    spdx = SPDXdata()
+
+    for d in license_dirs:
+        for el in lictree[d].traverse():
+            if not os.path.isfile(el.path):
+                continue
+
+            exception = None
+            for l in open(el.path).readlines():
+                if l.startswith('Valid-License-Identifier:'):
+                    lid = l.split(':')[1].strip().upper()
+                    if lid in spdx.licenses:
+                        raise SPDXException(el, 'Duplicate License Identifier: %s' %lid)
+                    else:
+                        spdx.licenses.append(lid)
+
+                elif l.startswith('SPDX-Exception-Identifier:'):
+                    exception = l.split(':')[1].strip().upper()
+                    spdx.exceptions[exception] = []
+
+                elif l.startswith('SPDX-Licenses:'):
+                    for lic in l.split(':')[1].upper().strip().replace(' ', '').replace('\t', '').split(','):
+                        if not lic in spdx.licenses:
+                            raise SPDXException(None, 'Exception %s missing license %s' %(ex, lic))
+                        spdx.exceptions[exception].append(lic)
+
+                elif l.startswith("License-Text:"):
+                    if exception:
+                        if not len(spdx.exceptions[exception]):
+                            raise SPDXException(el, 'Exception %s is missing SPDX-Licenses' %excid)
+                        spdx.exception_files += 1
+                    else:
+                        spdx.license_files += 1
+                    break
+    return spdx
+
+class id_parser(object):
+
+    reserved = [ 'AND', 'OR', 'WITH' ]
+    tokens = [ 'LPAR', 'RPAR', 'ID', 'EXC' ] + reserved
+
+    precedence = ( ('nonassoc', 'AND', 'OR'), )
+
+    t_ignore = ' \t'
+
+    def __init__(self, spdx):
+        self.spdx = spdx
+        self.lasttok = None
+        self.lastid = None
+        self.lexer = lex.lex(module = self, reflags = re.UNICODE)
+        # Initialize the parser. No debug file and no parser rules stored on disk
+        # The rules are small enough to be generated on the fly
+        self.parser = yacc.yacc(module = self, write_tables = False, debug = False)
+        self.lines_checked = 0
+        self.checked = 0
+        self.spdx_valid = 0
+        self.spdx_errors = 0
+        self.curline = 0
+        self.deepest = 0
+
+    # Validate License and Exception IDs
+    def validate(self, tok):
+        id = tok.value.upper()
+        if tok.type == 'ID':
+            if not id in self.spdx.licenses:
+                raise ParserException(tok, 'Invalid License ID')
+            self.lastid = id
+        elif tok.type == 'EXC':
+            if not self.spdx.exceptions.has_key(id):
+                raise ParserException(tok, 'Invalid Exception ID')
+            if self.lastid not in self.spdx.exceptions[id]:
+                raise ParserException(tok, 'Exception not valid for license %s' %self.lastid)
+            self.lastid = None
+        elif tok.type != 'WITH':
+            self.lastid = None
+
+    # Lexer functions
+    def t_RPAR(self, tok):
+        r'\)'
+        self.lasttok = tok.type
+        return tok
+
+    def t_LPAR(self, tok):
+        r'\('
+        self.lasttok = tok.type
+        return tok
+
+    def t_ID(self, tok):
+        r'[A-Za-z.0-9\-+]+'
+
+        if self.lasttok == 'EXC':
+            print(tok)
+            raise ParserException(tok, 'Missing parentheses')
+
+        tok.value = tok.value.strip()
+        val = tok.value.upper()
+
+        if val in self.reserved:
+            tok.type = val
+        elif self.lasttok == 'WITH':
+            tok.type = 'EXC'
+
+        self.lasttok = tok.type
+        self.validate(tok)
+        return tok
+
+    def t_error(self, tok):
+        raise ParserException(tok, 'Invalid token')
+
+    def p_expr(self, p):
+        '''expr : ID
+                | ID WITH EXC
+                | expr AND expr
+                | expr OR expr
+                | LPAR expr RPAR'''
+        pass
+
+    def p_error(self, p):
+        if not p:
+            raise ParserException(None, 'Unfinished license expression')
+        else:
+            raise ParserException(p, 'Syntax error')
+
+    def parse(self, expr):
+        self.lasttok = None
+        self.lastid = None
+        self.parser.parse(expr, lexer = self.lexer)
+
+    def parse_lines(self, fd, maxlines, fname):
+        self.checked += 1
+        self.curline = 0
+        try:
+            for line in fd:
+                self.curline += 1
+                if self.curline > maxlines:
+                    break
+                self.lines_checked += 1
+                if line.find("SPDX-License-Identifier:") < 0:
+                    continue
+                expr = line.split(':')[1].replace('*/', '').strip()
+                self.parse(expr)
+                self.spdx_valid += 1
+                #
+                # Should we check for more SPDX ids in the same file and
+                # complain if there are any?
+                #
+                break
+
+        except ParserException as pe:
+            if pe.tok:
+                col = line.find(expr) + pe.tok.lexpos
+                tok = pe.tok.value
+                sys.stdout.write('%s: %d:%d %s: %s\n' %(fname, self.curline, col, pe.txt, tok))
+            else:
+                sys.stdout.write('%s: %d:0 %s\n' %(fname, self.curline, col, pe.txt))
+            self.spdx_errors += 1
+
+def scan_git_tree(tree):
+    for el in tree.traverse():
+        # Exclude stuff which would make pointless noise
+        # FIXME: Put this somewhere more sensible
+        if el.path.startswith("LICENSES"):
+            continue
+        if el.path.find("license-rules.rst") >= 0:
+            continue
+        if el.path == 'scripts/checkpatch.pl':
+            continue
+        if not os.path.isfile(el.path):
+            continue
+        parser.parse_lines(open(el.path), args.maxlines, el.path)
+
+def scan_git_subtree(tree, path):
+    for p in path.strip('/').split('/'):
+        tree = tree[p]
+    scan_git_tree(tree)
+
+if __name__ == '__main__':
+
+    ap = ArgumentParser(description='SPDX expression checker')
+    ap.add_argument('path', nargs='*', help='Check path or file. If not given full git tree scan. For stdin use "-"')
+    ap.add_argument('-m', '--maxlines', type=int, default=15,
+                    help='Maximum number of lines to scan in a file. Default 15')
+    ap.add_argument('-v', '--verbose', action='store_true', help='Verbose statistics output')
+    args = ap.parse_args()
+
+    # Sanity check path arguments
+    if '-' in args.path and len(args.path) > 1:
+        sys.stderr.write('stdin input "-" must be the only path argument\n')
+        sys.exit(1)
+
+    try:
+        # Use git to get the valid license expressions
+        repo = git.Repo(os.getcwd())
+        assert not repo.bare
+
+        # Initialize SPDX data
+        spdx = read_spdxdata(repo)
+
+        # Initilize the parser
+        parser = id_parser(spdx)
+
+    except SPDXException as se:
+        if se.el:
+            sys.stderr.write('%s: %s\n' %(se.el.path, se.txt))
+        else:
+            sys.stderr.write('%s\n' %se.txt)
+        sys.exit(1)
+
+    except Exception as ex:
+        sys.stderr.write('FAIL: %s\n' %ex)
+        sys.stderr.write('%s\n' %traceback.format_exc())
+        sys.exit(1)
+
+    try:
+        if len(args.path) and args.path[0] == '-':
+            parser.parse_lines(sys.stdin, args.maxlines, '-')
+        else:
+            if args.path:
+                for p in args.path:
+                    if os.path.isfile(p):
+                        parser.parse_lines(open(p), args.maxlines, p)
+                    elif os.path.isdir(p):
+                        scan_git_subtree(repo.head.reference.commit.tree, p)
+                    else:
+                        sys.stderr.write('path %s does not exist\n' %p)
+                        sys.exit(1)
+            else:
+                # Full git tree scan
+                scan_git_tree(repo.head.commit.tree)
+
+            if args.verbose:
+                sys.stderr.write('\n')
+                sys.stderr.write('License files:     %12d\n' %spdx.license_files)
+                sys.stderr.write('Exception files:   %12d\n' %spdx.exception_files)
+                sys.stderr.write('License IDs        %12d\n' %len(spdx.licenses))
+                sys.stderr.write('Exception IDs      %12d\n' %len(spdx.exceptions))
+                sys.stderr.write('\n')
+                sys.stderr.write('Files checked:     %12d\n' %parser.checked)
+                sys.stderr.write('Lines checked:     %12d\n' %parser.lines_checked)
+                sys.stderr.write('Files with SPDX:   %12d\n' %parser.spdx_valid)
+                sys.stderr.write('Files with errors: %12d\n' %parser.spdx_errors)
+
+            sys.exit(0)
+
+    except Exception as ex:
+        sys.stderr.write('FAIL: %s\n' %ex)
+        sys.stderr.write('%s\n' %traceback.format_exc())
+        sys.exit(1)
diff --git a/security/commoncap.c b/security/commoncap.c
index 1ce701f..f4c33ab 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -919,6 +919,8 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
 int cap_inode_setxattr(struct dentry *dentry, const char *name,
 		       const void *value, size_t size, int flags)
 {
+	struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
+
 	/* Ignore non-security xattrs */
 	if (strncmp(name, XATTR_SECURITY_PREFIX,
 			sizeof(XATTR_SECURITY_PREFIX) - 1) != 0)
@@ -931,7 +933,7 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
 	if (strcmp(name, XATTR_NAME_CAPS) == 0)
 		return 0;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 	return 0;
 }
@@ -949,6 +951,8 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
  */
 int cap_inode_removexattr(struct dentry *dentry, const char *name)
 {
+	struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
+
 	/* Ignore non-security xattrs */
 	if (strncmp(name, XATTR_SECURITY_PREFIX,
 			sizeof(XATTR_SECURITY_PREFIX) - 1) != 0)
@@ -964,7 +968,7 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
 		return 0;
 	}
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 	return 0;
 }
diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
index a46fba3..facf9cd 100644
--- a/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@ -200,7 +200,8 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry,
 	int size;
 	bool ima_present = false;
 
-	if (!(inode->i_opflags & IOP_XATTR))
+	if (!(inode->i_opflags & IOP_XATTR) ||
+	    inode->i_sb->s_user_ns != &init_user_ns)
 		return -EOPNOTSUPP;
 
 	desc = init_desc(type);
diff --git a/security/keys/proc.c b/security/keys/proc.c
index fbc4af5..5af2934 100644
--- a/security/keys/proc.c
+++ b/security/keys/proc.c
@@ -18,7 +18,6 @@
 #include <asm/errno.h>
 #include "internal.h"
 
-static int proc_keys_open(struct inode *inode, struct file *file);
 static void *proc_keys_start(struct seq_file *p, loff_t *_pos);
 static void *proc_keys_next(struct seq_file *p, void *v, loff_t *_pos);
 static void proc_keys_stop(struct seq_file *p, void *v);
@@ -31,14 +30,6 @@ static const struct seq_operations proc_keys_ops = {
 	.show	= proc_keys_show,
 };
 
-static const struct file_operations proc_keys_fops = {
-	.open		= proc_keys_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static int proc_key_users_open(struct inode *inode, struct file *file);
 static void *proc_key_users_start(struct seq_file *p, loff_t *_pos);
 static void *proc_key_users_next(struct seq_file *p, void *v, loff_t *_pos);
 static void proc_key_users_stop(struct seq_file *p, void *v);
@@ -51,13 +42,6 @@ static const struct seq_operations proc_key_users_ops = {
 	.show	= proc_key_users_show,
 };
 
-static const struct file_operations proc_key_users_fops = {
-	.open		= proc_key_users_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 /*
  * Declare the /proc files.
  */
@@ -65,11 +49,11 @@ static int __init key_proc_init(void)
 {
 	struct proc_dir_entry *p;
 
-	p = proc_create("keys", 0, NULL, &proc_keys_fops);
+	p = proc_create_seq("keys", 0, NULL, &proc_keys_ops);
 	if (!p)
 		panic("Cannot create /proc/keys\n");
 
-	p = proc_create("key-users", 0, NULL, &proc_key_users_fops);
+	p = proc_create_seq("key-users", 0, NULL, &proc_key_users_ops);
 	if (!p)
 		panic("Cannot create /proc/key-users\n");
 
@@ -96,11 +80,6 @@ static struct rb_node *key_serial_next(struct seq_file *p, struct rb_node *n)
 	return n;
 }
 
-static int proc_keys_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &proc_keys_ops);
-}
-
 static struct key *find_ge_key(struct seq_file *p, key_serial_t id)
 {
 	struct user_namespace *user_ns = seq_user_ns(p);
@@ -293,15 +272,6 @@ static struct rb_node *key_user_first(struct user_namespace *user_ns, struct rb_
 	return __key_user_next(user_ns, n);
 }
 
-/*
- * Implement "/proc/key-users" to provides a list of the key users and their
- * quotas.
- */
-static int proc_key_users_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &proc_key_users_ops);
-}
-
 static void *proc_key_users_start(struct seq_file *p, loff_t *_pos)
 	__acquires(key_user_lock)
 {
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 179dd20..99c4675 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -274,11 +274,10 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
  * Try reloading inode security labels that have been marked as invalid.  The
  * @may_sleep parameter indicates when sleeping and thus reloading labels is
  * allowed; when set to false, returns -ECHILD when the label is
- * invalid.  The @opt_dentry parameter should be set to a dentry of the inode;
- * when no dentry is available, set it to NULL instead.
+ * invalid.  The @dentry parameter should be set to a dentry of the inode.
  */
 static int __inode_security_revalidate(struct inode *inode,
-				       struct dentry *opt_dentry,
+				       struct dentry *dentry,
 				       bool may_sleep)
 {
 	struct inode_security_struct *isec = inode->i_security;
@@ -295,7 +294,7 @@ static int __inode_security_revalidate(struct inode *inode,
 		 * @opt_dentry is NULL and no dentry for this inode can be
 		 * found; in that case, continue using the old label.
 		 */
-		inode_doinit_with_dentry(inode, opt_dentry);
+		inode_doinit_with_dentry(inode, dentry);
 	}
 	return 0;
 }
diff --git a/tools/memory-model/Documentation/cheatsheet.txt b/tools/memory-model/Documentation/cheatsheet.txt
index 956b1ae4..33ba98d 100644
--- a/tools/memory-model/Documentation/cheatsheet.txt
+++ b/tools/memory-model/Documentation/cheatsheet.txt
@@ -1,6 +1,6 @@
                                   Prior Operation     Subsequent Operation
                                   ---------------  ---------------------------
-                               C  Self  R  W  RWM  Self  R  W  DR  DW  RMW  SV
+                               C  Self  R  W  RMW  Self  R  W  DR  DW  RMW  SV
                               --  ----  -  -  ---  ----  -  -  --  --  ---  --
 
 Store, e.g., WRITE_ONCE()            Y                                       Y
@@ -14,7 +14,7 @@ smp_wmb()                                  Y    W           Y       Y    W
 smp_mb() & synchronize_rcu()  CP        Y  Y    Y        Y  Y   Y   Y    Y
 Successful full non-void RMW  CP     Y  Y  Y    Y     Y  Y  Y   Y   Y    Y   Y
 smp_mb__before_atomic()       CP        Y  Y    Y        a  a   a   a    Y
-smp_mb__after_atomic()        CP        a  a    Y        Y  Y   Y   Y
+smp_mb__after_atomic()        CP        a  a    Y        Y  Y   Y   Y    Y
 
 
 Key:	C:	Ordering is cumulative
@@ -26,4 +26,5 @@ Key:	C:	Ordering is cumulative
 	DR:	Dependent read (address dependency)
 	DW:	Dependent write (address, data, or control dependency)
 	RMW:	Atomic read-modify-write operation
-	SV	Same-variable access
+	SELF:	Orders self, as opposed to accesses before and/or after
+	SV:	Orders later accesses to the same variable
diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt
index a727c82..1b09f31 100644
--- a/tools/memory-model/Documentation/explanation.txt
+++ b/tools/memory-model/Documentation/explanation.txt
@@ -27,7 +27,7 @@ Explanation of the Linux-Kernel Memory Consistency Model
   19. AND THEN THERE WAS ALPHA
   20. THE HAPPENS-BEFORE RELATION: hb
   21. THE PROPAGATES-BEFORE RELATION: pb
-  22. RCU RELATIONS: link, gp-link, rscs-link, and rcu-path
+  22. RCU RELATIONS: rcu-link, gp, rscs, rcu-fence, and rb
   23. ODDS AND ENDS
 
 
@@ -1451,8 +1451,8 @@ they execute means that it cannot have cycles.  This requirement is
 the content of the LKMM's "propagation" axiom.
 
 
-RCU RELATIONS: link, gp-link, rscs-link, and rcu-path
------------------------------------------------------
+RCU RELATIONS: rcu-link, gp, rscs, rcu-fence, and rb
+----------------------------------------------------
 
 RCU (Read-Copy-Update) is a powerful synchronization mechanism.  It
 rests on two concepts: grace periods and read-side critical sections.
@@ -1509,8 +1509,8 @@ y, which occurs before the end of the critical section, did not
 propagate to P1 before the end of the grace period, violating the
 Guarantee.
 
-In the kernel's implementations of RCU, the business about stores
-propagating to every CPU is realized by placing strong fences at
+In the kernel's implementations of RCU, the requirements for stores
+to propagate to every CPU are fulfilled by placing strong fences at
 suitable places in the RCU-related code.  Thus, if a critical section
 starts before a grace period does then the critical section's CPU will
 execute an smp_mb() fence after the end of the critical section and
@@ -1523,72 +1523,124 @@ executes.
 What exactly do we mean by saying that a critical section "starts
 before" or "ends after" a grace period?  Some aspects of the meaning
 are pretty obvious, as in the example above, but the details aren't
-entirely clear.  The LKMM formalizes this notion by means of a
-relation with the unfortunately generic name "link".  It is a very
-general relation; among other things, X ->link Z includes cases where
-X happens-before or is equal to some event Y which is equal to or
-comes before Z in the coherence order.  Taking Y = Z, this says that
-X ->rfe Z implies X ->link Z, and taking Y = X, it says that X ->fr Z
-and X ->co Z each imply X ->link Z.
-
-The formal definition of the link relation is more than a little
+entirely clear.  The LKMM formalizes this notion by means of the
+rcu-link relation.  rcu-link encompasses a very general notion of
+"before": Among other things, X ->rcu-link Z includes cases where X
+happens-before or is equal to some event Y which is equal to or comes
+before Z in the coherence order.  When Y = Z this says that X ->rfe Z
+implies X ->rcu-link Z.  In addition, when Y = X it says that X ->fr Z
+and X ->co Z each imply X ->rcu-link Z.
+
+The formal definition of the rcu-link relation is more than a little
 obscure, and we won't give it here.  It is closely related to the pb
 relation, and the details don't matter unless you want to comb through
 a somewhat lengthy formal proof.  Pretty much all you need to know
-about link is the information in the preceding paragraph.
-
-The LKMM goes on to define the gp-link and rscs-link relations.  They
-bring grace periods and read-side critical sections into the picture,
-in the following way:
-
-	E ->gp-link F means there is a synchronize_rcu() fence event S
-	and an event X such that E ->po S, either S ->po X or S = X,
-	and X ->link F.  In other words, E and F are connected by a
-	grace period followed by an instance of link.
-
-	E ->rscs-link F means there is a critical section delimited by
-	an rcu_read_lock() fence L and an rcu_read_unlock() fence U,
-	and an event X such that E ->po U, either L ->po X or L = X,
-	and X ->link F.  Roughly speaking, this says that some event
-	in the same critical section as E is connected by link to F.
-
-If we think of the link relation as standing for an extended "before",
-then E ->gp-link F says that E executes before a grace period which
-ends before F executes.  (In fact it says more than this, because it
-includes cases where E executes before a grace period and some store
-propagates to F's CPU before F executes and doesn't propagate to some
-other CPU until after the grace period ends.)  Similarly,
-E ->rscs-link F says that E is part of (or before the start of) a
-critical section which starts before F executes.
+about rcu-link is the information in the preceding paragraph.
+
+The LKMM also defines the gp and rscs relations.  They bring grace
+periods and read-side critical sections into the picture, in the
+following way:
+
+	E ->gp F means there is a synchronize_rcu() fence event S such
+	that E ->po S and either S ->po F or S = F.  In simple terms,
+	there is a grace period po-between E and F.
+
+	E ->rscs F means there is a critical section delimited by an
+	rcu_read_lock() fence L and an rcu_read_unlock() fence U, such
+	that E ->po U and either L ->po F or L = F.  You can think of
+	this as saying that E and F are in the same critical section
+	(in fact, it also allows E to be po-before the start of the
+	critical section and F to be po-after the end).
+
+If we think of the rcu-link relation as standing for an extended
+"before", then X ->gp Y ->rcu-link Z says that X executes before a
+grace period which ends before Z executes.  (In fact it covers more
+than this, because it also includes cases where X executes before a
+grace period and some store propagates to Z's CPU before Z executes
+but doesn't propagate to some other CPU until after the grace period
+ends.)  Similarly, X ->rscs Y ->rcu-link Z says that X is part of (or
+before the start of) a critical section which starts before Z
+executes.
+
+The LKMM goes on to define the rcu-fence relation as a sequence of gp
+and rscs links separated by rcu-link links, in which the number of gp
+links is >= the number of rscs links.  For example:
+
+	X ->gp Y ->rcu-link Z ->rscs T ->rcu-link U ->gp V
+
+would imply that X ->rcu-fence V, because this sequence contains two
+gp links and only one rscs link.  (It also implies that X ->rcu-fence T
+and Z ->rcu-fence V.)  On the other hand:
+
+	X ->rscs Y ->rcu-link Z ->rscs T ->rcu-link U ->gp V
+
+does not imply X ->rcu-fence V, because the sequence contains only
+one gp link but two rscs links.
+
+The rcu-fence relation is important because the Grace Period Guarantee
+means that rcu-fence acts kind of like a strong fence.  In particular,
+if W is a write and we have W ->rcu-fence Z, the Guarantee says that W
+will propagate to every CPU before Z executes.
+
+To prove this in full generality requires some intellectual effort.
+We'll consider just a very simple case:
+
+	W ->gp X ->rcu-link Y ->rscs Z.
+
+This formula means that there is a grace period G and a critical
+section C such that:
+
+	1. W is po-before G;
+
+	2. X is equal to or po-after G;
+
+	3. X comes "before" Y in some sense;
+
+	4. Y is po-before the end of C;
+
+	5. Z is equal to or po-after the start of C.
+
+From 2 - 4 we deduce that the grace period G ends before the critical
+section C.  Then the second part of the Grace Period Guarantee says
+not only that G starts before C does, but also that W (which executes
+on G's CPU before G starts) must propagate to every CPU before C
+starts.  In particular, W propagates to every CPU before Z executes
+(or finishes executing, in the case where Z is equal to the
+rcu_read_lock() fence event which starts C.)  This sort of reasoning
+can be expanded to handle all the situations covered by rcu-fence.
+
+Finally, the LKMM defines the RCU-before (rb) relation in terms of
+rcu-fence.  This is done in essentially the same way as the pb
+relation was defined in terms of strong-fence.  We will omit the
+details; the end result is that E ->rb F implies E must execute before
+F, just as E ->pb F does (and for much the same reasons).
 
 Putting this all together, the LKMM expresses the Grace Period
-Guarantee by requiring that there are no cycles consisting of gp-link
-and rscs-link connections in which the number of gp-link instances is
->= the number of rscs-link instances.  It does this by defining the
-rcu-path relation to link events E and F whenever it is possible to
-pass from E to F by a sequence of gp-link and rscs-link connections
-with at least as many of the former as the latter.  The LKMM's "rcu"
-axiom then says that there are no events E such that E ->rcu-path E.
-
-Justifying this axiom takes some intellectual effort, but it is in
-fact a valid formalization of the Grace Period Guarantee.  We won't
-attempt to go through the detailed argument, but the following
-analysis gives a taste of what is involved.  Suppose we have a
-violation of the first part of the Guarantee: A critical section
-starts before a grace period, and some store propagates to the
-critical section's CPU before the end of the critical section but
-doesn't propagate to some other CPU until after the end of the grace
-period.
+Guarantee by requiring that the rb relation does not contain a cycle.
+Equivalently, this "rcu" axiom requires that there are no events E and
+F with E ->rcu-link F ->rcu-fence E.  Or to put it a third way, the
+axiom requires that there are no cycles consisting of gp and rscs
+alternating with rcu-link, where the number of gp links is >= the
+number of rscs links.
+
+Justifying the axiom isn't easy, but it is in fact a valid
+formalization of the Grace Period Guarantee.  We won't attempt to go
+through the detailed argument, but the following analysis gives a
+taste of what is involved.  Suppose we have a violation of the first
+part of the Guarantee: A critical section starts before a grace
+period, and some store propagates to the critical section's CPU before
+the end of the critical section but doesn't propagate to some other
+CPU until after the end of the grace period.
 
 Putting symbols to these ideas, let L and U be the rcu_read_lock() and
 rcu_read_unlock() fence events delimiting the critical section in
 question, and let S be the synchronize_rcu() fence event for the grace
 period.  Saying that the critical section starts before S means there
 are events E and F where E is po-after L (which marks the start of the
-critical section), E is "before" F in the sense of the link relation,
-and F is po-before the grace period S:
+critical section), E is "before" F in the sense of the rcu-link
+relation, and F is po-before the grace period S:
 
-	L ->po E ->link F ->po S.
+	L ->po E ->rcu-link F ->po S.
 
 Let W be the store mentioned above, let Z come before the end of the
 critical section and witness that W propagates to the critical
@@ -1600,16 +1652,19 @@ some event X which is po-after S.  Symbolically, this amounts to:
 
 The fr link from Y to W indicates that W has not propagated to Y's CPU
 at the time that Y executes.  From this, it can be shown (see the
-discussion of the link relation earlier) that X and Z are connected by
-link, yielding:
+discussion of the rcu-link relation earlier) that X and Z are related
+by rcu-link, yielding:
+
+	S ->po X ->rcu-link Z ->po U.
+
+The formulas say that S is po-between F and X, hence F ->gp X.  They
+also say that Z comes before the end of the critical section and E
+comes after its start, hence Z ->rscs E.  From all this we obtain:
 
-	S ->po X ->link Z ->po U.
+	F ->gp X ->rcu-link Z ->rscs E ->rcu-link F,
 
-These formulas say that S is po-between F and X, hence F ->gp-link Z
-via X.  They also say that Z comes before the end of the critical
-section and E comes after its start, hence Z ->rscs-link F via E.  But
-now we have a forbidden cycle: F ->gp-link Z ->rscs-link F.  Thus the
-"rcu" axiom rules out this violation of the Grace Period Guarantee.
+a forbidden cycle.  Thus the "rcu" axiom rules out this violation of
+the Grace Period Guarantee.
 
 For something a little more down-to-earth, let's see how the axiom
 works out in practice.  Consider the RCU code example from above, this
@@ -1635,18 +1690,18 @@ time with statement labels added to the memory access instructions:
 	}
 
 
-If r2 = 0 at the end then P0's store at X overwrites the value
-that P1's load at Z reads from, so we have Z ->fre X and thus
-Z ->link X.  In addition, there is a synchronize_rcu() between Y and
-Z, so therefore we have Y ->gp-link X.
+If r2 = 0 at the end then P0's store at X overwrites the value that
+P1's load at Z reads from, so we have Z ->fre X and thus Z ->rcu-link X.
+In addition, there is a synchronize_rcu() between Y and Z, so therefore
+we have Y ->gp Z.
 
 If r1 = 1 at the end then P1's load at Y reads from P0's store at W,
-so we have W ->link Y.  In addition, W and X are in the same critical
-section, so therefore we have X ->rscs-link Y.
+so we have W ->rcu-link Y.  In addition, W and X are in the same critical
+section, so therefore we have X ->rscs W.
 
-This gives us a cycle, Y ->gp-link X ->rscs-link Y, with one gp-link
-and one rscs-link, violating the "rcu" axiom.  Hence the outcome is
-not allowed by the LKMM, as we would expect.
+Then X ->rscs W ->rcu-link Y ->gp Z ->rcu-link X is a forbidden cycle,
+violating the "rcu" axiom.  Hence the outcome is not allowed by the
+LKMM, as we would expect.
 
 For contrast, let's see what can happen in a more complicated example:
 
@@ -1682,15 +1737,11 @@ For contrast, let's see what can happen in a more complicated example:
 	}
 
 If r0 = r1 = r2 = 1 at the end, then similar reasoning to before shows
-that W ->rscs-link Y via X, Y ->gp-link U via Z, and U ->rscs-link W
-via V.  And just as before, this gives a cycle:
-
-	W ->rscs-link Y ->gp-link U ->rscs-link W.
-
-However, this cycle has fewer gp-link instances than rscs-link
-instances, and consequently the outcome is not forbidden by the LKMM.
-The following instruction timing diagram shows how it might actually
-occur:
+that W ->rscs X ->rcu-link Y ->gp Z ->rcu-link U ->rscs V ->rcu-link W.
+However this cycle is not forbidden, because the sequence of relations
+contains fewer instances of gp (one) than of rscs (two).  Consequently
+the outcome is allowed by the LKMM.  The following instruction timing
+diagram shows how it might actually occur:
 
 P0			P1			P2
 --------------------	--------------------	--------------------
diff --git a/tools/memory-model/Documentation/references.txt b/tools/memory-model/Documentation/references.txt
index ba2e34c..b177f3e 100644
--- a/tools/memory-model/Documentation/references.txt
+++ b/tools/memory-model/Documentation/references.txt
@@ -63,15 +63,22 @@ o	Shaked Flur, Susmit Sarkar, Christopher Pulte, Kyndylan Nienhuis,
 	Principles of Programming Languages (POPL 2017). ACM, New York,
 	NY, USA, 429–442.
 
+o	Christopher Pulte, Shaked Flur, Will Deacon, Jon French,
+	Susmit Sarkar, and Peter Sewell. 2018. "Simplifying ARM concurrency:
+	multicopy-atomic axiomatic and operational models for ARMv8". In
+	Proceedings of the ACM on Programming Languages, Volume 2, Issue
+	POPL, Article No. 19. ACM, New York, NY, USA.
+
 
 Linux-kernel memory model
 =========================
 
-o	Andrea Parri, Alan Stern, Luc Maranget, Paul E. McKenney,
-	and Jade Alglave.  2017. "A formal model of
-	Linux-kernel memory ordering - companion webpage".
-	http://moscova.inria.fr/∼maranget/cats7/linux/. (2017). [Online;
-	accessed 30-January-2017].
+o	Jade Alglave, Luc Maranget, Paul E. McKenney, Andrea Parri, and
+	Alan Stern.  2018. "Frightening small children and disconcerting
+	grown-ups: Concurrency in the Linux kernel". In Proceedings of
+	the 23rd International Conference on Architectural Support for
+	Programming Languages and Operating Systems (ASPLOS 2018). ACM,
+	New York, NY, USA, 405-418.  Webpage: http://diy.inria.fr/linux/.
 
 o	Jade Alglave, Luc Maranget, Paul E. McKenney, Andrea Parri, and
 	Alan Stern.  2017.  "A formal kernel memory-ordering model (part 1)"
diff --git a/tools/memory-model/README b/tools/memory-model/README
index 0b3a5f3..734f7fe 100644
--- a/tools/memory-model/README
+++ b/tools/memory-model/README
@@ -20,7 +20,7 @@ that litmus test to be exercised within the Linux kernel.
 REQUIREMENTS
 ============
 
-Version 7.48 of the "herd7" and "klitmus7" tools must be downloaded
+Version 7.49 of the "herd7" and "klitmus7" tools must be downloaded
 separately:
 
   https://github.com/herd/herdtools7
diff --git a/tools/memory-model/linux-kernel.bell b/tools/memory-model/linux-kernel.bell
index 432c7cf..64f5740 100644
--- a/tools/memory-model/linux-kernel.bell
+++ b/tools/memory-model/linux-kernel.bell
@@ -5,10 +5,10 @@
  * Copyright (C) 2017 Alan Stern <stern@rowland.harvard.edu>,
  *                    Andrea Parri <parri.andrea@gmail.com>
  *
- * An earlier version of this file appears in the companion webpage for
+ * An earlier version of this file appeared in the companion webpage for
  * "Frightening small children and disconcerting grown-ups: Concurrency
  * in the Linux kernel" by Alglave, Maranget, McKenney, Parri, and Stern,
- * which is to appear in ASPLOS 2018.
+ * which appeared in ASPLOS 2018.
  *)
 
 "Linux-kernel memory consistency model"
diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat
index df97db0..59b5cbe 100644
--- a/tools/memory-model/linux-kernel.cat
+++ b/tools/memory-model/linux-kernel.cat
@@ -5,10 +5,10 @@
  * Copyright (C) 2017 Alan Stern <stern@rowland.harvard.edu>,
  *                    Andrea Parri <parri.andrea@gmail.com>
  *
- * An earlier version of this file appears in the companion webpage for
+ * An earlier version of this file appeared in the companion webpage for
  * "Frightening small children and disconcerting grown-ups: Concurrency
  * in the Linux kernel" by Alglave, Maranget, McKenney, Parri, and Stern,
- * which is to appear in ASPLOS 2018.
+ * which appeared in ASPLOS 2018.
  *)
 
 "Linux-kernel memory consistency model"
@@ -100,22 +100,29 @@ let rscs = po ; crit^-1 ; po?
  * one but two non-rf relations, but only in conjunction with an RCU
  * read-side critical section.
  *)
-let link = hb* ; pb* ; prop
+let rcu-link = hb* ; pb* ; prop
 
-(* Chains that affect the RCU grace-period guarantee *)
-let gp-link = gp ; link
-let rscs-link = rscs ; link
+(*
+ * Any sequence containing at least as many grace periods as RCU read-side
+ * critical sections (joined by rcu-link) acts as a generalized strong fence.
+ *)
+let rec rcu-fence = gp |
+	(gp ; rcu-link ; rscs) |
+	(rscs ; rcu-link ; gp) |
+	(gp ; rcu-link ; rcu-fence ; rcu-link ; rscs) |
+	(rscs ; rcu-link ; rcu-fence ; rcu-link ; gp) |
+	(rcu-fence ; rcu-link ; rcu-fence)
+
+(* rb orders instructions just as pb does *)
+let rb = prop ; rcu-fence ; hb* ; pb*
+
+irreflexive rb as rcu
 
 (*
- * A cycle containing at least as many grace periods as RCU read-side
- * critical sections is forbidden.
+ * The happens-before, propagation, and rcu constraints are all
+ * expressions of temporal ordering.  They could be replaced by
+ * a single constraint on an "executes-before" relation, xb:
+ *
+ * let xb = hb | pb | rb
+ * acyclic xb as executes-before
  *)
-let rec rcu-path =
-	gp-link |
-	(gp-link ; rscs-link) |
-	(rscs-link ; gp-link) |
-	(rcu-path ; rcu-path) |
-	(gp-link ; rcu-path ; rscs-link) |
-	(rscs-link ; rcu-path ; gp-link)
-
-irreflexive rcu-path as rcu
diff --git a/tools/memory-model/linux-kernel.def b/tools/memory-model/linux-kernel.def
index 397e4e6..6fa3eb2 100644
--- a/tools/memory-model/linux-kernel.def
+++ b/tools/memory-model/linux-kernel.def
@@ -1,9 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0+
 //
-// An earlier version of this file appears in the companion webpage for
+// An earlier version of this file appeared in the companion webpage for
 // "Frightening small children and disconcerting grown-ups: Concurrency
 // in the Linux kernel" by Alglave, Maranget, McKenney, Parri, and Stern,
-// which is to appear in ASPLOS 2018.
+// which appeared in ASPLOS 2018.
 
 // ONCE
 READ_ONCE(X) __load{once}(X)
@@ -14,14 +14,15 @@ smp_store_release(X,V) { __store{release}(*X,V); }
 smp_load_acquire(X) __load{acquire}(*X)
 rcu_assign_pointer(X,V) { __store{release}(X,V); }
 rcu_dereference(X) __load{once}(X)
+smp_store_mb(X,V) { __store{once}(X,V); __fence{mb}; }
 
 // Fences
-smp_mb() { __fence{mb} ; }
-smp_rmb() { __fence{rmb} ; }
-smp_wmb() { __fence{wmb} ; }
-smp_mb__before_atomic() { __fence{before-atomic} ; }
-smp_mb__after_atomic() { __fence{after-atomic} ; }
-smp_mb__after_spinlock() { __fence{after-spinlock} ; }
+smp_mb() { __fence{mb}; }
+smp_rmb() { __fence{rmb}; }
+smp_wmb() { __fence{wmb}; }
+smp_mb__before_atomic() { __fence{before-atomic}; }
+smp_mb__after_atomic() { __fence{after-atomic}; }
+smp_mb__after_spinlock() { __fence{after-spinlock}; }
 
 // Exchange
 xchg(X,V)  __xchg{mb}(X,V)
@@ -34,26 +35,27 @@ cmpxchg_acquire(X,V,W) __cmpxchg{acquire}(X,V,W)
 cmpxchg_release(X,V,W) __cmpxchg{release}(X,V,W)
 
 // Spinlocks
-spin_lock(X) { __lock(X) ; }
-spin_unlock(X) { __unlock(X) ; }
+spin_lock(X) { __lock(X); }
+spin_unlock(X) { __unlock(X); }
 spin_trylock(X) __trylock(X)
+spin_is_locked(X) __islocked(X)
 
 // RCU
 rcu_read_lock() { __fence{rcu-lock}; }
-rcu_read_unlock() { __fence{rcu-unlock};}
+rcu_read_unlock() { __fence{rcu-unlock}; }
 synchronize_rcu() { __fence{sync-rcu}; }
 synchronize_rcu_expedited() { __fence{sync-rcu}; }
 
 // Atomic
 atomic_read(X) READ_ONCE(*X)
-atomic_set(X,V) { WRITE_ONCE(*X,V) ; }
+atomic_set(X,V) { WRITE_ONCE(*X,V); }
 atomic_read_acquire(X) smp_load_acquire(X)
 atomic_set_release(X,V) { smp_store_release(X,V); }
 
-atomic_add(V,X) { __atomic_op(X,+,V) ; }
-atomic_sub(V,X) { __atomic_op(X,-,V) ; }
-atomic_inc(X)   { __atomic_op(X,+,1) ; }
-atomic_dec(X)   { __atomic_op(X,-,1) ; }
+atomic_add(V,X) { __atomic_op(X,+,V); }
+atomic_sub(V,X) { __atomic_op(X,-,V); }
+atomic_inc(X)   { __atomic_op(X,+,1); }
+atomic_dec(X)   { __atomic_op(X,-,1); }
 
 atomic_add_return(V,X) __atomic_op_return{mb}(X,+,V)
 atomic_add_return_relaxed(V,X) __atomic_op_return{once}(X,+,V)
diff --git a/tools/memory-model/litmus-tests/.gitignore b/tools/memory-model/litmus-tests/.gitignore
new file mode 100644
index 0000000..6e2ddc54
--- /dev/null
+++ b/tools/memory-model/litmus-tests/.gitignore
@@ -0,0 +1 @@
+*.litmus.out
diff --git a/tools/memory-model/litmus-tests/IRIW+mbonceonces+OnceOnce.litmus b/tools/memory-model/litmus-tests/IRIW+mbonceonces+OnceOnce.litmus
index 50d5db9..98a3716 100644
--- a/tools/memory-model/litmus-tests/IRIW+mbonceonces+OnceOnce.litmus
+++ b/tools/memory-model/litmus-tests/IRIW+mbonceonces+OnceOnce.litmus
@@ -7,7 +7,7 @@ C IRIW+mbonceonces+OnceOnce
  * between each pairs of reads.  In other words, is smp_mb() sufficient to
  * cause two different reading processes to agree on the order of a pair
  * of writes, where each write is to a different variable by a different
- * process?
+ * process?  This litmus test exercises LKMM's "propagation" rule.
  *)
 
 {}
diff --git a/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus b/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus
new file mode 100644
index 0000000..50f4d62
--- /dev/null
+++ b/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus
@@ -0,0 +1,35 @@
+C MP+polockmbonce+poacquiresilsil
+
+(*
+ * Result: Never
+ *
+ * Do spinlocks combined with smp_mb__after_spinlock() provide order
+ * to outside observers using spin_is_locked() to sense the lock-held
+ * state, ordered by acquire?  Note that when the first spin_is_locked()
+ * returns false and the second true, we know that the smp_load_acquire()
+ * executed before the lock was acquired (loosely speaking).
+ *)
+
+{
+}
+
+P0(spinlock_t *lo, int *x)
+{
+	spin_lock(lo);
+	smp_mb__after_spinlock();
+	WRITE_ONCE(*x, 1);
+	spin_unlock(lo);
+}
+
+P1(spinlock_t *lo, int *x)
+{
+	int r1;
+	int r2;
+	int r3;
+
+	r1 = smp_load_acquire(x);
+	r2 = spin_is_locked(lo);
+	r3 = spin_is_locked(lo);
+}
+
+exists (1:r1=1 /\ 1:r2=0 /\ 1:r3=1)
diff --git a/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus b/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus
new file mode 100644
index 0000000..abf81e7
--- /dev/null
+++ b/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus
@@ -0,0 +1,34 @@
+C MP+polockonce+poacquiresilsil
+
+(*
+ * Result: Sometimes
+ *
+ * Do spinlocks provide order to outside observers using spin_is_locked()
+ * to sense the lock-held state, ordered by acquire?  Note that when the
+ * first spin_is_locked() returns false and the second true, we know that
+ * the smp_load_acquire() executed before the lock was acquired (loosely
+ * speaking).
+ *)
+
+{
+}
+
+P0(spinlock_t *lo, int *x)
+{
+	spin_lock(lo);
+	WRITE_ONCE(*x, 1);
+	spin_unlock(lo);
+}
+
+P1(spinlock_t *lo, int *x)
+{
+	int r1;
+	int r2;
+	int r3;
+
+	r1 = smp_load_acquire(x);
+	r2 = spin_is_locked(lo);
+	r3 = spin_is_locked(lo);
+}
+
+exists (1:r1=1 /\ 1:r2=0 /\ 1:r3=1)
diff --git a/tools/memory-model/litmus-tests/README b/tools/memory-model/litmus-tests/README
index 04096fb..17eb9a8 100644
--- a/tools/memory-model/litmus-tests/README
+++ b/tools/memory-model/litmus-tests/README
@@ -23,7 +23,8 @@ IRIW+mbonceonces+OnceOnce.litmus
 	between each pairs of reads.  In other words, is smp_mb()
 	sufficient to cause two different reading processes to agree on
 	the order of a pair of writes, where each write is to a different
-	variable by a different process?
+	variable by a different process?  This litmus test is forbidden
+	by LKMM's propagation rule.
 
 IRIW+poonceonces+OnceOnce.litmus
 	Test of independent reads from independent writes with nothing
@@ -63,6 +64,16 @@ LB+poonceonces.litmus
 MP+onceassign+derefonce.litmus
 	As below, but with rcu_assign_pointer() and an rcu_dereference().
 
+MP+polockmbonce+poacquiresilsil.litmus
+	Protect the access with a lock and an smp_mb__after_spinlock()
+	in one process, and use an acquire load followed by a pair of
+	spin_is_locked() calls in the other process.
+
+MP+polockonce+poacquiresilsil.litmus
+	Protect the access with a lock in one process, and use an
+	acquire load followed by a pair of spin_is_locked() calls
+	in the other process.
+
 MP+polocks.litmus
 	As below, but with the second access of the writer process
 	and the first access of reader process protected by a lock.
@@ -109,8 +120,10 @@ S+wmbonceonce+poacquireonce.litmus
 
 WRC+poonceonces+Once.litmus
 WRC+pooncerelease+rmbonceonce+Once.litmus
-	These two are members of an extension of the MP litmus-test class
-	in which the first write is moved to a separate process.
+	These two are members of an extension of the MP litmus-test
+	class in which the first write is moved to a separate process.
+	The second is forbidden because smp_store_release() is
+	A-cumulative in LKMM.
 
 Z6.0+pooncelock+pooncelock+pombonce.litmus
 	Is the ordering provided by a spin_unlock() and a subsequent
diff --git a/tools/memory-model/litmus-tests/WRC+pooncerelease+rmbonceonce+Once.litmus b/tools/memory-model/litmus-tests/WRC+pooncerelease+rmbonceonce+Once.litmus
index 97fcbff..ad3448b 100644
--- a/tools/memory-model/litmus-tests/WRC+pooncerelease+rmbonceonce+Once.litmus
+++ b/tools/memory-model/litmus-tests/WRC+pooncerelease+rmbonceonce+Once.litmus
@@ -5,7 +5,9 @@ C WRC+pooncerelease+rmbonceonce+Once
  *
  * This litmus test is an extension of the message-passing pattern, where
  * the first write is moved to a separate process.  Because it features
- * a release and a read memory barrier, it should be forbidden.
+ * a release and a read memory barrier, it should be forbidden.  More
+ * specifically, this litmus test is forbidden because smp_store_release()
+ * is A-cumulative in LKMM.
  *)
 
 {}
diff --git a/tools/memory-model/lock.cat b/tools/memory-model/lock.cat
index ba4a4ec..305ded1 100644
--- a/tools/memory-model/lock.cat
+++ b/tools/memory-model/lock.cat
@@ -4,46 +4,72 @@
  * Copyright (C) 2017 Alan Stern <stern@rowland.harvard.edu>
  *)
 
-(* Generate coherence orders and handle lock operations *)
+(*
+ * Generate coherence orders and handle lock operations
+ *
+ * Warning: spin_is_locked() crashes herd7 versions strictly before 7.48.
+ * spin_is_locked() is functional from herd7 version 7.49.
+ *)
 
 include "cross.cat"
 
-(* From lock reads to their partner lock writes *)
-let lk-rmw = ([LKR] ; po-loc ; [LKW]) \ (po ; po)
-let rmw = rmw | lk-rmw
-
 (*
- * A paired LKR must always see an unlocked value; spin_lock() calls nested
- * inside a critical section (for the same lock) always deadlock.
+ * The lock-related events generated by herd are as follows:
+ *
+ * LKR		Lock-Read: the read part of a spin_lock() or successful
+ *			spin_trylock() read-modify-write event pair
+ * LKW		Lock-Write: the write part of a spin_lock() or successful
+ *			spin_trylock() RMW event pair
+ * UL		Unlock: a spin_unlock() event
+ * LF		Lock-Fail: a failed spin_trylock() event
+ * RL		Read-Locked: a spin_is_locked() event which returns True
+ * RU		Read-Unlocked: a spin_is_locked() event which returns False
+ *
+ * LKR and LKW events always come paired, like all RMW event sequences.
+ *
+ * LKR, LF, RL, and RU are read events; LKR has Acquire ordering.
+ * LKW and UL are write events; UL has Release ordering.
+ * LKW, LF, RL, and RU have no ordering properties.
  *)
-empty ([LKW] ; po-loc ; [domain(lk-rmw)]) \ (po-loc ; [UL] ; po-loc)
-	as lock-nest
 
-(* The litmus test is invalid if an LKW event is not part of an RMW pair *)
-flag ~empty LKW \ range(lk-rmw) as unpaired-LKW
+(* Backward compatibility *)
+let RL = try RL with emptyset
+let RU = try RU with emptyset
 
-(* This will be allowed if we implement spin_is_locked() *)
-flag ~empty LKR \ domain(lk-rmw) as unpaired-LKR
+(* Treat RL as a kind of LF: a read with no ordering properties *)
+let LF = LF | RL
 
-(* There should be no R or W accesses to spinlocks *)
-let ALL-LOCKS = LKR | LKW | UL | LF
+(* There should be no ordinary R or W accesses to spinlocks *)
+let ALL-LOCKS = LKR | LKW | UL | LF | RU
 flag ~empty [M \ IW] ; loc ; [ALL-LOCKS] as mixed-lock-accesses
 
+(* Link Lock-Reads to their RMW-partner Lock-Writes *)
+let lk-rmw = ([LKR] ; po-loc ; [LKW]) \ (po ; po)
+let rmw = rmw | lk-rmw
+
+(* The litmus test is invalid if an LKR/LKW event is not part of an RMW pair *)
+flag ~empty LKW \ range(lk-rmw) as unpaired-LKW
+flag ~empty LKR \ domain(lk-rmw) as unpaired-LKR
+
+(*
+ * An LKR must always see an unlocked value; spin_lock() calls nested
+ * inside a critical section (for the same lock) always deadlock.
+ *)
+empty ([LKW] ; po-loc ; [LKR]) \ (po-loc ; [UL] ; po-loc) as lock-nest
+
 (* The final value of a spinlock should not be tested *)
 flag ~empty [FW] ; loc ; [ALL-LOCKS] as lock-final
 
-
 (*
  * Put lock operations in their appropriate classes, but leave UL out of W
  * until after the co relation has been generated.
  *)
-let R = R | LKR | LF
+let R = R | LKR | LF | RU
 let W = W | LKW
 
 let Release = Release | UL
 let Acquire = Acquire | LKR
 
-
 (* Match LKW events to their corresponding UL events *)
 let critical = ([LKW] ; po-loc ; [UL]) \ (po-loc ; [LKW | UL] ; po-loc)
 
@@ -53,27 +79,48 @@ flag ~empty UL \ range(critical) as unmatched-unlock
 let UNMATCHED-LKW = LKW \ domain(critical)
 empty ([UNMATCHED-LKW] ; loc ; [UNMATCHED-LKW]) \ id as unmatched-locks
 
-
 (* rfi for LF events: link each LKW to the LF events in its critical section *)
 let rfi-lf = ([LKW] ; po-loc ; [LF]) \ ([LKW] ; po-loc ; [UL] ; po-loc)
 
 (* rfe for LF events *)
 let all-possible-rfe-lf =
-  (*
-   * Given an LF event r, compute the possible rfe edges for that event
-   * (all those starting from LKW events in other threads),
-   * and then convert that relation to a set of single-edge relations.
-   *)
-  let possible-rfe-lf r =
-    let pair-to-relation p = p ++ 0
-    in map pair-to-relation ((LKW * {r}) & loc & ext)
-  (* Do this for each LF event r that isn't in rfi-lf *)
-  in map possible-rfe-lf (LF \ range(rfi-lf))
+	(*
+	 * Given an LF event r, compute the possible rfe edges for that event
+	 * (all those starting from LKW events in other threads),
+	 * and then convert that relation to a set of single-edge relations.
+	 *)
+	let possible-rfe-lf r =
+		let pair-to-relation p = p ++ 0
+		in map pair-to-relation ((LKW * {r}) & loc & ext)
+	(* Do this for each LF event r that isn't in rfi-lf *)
+	in map possible-rfe-lf (LF \ range(rfi-lf))
 
 (* Generate all rf relations for LF events *)
 with rfe-lf from cross(all-possible-rfe-lf)
-let rf = rf | rfi-lf | rfe-lf
+let rf-lf = rfe-lf | rfi-lf
+
+(*
+ * RU, i.e., spin_is_locked() returning False, is slightly different.
+ * We rely on the memory model to rule out cases where spin_is_locked()
+ * within one of the lock's critical sections returns False.
+ *)
+
+(* rfi for RU events: an RU may read from the last po-previous UL *)
+let rfi-ru = ([UL] ; po-loc ; [RU]) \ ([UL] ; po-loc ; [LKW] ; po-loc)
+
+(* rfe for RU events: an RU may read from an external UL or the initial write *)
+let all-possible-rfe-ru =
+	let possible-rfe-ru r =
+		let pair-to-relation p = p ++ 0
+		in map pair-to-relation (((UL | IW) * {r}) & loc & ext)
+	in map possible-rfe-ru RU
+
+(* Generate all rf relations for RU events *)
+with rfe-ru from cross(all-possible-rfe-ru)
+let rf-ru = rfe-ru | rfi-ru
 
+(* Final rf relation *)
+let rf = rf | rf-lf | rf-ru
 
 (* Generate all co relations, including LKW events but not UL *)
 let co0 = co0 | ([IW] ; loc ; [LKW]) |
diff --git a/tools/memory-model/scripts/checkalllitmus.sh b/tools/memory-model/scripts/checkalllitmus.sh
new file mode 100644
index 0000000..af0aa15
--- /dev/null
+++ b/tools/memory-model/scripts/checkalllitmus.sh
@@ -0,0 +1,73 @@
+#!/bin/sh
+#
+# Run herd tests on all .litmus files in the specified directory (which
+# defaults to litmus-tests) and check each file's result against a "Result:"
+# comment within that litmus test.  If the verification result does not
+# match that specified in the litmus test, this script prints an error
+# message prefixed with "^^^".  It also outputs verification results to
+# a file whose name is that of the specified litmus test, but with ".out"
+# appended.
+#
+# Usage:
+#	sh checkalllitmus.sh [ directory ]
+#
+# The LINUX_HERD_OPTIONS environment variable may be used to specify
+# arguments to herd, whose default is defined by the checklitmus.sh script.
+# Thus, one would normally run this in the directory containing the memory
+# model, specifying the pathname of the litmus test to check.
+#
+# This script makes no attempt to run the litmus tests concurrently.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, you can access it online at
+# http://www.gnu.org/licenses/gpl-2.0.html.
+#
+# Copyright IBM Corporation, 2018
+#
+# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+
+litmusdir=${1-litmus-tests}
+if test -d "$litmusdir" -a -r "$litmusdir" -a -x "$litmusdir"
+then
+	:
+else
+	echo ' --- ' error: $litmusdir is not an accessible directory
+	exit 255
+fi
+
+# Find the checklitmus script.  If it is not where we expect it, then
+# assume that the caller has the PATH environment variable set
+# appropriately.
+if test -x scripts/checklitmus.sh
+then
+	clscript=scripts/checklitmus.sh
+else
+	clscript=checklitmus.sh
+fi
+
+# Run the script on all the litmus tests in the specified directory
+ret=0
+for i in litmus-tests/*.litmus
+do
+	if ! $clscript $i
+	then
+		ret=1
+	fi
+done
+if test "$ret" -ne 0
+then
+	echo " ^^^ VERIFICATION MISMATCHES"
+else
+	echo All litmus tests verified as was expected.
+fi
+exit $ret
diff --git a/tools/memory-model/scripts/checklitmus.sh b/tools/memory-model/scripts/checklitmus.sh
new file mode 100644
index 0000000..e2e4774
--- /dev/null
+++ b/tools/memory-model/scripts/checklitmus.sh
@@ -0,0 +1,86 @@
+#!/bin/sh
+#
+# Run a herd test and check the result against a "Result:" comment within
+# the litmus test.  If the verification result does not match that specified
+# in the litmus test, this script prints an error message prefixed with
+# "^^^" and exits with a non-zero status.  It also outputs verification
+# results to a file whose name is that of the specified litmus test, but
+# with ".out" appended.
+#
+# Usage:
+#	sh checklitmus.sh file.litmus
+#
+# The LINUX_HERD_OPTIONS environment variable may be used to specify
+# arguments to herd, which default to "-conf linux-kernel.cfg".  Thus,
+# one would normally run this in the directory containing the memory model,
+# specifying the pathname of the litmus test to check.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, you can access it online at
+# http://www.gnu.org/licenses/gpl-2.0.html.
+#
+# Copyright IBM Corporation, 2018
+#
+# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+
+litmus=$1
+herdoptions=${LINUX_HERD_OPTIONS--conf linux-kernel.cfg}
+
+if test -f "$litmus" -a -r "$litmus"
+then
+	:
+else
+	echo ' --- ' error: \"$litmus\" is not a readable file
+	exit 255
+fi
+if grep -q '^ \* Result: ' $litmus
+then
+	outcome=`grep -m 1 '^ \* Result: ' $litmus | awk '{ print $3 }'`
+else
+	outcome=specified
+fi
+
+echo Herd options: $herdoptions > $litmus.out
+/usr/bin/time herd7 -o ~/tmp $herdoptions $litmus >> $litmus.out 2>&1
+grep "Herd options:" $litmus.out
+grep '^Observation' $litmus.out
+if grep -q '^Observation' $litmus.out
+then
+	:
+else
+	cat $litmus.out
+	echo ' ^^^ Verification error'
+	echo ' ^^^ Verification error' >> $litmus.out 2>&1
+	exit 255
+fi
+if test "$outcome" = DEADLOCK
+then
+	echo grep 3 and 4
+	if grep '^Observation' $litmus.out | grep -q 'Never 0 0$'
+	then
+		ret=0
+	else
+		echo " ^^^ Unexpected non-$outcome verification"
+		echo " ^^^ Unexpected non-$outcome verification" >> $litmus.out 2>&1
+		ret=1
+	fi
+elif grep '^Observation' $litmus.out | grep -q $outcome || test "$outcome" = Maybe
+then
+	ret=0
+else
+	echo " ^^^ Unexpected non-$outcome verification"
+	echo " ^^^ Unexpected non-$outcome verification" >> $litmus.out 2>&1
+	ret=1
+fi
+tail -2 $litmus.out | head -1
+exit $ret
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
new file mode 100755
index 0000000..98f650c
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
@@ -0,0 +1,56 @@
+#!/bin/sh
+#
+# Invoke a text editor on all console.log files for all runs with diagnostics,
+# that is, on all such files having a console.log.diags counterpart.
+# Note that both console.log.diags and console.log are passed to the
+# editor (currently defaulting to "vi"), allowing the user to get an
+# idea of what to search for in the console.log file.
+#
+# Usage: kvm-find-errors.sh directory
+#
+# The "directory" above should end with the date/time directory, for example,
+# "tools/testing/selftests/rcutorture/res/2018.02.25-14:27:27".
+
+rundir="${1}"
+if test -z "$rundir" -o ! -d "$rundir"
+then
+	echo Usage: $0 directory
+fi
+editor=${EDITOR-vi}
+
+# Find builds with errors
+files=
+for i in ${rundir}/*/Make.out
+do
+	if egrep -q "error:|warning:" < $i
+	then
+		egrep "error:|warning:" < $i > $i.diags
+		files="$files $i.diags $i"
+	fi
+done
+if test -n "$files"
+then
+	$editor $files
+else
+	echo No build errors.
+fi
+if grep -q -e "--buildonly" < ${rundir}/log
+then
+	echo Build-only run, no console logs to check.
+fi
+
+# Find console logs with errors
+files=
+for i in ${rundir}/*/console.log
+do
+	if test -r $i.diags
+	then
+		files="$files $i.diags $i"
+	fi
+done
+if test -n "$files"
+then
+	$editor $files
+else
+	echo No errors in console logs.
+fi
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
index c2e1bb6..477ecb1 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
@@ -34,11 +34,15 @@ fi
 
 configfile=`echo $i | sed -e 's/^.*\///'`
 ngps=`grep ver: $i/console.log 2> /dev/null | tail -1 | sed -e 's/^.* ver: //' -e 's/ .*$//'`
+stopstate="`grep 'End-test grace-period state: g' $i/console.log 2> /dev/null |
+	    tail -1 | sed -e 's/^\[[ 0-9.]*] //' |
+	    awk '{ print \"[\" $1 \" \" $5 \" \" $6 \" \" $7 \"]\"; }' |
+	    tr -d '\012\015'`"
 if test -z "$ngps"
 then
-	echo "$configfile -------"
+	echo "$configfile ------- " $stopstate
 else
-	title="$configfile ------- $ngps grace periods"
+	title="$configfile ------- $ngps GPs"
 	dur=`sed -e 's/^.* rcutorture.shutdown_secs=//' -e 's/ .*$//' < $i/qemu-cmd 2> /dev/null`
 	if test -z "$dur"
 	then
@@ -46,9 +50,9 @@ else
 	else
 		ngpsps=`awk -v ngps=$ngps -v dur=$dur '
 			BEGIN { print ngps / dur }' < /dev/null`
-		title="$title ($ngpsps per second)"
+		title="$title ($ngpsps/s)"
 	fi
-	echo $title
+	echo $title $stopstate
 	nclosecalls=`grep --binary-files=text 'torture: Reader Batch' $i/console.log | tail -1 | awk '{for (i=NF-8;i<=NF;i++) sum+=$i; } END {print sum}'`
 	if test -z "$nclosecalls"
 	then
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
index f7e988f..c27e978 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
@@ -48,10 +48,6 @@ do
 				cat $i/Make.oldconfig.err
 			fi
 			parse-build.sh $i/Make.out $configfile
-			if test "$TORTURE_SUITE" != rcuperf
-			then
-				parse-torture.sh $i/console.log $configfile
-			fi
 			parse-console.sh $i/console.log $configfile
 			if test -r $i/Warnings
 			then
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 5f8fbb0..c5b0f94 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -267,5 +267,4 @@ then
 	echo Unknown PID, cannot kill qemu command
 fi
 
-parse-torture.sh $resdir/console.log $title
 parse-console.sh $resdir/console.log $title
diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index 08aa7d5..1729343 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -24,57 +24,146 @@
 #
 # Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
 
+T=${TMPDIR-/tmp}/parse-console.sh.$$
 file="$1"
 title="$2"
 
+trap 'rm -f $T.seq $T.diags' 0
+
 . functions.sh
 
+# Check for presence and readability of console output file
+if test -f "$file" -a -r "$file"
+then
+	:
+else
+	echo $title unreadable console output file: $file
+	exit 1
+fi
 if grep -Pq '\x00' < $file
 then
 	print_warning Console output contains nul bytes, old qemu still running?
 fi
-egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags
-if test -s $1.diags
+cat /dev/null > $file.diags
+
+# Check for proper termination, except that rcuperf runs don't indicate this.
+if test "$TORTURE_SUITE" != rcuperf
 then
-	print_warning Assertion failure in $file $title
-	# cat $1.diags
+	# check for abject failure
+
+	if grep -q FAILURE $file || grep -q -e '-torture.*!!!' $file
+	then
+		nerrs=`grep --binary-files=text '!!!' $file |
+		tail -1 |
+		awk '
+		{
+			for (i=NF-8;i<=NF;i++)
+				sum+=$i;
+		}
+		END { print sum }'`
+		print_bug $title FAILURE, $nerrs instances
+		exit
+	fi
+
+	grep --binary-files=text 'torture:.*ver:' $file |
+	egrep --binary-files=text -v '\(null\)|rtc: 000000000* ' |
+	sed -e 's/^(initramfs)[^]]*] //' -e 's/^\[[^]]*] //' |
+	awk '
+	BEGIN	{
+		ver = 0;
+		badseq = 0;
+		}
+
+		{
+		if (!badseq && ($5 + 0 != $5 || $5 <= ver)) {
+			badseqno1 = ver;
+			badseqno2 = $5;
+			badseqnr = NR;
+			badseq = 1;
+		}
+		ver = $5
+		}
+
+	END	{
+		if (badseq) {
+			if (badseqno1 == badseqno2 && badseqno2 == ver)
+				print "GP HANG at " ver " torture stat " badseqnr;
+			else
+				print "BAD SEQ " badseqno1 ":" badseqno2 " last:" ver " version " badseqnr;
+		}
+		}' > $T.seq
+
+	if grep -q SUCCESS $file
+	then
+		if test -s $T.seq
+		then
+			print_warning $title `cat $T.seq`
+			echo "   " $file
+			exit 2
+		fi
+	else
+		if grep -q "_HOTPLUG:" $file
+		then
+			print_warning HOTPLUG FAILURES $title `cat $T.seq`
+			echo "   " $file
+			exit 3
+		fi
+		echo $title no success message, `grep --binary-files=text 'ver:' $file | wc -l` successful version messages
+		if test -s $T.seq
+		then
+			print_warning $title `cat $T.seq`
+		fi
+		exit 2
+	fi
+fi | tee -a $file.diags
+
+egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for' < $file |
+grep -v 'ODEBUG: ' |
+grep -v 'Warning: unable to open an initial console' > $T.diags
+if test -s $T.diags
+then
+	print_warning "Assertion failure in $file $title"
+	# cat $T.diags
 	summary=""
-	n_badness=`grep -c Badness $1`
+	n_badness=`grep -c Badness $file`
 	if test "$n_badness" -ne 0
 	then
 		summary="$summary  Badness: $n_badness"
 	fi
-	n_warn=`grep -v 'Warning: unable to open an initial console' $1 | egrep -c 'WARNING:|Warn'`
+	n_warn=`grep -v 'Warning: unable to open an initial console' $file | egrep -c 'WARNING:|Warn'`
 	if test "$n_warn" -ne 0
 	then
 		summary="$summary  Warnings: $n_warn"
 	fi
-	n_bugs=`egrep -c 'BUG|Oops:' $1`
+	n_bugs=`egrep -c 'BUG|Oops:' $file`
 	if test "$n_bugs" -ne 0
 	then
 		summary="$summary  Bugs: $n_bugs"
 	fi
-	n_calltrace=`grep -c 'Call Trace:' $1`
+	n_calltrace=`grep -c 'Call Trace:' $file`
 	if test "$n_calltrace" -ne 0
 	then
 		summary="$summary  Call Traces: $n_calltrace"
 	fi
-	n_lockdep=`grep -c =========== $1`
+	n_lockdep=`grep -c =========== $file`
 	if test "$n_badness" -ne 0
 	then
 		summary="$summary  lockdep: $n_badness"
 	fi
-	n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' $1`
+	n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' $file`
 	if test "$n_stalls" -ne 0
 	then
 		summary="$summary  Stalls: $n_stalls"
 	fi
-	n_starves=`grep -c 'rcu_.*kthread starved for' $1`
+	n_starves=`grep -c 'rcu_.*kthread starved for' $file`
 	if test "$n_starves" -ne 0
 	then
 		summary="$summary  Starves: $n_starves"
 	fi
 	print_warning Summary: $summary
-else
-	rm $1.diags
+	cat $T.diags >> $file.diags
+fi
+if ! test -s $file.diags
+then
+	rm -f $file.diags
 fi
diff --git a/tools/testing/selftests/rcutorture/bin/parse-torture.sh b/tools/testing/selftests/rcutorture/bin/parse-torture.sh
deleted file mode 100755
index 5987e50..0000000
--- a/tools/testing/selftests/rcutorture/bin/parse-torture.sh
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/bin/bash
-#
-# Check the console output from a torture run for goodness.
-# The "file" is a pathname on the local system, and "title" is
-# a text string for error-message purposes.
-#
-# The file must contain torture output, but can be interspersed
-# with other dmesg text, as in console-log output.
-#
-# Usage: parse-torture.sh file title
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, you can access it online at
-# http://www.gnu.org/licenses/gpl-2.0.html.
-#
-# Copyright (C) IBM Corporation, 2011
-#
-# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
-
-T=${TMPDIR-/tmp}/parse-torture.sh.$$
-file="$1"
-title="$2"
-
-trap 'rm -f $T.seq' 0
-
-. functions.sh
-
-# check for presence of torture output file.
-
-if test -f "$file" -a -r "$file"
-then
-	:
-else
-	echo $title unreadable torture output file: $file
-	exit 1
-fi
-
-# check for abject failure
-
-if grep -q FAILURE $file || grep -q -e '-torture.*!!!' $file
-then
-	nerrs=`grep --binary-files=text '!!!' $file | tail -1 | awk '{for (i=NF-8;i<=NF;i++) sum+=$i; } END {print sum}'`
-	print_bug $title FAILURE, $nerrs instances
-	echo "   " $url
-	exit
-fi
-
-grep --binary-files=text 'torture:.*ver:' $file | egrep --binary-files=text -v '\(null\)|rtc: 000000000* ' | sed -e 's/^(initramfs)[^]]*] //' -e 's/^\[[^]]*] //' |
-awk '
-BEGIN	{
-	ver = 0;
-	badseq = 0;
-	}
-
-	{
-	if (!badseq && ($5 + 0 != $5 || $5 <= ver)) {
-		badseqno1 = ver;
-		badseqno2 = $5;
-		badseqnr = NR;
-		badseq = 1;
-	}
-	ver = $5
-	}
-
-END	{
-	if (badseq) {
-		if (badseqno1 == badseqno2 && badseqno2 == ver)
-			print "GP HANG at " ver " torture stat " badseqnr;
-		else
-			print "BAD SEQ " badseqno1 ":" badseqno2 " last:" ver " version " badseqnr;
-	}
-	}' > $T.seq
-
-if grep -q SUCCESS $file
-then
-	if test -s $T.seq
-	then
-		print_warning $title $title `cat $T.seq`
-		echo "   " $file
-		exit 2
-	fi
-else
-	if grep -q "_HOTPLUG:" $file
-	then
-		print_warning HOTPLUG FAILURES $title `cat $T.seq`
-		echo "   " $file
-		exit 3
-	fi
-	echo $title no success message, `grep --binary-files=text 'ver:' $file | wc -l` successful version messages
-	if test -s $T.seq
-	then
-		print_warning $title `cat $T.seq`
-	fi
-	exit 2
-fi
diff --git a/tools/virtio/linux/dma-mapping.h b/tools/virtio/linux/dma-mapping.h
index 1571e24..f91aeb5f 100644
--- a/tools/virtio/linux/dma-mapping.h
+++ b/tools/virtio/linux/dma-mapping.h
@@ -6,8 +6,6 @@
 # error Virtio userspace code does not support CONFIG_HAS_DMA
 #endif
 
-#define PCI_DMA_BUS_IS_PHYS 1
-
 enum dma_data_direction {
 	DMA_BIDIRECTIONAL = 0,
 	DMA_TO_DEVICE = 1,
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 7f6a944..8d90de2 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1401,6 +1401,7 @@ static void kvm_send_hwpoison_signal(unsigned long address,
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo   = SIGBUS;
 	info.si_errno   = 0;
 	info.si_code    = BUS_MCEERR_AR;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 6e865e8..90d30fb 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -397,7 +397,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	 * Check if there was an event already pending on the eventfd
 	 * before we registered, and trigger it as if we didn't miss it.
 	 */
-	events = f.file->f_op->poll(f.file, &irqfd->pt);
+	events = vfs_poll(f.file, &irqfd->pt);
 
 	if (events & EPOLLIN)
 		schedule_work(&irqfd->inject);